-
Notifications
You must be signed in to change notification settings - Fork 1
/
notion_arxiv_browse_chat.py
130 lines (124 loc) · 6.36 KB
/
notion_arxiv_browse_chat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
from os.path import join
from notion_client import Client
import arxiv
import questionary
import textwrap
from prompt_toolkit import PromptSession
from prompt_toolkit.history import InMemoryHistory, FileHistory
import yaml
import requests
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
try:
from langchain.document_loaders import PDFMinerLoader, PyPDFLoader, BSHTMLLoader, UnstructuredURLLoader # for loading the pdf
from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
from langchain.vectorstores import Chroma # for the vectorization part
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import get_openai_callback
except:
from langchain_community.document_loaders import PDFMinerLoader, PyPDFLoader, BSHTMLLoader, UnstructuredURLLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOpenAI
from langchain_community.callbacks import get_openai_callback
from notion_tools import QA_notion_blocks, clean_metadata, print_entries, save_qa_history, load_qa_history, print_qa_result
from arxiv_browse_lib import add_to_notion, notion_paper_chat, fetch_K_results, print_arxiv_entry, arxiv_paper_download
# file to save the arxiv query history
history = FileHistory("notion_arxiv_history.txt")
session = PromptSession(history=history)
# file to save the Q&A chat history
chathistory = FileHistory("qa_chat_history.txt")
chatsession = PromptSession(history=chathistory)
with open("config.yaml") as file:
config = yaml.load(file, Loader=yaml.FullLoader)
MAX_RESULTS_PER_PAGE = int(config["MAX_RESULTS_PER_PAGE"])
PDF_DOWNLOAD_ROOT = config["PDF_DOWNLOAD_ROOT"]
EMBED_ROOTDIR = config["EMBED_ROOTDIR"]
os.makedirs(PDF_DOWNLOAD_ROOT, exist_ok=True)
os.makedirs(EMBED_ROOTDIR, exist_ok=True)
print(f"PDFs will be downloaded to {PDF_DOWNLOAD_ROOT}")
print(f"Computed embeddings will be saved to {EMBED_ROOTDIR}")
if "NOTION_TOKEN" in os.environ:
notion = Client(auth=os.environ["NOTION_TOKEN"])
database_id = config["database_id"]
# notion.databases.query(database_id, filter={"property": "Name", "text": {"is_not_empty": True}}, )
if database_id == "PUT_YOUR_DATABASE_ID_HERE" or database_id == "" or database_id == "None":
print("Please set the database_id in config.yaml.")
save2notion = False
else:
save2notion = True
else:
print("Please set the NOTION_TOKEN environment variable.")
save2notion = False
default_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, chunk_overlap=200)
# query = "2106.05963"
# query = "au:Yann LeCun"
# Logic:
# Ctrl-C in the navigation loop to exit and start a new query
# Ctrl-C in the query prompt to exit the program
# Up/Down to navigate through prompts and query history
# main loop
while True:
try:
cnt = 0
query = session.prompt("Enter arXiv ID or query str: ", multiline=False)
search_obj = arxiv.Search(query, )
results_arxiv = fetch_K_results(search_obj, K=MAX_RESULTS_PER_PAGE, offset=cnt)
if len(results_arxiv) == 0:
print("No results found.")
continue
elif len(results_arxiv) == 1:
paper = results_arxiv[0]
arxiv_id = paper.entry_id.split("/")[-1]
print_arxiv_entry(paper)
# Add the entry if confirmed
if questionary.confirm("Add this entry?").ask():
page_id, _ = add_to_notion(notion, database_id, paper)
if questionary.confirm("Q&A Chatting with this file?").ask():
pages = arxiv_paper_download(arxiv_id, pdf_download_root=PDF_DOWNLOAD_ROOT)
notion_paper_chat(arxiv_id=arxiv_id, pages=pages, save_page_id=page_id,
notion_client=notion, embed_rootdir=EMBED_ROOTDIR,
chatsession=chatsession, )
elif len(results_arxiv) > 1:
# multiple results found, complex logic to navigate through results
last_selection = None # last selected result to highlight
while True:
# looping of results and pages, navigating through search results
print("Multiple results found. Please select one:")
choices = [f"{i + 1}: [{paper.entry_id.split('/')[-1]}] {paper.title} " for i, paper in enumerate(results_arxiv)]
if len(results_arxiv) == MAX_RESULTS_PER_PAGE:
choices.append("0: Next page")
if cnt > 0:
choices.append("-1: Prev page")
selection = questionary.select("Select paper:", choices=choices, default=None if last_selection is None
else choices[last_selection]).ask()
selection = int(selection.split(":")[0])
if selection == 0:
cnt += MAX_RESULTS_PER_PAGE
results_arxiv = fetch_K_results(search_obj, K=MAX_RESULTS_PER_PAGE, offset=cnt)
continue
if selection == -1:
cnt -= MAX_RESULTS_PER_PAGE
results_arxiv = fetch_K_results(search_obj, K=MAX_RESULTS_PER_PAGE, offset=cnt)
continue
else:
paper = results_arxiv[int(selection) - 1]
last_selection = int(selection) - 1
arxiv_id = paper.entry_id.split("/")[-1]
print_arxiv_entry(paper)
if questionary.confirm("Add this entry?").ask():
# Add the entry if confirmed
page_id, _ = add_to_notion(notion, database_id, paper)
if questionary.confirm("Q&A Chatting with this file?").ask():
pages = arxiv_paper_download(arxiv_id, pdf_download_root=PDF_DOWNLOAD_ROOT)
notion_paper_chat(arxiv_id=arxiv_id, pages=pages, save_page_id=page_id,
notion_client=notion, embed_rootdir=EMBED_ROOTDIR,
chatsession=chatsession, )
except KeyboardInterrupt as e:
break
except Exception as e:
print("Chat loop failed with exception:")
print(e)
continue