diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..d29023f --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +vectorstore/* filter=lfs diff=lfs merge=lfs -text +vectorstore/*.sqlite filter=lfs diff=lfs merge=lfs -text +vectorstore/**/*.bin filter=lfs diff=lfs merge=lfs -text diff --git a/src/paper_query/data/loaders.py b/src/paper_query/data/loaders.py index c57e19a..8f3b3dc 100644 --- a/src/paper_query/data/loaders.py +++ b/src/paper_query/data/loaders.py @@ -10,13 +10,24 @@ from paper_query.llm import setup_model -def pypdf_loader(file_path: str) -> Document: +def pypdf_loader(file_path: str, interpret_images: bool = False, **image_kwargs) -> Document: + """Function to load a PDF file, optionally interpreting images.""" + if interpret_images and "model" not in image_kwargs: + raise ValueError("When interpret_images is True, 'model' must be provided in image_kwargs.") + + if interpret_images: + return _pypdf_loader_w_images(file_path, **image_kwargs) + else: + return _pypdf_loader(file_path) + + +def _pypdf_loader(file_path: str) -> Document: """Function to load text from a PDF file.""" logger.debug("Loading PDF file using PyPDFLoader") return PyPDFLoader(file_path, mode="single").load()[0] -def pypdf_loader_w_images( +def _pypdf_loader_w_images( file_path: str, model: str, provider: str, max_tokens: int = 1024 ) -> Document: """Function to load text from a PDF file with images.""" diff --git a/src/paper_query/llm/models.py b/src/paper_query/llm/models.py index f183ea1..b0610ba 100644 --- a/src/paper_query/llm/models.py +++ b/src/paper_query/llm/models.py @@ -1,10 +1,11 @@ import os from langchain.chat_models import init_chat_model +from langchain_core.language_models.chat_models import BaseChatModel from loguru import logger -def setup_model(model_name: str, model_provider: str, **kwargs): +def setup_model(model_name: str, model_provider: str, **kwargs) -> BaseChatModel: """Initialize the chat model.""" logger.info(f"Initializing {model_name} model from {model_provider}") if model_provider == "openai": diff --git a/src/paper_query/ui/strain_relief_app.py b/src/paper_query/ui/strain_relief_app.py index c4eead8..2cbe89d 100644 --- a/src/paper_query/ui/strain_relief_app.py +++ b/src/paper_query/ui/strain_relief_app.py @@ -30,7 +30,7 @@ def strain_relief_chatbot(): """Chatbot for the StrainRelief paper.""" initialize_session_state() - st.title("The StrainRelief Chatbot") + st.title("StrainReliefChat") chat_tab, about_tab = st.tabs(["Chat", "About"]) st.sidebar.title("API Configuration") @@ -46,12 +46,14 @@ def strain_relief_chatbot(): # Display current model st.sidebar.markdown(f"Using **{st.session_state.model_name}** model.") - st.session_state.chatbot = HybridQueryChatbot( - model_name=st.session_state.model_name.lower(), - model_provider="openai", - paper_path=str(assets_dir / "strainrelief_preprint.pdf"), - references_dir=str(assets_dir / "references"), - ) + # Only instantiate chatbot once and store in session state + if st.session_state.chatbot is None: + st.session_state.chatbot = HybridQueryChatbot( + model_name=st.session_state.model_name.lower(), + model_provider="openai", + paper_path=str(assets_dir / "strainrelief_preprint.pdf"), + references_dir=str(assets_dir / "references"), + ) with chat_tab: if "messages" not in st.session_state: diff --git a/test/data/test_loaders.py b/test/data/test_loaders.py index 32bbef6..f3c3f73 100644 --- a/test/data/test_loaders.py +++ b/test/data/test_loaders.py @@ -5,7 +5,6 @@ from paper_query.data.loaders import ( code_loader, pypdf_loader, - pypdf_loader_w_images, references_loader, ) @@ -13,7 +12,7 @@ def test_pypdf_loader(test_assets_dir): """Test the pypdf_loader function.""" path = test_assets_dir / "example_pdf.pdf" - doc = pypdf_loader(path) + doc = pypdf_loader(path, interpret_images=False) assert isinstance(doc, Document) @@ -22,7 +21,7 @@ def test_pypdf_loader_w_images(test_assets_dir): """Test the pypdf_loader_w_images function.""" path = test_assets_dir / "example_pdf.pdf" # TODO: change to free model - doc = pypdf_loader_w_images(path, "gpt-4.1-nano", "openai") + doc = pypdf_loader(path, interpret_images=True, model="gpt-4.1-nano", provider="openai") assert isinstance(doc, Document) diff --git a/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/data_level0.bin b/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/data_level0.bin new file mode 100644 index 0000000..d12da93 --- /dev/null +++ b/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/data_level0.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88dd85fa556c4fa603eca6de31919f9bb8ef126a170d1da3caab8045063fca8a +size 62840000 diff --git a/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/header.bin b/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/header.bin new file mode 100644 index 0000000..2c277ba --- /dev/null +++ b/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/header.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54b5532955f22844eaeaf4dd7836818cf08c164d4a2de55a326648288dcf3911 +size 100 diff --git a/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/index_metadata.pickle b/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/index_metadata.pickle new file mode 100644 index 0000000..1f7aeb4 Binary files /dev/null and b/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/index_metadata.pickle differ diff --git a/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/length.bin b/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/length.bin new file mode 100644 index 0000000..0c044fd --- /dev/null +++ b/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/length.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d29acdf449874a292afbe2eaeecb42b7b484c8c6b1ba1283ea3f21c76f053cb +size 40000 diff --git a/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/link_lists.bin b/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/link_lists.bin new file mode 100644 index 0000000..04c4e8b --- /dev/null +++ b/vectorstore/30412df4-0651-48bc-86f1-8c77209061b8/link_lists.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75564ee38bb483c9c474b93eb1b617523116f3aeb594b61bf39d884835f03e9b +size 30692 diff --git a/vectorstore/chroma.sqlite3 b/vectorstore/chroma.sqlite3 new file mode 100644 index 0000000..e4aaaee --- /dev/null +++ b/vectorstore/chroma.sqlite3 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ab584cdaceced622524725f43262bc985bec4241f38e7ac5f6c938a0cd05985 +size 66457600