From e94338909b1f142479b83ac7d8be50e93b6f695e Mon Sep 17 00:00:00 2001 From: Fabian Moertter Date: Wed, 2 Aug 2023 13:05:14 +0200 Subject: [PATCH 1/2] Caching: cache embedded_pdf --- pipelines.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipelines.py b/pipelines.py index 1927cff..fe5dbda 100644 --- a/pipelines.py +++ b/pipelines.py @@ -1,9 +1,11 @@ +from functools import lru_cache from filepreprocessing import pdf_get_text_chunks from tqdm import tqdm from embedding import get_embedding_sentence_transformer from similarity import model_qa, cosine_similarity +@lru_cache(maxsize=64) def embedding_loaded_pdf(file_path, chunk_size, overlap): # FIRST WE LOAD PDF From f7c27bca4c8f1ad2bb6745ecf44fc83c04225508 Mon Sep 17 00:00:00 2001 From: Fabian Moertter Date: Wed, 2 Aug 2023 13:28:16 +0200 Subject: [PATCH 2/2] Features: add/remove/search multiple docs fixes and adding comments --- app.py | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 5 deletions(-) diff --git a/app.py b/app.py index 31debb5..55e6057 100644 --- a/app.py +++ b/app.py @@ -33,10 +33,13 @@ ui.nav('Ask File', ui.layout_sidebar( ui.panel_sidebar( - ui.input_file('document_input_file', - 'Select a PDF file you wish to ask a question about', - multiple=False, accept='.pdf', button_label='Select', + ui.input_file('document_input_files', + 'Select one or more PDF file(s) you wish to ask a question about', + multiple=True, accept='.pdf', button_label='Select', placeholder='Your PDF here..'), + ui.input_checkbox_group('selected_files', '', []), + # TODO: Hide remove button behind panel_conditional when no file is uploaded yet + ui.input_action_button('remove_selected', label="Remove selected PDF(s)"), ui.input_text_area('question_input_file', 'What wisdom do you seek from this file?', rows=4), ui.input_slider('n_chunks_file', 'Number of chunks', min=1, max=5, value=3), ui.input_action_button(id="run_process_file", label="Do Magic", class_='btn-success'), @@ -45,7 +48,7 @@ ), ui.panel_main( ui.panel_conditional( - """input.run_process_file > 0 && input.question_input_file != ''""", # && input.document_input_file != null + """input.run_process_file > 0 && input.question_input_file != ''""", # && input.document_input_files != null ui.output_text('get_answer_file'), ), width=8, @@ -60,6 +63,19 @@ def server(input, output, session): + documents = reactive.Value([]) + + @reactive.Effect + @reactive.event(input.document_input_files) + def _(): + docs = documents.get() + document_names = [ file['name'] for file in docs ] + + for file in input.document_input_files(): + if file['name'] not in document_names: + docs.append(file) + documents.set(docs) + val = reactive.Value(3) @reactive.Effect @reactive.event(input.n_chunks_db) @@ -83,11 +99,57 @@ async def get_answer_db(): answer = re.sub('\s*$', '', answer) return answer + @reactive.Effect + @reactive.event(input.document_input_files) + def _(): + """ + Update checkbox_group after uploading file(s). + """ + docs = documents.get() + choices = [ file['name'] for file in docs ] + ui.update_checkbox_group('selected_files', label="Selected file(s):", choices=choices, selected=choices) + + @reactive.Effect + @reactive.event(documents.get) + def _(): + """ + Update checkbox_group after deleting file(s). + """ + docs = documents.get() + choices = [ file['name'] for file in docs ] + if choices: + label = "Selected file(s):" + else: + label = "" + ui.update_checkbox_group('selected_files', label=label, choices=choices, selected=[]) + # Use this to auto select remaining documents after deletion + # ui.update_checkbox_group('selected_files', label=label, choices=choices, selected=choices) + + @reactive.Effect + @reactive.event(input.remove_selected) + def _(): + """ + Remove selected file(s). + """ + docs = documents.get() + docs_to_keep = [ file for file in docs if file['name'] not in input.selected_files() ] + # Use this to keep not selected files + # docs_to_keep = [ file for file in docs if file['name'] in input.selected_files() ] + documents.set(docs_to_keep) + @output() @render.text @reactive.event(input.run_process_file) async def get_answer_file(): - db_items = embedding_loaded_pdf(file_path=input.document_input_file()[0]['datapath'], chunk_size=200, overlap=10) + db_items = [] + + docs = documents.get() + + for file in docs: + if file['name'] in input.selected_files(): + file['db_items'] = embedding_loaded_pdf(file_path=input.document_input_files()[0]['datapath'], chunk_size=200, overlap=10) + db_items.extend(file['db_items']) + answer = pipeline_return_question_and_answer(query=input.question_input_file(), db_items=db_items, n_chunks=input.n_chunks_file())