From 75068a021f9f71047343b01f3e223c2f8c60a1e1 Mon Sep 17 00:00:00 2001 From: marauder37 Date: Tue, 4 Jul 2023 17:38:20 +1000 Subject: [PATCH 1/6] Update llama_index to 0.6.38.post1 --- requirements/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/base.txt b/requirements/base.txt index 3a67ecd..1f2178b 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -32,6 +32,6 @@ channels_redis # NLP-Related # ------------------------------------------------------------------------------ -llama_index==0.5.25 # https://github.com/jerryjliu/llama_index +llama_index==0.6.38.post1 # https://github.com/jerryjliu/llama_index PyPDF2==3.* # https://pypdf2.readthedocs.io/en/latest/ docx2txt==0.8 From 8c3948334716dd8346a1b56cd42a7eb01b364cd1 Mon Sep 17 00:00:00 2001 From: marauder37 Date: Tue, 4 Jul 2023 17:47:56 +1000 Subject: [PATCH 2/6] Migrate deprecated GPTSimpleVectorIndex to GPTVectorStoreIndex --- delphic/tasks/index_tasks.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/delphic/tasks/index_tasks.py b/delphic/tasks/index_tasks.py index 56e0307..57a84f3 100644 --- a/delphic/tasks/index_tasks.py +++ b/delphic/tasks/index_tasks.py @@ -1,3 +1,4 @@ +import json import logging import os import tempfile @@ -8,7 +9,7 @@ from django.core.files import File from langchain import OpenAI from llama_index import ( - GPTSimpleVectorIndex, + GPTVectorStoreIndex, LLMPredictor, ServiceContext, download_loader, @@ -23,11 +24,11 @@ @celery_app.task def create_index(collection_id): """ - Celery task to create a GPTSimpleVectorIndex for a given Collection object. + Celery task to create a GPTVectorStoreIndex for a given Collection object. This task takes the ID of a Collection object, retrieves it from the database along with its related documents, and saves the document files - to a temporary directory. Then, it creates a GPTSimpleVectorIndex using + to a temporary directory. Then, it creates a GPTVectorStoreIndex using the provided code and saves the index to the Comparison.model FileField. Args: @@ -60,15 +61,18 @@ def create_index(collection_id): with temp_file_path.open("wb") as f: f.write(file_data) - # Create the GPTSimpleVectorIndex - SimpleDirectoryReader = download_loader("SimpleDirectoryReader") + # Create the GPTVectorStoreIndex + try: + SimpleDirectoryReader = download_loader("SimpleDirectoryReader") + except Exception as e: + logger.error(f"Error downloading SimpleDirectoryReader: {e}") + raise + loader = SimpleDirectoryReader( tempdir_path, recursive=True, exclude_hidden=False ) documents = loader.load_data() - # index = GPTSimpleVectorIndex(documents) - # documents = SimpleDirectoryReader(str(tempdir_path)).load_data() llm_predictor = LLMPredictor( llm=OpenAI( temperature=0, @@ -81,11 +85,11 @@ def create_index(collection_id): ) # build index - index = GPTSimpleVectorIndex.from_documents( + index = GPTVectorStoreIndex.from_documents( documents, service_context=service_context ) - index_str = index.save_to_string() + index_str = json.dumps(index.storage_context.to_dict()) # Save the index_str to the Comparison.model FileField with tempfile.NamedTemporaryFile(delete=False) as f: @@ -105,7 +109,9 @@ def create_index(collection_id): return True except Exception as e: - logger.error(f"Error creating index for collection {collection_id}: {e}") + logger.error( + f"{type(e).__name__} creating index for collection {collection_id}: {e}" + ) collection.status = CollectionStatus.ERROR collection.save() From 93fd9b37c46590957b8b0f8d324d04ca3179ac82 Mon Sep 17 00:00:00 2001 From: marauder37 Date: Tue, 4 Jul 2023 17:49:19 +1000 Subject: [PATCH 3/6] Make index query via `query_engine` --- config/api/websockets/queries.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/config/api/websockets/queries.py b/config/api/websockets/queries.py index e4eb687..10c41ae 100644 --- a/config/api/websockets/queries.py +++ b/config/api/websockets/queries.py @@ -39,7 +39,9 @@ async def receive(self, text_data): {query_str} """ - response = self.index.query(modified_query_str) + + query_engine = self.index.as_query_engine() + response = query_engine.query(modified_query_str) # Format the response as markdown markdown_response = f"## Response\n\n{response}\n\n" From 55ed2ab2ce8dec64d21eab5b20db913e3ef6dc0e Mon Sep 17 00:00:00 2001 From: marauder37 Date: Tue, 4 Jul 2023 17:51:26 +1000 Subject: [PATCH 4/6] Migrate deprecated GPTSimpleVectorIndex to VectorStoreIndex --- delphic/utils/collections.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/delphic/utils/collections.py b/delphic/utils/collections.py index 2b50385..8fd9bf6 100644 --- a/delphic/utils/collections.py +++ b/delphic/utils/collections.py @@ -1,10 +1,11 @@ +import json import logging import textwrap from pathlib import Path from django.conf import settings -from langchain import OpenAI -from llama_index import GPTSimpleVectorIndex, LLMPredictor, ServiceContext +from llama_index import StorageContext, load_index_from_storage +from llama_index.indices.base import BaseIndex from delphic.indexes.models import Collection @@ -27,7 +28,7 @@ def format_source(source): return formatted_source -async def load_collection_model(collection_id: str | int) -> GPTSimpleVectorIndex: +async def load_collection_model(collection_id: str | int) -> "BaseIndex": """ Load the Collection model from cache or the database, and return the index. @@ -35,14 +36,14 @@ async def load_collection_model(collection_id: str | int) -> GPTSimpleVectorInde collection_id (Union[str, int]): The ID of the Collection model instance. Returns: - GPTSimpleVectorIndex: The loaded index. + VectorStoreIndex: The loaded index. This function performs the following steps: 1. Retrieve the Collection object with the given collection_id. 2. Check if a JSON file with the name '/cache/model_{collection_id}.json' exists. - 3. If the JSON file doesn't exist, load the JSON from the Collection.model FileField and save it to + 3. If the JSON file doesn't exist, load the JSON from the `Collection.model` FileField and save it to '/cache/model_{collection_id}.json'. - 4. Call GPTSimpleVectorIndex.load_from_disk with the cache_file_path. + 4. Call VectorStoreIndex.load_from_disk with the cache_file_path. """ # Retrieve the Collection object collection = await Collection.objects.aget(id=collection_id) @@ -61,21 +62,12 @@ async def load_collection_model(collection_id: str | int) -> GPTSimpleVectorInde with cache_file_path.open("w+", encoding="utf-8") as cache_file: cache_file.write(model_file.read().decode("utf-8")) - # define LLM - logger.info( - f"load_collection_model() - Setup service context with tokens {settings.MAX_TOKENS} and " - f"model {settings.MODEL_NAME}" - ) - llm_predictor = LLMPredictor( - llm=OpenAI(temperature=0, model_name="text-davinci-003", max_tokens=512) - ) - service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor) - - # Call GPTSimpleVectorIndex.load_from_disk + # Call VectorStoreIndex.load_from_disk logger.info("load_collection_model() - Load llama index") - index = GPTSimpleVectorIndex.load_from_disk( - cache_file_path, service_context=service_context - ) + with cache_file_path.open("r") as cache_file: + storage_context = StorageContext.from_dict(json.load(cache_file)) + index = load_index_from_storage(storage_context) + logger.info( "load_collection_model() - Llamaindex loaded and ready for query..." ) From 2f05aa5f1ccc7e0d513ba59e8efb62deb2cb6ff1 Mon Sep 17 00:00:00 2001 From: marauder37 Date: Tue, 4 Jul 2023 17:53:08 +1000 Subject: [PATCH 5/6] Auto-reload Celery tasks --- compose/local/django/celery/worker/start | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compose/local/django/celery/worker/start b/compose/local/django/celery/worker/start index 5bbb5d2..e92cf8d 100644 --- a/compose/local/django/celery/worker/start +++ b/compose/local/django/celery/worker/start @@ -4,5 +4,5 @@ set -o errexit set -o nounset -#exec watchfiles celery.__main__.main --args '-A config.celery_app worker -l INFO' -exec celery -A config.celery_app worker -l INFO +exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker -l INFO' +#exec celery -A config.celery_app worker -l INFO From 3c9a1a7bc103189efba38637062b1357b01d5361 Mon Sep 17 00:00:00 2001 From: marauder37 Date: Tue, 4 Jul 2023 18:12:29 +1000 Subject: [PATCH 6/6] Exclude local configuration from version control Update .gitignore with current cookiecutter-django setup, which is to exclude .idea/ entirely --- .gitignore | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 75a0317..74f1a07 100644 --- a/.gitignore +++ b/.gitignore @@ -162,29 +162,6 @@ typings/ # Local History for Visual Studio Code .history/ - -# Provided default Pycharm Run/Debug Configurations should be tracked by git -# In case of local modifications made by Pycharm, use update-index command -# for each changed file, like this: -# git update-index --assume-unchanged .idea/chat_all_the_docs.iml -### JetBrains template -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff: -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/dictionaries - -# Sensitive or high-churn files: -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.xml -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml - # Gradle: .idea/**/gradle.xml .idea/**/libraries @@ -338,3 +315,14 @@ delphic/media/* ### Models for Question Answering cache/* + +# https://github.com/cookiecutter/cookiecutter-django/blob/de8759fdbd45ac288b97e050073a5d09f50029db/.gitignore#L211 +# Even though the project might be opened and edited +# in any of the JetBrains IDEs, it makes no sense whatsoever +# to 'run' anything within it since any particular cookiecutter +# is declarative by nature. +.idea/ + +### Local configuration files +/.envs/.local +/frontend/.frontend