diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index ae040ad..6bc4398 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -24,7 +24,3 @@ jobs: uses: cytopia/docker-black@0.8 with: path: 'workers/embedder.py' - - name: Python Black (scheduler) - uses: cytopia/docker-black@0.8 - with: - path: 'workers/scheduler.py' diff --git a/LICENSE b/LICENSE index 261eeb9..a4ed920 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2024 Research Chain Team Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README.md b/README.md index 9e6d120..fac5a8d 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,27 @@ Frontend is launched separately to back end, run the following command to start - `environment.yml` is the linux env, but for macOS (silicon) and windows there are other available - Apple intel is not supported anymore, but you can still get it working by manually installing any missing package that comes up during the program execution. +- `pull access denied for X` error: The connection may occasionally get throttled, resulting in this error. + To solve this issue, let all the current downloads finish downloading, and restart the program. + Repeat until every file is downloaded. + +#### Running locally + +If you intend on running this project locally, whether for development, debugging or performance purposes, +you'll still need to have these three docker containers launched somewhere in the background: `ollama`, `postgres` and `rabbitmq`. + +Here's a full command on how to initialize the conda environment run them all at once: + +- `conda env create -f environment.yml` +- `sudo docker-compose -f docker/docker-compose.yml up ollama postgres rabbitmq` + +Rest of the workers and services can now be launched directly via `main.py`: + +- `conda activate ResearchChain` +- `python main.py -w crawler` + +For list of all available workers and services see: +- `python main.py -h` ### This is a monorepo for both a tool, and apps for it: diff --git a/configs/none.json b/configs/none.json new file mode 100644 index 0000000..0001f50 --- /dev/null +++ b/configs/none.json @@ -0,0 +1,5 @@ +{ + "worker_type": "none", + "llm_config_name": "none", + "embedder_config_name": "none" +} \ No newline at end of file diff --git a/configurator.py b/configurator.py index c039b10..1892b85 100644 --- a/configurator.py +++ b/configurator.py @@ -23,7 +23,6 @@ type=str, dest="worker_type", choices=[ - "webui", "crawler", "embedder", "summarizer", @@ -31,6 +30,18 @@ default="none", help="Select one of the ready worker configs to be used", ) +parser.add_argument( + "-s", + "--run-scheduler", + type=str, + dest="scheduler_type", + choices=[ + "webui", + "deep_searcher", + ], + default="none", + help="Select one of the available schedulers", +) parser.add_argument( "-c", "--custom-worker-path", @@ -78,11 +89,11 @@ def get_runtime_config(): global runtime_config - fallback_config_path = "configs/crawler.json" + empty_config_path = "configs/none.json" - if args.worker_type == "webui": + if args.worker_type == "none": # fixme: this is a workaround, webui should be started from it's folder - return load_runtime_config_from_file(fallback_config_path) + return load_runtime_config_from_file(empty_config_path) # fetch cache if runtime_config: diff --git a/core/chainables/web.py b/core/chainables/web.py index 8489f20..73c9527 100644 --- a/core/chainables/web.py +++ b/core/chainables/web.py @@ -77,3 +77,35 @@ def web_news_lookup_prompt(): ), ] ) + + +def basic_query_prompt(): + return ChatPromptTemplate.from_messages( + [ + ( + "system", + "You are a personal assistant. " + "Your job is to respond to the requests given to you by the user. " + "You are to follow the requests given precisely and intelligently. " + "Answer or complete the request to the best of your abilities. ", + ), + ("user", "{user_request}"), + ] + ) + + +# some schedulers may require data-extraction capabilities outside data-gathering +def structured_extraction_prompt(): + return ChatPromptTemplate.from_messages( + [ + ( + "system", + "You are a data extraction and analysis specialist. " + "Your job is to respond in a structured way to the question you were given. " + "You are to follow the orders given precisely and intelligently. " + "You are provided with data chunk, use it, to fulfill user's request. " + "Satisfy the requested task to the best of your abilities. ", + ), + ("user", "Data: ```{data}``` User request: '{user_request}'"), + ] + ) diff --git a/core/databases/db_completion_tasks.py b/core/databases/db_completion_tasks.py index e408b3e..6993c0b 100644 --- a/core/databases/db_completion_tasks.py +++ b/core/databases/db_completion_tasks.py @@ -1,4 +1,3 @@ -from typing import Optional from sqlalchemy import String, TEXT, Integer, Boolean, select, update from sqlalchemy.orm import Mapped, mapped_column, Session, relationship @@ -106,6 +105,20 @@ def db_get_incomplete_completion_tasks(amount: int = 1): return results +def db_get_complete_completion_tasks(amount: int = 1): + with Session(engine) as session: + session.expire_on_commit = False + + query = ( + select(CompletionTask).where(CompletionTask.completed == True).limit(amount) + ) + + results = list(session.scalars(query).all()) + session.expunge_all() + + return results + + def db_release_executing_tasks(uuid_list: list[str]): with Session(engine) as session: session.execute( @@ -149,3 +162,21 @@ def db_update_completion_task_after_summarizing(summary: str, uuid: str): ) session.commit() + + +def db_refresh_completion_tasks(timeout_seconds: int = 600): + # find completion tasks with timed-out execution and restart them to the awaiting state + # timing out after 600 seconds = 10 minutes by default + timeout_date = utils.gen_unix_time() + timeout_seconds + with Session(engine) as session: + session.execute( + update(CompletionTask) + .where(CompletionTask.executing == True) + .where(CompletionTask.completed == False) + .where(CompletionTask.execution_date > timeout_date) + .values( + executing=False, + ) + ) + + session.commit() diff --git a/core/databases/db_crawl_tasks.py b/core/databases/db_crawl_tasks.py index 320efa5..a83b14f 100644 --- a/core/databases/db_crawl_tasks.py +++ b/core/databases/db_crawl_tasks.py @@ -225,3 +225,21 @@ def db_increment_task_embedding_progression(uuid: str, model_name: str): ) session.commit() + + +def db_refresh_crawl_tasks(timeout_seconds: int = 600): + # find completion tasks with timed-out execution and restart them to the awaiting state + # timing out after 600 seconds = 10 minutes by default + timeout_date = utils.gen_unix_time() + timeout_seconds + with Session(engine) as session: + session.execute( + update(CrawlTask) + .where(CrawlTask.executing == True) + .where(CrawlTask.completed == False) + .where(CrawlTask.execution_date > timeout_date) + .values( + executing=False, + ) + ) + + session.commit() diff --git a/core/tools/model_loader.py b/core/tools/model_loader.py index 33370f7..0c88ed3 100644 --- a/core/tools/model_loader.py +++ b/core/tools/model_loader.py @@ -3,6 +3,7 @@ from langchain_community.llms.ollama import Ollama from huggingface_hub import hf_hub_download from llama_cpp import Llama +from langchain_experimental.llms.ollama_functions import OllamaFunctions from configurator import get_runtime_config from core.tools import errorlib @@ -11,6 +12,7 @@ llm_config = runtime_configuration.llm_config embedder_config = runtime_configuration.embedder_config + # problem with the current caching: we have to share those singletons across instances # fixme: n_gpu_layers=-1 is a poor approach, it can and will cause crashes. @@ -27,6 +29,11 @@ def load_ollama_llm() -> Ollama: return llm +def load_ollama_functional_llm() -> OllamaFunctions: + # todo: As far as i see OllamaFunctions could be used by default, but old code has to be adapted + return OllamaFunctions(model=llm_config.model_name, base_url="http://ollama:11434") + + def load_ollama_embedder() -> OllamaEmbeddings: cached_embedder = runtime_configuration.embedder_object if cached_embedder: @@ -89,6 +96,25 @@ def load_llm(): return load_ollama_llm() +def load_functional_llm(): + # EXPERIMENTAL + if llm_config is None: + errorlib.pretty_error( + title="Tried loading functional LLM without a valid configuration", + advice=f"Your worker configuration file is likely missing " + f"a valid {Fore.CYAN}llm_config_name{Fore.RESET} variable", + ) + + if llm_config.supplier == "hugging_face": + errorlib.pretty_error( + title="Tried running functional model with a HF configuration.", + advice=f"Functional models are not yet supported with llama.cpp loaders. " + f"Please switch to {Fore.CYAN}Ollama{Fore.RESET} or stop using functional models.", + ) + else: + return load_ollama_functional_llm() + + def load_embedder(): if embedder_config is None: errorlib.pretty_error( diff --git a/docker/deep_searcher/Dockerfile b/docker/deep_searcher/Dockerfile new file mode 100644 index 0000000..c0290bd --- /dev/null +++ b/docker/deep_searcher/Dockerfile @@ -0,0 +1,16 @@ +FROM condaforge/miniforge3 + +WORKDIR /app + +COPY . /app + +RUN apt-get update && apt-get install -y \ + build-essential \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +RUN conda env create -f environment.yml + +SHELL ["conda", "run", "-n", "ResearchChain", "/bin/bash", "-c"] + +ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "ResearchChain", "python3", "main.py", "-s", "deep_searcher"] diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index e2861ff..c7c7895 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -26,6 +26,8 @@ services: - app-network pgadmin: + profiles: + - pgadmin image: dpage/pgadmin4 depends_on: - postgres @@ -39,8 +41,6 @@ services: ollama: image: ollama/ollama - expose: - - 11434 ports: - 11434:11434 pull_policy: always @@ -113,6 +113,8 @@ services: - app-network frontend: + profiles: + - frontend image: frontend depends_on: - postgres @@ -125,6 +127,24 @@ services: networks: - app-network + deep_searcher: + profiles: + - deep_searcher + image: deep_searcher + depends_on: + postgres: + condition: service_started + rabbitmq: + condition: service_healthy + ollama: + condition: service_started + build: + context: ../. + dockerfile: ./docker/deep_searcher/Dockerfile + networks: + - app-network + - ollama-network + volumes: ollama: pgdata: diff --git a/docker/webui/Dockerfile b/docker/webui/Dockerfile index 8c4a0cb..cd07c01 100644 --- a/docker/webui/Dockerfile +++ b/docker/webui/Dockerfile @@ -16,4 +16,4 @@ EXPOSE 8000 SHELL ["conda", "run", "-n", "ResearchChain", "/bin/bash", "-c"] -ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "ResearchChain", "python3", "main.py", "-w", "webui"] +ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "ResearchChain", "python3", "main.py", "-s", "webui"] diff --git a/environment.yml b/environment.yml index a28ae00..b11ec2f 100644 --- a/environment.yml +++ b/environment.yml @@ -3,27 +3,46 @@ channels: - conda-forge - defaults dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 - beautifulsoup4=4.12.2 - brotli-python=1.0.9 - bzip2=1.0.8 - ca-certificates=2024.3.11 - certifi=2024.6.2 + - cffi=1.16.0 + - charset-normalizer=3.3.2 - colorama=0.4.6 + - cudatoolkit=11.8.0 - filelock=3.13.1 - fsspec=2023.10.0 - googlesearch=3.0.0 + - h2=4.1.0 + - hpack=4.0.0 - huggingface_hub=0.20.3 + - hyperframe=6.0.1 + - idna=3.7 + - keyutils=1.6.1 - krb5=1.20.1 + - ld_impl_linux-64=2.40 - libblas=3.9.0 - libcxx=8.0.0 + - libcxxabi=8.0.0 - libedit=3.1.20230828 - libfaiss=1.7.4 - libffi=3.4.2 + - libgcc-ng=14.1.0 + - libgfortran-ng=11.3.0 - libgfortran5=11.3.0 + - libgomp=14.1.0 - liblapack=3.9.0 + - libnsl=2.0.1 - libopenblas=0.3.23 - libpq=12.17 - libsqlite=3.45.3 + - libstdcxx-ng=14.1.0 + - libuuid=2.38.1 + - libxcrypt=4.4.36 - libzlib=1.2.13 - llvm-openmp=18.1.3 - ncurses=6.4.20240210 @@ -31,6 +50,7 @@ dependencies: - packaging=23.2 - pip=24.0 - psycopg2=2.9.9 + - pycparser=2.22 - pysocks=1.7.1 - python=3.9.19 - python_abi=3.9 @@ -42,12 +62,13 @@ dependencies: - sqlite=3.45.3 - tk=8.6.13 - tqdm=4.65.0 - - typing_extensions=4.9.0 - tzdata=2024a - wheel=0.43.0 - xz=5.2.6 - yaml=0.2.5 - zlib=1.2.13 + - zstandard=0.23.0 + - zstd=1.5.6 - pip: - aiohttp==3.9.5 - aiosignal==1.3.1 @@ -57,7 +78,6 @@ dependencies: - attrs==23.2.0 - black==24.4.2 - chardet==5.2.0 - - charset-normalizer==3.3.2 - click==8.1.7 - dataclasses-json==0.6.4 - diskcache==5.6.3 @@ -68,19 +88,20 @@ dependencies: - fastapi==0.111.0 - fastapi-cli==0.0.4 - frozenlist==1.4.1 + - greenlet==3.0.3 - h11==0.14.0 - httpcore==1.0.5 - httptools==0.6.1 - httpx==0.27.0 - - idna==3.7 - jinja2==3.1.3 - jsonpatch==1.33 - jsonpointer==2.4 - - langchain==0.1.17 - - langchain-community==0.0.36 - - langchain-core==0.1.50 - - langchain-text-splitters==0.0.1 - - langsmith==0.1.48 + - langchain==0.2.8 + - langchain-community==0.2.7 + - langchain-core==0.2.19 + - langchain-experimental==0.0.62 + - langchain-text-splitters==0.2.2 + - langsmith==0.1.85 - llama-cpp-python==0.2.61 - markdown-it-py==3.0.0 - markupsafe==2.1.5 diff --git a/main.py b/main.py index cc24e22..5172ad6 100644 --- a/main.py +++ b/main.py @@ -6,6 +6,7 @@ from configurator import get_runtime_config, args from core.databases import db_base from core.tools import errorlib +from schedulers.deepsearcher import start_deep_searcher from workers.crawler import start_crawler from workers.embedder import start_embedder from workers.summarizer import start_summarizer @@ -13,17 +14,33 @@ colorama_init() db_base.db_init() -if args.worker_type == "webui": - # fixme: this is a workaround, webui should be started from it's folder - uvicorn.run("webui.main:app", host="0.0.0.0", port=8000) +# todo: change deep_searcher into a scheduler instead of worker, -s flag +# this is because workers use cpu, gpu and memory resources in a distributed way, +# while a scheduler only does lightweight management without any significant load, +# and most importantly contrary to worker, doesn't require a config file to run +# theoretically this makes the crawler a scheduler as well, but for now it is a core part -if args.worker_type == "webui": +if args.worker_type == "none" and args.scheduler_type == "none": errorlib.pretty_error( - title=f"No flags were provided", - advice=f"---", + title=f"No worker or scheduler was selected to run.", + advice=f"You have to select either worker or a scheduler to run.", ) +if args.worker_type != "none" and args.scheduler_type != "none": + errorlib.pretty_error( + title=f"Both a worker and a scheduler was selected to run.", + advice=f"You may only select either worker or a scheduler to run.", + ) + + +if args.scheduler_type == "webui": + uvicorn.run("webui.main:app", host="0.0.0.0", port=8000) + try: + # todo: dynamically allocate available-scheduler array + if args.scheduler_type == "deep_searcher": + start_deep_searcher() + runtime_config = get_runtime_config() if runtime_config.worker_type == "crawler": start_crawler() @@ -31,6 +48,7 @@ start_embedder() if runtime_config.worker_type == "summarizer": start_summarizer() + except requests.exceptions.ConnectionError: errorlib.pretty_error( title=f"OLLAMA called but not running", diff --git a/schedulers/deepsearcher.py b/schedulers/deepsearcher.py new file mode 100644 index 0000000..1e4f714 --- /dev/null +++ b/schedulers/deepsearcher.py @@ -0,0 +1,93 @@ +# The first automatic scheduler. +# When launched, looks at the available data, picks interesting topics and requests deeper searches about them. +from __future__ import annotations + +from langchain_core.output_parsers import StrOutputParser + +from core.chainables.web import structured_extraction_prompt +from core.databases.db_completion_tasks import ( + CompletionTask, + db_get_complete_completion_tasks, + db_add_completion_task, +) +from core.tools.model_loader import load_llm +from core.tools.utils import sleep_noisy, remove_characters + +# get finished tasks +# extract interesting talking points +# repeat until a unique one has been found +# create new tasks out of it + +# we can also make a great use of embedding here +# use very broad embeddings to see which topics align the most with others +# to see how well certain topic is developed so far + +# poc: just get a random summary and dispatch a new one from it +# then: grab a random topic from one of the summaries, and focus on it, +# continuously condensing summaries and requesting them on deeper topics + +# TODO: move all LLM load to summarizer, local for now + +llm = None +output_parser = StrOutputParser() + + +def are_workers_free(): + # todo: check if there are non-busy workers available + return True + + +def get_random_completion() -> CompletionTask | None: + completions_list = db_get_complete_completion_tasks() + if len(completions_list) > 0: + return completions_list[0] + else: + return None + + +def extract_interesting_topics(text: str) -> list[str]: + # todo: separate function for extracting topics and getting one, + # run the extraction one only when lacking topics + # todo: perform semantic grouping of subjects + their context + # fixme: instead of TODO, extracting a single topic as a POC for now + extraction_request = "Find exactly one topic from this text, it must be interesting. Reply in 3 words at most." + + global llm + if llm is None: + llm = load_llm() + extraction_chain = structured_extraction_prompt() | llm | output_parser + + result = extraction_chain.invoke({"data": text, "user_request": extraction_request}) + + return [result] + + +def schedule_new_completion(query: str) -> str: + # fixme: regex remove anything not a-zA-Z + pure_query = remove_characters(query, ['"', "'", ':', '?', '!']) + return db_add_completion_task(pure_query, "info") + + +def start_deep_searcher(): + while True: + # fixme: replace False with free worker checking + if not are_workers_free(): + sleep_noisy(6) + continue + + completion = get_random_completion() + + if not completion: + sleep_noisy(6) + continue + + completion_text = completion.completion_result + topics = extract_interesting_topics(completion_text) + + for topic in topics: + print('dispatching new summaries:', topic) + schedule_new_completion(topic) + + sleep_noisy(6) + print("DBG: shutting down deep_searcher - only 1 loop scheduled") + return diff --git a/webui/frontend/src/app/components/PromptInput.tsx b/webui/frontend/src/app/components/PromptInput.tsx index c1c883f..46b474a 100644 --- a/webui/frontend/src/app/components/PromptInput.tsx +++ b/webui/frontend/src/app/components/PromptInput.tsx @@ -149,6 +149,7 @@ function PromptInput() { +