diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
index ae040ad..6bc4398 100644
--- a/.github/workflows/black.yml
+++ b/.github/workflows/black.yml
@@ -24,7 +24,3 @@ jobs:
uses: cytopia/docker-black@0.8
with:
path: 'workers/embedder.py'
- - name: Python Black (scheduler)
- uses: cytopia/docker-black@0.8
- with:
- path: 'workers/scheduler.py'
diff --git a/LICENSE b/LICENSE
index 261eeb9..a4ed920 100644
--- a/LICENSE
+++ b/LICENSE
@@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.
- Copyright [yyyy] [name of copyright owner]
+ Copyright 2024 Research Chain Team
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
diff --git a/README.md b/README.md
index 9e6d120..fac5a8d 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,27 @@ Frontend is launched separately to back end, run the following command to start
- `environment.yml` is the linux env, but for macOS (silicon) and windows there are other available
- Apple intel is not supported anymore, but you can still get it working by manually installing
any missing package that comes up during the program execution.
+- `pull access denied for X` error: The connection may occasionally get throttled, resulting in this error.
+ To solve this issue, let all the current downloads finish downloading, and restart the program.
+ Repeat until every file is downloaded.
+
+#### Running locally
+
+If you intend on running this project locally, whether for development, debugging or performance purposes,
+you'll still need to have these three docker containers launched somewhere in the background: `ollama`, `postgres` and `rabbitmq`.
+
+Here's a full command on how to initialize the conda environment run them all at once:
+
+- `conda env create -f environment.yml`
+- `sudo docker-compose -f docker/docker-compose.yml up ollama postgres rabbitmq`
+
+Rest of the workers and services can now be launched directly via `main.py`:
+
+- `conda activate ResearchChain`
+- `python main.py -w crawler`
+
+For list of all available workers and services see:
+- `python main.py -h`
### This is a monorepo for both a tool, and apps for it:
diff --git a/configs/none.json b/configs/none.json
new file mode 100644
index 0000000..0001f50
--- /dev/null
+++ b/configs/none.json
@@ -0,0 +1,5 @@
+{
+ "worker_type": "none",
+ "llm_config_name": "none",
+ "embedder_config_name": "none"
+}
\ No newline at end of file
diff --git a/configurator.py b/configurator.py
index c039b10..1892b85 100644
--- a/configurator.py
+++ b/configurator.py
@@ -23,7 +23,6 @@
type=str,
dest="worker_type",
choices=[
- "webui",
"crawler",
"embedder",
"summarizer",
@@ -31,6 +30,18 @@
default="none",
help="Select one of the ready worker configs to be used",
)
+parser.add_argument(
+ "-s",
+ "--run-scheduler",
+ type=str,
+ dest="scheduler_type",
+ choices=[
+ "webui",
+ "deep_searcher",
+ ],
+ default="none",
+ help="Select one of the available schedulers",
+)
parser.add_argument(
"-c",
"--custom-worker-path",
@@ -78,11 +89,11 @@
def get_runtime_config():
global runtime_config
- fallback_config_path = "configs/crawler.json"
+ empty_config_path = "configs/none.json"
- if args.worker_type == "webui":
+ if args.worker_type == "none":
# fixme: this is a workaround, webui should be started from it's folder
- return load_runtime_config_from_file(fallback_config_path)
+ return load_runtime_config_from_file(empty_config_path)
# fetch cache
if runtime_config:
diff --git a/core/chainables/web.py b/core/chainables/web.py
index 8489f20..73c9527 100644
--- a/core/chainables/web.py
+++ b/core/chainables/web.py
@@ -77,3 +77,35 @@ def web_news_lookup_prompt():
),
]
)
+
+
+def basic_query_prompt():
+ return ChatPromptTemplate.from_messages(
+ [
+ (
+ "system",
+ "You are a personal assistant. "
+ "Your job is to respond to the requests given to you by the user. "
+ "You are to follow the requests given precisely and intelligently. "
+ "Answer or complete the request to the best of your abilities. ",
+ ),
+ ("user", "{user_request}"),
+ ]
+ )
+
+
+# some schedulers may require data-extraction capabilities outside data-gathering
+def structured_extraction_prompt():
+ return ChatPromptTemplate.from_messages(
+ [
+ (
+ "system",
+ "You are a data extraction and analysis specialist. "
+ "Your job is to respond in a structured way to the question you were given. "
+ "You are to follow the orders given precisely and intelligently. "
+ "You are provided with data chunk, use it, to fulfill user's request. "
+ "Satisfy the requested task to the best of your abilities. ",
+ ),
+ ("user", "Data: ```{data}``` User request: '{user_request}'"),
+ ]
+ )
diff --git a/core/databases/db_completion_tasks.py b/core/databases/db_completion_tasks.py
index e408b3e..6993c0b 100644
--- a/core/databases/db_completion_tasks.py
+++ b/core/databases/db_completion_tasks.py
@@ -1,4 +1,3 @@
-from typing import Optional
from sqlalchemy import String, TEXT, Integer, Boolean, select, update
from sqlalchemy.orm import Mapped, mapped_column, Session, relationship
@@ -106,6 +105,20 @@ def db_get_incomplete_completion_tasks(amount: int = 1):
return results
+def db_get_complete_completion_tasks(amount: int = 1):
+ with Session(engine) as session:
+ session.expire_on_commit = False
+
+ query = (
+ select(CompletionTask).where(CompletionTask.completed == True).limit(amount)
+ )
+
+ results = list(session.scalars(query).all())
+ session.expunge_all()
+
+ return results
+
+
def db_release_executing_tasks(uuid_list: list[str]):
with Session(engine) as session:
session.execute(
@@ -149,3 +162,21 @@ def db_update_completion_task_after_summarizing(summary: str, uuid: str):
)
session.commit()
+
+
+def db_refresh_completion_tasks(timeout_seconds: int = 600):
+ # find completion tasks with timed-out execution and restart them to the awaiting state
+ # timing out after 600 seconds = 10 minutes by default
+ timeout_date = utils.gen_unix_time() + timeout_seconds
+ with Session(engine) as session:
+ session.execute(
+ update(CompletionTask)
+ .where(CompletionTask.executing == True)
+ .where(CompletionTask.completed == False)
+ .where(CompletionTask.execution_date > timeout_date)
+ .values(
+ executing=False,
+ )
+ )
+
+ session.commit()
diff --git a/core/databases/db_crawl_tasks.py b/core/databases/db_crawl_tasks.py
index 320efa5..a83b14f 100644
--- a/core/databases/db_crawl_tasks.py
+++ b/core/databases/db_crawl_tasks.py
@@ -225,3 +225,21 @@ def db_increment_task_embedding_progression(uuid: str, model_name: str):
)
session.commit()
+
+
+def db_refresh_crawl_tasks(timeout_seconds: int = 600):
+ # find completion tasks with timed-out execution and restart them to the awaiting state
+ # timing out after 600 seconds = 10 minutes by default
+ timeout_date = utils.gen_unix_time() + timeout_seconds
+ with Session(engine) as session:
+ session.execute(
+ update(CrawlTask)
+ .where(CrawlTask.executing == True)
+ .where(CrawlTask.completed == False)
+ .where(CrawlTask.execution_date > timeout_date)
+ .values(
+ executing=False,
+ )
+ )
+
+ session.commit()
diff --git a/core/tools/model_loader.py b/core/tools/model_loader.py
index 33370f7..0c88ed3 100644
--- a/core/tools/model_loader.py
+++ b/core/tools/model_loader.py
@@ -3,6 +3,7 @@
from langchain_community.llms.ollama import Ollama
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
+from langchain_experimental.llms.ollama_functions import OllamaFunctions
from configurator import get_runtime_config
from core.tools import errorlib
@@ -11,6 +12,7 @@
llm_config = runtime_configuration.llm_config
embedder_config = runtime_configuration.embedder_config
+
# problem with the current caching: we have to share those singletons across instances
# fixme: n_gpu_layers=-1 is a poor approach, it can and will cause crashes.
@@ -27,6 +29,11 @@ def load_ollama_llm() -> Ollama:
return llm
+def load_ollama_functional_llm() -> OllamaFunctions:
+ # todo: As far as i see OllamaFunctions could be used by default, but old code has to be adapted
+ return OllamaFunctions(model=llm_config.model_name, base_url="http://ollama:11434")
+
+
def load_ollama_embedder() -> OllamaEmbeddings:
cached_embedder = runtime_configuration.embedder_object
if cached_embedder:
@@ -89,6 +96,25 @@ def load_llm():
return load_ollama_llm()
+def load_functional_llm():
+ # EXPERIMENTAL
+ if llm_config is None:
+ errorlib.pretty_error(
+ title="Tried loading functional LLM without a valid configuration",
+ advice=f"Your worker configuration file is likely missing "
+ f"a valid {Fore.CYAN}llm_config_name{Fore.RESET} variable",
+ )
+
+ if llm_config.supplier == "hugging_face":
+ errorlib.pretty_error(
+ title="Tried running functional model with a HF configuration.",
+ advice=f"Functional models are not yet supported with llama.cpp loaders. "
+ f"Please switch to {Fore.CYAN}Ollama{Fore.RESET} or stop using functional models.",
+ )
+ else:
+ return load_ollama_functional_llm()
+
+
def load_embedder():
if embedder_config is None:
errorlib.pretty_error(
diff --git a/docker/deep_searcher/Dockerfile b/docker/deep_searcher/Dockerfile
new file mode 100644
index 0000000..c0290bd
--- /dev/null
+++ b/docker/deep_searcher/Dockerfile
@@ -0,0 +1,16 @@
+FROM condaforge/miniforge3
+
+WORKDIR /app
+
+COPY . /app
+
+RUN apt-get update && apt-get install -y \
+ build-essential \
+ g++ \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN conda env create -f environment.yml
+
+SHELL ["conda", "run", "-n", "ResearchChain", "/bin/bash", "-c"]
+
+ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "ResearchChain", "python3", "main.py", "-s", "deep_searcher"]
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index e2861ff..c7c7895 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -26,6 +26,8 @@ services:
- app-network
pgadmin:
+ profiles:
+ - pgadmin
image: dpage/pgadmin4
depends_on:
- postgres
@@ -39,8 +41,6 @@ services:
ollama:
image: ollama/ollama
- expose:
- - 11434
ports:
- 11434:11434
pull_policy: always
@@ -113,6 +113,8 @@ services:
- app-network
frontend:
+ profiles:
+ - frontend
image: frontend
depends_on:
- postgres
@@ -125,6 +127,24 @@ services:
networks:
- app-network
+ deep_searcher:
+ profiles:
+ - deep_searcher
+ image: deep_searcher
+ depends_on:
+ postgres:
+ condition: service_started
+ rabbitmq:
+ condition: service_healthy
+ ollama:
+ condition: service_started
+ build:
+ context: ../.
+ dockerfile: ./docker/deep_searcher/Dockerfile
+ networks:
+ - app-network
+ - ollama-network
+
volumes:
ollama:
pgdata:
diff --git a/docker/webui/Dockerfile b/docker/webui/Dockerfile
index 8c4a0cb..cd07c01 100644
--- a/docker/webui/Dockerfile
+++ b/docker/webui/Dockerfile
@@ -16,4 +16,4 @@ EXPOSE 8000
SHELL ["conda", "run", "-n", "ResearchChain", "/bin/bash", "-c"]
-ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "ResearchChain", "python3", "main.py", "-w", "webui"]
+ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "ResearchChain", "python3", "main.py", "-s", "webui"]
diff --git a/environment.yml b/environment.yml
index a28ae00..b11ec2f 100644
--- a/environment.yml
+++ b/environment.yml
@@ -3,27 +3,46 @@ channels:
- conda-forge
- defaults
dependencies:
+ - _libgcc_mutex=0.1
+ - _openmp_mutex=4.5
- beautifulsoup4=4.12.2
- brotli-python=1.0.9
- bzip2=1.0.8
- ca-certificates=2024.3.11
- certifi=2024.6.2
+ - cffi=1.16.0
+ - charset-normalizer=3.3.2
- colorama=0.4.6
+ - cudatoolkit=11.8.0
- filelock=3.13.1
- fsspec=2023.10.0
- googlesearch=3.0.0
+ - h2=4.1.0
+ - hpack=4.0.0
- huggingface_hub=0.20.3
+ - hyperframe=6.0.1
+ - idna=3.7
+ - keyutils=1.6.1
- krb5=1.20.1
+ - ld_impl_linux-64=2.40
- libblas=3.9.0
- libcxx=8.0.0
+ - libcxxabi=8.0.0
- libedit=3.1.20230828
- libfaiss=1.7.4
- libffi=3.4.2
+ - libgcc-ng=14.1.0
+ - libgfortran-ng=11.3.0
- libgfortran5=11.3.0
+ - libgomp=14.1.0
- liblapack=3.9.0
+ - libnsl=2.0.1
- libopenblas=0.3.23
- libpq=12.17
- libsqlite=3.45.3
+ - libstdcxx-ng=14.1.0
+ - libuuid=2.38.1
+ - libxcrypt=4.4.36
- libzlib=1.2.13
- llvm-openmp=18.1.3
- ncurses=6.4.20240210
@@ -31,6 +50,7 @@ dependencies:
- packaging=23.2
- pip=24.0
- psycopg2=2.9.9
+ - pycparser=2.22
- pysocks=1.7.1
- python=3.9.19
- python_abi=3.9
@@ -42,12 +62,13 @@ dependencies:
- sqlite=3.45.3
- tk=8.6.13
- tqdm=4.65.0
- - typing_extensions=4.9.0
- tzdata=2024a
- wheel=0.43.0
- xz=5.2.6
- yaml=0.2.5
- zlib=1.2.13
+ - zstandard=0.23.0
+ - zstd=1.5.6
- pip:
- aiohttp==3.9.5
- aiosignal==1.3.1
@@ -57,7 +78,6 @@ dependencies:
- attrs==23.2.0
- black==24.4.2
- chardet==5.2.0
- - charset-normalizer==3.3.2
- click==8.1.7
- dataclasses-json==0.6.4
- diskcache==5.6.3
@@ -68,19 +88,20 @@ dependencies:
- fastapi==0.111.0
- fastapi-cli==0.0.4
- frozenlist==1.4.1
+ - greenlet==3.0.3
- h11==0.14.0
- httpcore==1.0.5
- httptools==0.6.1
- httpx==0.27.0
- - idna==3.7
- jinja2==3.1.3
- jsonpatch==1.33
- jsonpointer==2.4
- - langchain==0.1.17
- - langchain-community==0.0.36
- - langchain-core==0.1.50
- - langchain-text-splitters==0.0.1
- - langsmith==0.1.48
+ - langchain==0.2.8
+ - langchain-community==0.2.7
+ - langchain-core==0.2.19
+ - langchain-experimental==0.0.62
+ - langchain-text-splitters==0.2.2
+ - langsmith==0.1.85
- llama-cpp-python==0.2.61
- markdown-it-py==3.0.0
- markupsafe==2.1.5
diff --git a/main.py b/main.py
index cc24e22..5172ad6 100644
--- a/main.py
+++ b/main.py
@@ -6,6 +6,7 @@
from configurator import get_runtime_config, args
from core.databases import db_base
from core.tools import errorlib
+from schedulers.deepsearcher import start_deep_searcher
from workers.crawler import start_crawler
from workers.embedder import start_embedder
from workers.summarizer import start_summarizer
@@ -13,17 +14,33 @@
colorama_init()
db_base.db_init()
-if args.worker_type == "webui":
- # fixme: this is a workaround, webui should be started from it's folder
- uvicorn.run("webui.main:app", host="0.0.0.0", port=8000)
+# todo: change deep_searcher into a scheduler instead of worker, -s flag
+# this is because workers use cpu, gpu and memory resources in a distributed way,
+# while a scheduler only does lightweight management without any significant load,
+# and most importantly contrary to worker, doesn't require a config file to run
+# theoretically this makes the crawler a scheduler as well, but for now it is a core part
-if args.worker_type == "webui":
+if args.worker_type == "none" and args.scheduler_type == "none":
errorlib.pretty_error(
- title=f"No flags were provided",
- advice=f"---",
+ title=f"No worker or scheduler was selected to run.",
+ advice=f"You have to select either worker or a scheduler to run.",
)
+if args.worker_type != "none" and args.scheduler_type != "none":
+ errorlib.pretty_error(
+ title=f"Both a worker and a scheduler was selected to run.",
+ advice=f"You may only select either worker or a scheduler to run.",
+ )
+
+
+if args.scheduler_type == "webui":
+ uvicorn.run("webui.main:app", host="0.0.0.0", port=8000)
+
try:
+ # todo: dynamically allocate available-scheduler array
+ if args.scheduler_type == "deep_searcher":
+ start_deep_searcher()
+
runtime_config = get_runtime_config()
if runtime_config.worker_type == "crawler":
start_crawler()
@@ -31,6 +48,7 @@
start_embedder()
if runtime_config.worker_type == "summarizer":
start_summarizer()
+
except requests.exceptions.ConnectionError:
errorlib.pretty_error(
title=f"OLLAMA called but not running",
diff --git a/schedulers/deepsearcher.py b/schedulers/deepsearcher.py
new file mode 100644
index 0000000..1e4f714
--- /dev/null
+++ b/schedulers/deepsearcher.py
@@ -0,0 +1,93 @@
+# The first automatic scheduler.
+# When launched, looks at the available data, picks interesting topics and requests deeper searches about them.
+from __future__ import annotations
+
+from langchain_core.output_parsers import StrOutputParser
+
+from core.chainables.web import structured_extraction_prompt
+from core.databases.db_completion_tasks import (
+ CompletionTask,
+ db_get_complete_completion_tasks,
+ db_add_completion_task,
+)
+from core.tools.model_loader import load_llm
+from core.tools.utils import sleep_noisy, remove_characters
+
+# get finished tasks
+# extract interesting talking points
+# repeat until a unique one has been found
+# create new tasks out of it
+
+# we can also make a great use of embedding here
+# use very broad embeddings to see which topics align the most with others
+# to see how well certain topic is developed so far
+
+# poc: just get a random summary and dispatch a new one from it
+# then: grab a random topic from one of the summaries, and focus on it,
+# continuously condensing summaries and requesting them on deeper topics
+
+# TODO: move all LLM load to summarizer, local for now
+
+llm = None
+output_parser = StrOutputParser()
+
+
+def are_workers_free():
+ # todo: check if there are non-busy workers available
+ return True
+
+
+def get_random_completion() -> CompletionTask | None:
+ completions_list = db_get_complete_completion_tasks()
+ if len(completions_list) > 0:
+ return completions_list[0]
+ else:
+ return None
+
+
+def extract_interesting_topics(text: str) -> list[str]:
+ # todo: separate function for extracting topics and getting one,
+ # run the extraction one only when lacking topics
+ # todo: perform semantic grouping of subjects + their context
+ # fixme: instead of TODO, extracting a single topic as a POC for now
+ extraction_request = "Find exactly one topic from this text, it must be interesting. Reply in 3 words at most."
+
+ global llm
+ if llm is None:
+ llm = load_llm()
+ extraction_chain = structured_extraction_prompt() | llm | output_parser
+
+ result = extraction_chain.invoke({"data": text, "user_request": extraction_request})
+
+ return [result]
+
+
+def schedule_new_completion(query: str) -> str:
+ # fixme: regex remove anything not a-zA-Z
+ pure_query = remove_characters(query, ['"', "'", ':', '?', '!'])
+ return db_add_completion_task(pure_query, "info")
+
+
+def start_deep_searcher():
+ while True:
+ # fixme: replace False with free worker checking
+ if not are_workers_free():
+ sleep_noisy(6)
+ continue
+
+ completion = get_random_completion()
+
+ if not completion:
+ sleep_noisy(6)
+ continue
+
+ completion_text = completion.completion_result
+ topics = extract_interesting_topics(completion_text)
+
+ for topic in topics:
+ print('dispatching new summaries:', topic)
+ schedule_new_completion(topic)
+
+ sleep_noisy(6)
+ print("DBG: shutting down deep_searcher - only 1 loop scheduled")
+ return
diff --git a/webui/frontend/src/app/components/PromptInput.tsx b/webui/frontend/src/app/components/PromptInput.tsx
index c1c883f..46b474a 100644
--- a/webui/frontend/src/app/components/PromptInput.tsx
+++ b/webui/frontend/src/app/components/PromptInput.tsx
@@ -149,6 +149,7 @@ function PromptInput() {