From 726997d963fefb71ad4c68b4664a210e7189b356 Mon Sep 17 00:00:00 2001 From: Kailas Mahavarkar <66670953+KailasMahavarkar@users.noreply.github.com> Date: Mon, 20 Apr 2026 16:27:56 +0530 Subject: [PATCH] docs(autoresearch): README reflects /.env + env_key pattern After #163 (.env + typed env.py) and #164 (env_key pointer in provider config), the autoresearch README still documented the old "inline api_key in config.json" shape. Updated: * Config example now shows env_key pointers (ollama_key, openrouter_key) instead of api_key strings * Added a short preamble explaining that secrets live in /.env and config.json is shape-only * Setup section now tells the user to copy both config.example.json AND .env.example, and edit /.env for real keys * "Add a new model" section points at env.py + .env.example for provider-level onboarding Other README.md files in the repo were scanned; only this one had stale config refs. Nothing else needs updating. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/framework/README.md | 41 ++++++++++++++++++----------- tools/autoresearch/README.md | 23 +++++++++++----- website/docs/benchmarks/overview.md | 11 ++++---- website/docs/guides/first-memory.md | 16 +++++------ 4 files changed, 57 insertions(+), 34 deletions(-) diff --git a/benchmarks/framework/README.md b/benchmarks/framework/README.md index 6f9fce5..ca20d4d 100644 --- a/benchmarks/framework/README.md +++ b/benchmarks/framework/README.md @@ -6,9 +6,9 @@ Benchmark GraphStore retrieval quality on three standardized datasets. | Benchmark | Protocol | Scoring | Runner | |---|---|---|---| -| **LongMemEval** | Per-record: reset - ingest haystack - query - score | Accuracy, R@K, LLM judge | `runner.py` / `run_longmemeval.py` | -| **LoCoMo** | Per-conversation: ingest once - query all QAs | Token-level F1 (Porter stemming) | `run_locomo.py` | -| **BEAM** | Per-chat: ingest chunks - answer probing questions | External BEAM evaluator | `run_beam.py` | +| **LongMemEval** | Per-record: reset - ingest haystack - query - score | Accuracy, R@K, LLM judge | `runners/runner.py` + `runners/longmemeval.py` | +| **LoCoMo** | Per-conversation: ingest once - query all QAs | Token-level F1 + LLM judge | `runners/locomo.py` | +| **BEAM** | Per-chat: ingest chunks - answer probing questions | External BEAM evaluator | `runners/beam.py` | ## Quickstart @@ -78,27 +78,38 @@ cost ingest_tokens, query_tokens ``` framework/ cli.py # Unified CLI (all 3 benchmarks) - runner.py # Generic per-record runner (LongMemEval) - run_locomo.py # LoCoMo protocol (ingest-once, F1 + LLM judge) - run_beam.py # BEAM protocol (chunk + answer generation) - run_longmemeval.py # LongMemEval native runner (NDCG, per-type) - adapter.py # MemoryAdapter protocol - adapters/graphstore_.py # Native-DSL adapter (5-signal REMEMBER) - adapters/graphstore_skill.py # Skill-based ingest adapter (LLM-planned DSL) datasets.py # Dataset loaders (longmemeval, locomo) metrics.py # Quality, latency, memory metrics report.py # JSON, CSV, Markdown output entity_extraction.py # NER for graph enrichment (used by graphstore_.py) - ratchet_recall.py # LoCoMo evidence-recall metrics - ratchet_test.py # Ratchet test harness (50Q random 10/cat) - llm_runner.py # Shared LLM transport: rate-limit + retry + fallback - llm_client.py # LoCoMo reader/judge wrappers (delegates to llm_runner) - llm_judge.py # LongMemEval per-category judge prompts docker_runner.py # Docker entry point Dockerfile.bench # CPU container Dockerfile.bench.gpu # GPU container + + runners/ + runner.py # Generic per-record runner (LongMemEval) + locomo.py # LoCoMo protocol (ingest-once, F1 + LLM judge) + beam.py # BEAM protocol (chunk + answer generation) + longmemeval.py # LongMemEval native runner (NDCG, per-type) + ratchet_recall.py # LoCoMo evidence-recall metrics + ratchet_test.py # Ratchet test harness (50Q random 10/cat) + + transport/ + llm_runner.py # Thin re-export of graphstore.llm_runner + llm_client.py # LoCoMo reader/judge wrappers (delegates to runner) + llm_judge.py # LongMemEval per-category judge prompts + + adapters/ + base.py # MemoryAdapter protocol + shared types + graphstore_.py # Native-DSL adapter (5-signal REMEMBER) + graphstore_skill.py # Skill-based ingest adapter (LLM-planned DSL) ``` +The canonical LLM transport lives in `src/graphstore/llm_runner.py`. The +`transport/` re-export keeps bench-side imports stable. Provider chain + +config.json parsing live in `tools/autoresearch/providers.py`; secrets +come from `/.env` via `/env.py`. + ## Docker ```bash diff --git a/tools/autoresearch/README.md b/tools/autoresearch/README.md index 1b79c72..3429473 100644 --- a/tools/autoresearch/README.md +++ b/tools/autoresearch/README.md @@ -159,6 +159,11 @@ unverified candidate. Two-level hierarchy: **providers** own connection settings, **models** are leaves under them. A single API key per provider is reused by all its models. +Secrets never live in `config.json`. Each provider declares an `env_key` +that points at a field on the typed `ENV` object in `/env.py`; values +come from `/.env` (gitignored) or the shell. See `/.env.example` for the +full list of variables. + ```json { "active_provider": "local_ollama", @@ -167,7 +172,7 @@ leaves under them. A single API key per provider is reused by all its models. "providers": { "local_ollama": { "base_url": "http://localhost:11434", - "api_key": "...", + "env_key": "ollama_key", "is_local": true, "litellm_prefix": "ollama_chat", "models": { @@ -181,7 +186,7 @@ leaves under them. A single API key per provider is reused by all its models. }, "openrouter": { "base_url": "https://openrouter.ai/api/v1", - "api_key": "sk-or-v1-...", + "env_key": "openrouter_key", "is_local": false, "litellm_prefix": "openrouter", "models": { @@ -211,9 +216,13 @@ auto-migrated in `migrate_config()` at load time. ### Setup ```bash -# Copy and fill in API keys (config.json is gitignored) -cp autoresearch/config.example.json autoresearch/config.json -# edit autoresearch/config.json and add real api_key values +# Copy the shape-only config template (config.json is gitignored): +cp tools/autoresearch/config.example.json tools/autoresearch/config.json + +# Put real API keys in /.env at the repo root. config.json never holds +# secrets; it only declares env_key pointers. +cp .env.example .env +# edit /.env and set OPENROUTER_API_KEY + OLLAMA_API_KEY # Run a single loop python -m tools.autoresearch.run_loop --algo spreading --iterations 18 @@ -323,7 +332,9 @@ change), and **target-function-only AST comparison** for the others. 3. Test it: `python -m tools.autoresearch.run_loop --algo X --iterations 1 --model ` If the model is on a new provider (e.g., anthropic), add a new top-level -provider entry with its `base_url`, `api_key`, and `litellm_prefix`. +provider entry with its `base_url`, `env_key`, and `litellm_prefix`. +Declare the matching field on `_Env` in `/env.py` and add the raw env +var to `/.env.example`. ## Known limitations diff --git a/website/docs/benchmarks/overview.md b/website/docs/benchmarks/overview.md index 7323ef3..fdf5af1 100644 --- a/website/docs/benchmarks/overview.md +++ b/website/docs/benchmarks/overview.md @@ -93,14 +93,15 @@ set_cache_dir('/tmp/gs_models') install_embedder('jina-v5-small-retrieval') " -# Run full benchmark (requires LLM - set QA_MODEL in benchmarks/framework/llm_client.py) -python -m benchmarks.framework.run_locomo \ +# Run full benchmark (requires LLM - set OPENROUTER_API_KEY/OLLAMA_API_KEY in /.env; +# QA_MODEL_PRIORITY lives in src/graphstore/llm_runner.py) +python -m benchmarks.framework.runners.locomo \ --data-path /tmp/locomo \ --embedder installed:jina-v5-small-retrieval \ --k 10 # Run direct evidence recall test (no LLM needed) -python -m benchmarks.framework.ratchet_recall +python -m benchmarks.framework.runners.ratchet_recall ``` For a walkthrough of ingestion and querying on a single LoCoMo conversation, see [First memory](../guides/first-memory). @@ -131,7 +132,7 @@ Disk numbers at **100k nodes**, in-memory at **10k nodes** (disk WAL sync domina ## BEAM -graphstore includes a benchmark-side BEAM answer-generation runner at `benchmarks/framework/run_beam.py`. Keeps graphstore core untouched and emits BEAM-compatible answer JSON so BEAM's own evaluator can score it. +graphstore includes a benchmark-side BEAM answer-generation runner at `benchmarks/framework/runners/beam.py`. Keeps graphstore core untouched and emits BEAM-compatible answer JSON so BEAM's own evaluator can score it. ### Workflow @@ -143,7 +144,7 @@ graphstore includes a benchmark-side BEAM answer-generation runner at `benchmark ```bash # Generate answers for BEAM 100K chats 1..2 -uv run python3 -m benchmarks.framework.run_beam \ +uv run python3 -m benchmarks.framework.runners.beam \ --beam-root /tmp/BEAM \ --chat-size 100K \ --start-index 1 \ diff --git a/website/docs/guides/first-memory.md b/website/docs/guides/first-memory.md index cf0c1f7..8a8cdb5 100644 --- a/website/docs/guides/first-memory.md +++ b/website/docs/guides/first-memory.md @@ -141,12 +141,12 @@ uv run python3 -m benchmarks.framework.ratchet_recall ## Full LoCoMo with LLM -Set the model in `benchmarks/framework/llm_client.py`: - -```python -QA_MODEL = "minimax/minimax-m2.7:nitro" # OpenRouter paid -QA_MODEL_OR = "minimax/minimax-m2.7:nitro" -``` +1. Set API keys in `/.env` at the repo root (see `/.env.example`). The + shared transport reads `OPENROUTER_API_KEY` + `OLLAMA_API_KEY`. +2. The preferred QA model is declared in + `src/graphstore/llm_runner.py` as `QA_MODEL_PRIORITY`. Edit that list + to swap models. Default: `gemma4:31b-cloud` (Ollama) with + `google/gemma-4-31b-it` (OpenRouter) as fallback. Run: @@ -155,7 +155,7 @@ Run: uv run python3 -c " import os os.environ['GRAPHSTORE_MODEL_CACHE_DIR'] = '/tmp/gs_models' -from benchmarks.framework.run_locomo import run_locomo +from benchmarks.framework.runners.locomo import run_locomo from benchmarks.framework.datasets import load_locomo from benchmarks.framework.adapters.graphstore_ import GraphStoreAdapter @@ -173,7 +173,7 @@ print(f'Overall F1: {summary[\"overall_f1\"]:.4f}') " # Full 1986Q (~$0.40 on MiniMax nitro) -uv run python3 -m benchmarks.framework.run_locomo \ +uv run python3 -m benchmarks.framework.runners.locomo \ --data-path /tmp/locomo \ --embedder installed:jina-v5-small-retrieval \ --k 10