orkait · KailasMahavarkar · Apr 20, 2026 · Apr 20, 2026
diff --git a/benchmarks/framework/README.md b/benchmarks/framework/README.md
@@ -6,9 +6,9 @@ Benchmark GraphStore retrieval quality on three standardized datasets.
 
 | Benchmark | Protocol | Scoring | Runner |
 |---|---|---|---|
-| **LongMemEval** | Per-record: reset - ingest haystack - query - score | Accuracy, R@K, LLM judge | `runner.py` / `run_longmemeval.py` |
-| **LoCoMo** | Per-conversation: ingest once - query all QAs | Token-level F1 (Porter stemming) | `run_locomo.py` |
-| **BEAM** | Per-chat: ingest chunks - answer probing questions | External BEAM evaluator | `run_beam.py` |
+| **LongMemEval** | Per-record: reset - ingest haystack - query - score | Accuracy, R@K, LLM judge | `runners/runner.py` + `runners/longmemeval.py` |
+| **LoCoMo** | Per-conversation: ingest once - query all QAs | Token-level F1 + LLM judge | `runners/locomo.py` |
+| **BEAM** | Per-chat: ingest chunks - answer probing questions | External BEAM evaluator | `runners/beam.py` |
 
 ## Quickstart
 
@@ -78,27 +78,38 @@ cost            ingest_tokens, query_tokens
 ```
 framework/
   cli.py                        # Unified CLI (all 3 benchmarks)
-  runner.py                     # Generic per-record runner (LongMemEval)
-  run_locomo.py                 # LoCoMo protocol (ingest-once, F1 + LLM judge)
-  run_beam.py                   # BEAM protocol (chunk + answer generation)
-  run_longmemeval.py            # LongMemEval native runner (NDCG, per-type)
-  adapter.py                    # MemoryAdapter protocol
-  adapters/graphstore_.py       # Native-DSL adapter (5-signal REMEMBER)
-  adapters/graphstore_skill.py  # Skill-based ingest adapter (LLM-planned DSL)
   datasets.py                   # Dataset loaders (longmemeval, locomo)
   metrics.py                    # Quality, latency, memory metrics
   report.py                     # JSON, CSV, Markdown output
   entity_extraction.py          # NER for graph enrichment (used by graphstore_.py)
-  ratchet_recall.py             # LoCoMo evidence-recall metrics
-  ratchet_test.py               # Ratchet test harness (50Q random 10/cat)
-  llm_runner.py                 # Shared LLM transport: rate-limit + retry + fallback
-  llm_client.py                 # LoCoMo reader/judge wrappers (delegates to llm_runner)
-  llm_judge.py                  # LongMemEval per-category judge prompts
   docker_runner.py              # Docker entry point
   Dockerfile.bench              # CPU container
   Dockerfile.bench.gpu          # GPU container
+
+  runners/
+    runner.py                   # Generic per-record runner (LongMemEval)
+    locomo.py                   # LoCoMo protocol (ingest-once, F1 + LLM judge)
+    beam.py                     # BEAM protocol (chunk + answer generation)
+    longmemeval.py              # LongMemEval native runner (NDCG, per-type)
+    ratchet_recall.py           # LoCoMo evidence-recall metrics
+    ratchet_test.py             # Ratchet test harness (50Q random 10/cat)
+
+  transport/
+    llm_runner.py               # Thin re-export of graphstore.llm_runner
+    llm_client.py               # LoCoMo reader/judge wrappers (delegates to runner)
+    llm_judge.py                # LongMemEval per-category judge prompts
+
+  adapters/
+    base.py                     # MemoryAdapter protocol + shared types
+    graphstore_.py              # Native-DSL adapter (5-signal REMEMBER)
+    graphstore_skill.py         # Skill-based ingest adapter (LLM-planned DSL)
 ```
 
+The canonical LLM transport lives in `src/graphstore/llm_runner.py`. The
+`transport/` re-export keeps bench-side imports stable. Provider chain +
+config.json parsing live in `tools/autoresearch/providers.py`; secrets
+come from `/.env` via `/env.py`.
+
 ## Docker
 
 ```bash

diff --git a/tools/autoresearch/README.md b/tools/autoresearch/README.md
@@ -159,6 +159,11 @@ unverified candidate.
 Two-level hierarchy: **providers** own connection settings, **models** are
 leaves under them. A single API key per provider is reused by all its models.
 
+Secrets never live in `config.json`. Each provider declares an `env_key`
+that points at a field on the typed `ENV` object in `/env.py`; values
+come from `/.env` (gitignored) or the shell. See `/.env.example` for the
+full list of variables.
+
 ```json
 {
   "active_provider": "local_ollama",
@@ -167,7 +172,7 @@ leaves under them. A single API key per provider is reused by all its models.
   "providers": {
     "local_ollama": {
       "base_url": "http://localhost:11434",
-      "api_key": "...",
+      "env_key": "ollama_key",
       "is_local": true,
       "litellm_prefix": "ollama_chat",
       "models": {
@@ -181,7 +186,7 @@ leaves under them. A single API key per provider is reused by all its models.
     },
     "openrouter": {
       "base_url": "https://openrouter.ai/api/v1",
-      "api_key": "sk-or-v1-...",
+      "env_key": "openrouter_key",
       "is_local": false,
       "litellm_prefix": "openrouter",
       "models": {
@@ -211,9 +216,13 @@ auto-migrated in `migrate_config()` at load time.
 ### Setup
 
 ```bash
-# Copy and fill in API keys (config.json is gitignored)
-cp autoresearch/config.example.json autoresearch/config.json
-# edit autoresearch/config.json and add real api_key values
+# Copy the shape-only config template (config.json is gitignored):
+cp tools/autoresearch/config.example.json tools/autoresearch/config.json
+
+# Put real API keys in /.env at the repo root. config.json never holds
+# secrets; it only declares env_key pointers.
+cp .env.example .env
+# edit /.env and set OPENROUTER_API_KEY + OLLAMA_API_KEY
 
 # Run a single loop
 python -m tools.autoresearch.run_loop --algo spreading --iterations 18
@@ -323,7 +332,9 @@ change), and **target-function-only AST comparison** for the others.
 3. Test it: `python -m tools.autoresearch.run_loop --algo X --iterations 1 --model <model>`
 
 If the model is on a new provider (e.g., anthropic), add a new top-level
-provider entry with its `base_url`, `api_key`, and `litellm_prefix`.
+provider entry with its `base_url`, `env_key`, and `litellm_prefix`.
+Declare the matching field on `_Env` in `/env.py` and add the raw env
+var to `/.env.example`.
 
 ## Known limitations
 

diff --git a/website/docs/benchmarks/overview.md b/website/docs/benchmarks/overview.md
@@ -93,14 +93,15 @@ set_cache_dir('/tmp/gs_models')
 install_embedder('jina-v5-small-retrieval')
 "
 
-# Run full benchmark (requires LLM - set QA_MODEL in benchmarks/framework/llm_client.py)
-python -m benchmarks.framework.run_locomo \
+# Run full benchmark (requires LLM - set OPENROUTER_API_KEY/OLLAMA_API_KEY in /.env;
+# QA_MODEL_PRIORITY lives in src/graphstore/llm_runner.py)
+python -m benchmarks.framework.runners.locomo \
   --data-path /tmp/locomo \
   --embedder installed:jina-v5-small-retrieval \
   --k 10
 
 # Run direct evidence recall test (no LLM needed)
-python -m benchmarks.framework.ratchet_recall
+python -m benchmarks.framework.runners.ratchet_recall
 ```
 
 For a walkthrough of ingestion and querying on a single LoCoMo conversation, see [First memory](../guides/first-memory).
@@ -131,7 +132,7 @@ Disk numbers at **100k nodes**, in-memory at **10k nodes** (disk WAL sync domina
 
 ## BEAM
 
-graphstore includes a benchmark-side BEAM answer-generation runner at `benchmarks/framework/run_beam.py`. Keeps graphstore core untouched and emits BEAM-compatible answer JSON so BEAM's own evaluator can score it.
+graphstore includes a benchmark-side BEAM answer-generation runner at `benchmarks/framework/runners/beam.py`. Keeps graphstore core untouched and emits BEAM-compatible answer JSON so BEAM's own evaluator can score it.
 
 ### Workflow
 
@@ -143,7 +144,7 @@ graphstore includes a benchmark-side BEAM answer-generation runner at `benchmark
 
 ```bash
 # Generate answers for BEAM 100K chats 1..2
-uv run python3 -m benchmarks.framework.run_beam \
+uv run python3 -m benchmarks.framework.runners.beam \
   --beam-root /tmp/BEAM \
   --chat-size 100K \
   --start-index 1 \

diff --git a/website/docs/guides/first-memory.md b/website/docs/guides/first-memory.md
@@ -141,12 +141,12 @@ uv run python3 -m benchmarks.framework.ratchet_recall
 
 ## Full LoCoMo with LLM
 
-Set the model in `benchmarks/framework/llm_client.py`:
-
-```python
-QA_MODEL = "minimax/minimax-m2.7:nitro"   # OpenRouter paid
-QA_MODEL_OR = "minimax/minimax-m2.7:nitro"
-```
+1. Set API keys in `/.env` at the repo root (see `/.env.example`). The
+   shared transport reads `OPENROUTER_API_KEY` + `OLLAMA_API_KEY`.
+2. The preferred QA model is declared in
+   `src/graphstore/llm_runner.py` as `QA_MODEL_PRIORITY`. Edit that list
+   to swap models. Default: `gemma4:31b-cloud` (Ollama) with
+   `google/gemma-4-31b-it` (OpenRouter) as fallback.
 
 Run:
 
@@ -155,7 +155,7 @@ Run:
 uv run python3 -c "
 import os
 os.environ['GRAPHSTORE_MODEL_CACHE_DIR'] = '/tmp/gs_models'
-from benchmarks.framework.run_locomo import run_locomo
+from benchmarks.framework.runners.locomo import run_locomo
 from benchmarks.framework.datasets import load_locomo
 from benchmarks.framework.adapters.graphstore_ import GraphStoreAdapter
 
@@ -173,7 +173,7 @@ print(f'Overall F1: {summary[\"overall_f1\"]:.4f}')
 "
 
 # Full 1986Q (~$0.40 on MiniMax nitro)
-uv run python3 -m benchmarks.framework.run_locomo \
+uv run python3 -m benchmarks.framework.runners.locomo \
   --data-path /tmp/locomo \
   --embedder installed:jina-v5-small-retrieval \
   --k 10