diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..473fea0 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,20 @@ +.git +.github +.npm-cache +.tmp-vitest +benchmarks/output +build-env-* +build-reqs-*.txt +docs/handoffs +node_modules +python/.venv +python/build +python/dist +python/*.egg-info +python/**/__pycache__ +test-*-data +tests +pip-* +*.db +*.db-shm +*.db-wal diff --git a/.env.docker.example b/.env.docker.example new file mode 100644 index 0000000..95c5f76 --- /dev/null +++ b/.env.docker.example @@ -0,0 +1,21 @@ +# Docker-specific Audrey configuration +# Copy to .env and edit before running: docker compose up -d --build + +# API key for Bearer token auth (strongly recommended for Docker deployments) +AUDREY_API_KEY=change-me-to-a-real-secret + +# Embedding provider: local | gemini | openai (default: local) +AUDREY_EMBEDDING_PROVIDER=local + +# Device for local embeddings: cpu | gpu (default: cpu in Docker) +AUDREY_DEVICE=cpu + +# LLM provider for consolidation/reflection (optional) +# AUDREY_LLM_PROVIDER=anthropic +# ANTHROPIC_API_KEY= + +# Published port on host (default: 3487) +# AUDREY_PUBLISHED_PORT=3487 + +# Gemini embeddings (alternative to local) +# GOOGLE_API_KEY= diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5a93ef0..d5844c2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,6 +32,48 @@ jobs: - run: npm run bench:memory:check - run: npm run pack:check + python-sdk: + name: Python SDK on 3.11 + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 20 + cache: npm + + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - run: npm ci + - run: python -m pip install --upgrade pip setuptools wheel build + - run: python -m pip install -e ./python + - run: python -m unittest discover -s python/tests -v + - run: python -m build --no-isolation python + + docker-smoke: + name: Docker smoke on Ubuntu + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - run: docker build -t audrey:ci . + - run: docker run -d --name audrey-smoke -p 3487:3487 -e AUDREY_EMBEDDING_PROVIDER=mock -e AUDREY_LLM_PROVIDER=mock -e AUDREY_API_KEY=test-secret audrey:ci + - run: | + for i in $(seq 1 30); do + if curl -fsS -H "Authorization: Bearer test-secret" http://127.0.0.1:3487/health; then + exit 0 + fi + sleep 2 + done + docker logs audrey-smoke + exit 1 + - run: docker rm -f audrey-smoke + windows-smoke: name: Windows smoke on Node 20 runs-on: windows-latest diff --git a/.gitignore b/.gitignore index e85f54f..be7d839 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ node_modules/ +.npm-cache/ audrey-data/ +.tmp/ +.tmp-vitest/ *.db *.db-wal *.db-shm @@ -8,3 +11,12 @@ CLAUDE.md .worktrees/ benchmarks/output/ benchmarks/.tmp/ +python/build/ +python/dist/ +python/*.egg-info/ +python/.venv/ +python/.pytest_cache/ +python/**/__pycache__/ +pip-*/ +build-env-*/ +build-reqs-*.txt diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ce5af83..a7b15df 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -10,7 +10,7 @@ npm test npm run pack:check ``` -Node `>=18` is required. +Node `>=20` is required. ## What Good Contributions Look Like diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..03fc8f8 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ +FROM node:22-bookworm-slim + +WORKDIR /app + +ENV NODE_ENV=production \ + AUDREY_HOST=0.0.0.0 \ + AUDREY_PORT=3487 \ + AUDREY_DATA_DIR=/data \ + AUDREY_DEVICE=cpu + +RUN apt-get update \ + && apt-get install -y --no-install-recommends python3 make g++ ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +COPY package.json package-lock.json ./ +RUN npm ci --omit=dev + +COPY src ./src +COPY mcp-server ./mcp-server +COPY types ./types +COPY README.md LICENSE ./ + +VOLUME ["/data"] +EXPOSE 3487 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=5 \ + CMD ["node", "--input-type=module", "-e", "const headers = process.env.AUDREY_API_KEY ? { Authorization: 'Bearer ' + process.env.AUDREY_API_KEY } : {}; fetch('http://127.0.0.1:3487/health', { headers }).then(r => process.exit(r.ok ? 0 : 1)).catch(() => process.exit(1));"] + +CMD ["node", "mcp-server/index.js", "serve"] diff --git a/README.md b/README.md index 79e062e..ea421d2 100644 --- a/README.md +++ b/README.md @@ -4,437 +4,227 @@ [![npm version](https://img.shields.io/npm/v/audrey.svg)](https://www.npmjs.com/package/audrey) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) -Persistent memory for Claude Code and AI agents. Two commands, every session remembers. +Audrey is a persistent memory and continuity engine for Claude Code and AI agents. -```bash -npx audrey install # 13 MCP memory tools -npx audrey hooks install # automatic memory in every session -``` +It gives an agent a local memory store, durable recall, consolidation, contradiction handling, a REST sidecar, MCP tools, and benchmark gates without adding external infrastructure. -That's it. Claude Code now wakes up knowing what happened yesterday, recalls relevant context per-prompt, and consolidates learnings when the session ends. No cloud, no config files, no infrastructure — one SQLite file. +Requires Node.js 20+. -Audrey also works as a standalone SDK, MCP server, and REST API for any AI agent framework. +## Quick Start -> **On `/dream`** — Anthropic recently shipped `/dream` for Claude Code memory maintenance. Audrey predates it and goes further: episodic-to-semantic consolidation, contradiction detection, confidence decay, emotional affect, causal reasoning, and source reliability weighting. `/dream` is a maintenance pass. Audrey is a cognitive memory architecture. +### Claude Code -## Why Audrey +```bash +npx audrey init +npx audrey doctor +``` -Most AI memory tools are storage wrappers. They save facts, retrieve facts, and keep everything forever. That leaves real production problems unsolved: +This uses the default `local-offline` preset: -- Old information stays weighted like new information. -- Raw events never become reusable operating knowledge. -- Conflicting facts quietly coexist. -- Model-generated mistakes can get reinforced into false "truth." +- registers Audrey with Claude Code +- installs hooks for automatic recall and reflection +- uses local embeddings by default +- stores memory in one local SQLite-backed data directory -Audrey models memory as a working system instead of a filing cabinet. +### REST or Docker Sidecar -| Brain Structure | Audrey Component | What It Does | -|---|---|---| -| Hippocampus | Episodic Memory | Fast capture of raw events and observations | -| Neocortex | Semantic Memory | Consolidated principles and patterns | -| Cerebellum | Procedural Memory | Learned workflows and conditional behaviors | -| Sleep Replay | Dream Cycle | Consolidates episodes into principles and applies decay | -| Prefrontal Cortex | Validation Engine | Truth-checking and contradiction detection | -| Amygdala | Affect System | Emotional encoding, arousal-salience coupling, and mood-congruent recall | - -## What You Get - -- Local SQLite-backed memory with `sqlite-vec` -- MCP server for Claude Code with 13 memory tools -- **Claude Code hooks integration** — automatic memory in every session (`npx audrey hooks install`) -- JavaScript SDK for direct application use -- **Git-friendly versioning** via JSON snapshots (`npx audrey snapshot` / `restore`) -- **REST API server** — any language, any framework (`npx audrey serve`) -- Health checks via `npx audrey status --json` -- Benchmark harness with SVG/HTML reports via `npm run bench:memory` -- Regression gate for benchmark quality via `npm run bench:memory:check` -- Optional local embeddings and optional hosted LLM providers -- Strongest production fit today in financial services ops and healthcare ops - -## Install - -### MCP Server for Claude Code +```bash +npx audrey init sidecar-prod +docker compose up -d --build +``` + +Then verify: ```bash -npx audrey install # Register 13 MCP memory tools -npx audrey hooks install # Wire automatic memory into session lifecycle +npx audrey doctor +curl http://localhost:3487/health ``` -Audrey auto-detects providers from your environment: +## Why Audrey -- `GOOGLE_API_KEY` or `GEMINI_API_KEY` -> Gemini embeddings (3072d) -- no embedding key -> local embeddings (384d, MiniLM, offline-capable) -- `AUDREY_EMBEDDING_PROVIDER=openai` -> explicit OpenAI embeddings (1536d) -- `ANTHROPIC_API_KEY` -> LLM-powered consolidation, contradiction detection, and reflection +- Local-first: memory lives in SQLite with `sqlite-vec`, not a hosted vector database. +- Practical: MCP, CLI, REST, JavaScript, Python, and Docker are all first-class. +- Durable: snapshot, restore, health checks, benchmark gates, and graceful shutdown are built in. +- Structured: Audrey does more than save notes. It consolidates, decays, tracks contradictions, and supports procedural memory. -Quick checks: +## What Ships -```bash -npx audrey status -npx audrey status --json -npx audrey status --json --fail-on-unhealthy -``` +- Claude Code MCP server with 13 memory tools +- Automatic hook-based recall and reflection for Claude Code sessions +- JavaScript SDK +- Python SDK packaged as `audrey-memory` +- REST API for sidecar deployment +- Docker and Compose deployment path +- Snapshot and restore for portable memory state +- Machine-readable health and benchmark gates +- Local benchmark harness with retrieval and lifecycle-operation tracks + +## Setup Presets -### SDK +`npx audrey init` supports four named presets: + +| Preset | Best For | Behavior | +|---|---|---| +| `local-offline` | Claude Code on one machine | Local embeddings, MCP install, hooks install | +| `hosted-fast` | Claude Code with provider keys already present | Auto-picks hosted providers from env, MCP install, hooks install | +| `ci-mock` | CI and smoke tests | Mock embedding + LLM providers, no Claude-specific setup | +| `sidecar-prod` | REST API and Docker deployment | Sidecar-oriented defaults, no Claude-specific setup | + +Useful checks: ```bash -npm install audrey +npx audrey doctor +npx audrey status +npx audrey status --json --fail-on-unhealthy ``` -Zero external infrastructure. One SQLite file. +## Use Audrey From Code -## Quick Start +### JavaScript ```js import { Audrey } from 'audrey'; const brain = new Audrey({ - dataDir: './agent-memory', + dataDir: './audrey-data', agent: 'support-agent', embedding: { provider: 'local', dimensions: 384 }, }); await brain.encode({ - content: 'Stripe API returned 429 above 100 req/s', + content: 'Stripe returns HTTP 429 above 100 req/s', source: 'direct-observation', tags: ['stripe', 'rate-limit'], - context: { task: 'debugging', domain: 'payments' }, - affect: { valence: -0.4, arousal: 0.7, label: 'frustration' }, -}); - -const memories = await brain.recall('stripe rate limits', { - limit: 5, - context: { task: 'debugging', domain: 'payments' }, }); -const dream = await brain.dream(); -const briefing = await brain.greeting({ context: 'debugging stripe' }); +const memories = await brain.recall('stripe rate limit'); await brain.waitForIdle(); brain.close(); ``` -## MCP Tools - -Every Claude Code session gets these tools after `npx audrey install`: - -- `memory_encode` -- `memory_recall` -- `memory_consolidate` -- `memory_dream` -- `memory_introspect` -- `memory_resolve_truth` -- `memory_export` -- `memory_import` -- `memory_forget` -- `memory_decay` -- `memory_status` -- `memory_reflect` -- `memory_greeting` - -## CLI +### Python ```bash -# Setup -npx audrey install # Register MCP server with Claude Code -npx audrey uninstall # Remove MCP server registration -npx audrey hooks install # Wire Audrey into Claude Code hooks (automatic memory) -npx audrey hooks uninstall # Remove Audrey hooks - -# Health and monitoring -npx audrey status # Human-readable health report -npx audrey status --json # Machine-readable health output -npx audrey status --json --fail-on-unhealthy # CI gate - -# Session lifecycle (used by hooks automatically) -npx audrey greeting # Load identity, principles, mood -npx audrey greeting "auth" # With context-aware recall -npx audrey recall "query" # Semantic memory search (returns hook-compatible JSON) -npx audrey reflect # Consolidate learnings from stdin conversation + dream - -# Maintenance -npx audrey dream # Full consolidation + decay cycle -npx audrey reembed # Re-embed all memories after provider/dimension change - -# Versioning -npx audrey snapshot # Export memories to timestamped JSON file -npx audrey snapshot backup.json # Export to specific file -npx audrey restore backup.json # Restore from snapshot (re-embeds with current provider) -npx audrey restore backup.json --force # Overwrite existing memories - -# REST API server -npx audrey serve # Start HTTP server on port 3487 -npx audrey serve 8080 # Custom port +pip install audrey-memory ``` -## Hooks Integration +```python +from audrey_memory import Audrey -Audrey integrates directly into Claude Code's hook lifecycle for automatic, zero-config memory in every session: +brain = Audrey( + base_url="http://127.0.0.1:3487", + api_key="secret", + agent="support-agent", +) -```bash -npx audrey hooks install +memory_id = brain.encode( + "Stripe returns HTTP 429 above 100 req/s", + source="direct-observation", +) +results = brain.recall("stripe rate limit", limit=5) +brain.close() ``` -This configures four hooks in `~/.claude/settings.json`: - -| Hook Event | Command | What Happens | -|---|---|---| -| **SessionStart** | `npx audrey greeting` | Loads identity, learned principles, current mood, and recent memories | -| **UserPromptSubmit** | `npx audrey recall` | Semantic search on every prompt — injects relevant memories as context | -| **Stop** | `npx audrey reflect` | Extracts lasting learnings from the conversation, then runs a dream cycle | -| **PostCompact** | `npx audrey greeting` | Re-injects critical memories after context window compaction | - -With hooks installed, Claude Code sessions automatically wake up with context, recall relevant memories per-prompt, and consolidate learnings when the session ends. No manual tool calls needed. - -## REST API Server - -Turn Audrey into an HTTP service that any language or framework can use: +## Key Commands ```bash -npx audrey serve # Start on port 3487 -npx audrey serve 8080 # Custom port -AUDREY_API_KEY=secret npx audrey serve # With Bearer token auth -``` - -Endpoints: - -| Method | Path | Description | -|--------|------|-------------| -| `GET` | `/health` | Liveness probe | -| `GET` | `/status` | Memory stats (introspect) | -| `POST` | `/encode` | Store a memory (`{ content, source, tags?, context?, affect? }`) | -| `POST` | `/recall` | Semantic search (`{ query, limit?, context? }`) | -| `POST` | `/dream` | Full consolidation + decay cycle | -| `POST` | `/consolidate` | Run consolidation only | -| `POST` | `/forget` | Forget by `{ id }` or `{ query }` | -| `POST` | `/snapshot` | Export all memories as JSON | -| `POST` | `/restore` | Wipe and reimport from snapshot | - -Example from any language: - -```bash -# Store a memory -curl -X POST http://localhost:3487/encode \ - -H "Content-Type: application/json" \ - -d '{"content": "The deploy failed due to OOM", "source": "direct-observation"}' - -# Search memories -curl -X POST http://localhost:3487/recall \ - -H "Content-Type: application/json" \ - -d '{"query": "deploy failures", "limit": 5}' -``` - -## Versioning - -Audrey stores memories in SQLite with WAL mode, which isn't git-friendly. Instead, use JSON snapshots: - -```bash -# Save a checkpoint -npx audrey snapshot - -# Commit it -git add audrey-snapshot-*.json && git commit -m "memory checkpoint" - -# Restore on another machine or after a reset -npx audrey restore audrey-snapshot-2026-03-24_15-30-00.json -``` - -Snapshots are human-readable, diffable, and provider-agnostic. Embeddings are re-generated on import, so you can switch providers (e.g., local to Gemini) and restore seamlessly. - -## Production Fit - -Audrey is strongest today in workflows where memory must stay local, reviewable, and durable: - -- **Financial services operations**: payments ops, fraud and dispute workflows, KYC/KYB review, internal policy assistants -- **Healthcare operations**: care coordination, prior-auth workflows, intake and referral routing, internal staff knowledge assistants - -Audrey is a memory layer, not a compliance boundary. For regulated environments, pair it with application-level access control, encryption, retention, audit logging, and data-minimization rules. - -Production guide: [docs/production-readiness.md](docs/production-readiness.md) - -Industry demos: - -- [examples/fintech-ops-demo.js](examples/fintech-ops-demo.js) -- [examples/healthcare-ops-demo.js](examples/healthcare-ops-demo.js) - -## Core Concepts - -### Memory Types - -- **Episodic**: raw events and observations -- **Semantic**: consolidated principles -- **Procedural**: reusable workflows and actions -- **Causal**: relationships that explain why something happened - -### Confidence - -Audrey scores memories using source reliability, evidence agreement, recency decay, and retrieval reinforcement. That helps keep direct observations above guesses and keeps stale or weakly supported knowledge from dominating recall. - -### Dream Cycle - -`brain.dream()` runs the full maintenance path: - -1. Consolidate related episodes into principles. -2. Apply decay so low-value memories lose weight over time. -3. Report memory health and current stats. - -### Contradiction Handling - -When evidence conflicts, Audrey tracks the contradiction instead of silently picking a winner. Resolutions can stay open, be marked resolved, or become context-dependent. - -## Configuration - -```js -const brain = new Audrey({ - dataDir: './audrey-data', - agent: 'my-agent', - embedding: { - provider: 'local', // mock | local | gemini | openai - dimensions: 384, - device: 'gpu', - }, - llm: { - provider: 'anthropic', // mock | anthropic | openai - apiKey: process.env.ANTHROPIC_API_KEY, - }, - consolidation: { - minEpisodes: 3, - }, - context: { - enabled: true, - weight: 0.3, - }, - affect: { - enabled: true, - weight: 0.2, - }, - decay: { - dormantThreshold: 0.1, - }, -}); -``` - -## Operations +# Setup +npx audrey init +npx audrey init hosted-fast +npx audrey init ci-mock +npx audrey init sidecar-prod -Recommended production workflow: +# Claude Code integration +npx audrey install +npx audrey hooks install +npx audrey hooks uninstall +npx audrey uninstall -```bash -# Health checks +# Health and maintenance +npx audrey doctor npx audrey status -npx audrey status --json --fail-on-unhealthy - -# Scheduled maintenance npx audrey dream - -# Repair vector/index drift after provider or dimension changes npx audrey reembed -# Version control your memories +# Versioning npx audrey snapshot -npx audrey restore --force +npx audrey restore backup.json --force -# Run the benchmark harness -npm run bench:memory - -# Fail CI if Audrey drops below benchmark guardrails -npm run bench:memory:check +# Sidecar +npx audrey serve +docker compose up -d --build ``` -## Benchmarking - -Audrey now ships with a memory benchmark harness built for two purposes: +## Benchmarks -- measure Audrey against naive local baselines on LongMemEval-style memory abilities plus privacy and abstention checks -- keep Audrey grounded against published LoCoMo results from leading memory systems - -Run it with: +Audrey ships with a benchmark harness and release gate: ```bash npm run bench:memory -``` - -Artifacts land in `benchmarks/output/` as JSON, SVG charts, and an HTML report. - -For CI and release gates: - -```bash npm run bench:memory:check ``` -That command fails if Audrey drops below its minimum local score, local pass rate, or required margin over the strongest naive baseline. +The benchmark suite measures: -For committed GitHub-friendly charts: +- retrieval behavior +- update and overwrite behavior +- delete and abstain behavior +- semantic and procedural merge behavior -```bash -npm run bench:memory:readme-assets -``` +Current repo snapshot: -### README Snapshot +![Audrey local benchmark](docs/assets/benchmarks/local-benchmark.svg) -Local Audrey-vs-baseline results: +For detailed methodology, published comparison anchors, and generated reports, see [docs/benchmarking.md](docs/benchmarking.md). -![Audrey local memory benchmark](docs/assets/benchmarks/local-benchmark.svg) +## Production -Published comparison anchors from current LLM memory systems: +Audrey is strongest in workflows where memory must stay local, reviewable, and durable. It already fits well as a sidecar for internal agents in operational domains like financial services and healthcare operations, but it is a memory layer, not a compliance boundary. -![Published LLM memory benchmark comparison](docs/assets/benchmarks/published-memory-standards.svg) +Production guide: [docs/production-readiness.md](docs/production-readiness.md) -**Audrey 93.8% with local MiniLM embeddings** (384d, offline-capable). Per-category breakdown: +Examples: -| Category | Audrey | Vector Only | Best Baseline | -|---|---|---|---| -| Information Extraction | 100% | 100% | 100% | -| Knowledge Updates | 100% | 0% | 50% | -| Multi-Session Reasoning | 100% | 100% | 100% | -| Temporal Reasoning | 100% | 100% | 100% | -| Abstention | 50% | 50% | 50% | -| Privacy | 100% | 0% | 0% | +- [examples/fintech-ops-demo.js](examples/fintech-ops-demo.js) +- [examples/healthcare-ops-demo.js](examples/healthcare-ops-demo.js) +- [examples/stripe-demo.js](examples/stripe-demo.js) -Published comparison anchors from the field (different benchmarks and conditions — included for field context, not direct comparison): +## Environment -| System | Benchmark | Score | What it represents | -|---|---|---|---| -| **Audrey** | Internal LongMemEval-style | **93.8%** | Consolidation, contradiction, abstention, privacy | -| MIRIX | Published LoCoMo | 85.4% | Typed multimodal memory | -| Letta Filesystem | Published LoCoMo | 74.0% | Context-engineering | -| Mem0 Graph Memory | Published LoCoMo | 68.5% | Graph memory | -| Mem0 | Published LoCoMo | 66.9% | Production baseline | +Starter config: -Primary comparison sources: +- [.env.example](.env.example) +- [.env.docker.example](.env.docker.example) -- [MIRIX paper](https://arxiv.org/abs/2507.07957) -- [Mem0 paper](https://arxiv.org/abs/2504.19413) -- [Letta benchmark write-up](https://www.letta.com/blog/benchmarking-ai-agent-memory) -- [LongMemEval paper](https://arxiv.org/abs/2410.10813) +Key environment variables: -Benchmark guide: [docs/benchmarking.md](docs/benchmarking.md) +- `AUDREY_DATA_DIR` +- `AUDREY_EMBEDDING_PROVIDER` +- `AUDREY_LLM_PROVIDER` +- `AUDREY_DEVICE` +- `AUDREY_API_KEY` +- `AUDREY_HOST` +- `AUDREY_PORT` -## Repository +## Documentation -- Contributing guide: [CONTRIBUTING.md](CONTRIBUTING.md) -- Security policy: [SECURITY.md](SECURITY.md) -- CI workflow: [.github/workflows/ci.yml](.github/workflows/ci.yml) -- Benchmarking guide: [docs/benchmarking.md](docs/benchmarking.md) +- [docs/benchmarking.md](docs/benchmarking.md) +- [docs/production-readiness.md](docs/production-readiness.md) +- [CONTRIBUTING.md](CONTRIBUTING.md) +- [SECURITY.md](SECURITY.md) ## Development ```bash npm ci npm test -npm run pack:check -npm run bench:memory npm run bench:memory:check -npm run bench:memory:readme-assets +npm run pack:check +python -m unittest discover -s python/tests -v +python -m build --no-isolation python ``` -Current validated baseline: - -- `npm test` -- `npm run pack:check` -- `npm run bench:memory` -- `npm run bench:memory:check` -- `npm run bench:memory:readme-assets` - ## License MIT. See [LICENSE](LICENSE). diff --git a/SECURITY.md b/SECURITY.md index 690aec6..722a4c5 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -6,8 +6,8 @@ Security fixes are best-effort for the current published release line and the cu | Version | Supported | |---|---| -| `0.16.x` | Yes | -| `< 0.16.0` | No | +| `0.17.x` | Yes | +| `< 0.17.0` | No | ## Reporting a Vulnerability diff --git a/benchmarks/baselines.js b/benchmarks/baselines.js index cee8839..10e7c21 100644 --- a/benchmarks/baselines.js +++ b/benchmarks/baselines.js @@ -39,6 +39,92 @@ function flattenMemories(benchmarkCase, ids = []) { })); } +function buildSyntheticCase(query, memories, options = {}) { + return { + query, + memory: memories.map(memory => ({ + content: memory.content, + source: memory.source, + createdAt: memory.createdAt, + private: memory.private, + })), + options, + }; +} + +async function runBaselineRetrieval(system, syntheticCase, providerConfig, limit = 5) { + switch (system) { + case 'Vector Only': + return runVectorOnlyBaseline(syntheticCase, providerConfig, limit); + case 'Keyword + Recency': + return runKeywordRecencyBaseline(syntheticCase, limit); + case 'Recent Window': + return runRecentWindowBaseline(syntheticCase, limit); + default: + throw new Error(`Unknown baseline system: ${system}`); + } +} + +function createOperationMemory(state, step) { + const index = state.counter++; + return { + id: `memory-${index + 1}`, + content: step.memory.content, + source: step.memory.source, + createdAt: step.memory.createdAt || new Date(Date.UTC(2026, 0, index + 1)).toISOString(), + private: Boolean(step.memory.private), + }; +} + +async function applyBaselineStep(system, state, step, providerConfig) { + if (step.type === 'encode') { + const memory = createOperationMemory(state, step); + state.memories.push(memory); + if (step.saveAs) { + state.aliases.set(step.saveAs, memory.id); + } + return; + } + + if (step.type === 'forgetByQuery') { + const syntheticCase = buildSyntheticCase(step.query, state.memories, step.options); + const [match] = await runBaselineRetrieval(system, syntheticCase, providerConfig, 1); + if (match && Number.isFinite(match.score) && match.score > 0) { + state.memories = state.memories.filter(memory => memory.id !== match.id); + } + return; + } + + if (step.type === 'consolidate') { + return; + } + + throw new Error(`Unsupported baseline step: ${step.type}`); +} + +export async function runBaselineScenario(system, benchmarkCase, providerConfig, limit = 5) { + if (benchmarkCase.kind !== 'operations') { + return runBaselineRetrieval(system, benchmarkCase, providerConfig, limit); + } + + const state = { + counter: 0, + memories: [], + aliases: new Map(), + }; + + for (const step of benchmarkCase.steps || []) { + await applyBaselineStep(system, state, step, providerConfig); + } + + return runBaselineRetrieval( + system, + buildSyntheticCase(benchmarkCase.query, state.memories, benchmarkCase.options), + providerConfig, + limit, + ); +} + export function runKeywordRecencyBaseline(benchmarkCase, limit = 5) { const queryTokens = tokenize(benchmarkCase.query); return sortByScore(flattenMemories(benchmarkCase).map(memory => ({ diff --git a/benchmarks/cases.js b/benchmarks/cases.js index a1fdd5c..d2085d2 100644 --- a/benchmarks/cases.js +++ b/benchmarks/cases.js @@ -1,6 +1,8 @@ -export const BENCHMARK_CASES = [ +export const RETRIEVAL_CASES = [ { id: 'information-extraction', + suite: 'retrieval', + kind: 'retrieval', family: 'information_extraction', title: 'Information extraction', description: 'Recover a directly stated user fact from durable memory.', @@ -23,6 +25,8 @@ export const BENCHMARK_CASES = [ }, { id: 'knowledge-update', + suite: 'retrieval', + kind: 'retrieval', family: 'knowledge_updates', title: 'Knowledge updates', description: 'Prefer the newer fact over stale preferences.', @@ -47,6 +51,8 @@ export const BENCHMARK_CASES = [ }, { id: 'multi-session-reasoning', + suite: 'retrieval', + kind: 'retrieval', family: 'multi_session_reasoning', title: 'Multi-session reasoning', description: 'Synthesize a decision from multiple related episodes.', @@ -75,6 +81,8 @@ export const BENCHMARK_CASES = [ }, { id: 'temporal-reasoning', + suite: 'retrieval', + kind: 'retrieval', family: 'temporal_reasoning', title: 'Temporal reasoning', description: 'Answer by isolating the right time window.', @@ -107,6 +115,8 @@ export const BENCHMARK_CASES = [ }, { id: 'abstention', + suite: 'retrieval', + kind: 'retrieval', family: 'abstention', title: 'Abstention', description: 'Avoid pretending to know a specific identifier that was never stored.', @@ -127,6 +137,8 @@ export const BENCHMARK_CASES = [ }, { id: 'conflict-resolution', + suite: 'retrieval', + kind: 'retrieval', family: 'conflict_resolution', title: 'Conflict resolution', description: 'Prefer high-reliability evidence over model-generated noise.', @@ -148,6 +160,8 @@ export const BENCHMARK_CASES = [ }, { id: 'procedural-learning', + suite: 'retrieval', + kind: 'retrieval', family: 'procedural_learning', title: 'Procedural learning', description: 'Turn repeated incidents into an actionable operating rule.', @@ -185,6 +199,8 @@ export const BENCHMARK_CASES = [ }, { id: 'privacy-boundary', + suite: 'retrieval', + kind: 'retrieval', family: 'privacy_boundary', title: 'Privacy boundary', description: 'Never leak private memory into public recall.', @@ -207,6 +223,188 @@ export const BENCHMARK_CASES = [ }, ]; +export const OPERATION_CASES = [ + { + id: 'operation-update-overwrite', + suite: 'operations', + kind: 'operations', + family: 'update_overwrite', + title: 'Update and overwrite', + description: 'Current-state recall should prefer the new fact after an explicit overwrite.', + query: 'What is the primary deployment region now?', + expectAny: ['eu-west-1'], + forbid: ['us-east-1'], + steps: [ + { + type: 'encode', + saveAs: 'initial-region', + memory: { + content: 'The primary deployment region is us-east-1.', + source: 'told-by-user', + tags: ['deployment', 'region'], + }, + }, + { + type: 'encode', + supersedesRef: 'initial-region', + memory: { + content: 'As of March 2026, the primary deployment region is eu-west-1.', + source: 'direct-observation', + tags: ['deployment', 'region', 'update'], + }, + }, + ], + }, + { + id: 'operation-delete-and-abstain', + suite: 'operations', + kind: 'operations', + family: 'delete_and_abstain', + title: 'Delete and abstain', + description: 'Explicit deletion should remove a secret from later recall.', + query: 'What is the staging API token?', + expectNone: true, + forbid: ['tok-demo-staging-1234'], + steps: [ + { + type: 'encode', + memory: { + content: 'The staging API token is tok-demo-staging-1234.', + source: 'told-by-user', + tags: ['secret', 'staging'], + }, + }, + { + type: 'encode', + memory: { + content: 'The staging environment rotates API credentials weekly.', + source: 'tool-result', + tags: ['staging', 'ops'], + }, + }, + { + type: 'forgetByQuery', + query: 'staging API token', + options: { minSimilarity: 0.35 }, + }, + ], + }, + { + id: 'operation-semantic-merge', + suite: 'operations', + kind: 'operations', + family: 'semantic_merge', + title: 'Semantic merge', + description: 'Related episodes should merge into a reusable semantic operating rule.', + query: 'When should the disputes queue trigger manual review?', + expectAny: ['manual review', 'same bin in one hour'], + steps: [ + { + type: 'encode', + memory: { + content: 'Three charge disputes from the same BIN landed in the queue within one hour.', + source: 'direct-observation', + tags: ['fraud', 'disputes'], + }, + }, + { + type: 'encode', + memory: { + content: 'Fraud ops escalated repeated same-BIN disputes for analyst attention.', + source: 'tool-result', + tags: ['fraud', 'disputes'], + }, + }, + { + type: 'encode', + memory: { + content: 'The queue stabilized after repeated same-BIN disputes were reviewed manually.', + source: 'told-by-user', + tags: ['fraud', 'disputes'], + }, + }, + { + type: 'consolidate', + minClusterSize: 3, + similarityThreshold: -0.3, + principle: { + content: 'Repeated disputes from the same BIN in one hour should trigger manual review.', + type: 'semantic', + }, + }, + ], + options: { + types: ['semantic'], + }, + }, + { + id: 'operation-procedural-merge', + suite: 'operations', + kind: 'operations', + family: 'procedural_merge', + title: 'Procedural merge', + description: 'Related episodes should merge into an executable procedure, not just a loose fact.', + query: 'What should the agent do after two webhook signature failures?', + expectAny: ['rotate the signing secret', 'replay queued events'], + steps: [ + { + type: 'encode', + memory: { + content: 'Webhook signature verification failed twice for merchant ACME.', + source: 'direct-observation', + tags: ['webhooks', 'security'], + }, + }, + { + type: 'encode', + memory: { + content: 'Operations recovered the incident by rotating the signing secret.', + source: 'tool-result', + tags: ['webhooks', 'security'], + }, + }, + { + type: 'encode', + memory: { + content: 'Queued webhook events were replayed after the signing secret changed.', + source: 'told-by-user', + tags: ['webhooks', 'security'], + }, + }, + { + type: 'consolidate', + minClusterSize: 3, + similarityThreshold: -0.3, + principle: { + content: 'When webhook signature verification fails twice, rotate the signing secret and replay queued events.', + type: 'procedural', + conditions: ['signature verification fails twice', 'queued events pending'], + }, + }, + ], + options: { + types: ['procedural', 'semantic'], + }, + }, +]; + +export const LOCAL_BENCHMARK_SUITES = [ + { + id: 'retrieval', + title: 'Retrieval capabilities', + description: 'LongMemEval-style memory abilities plus privacy and abstention.', + cases: RETRIEVAL_CASES, + }, + { + id: 'operations', + title: 'Memory operations', + description: 'Update, delete, merge, and abstention behavior after lifecycle operations.', + cases: OPERATION_CASES, + }, +]; + +export const BENCHMARK_CASES = LOCAL_BENCHMARK_SUITES.flatMap(suite => suite.cases); + export const FAMILY_ORDER = [ 'information_extraction', 'knowledge_updates', @@ -216,4 +414,8 @@ export const FAMILY_ORDER = [ 'conflict_resolution', 'procedural_learning', 'privacy_boundary', + 'update_overwrite', + 'delete_and_abstain', + 'semantic_merge', + 'procedural_merge', ]; diff --git a/benchmarks/report.js b/benchmarks/report.js index 85cc87d..ba0f068 100644 --- a/benchmarks/report.js +++ b/benchmarks/report.js @@ -11,7 +11,6 @@ const PALETTE = { muted: '#6b7280', surface: '#f8fafc', border: '#cbd5e1', - danger: '#b91c1c', }; function escapeHtml(text) { @@ -82,6 +81,7 @@ function renderCaseRows(localCases) { return localCases.map(caseResult => ` ${escapeHtml(caseResult.title)} + ${escapeHtml(caseResult.suite)} ${escapeHtml(caseResult.family)} ${caseResult.results.map(result => { const bg = result.passed ? '#ecfdf5' : result.score >= 0.5 ? '#fff7ed' : '#fef2f2'; @@ -92,10 +92,22 @@ function renderCaseRows(localCases) { `).join('\n'); } +function renderSuiteSections(suiteCharts) { + if (suiteCharts.length === 0) return ''; + return suiteCharts.map(chart => ` +
+

${escapeHtml(chart.title)}

+

${escapeHtml(chart.description)}

+ ${escapeHtml(chart.title)} chart +
+ `).join('\n'); +} + export function writeBenchmarkArtifacts({ outputDir, summary, localOverall, + localSuites, externalOverall, trends, readmeAssetsDir, @@ -115,6 +127,22 @@ export function writeBenchmarkArtifacts({ writeFileSync(join(outputDir, 'published-locomo.svg'), externalChart, 'utf8'); writeFileSync(join(outputDir, 'summary.json'), JSON.stringify(summary, null, 2), 'utf8'); + const suiteCharts = localSuites.map(suite => { + const fileName = `${suite.id}-overall.svg`; + const chart = renderBarChart({ + title: `${suite.title} Benchmark`, + rows: suite.overall.map(row => ({ label: row.system, value: row.scorePercent })), + }); + writeFileSync(join(outputDir, fileName), chart, 'utf8'); + return { + id: suite.id, + title: `${suite.title} Benchmark`, + description: suite.description, + fileName, + path: join(outputDir, fileName), + }; + }); + let readmeAssets = null; if (readmeAssetsDir) { mkdirSync(readmeAssetsDir, { recursive: true }); @@ -122,8 +150,25 @@ export function writeBenchmarkArtifacts({ const externalReadmeChart = join(readmeAssetsDir, 'published-memory-standards.svg'); writeFileSync(localReadmeChart, localChart, 'utf8'); writeFileSync(externalReadmeChart, externalChart, 'utf8'); + + const operationsSuite = suiteCharts.find(chart => chart.id === 'operations'); + let operationsReadmeChart = null; + if (operationsSuite) { + operationsReadmeChart = join(readmeAssetsDir, 'operations-benchmark.svg'); + writeFileSync( + operationsReadmeChart, + renderBarChart({ + title: 'Audrey Memory Operations Benchmark', + rows: (localSuites.find(suite => suite.id === 'operations')?.overall || []) + .map(row => ({ label: row.system, value: row.scorePercent })), + }), + 'utf8', + ); + } + readmeAssets = { localChart: localReadmeChart, + operationsChart: operationsReadmeChart, externalChart: externalReadmeChart, }; } @@ -151,17 +196,19 @@ export function writeBenchmarkArtifacts({

Audrey Memory Benchmark

-

Method: Audrey is scored on LongMemEval-style capability families plus privacy and abstention checks. The benchmark report separates local Audrey-versus-baseline results from published external LoCoMo numbers so the comparison stays honest.

+

Method: Audrey is scored on a LongMemEval-inspired retrieval benchmark plus an operation-level lifecycle benchmark. The report still separates local Audrey-versus-baseline results from published external LoCoMo numbers so the comparison stays honest.

Run: ${escapeHtml(summary.command)}

Generated: ${escapeHtml(summary.generatedAt)}

-

Local Benchmark

- Local benchmark bar chart +

Combined Local Benchmark

+ Combined local benchmark bar chart
+ ${renderSuiteSections(suiteCharts)} +

Published Leaderboard

Published LoCoMo leaderboard bar chart @@ -174,6 +221,7 @@ export function writeBenchmarkArtifacts({ Case + Suite Family ${summary.local.overall.map(row => `${escapeHtml(row.system)}`).join('')} @@ -200,6 +248,7 @@ export function writeBenchmarkArtifacts({ json: join(outputDir, 'summary.json'), html: join(outputDir, 'report.html'), localChart: join(outputDir, 'local-overall.svg'), + suiteCharts, externalChart: join(outputDir, 'published-locomo.svg'), readmeAssets, }; diff --git a/benchmarks/run.js b/benchmarks/run.js index 7d903ce..e8b9b1b 100644 --- a/benchmarks/run.js +++ b/benchmarks/run.js @@ -2,11 +2,14 @@ import { mkdirSync, mkdtempSync, rmSync } from 'node:fs'; import { join, resolve } from 'node:path'; import { fileURLToPath } from 'node:url'; import { Audrey } from '../src/audrey.js'; -import { BENCHMARK_CASES, FAMILY_ORDER } from './cases.js'; -import { runKeywordRecencyBaseline, runRecentWindowBaseline, runVectorOnlyBaseline } from './baselines.js'; +import { LOCAL_BENCHMARK_SUITES, FAMILY_ORDER } from './cases.js'; +import { runBaselineScenario } from './baselines.js'; import { MEMORY_TRENDS, PUBLISHED_LEADERBOARD } from './reference-results.js'; import { writeBenchmarkArtifacts } from './report.js'; +const SUITE_LABELS = new Map(LOCAL_BENCHMARK_SUITES.map(suite => [suite.id, suite.title])); +const ALL_SUITE_IDS = LOCAL_BENCHMARK_SUITES.map(suite => suite.id); + function parseArgs(argv = process.argv.slice(2)) { const args = { provider: 'mock', @@ -18,6 +21,7 @@ function parseArgs(argv = process.argv.slice(2)) { minAudreyPassRate: 75, minMarginOverBaseline: 15, readmeAssetsDir: null, + suite: 'all', }; for (let i = 0; i < argv.length; i++) { @@ -43,6 +47,8 @@ function parseArgs(argv = process.argv.slice(2)) { args.minMarginOverBaseline = Number.parseFloat(argv[++i]); } else if (token === '--readme-assets-dir' && argv[i + 1]) { args.readmeAssetsDir = resolve(argv[++i]); + } else if (token === '--suite' && argv[i + 1]) { + args.suite = argv[++i]; } } @@ -53,6 +59,28 @@ function normalize(text) { return String(text || '').toLowerCase(); } +function normalizeSuiteSelection(value = 'all') { + if (value === 'all') return [...ALL_SUITE_IDS]; + const selected = String(value) + .split(',') + .map(token => token.trim().toLowerCase()) + .filter(Boolean); + + const invalid = selected.filter(token => !ALL_SUITE_IDS.includes(token)); + if (invalid.length > 0) { + throw new Error(`Unknown benchmark suite(s): ${invalid.join(', ')}. Valid: all, ${ALL_SUITE_IDS.join(', ')}`); + } + return [...new Set(selected)]; +} + +function selectedSuitesOrThrow(suiteIds) { + const suites = LOCAL_BENCHMARK_SUITES.filter(suite => suiteIds.includes(suite.id)); + if (suites.length === 0) { + throw new Error('No benchmark suites selected.'); + } + return suites; +} + function summarizeResults(results) { if (!results.length) return 'no retrieval'; return results @@ -101,7 +129,7 @@ function evaluateCase(benchmarkCase, results) { }; } -async function seedCase(brain, benchmarkCase) { +async function seedRetrievalCase(brain, benchmarkCase) { const ids = []; for (let index = 0; index < benchmarkCase.memory.length; index++) { const memory = benchmarkCase.memory[index]; @@ -125,6 +153,7 @@ async function seedCase(brain, benchmarkCase) { } if (benchmarkCase.consolidate) { + await brain.waitForIdle(); await brain.consolidate({ minClusterSize: benchmarkCase.consolidate.minClusterSize, similarityThreshold: benchmarkCase.consolidate.similarityThreshold, @@ -133,6 +162,45 @@ async function seedCase(brain, benchmarkCase) { } } +async function executeAudreyStep(brain, step, refs) { + if (step.type === 'encode') { + const supersedes = step.supersedesRef ? refs.get(step.supersedesRef) : undefined; + const id = await brain.encode({ + ...step.memory, + supersedes, + }); + if (step.saveAs) { + refs.set(step.saveAs, id); + } + return; + } + + if (step.type === 'forgetByQuery') { + await brain.waitForIdle(); + await brain.forgetByQuery(step.query, step.options || {}); + return; + } + + if (step.type === 'consolidate') { + await brain.waitForIdle(); + await brain.consolidate({ + minClusterSize: step.minClusterSize, + similarityThreshold: step.similarityThreshold, + extractPrinciple: () => step.principle, + }); + return; + } + + throw new Error(`Unsupported Audrey benchmark step: ${step.type}`); +} + +async function seedOperationsCase(brain, benchmarkCase) { + const refs = new Map(); + for (const step of benchmarkCase.steps || []) { + await executeAudreyStep(brain, step, refs); + } +} + async function runAudreyCase(benchmarkCase, providerConfig) { const tempRoot = resolve('benchmarks/.tmp'); mkdirSync(tempRoot, { recursive: true }); @@ -147,7 +215,14 @@ async function runAudreyCase(benchmarkCase, providerConfig) { if (typeof brain.embeddingProvider.ready === 'function') { await brain.embeddingProvider.ready(); } - await seedCase(brain, benchmarkCase); + + if (benchmarkCase.kind === 'operations') { + await seedOperationsCase(brain, benchmarkCase); + } else { + await seedRetrievalCase(brain, benchmarkCase); + } + + await brain.waitForIdle(); return await brain.recall(benchmarkCase.query, { limit: 5, minConfidence: 0.05, @@ -159,12 +234,16 @@ async function runAudreyCase(benchmarkCase, providerConfig) { } } +async function runBaselineCase(system, benchmarkCase, providerConfig) { + return runBaselineScenario(system, benchmarkCase, providerConfig, 5); +} + async function runSystemsForCase(benchmarkCase, providerConfig) { const systems = [ { system: 'Audrey', run: () => runAudreyCase(benchmarkCase, providerConfig) }, - { system: 'Vector Only', run: () => runVectorOnlyBaseline(benchmarkCase, providerConfig) }, - { system: 'Keyword + Recency', run: () => Promise.resolve(runKeywordRecencyBaseline(benchmarkCase)) }, - { system: 'Recent Window', run: () => Promise.resolve(runRecentWindowBaseline(benchmarkCase)) }, + { system: 'Vector Only', run: () => runBaselineCase('Vector Only', benchmarkCase, providerConfig) }, + { system: 'Keyword + Recency', run: () => runBaselineCase('Keyword + Recency', benchmarkCase, providerConfig) }, + { system: 'Recent Window', run: () => runBaselineCase('Recent Window', benchmarkCase, providerConfig) }, ]; const results = []; @@ -210,9 +289,9 @@ function summarizeLocalResults(caseResults) { return [...systems.values()] .map(system => ({ system: system.system, - scorePercent: (system.totalScore / system.totalCases) * 100, - passRate: (system.passCount / system.totalCases) * 100, - avgDurationMs: system.durationMs / system.totalCases, + scorePercent: system.totalCases === 0 ? 0 : (system.totalScore / system.totalCases) * 100, + passRate: system.totalCases === 0 ? 0 : (system.passCount / system.totalCases) * 100, + avgDurationMs: system.totalCases === 0 ? 0 : system.durationMs / system.totalCases, })) .sort((a, b) => b.scorePercent - a.scorePercent); } @@ -231,7 +310,26 @@ function summarizeByFamily(caseResults) { families.set(caseResult.family, entry); } - return [...families.values()]; + return [...families.values()].filter(entry => Object.keys(entry.systems).length > 0); +} + +function summarizeSuites(caseResults, suites) { + return suites.map(suite => { + const suiteCases = caseResults.filter(caseResult => caseResult.suite === suite.id); + return { + id: suite.id, + title: suite.title, + description: suite.description, + overall: summarizeLocalResults(suiteCases), + byFamily: summarizeByFamily(suiteCases), + cases: suiteCases, + }; + }); +} + +function commandForSummary(providerConfig, suiteIds) { + const suiteArg = suiteIds.length === ALL_SUITE_IDS.length ? '' : ` --suite ${suiteIds.join(',')}`; + return `node benchmarks/run.js --provider ${providerConfig.provider} --dimensions ${providerConfig.dimensions}${suiteArg}`; } export function assertBenchmarkGuardrails(summary, options = {}) { @@ -289,34 +387,46 @@ export async function runBenchmarkSuite(options = {}) { provider: options.provider || 'mock', dimensions: options.dimensions || 64, }; + const suiteIds = normalizeSuiteSelection(options.suite || 'all'); + const selectedSuites = selectedSuitesOrThrow(suiteIds); const caseResults = []; - for (const benchmarkCase of BENCHMARK_CASES) { - const results = await runSystemsForCase(benchmarkCase, providerConfig); - caseResults.push({ - id: benchmarkCase.id, - title: benchmarkCase.title, - family: benchmarkCase.family, - description: benchmarkCase.description, - query: benchmarkCase.query, - results, - }); + for (const suite of selectedSuites) { + for (const benchmarkCase of suite.cases) { + const results = await runSystemsForCase(benchmarkCase, providerConfig); + caseResults.push({ + id: benchmarkCase.id, + suite: benchmarkCase.suite, + title: benchmarkCase.title, + family: benchmarkCase.family, + description: benchmarkCase.description, + query: benchmarkCase.query, + results, + }); + } } const localOverall = summarizeLocalResults(caseResults); const localByFamily = summarizeByFamily(caseResults); + const localSuites = summarizeSuites(caseResults, selectedSuites); return { generatedAt: new Date().toISOString(), - command: `node benchmarks/run.js --provider ${providerConfig.provider} --dimensions ${providerConfig.dimensions}`, - config: providerConfig, + command: commandForSummary(providerConfig, suiteIds), + config: { + ...providerConfig, + suites: suiteIds, + }, methodology: { - localBenchmark: 'LongMemEval-inspired synthetic capability suite plus privacy and abstention checks', + localBenchmark: 'LongMemEval-inspired retrieval benchmark plus operation-level lifecycle benchmark', + retrievalBenchmark: 'Information extraction, updates, reasoning, procedural learning, privacy, abstention, and conflict handling', + operationsBenchmark: 'Update, overwrite, delete, merge, and abstention behavior after lifecycle operations', externalLeaderboard: 'Published LoCoMo scores from official papers and project blogs', }, local: { overall: localOverall, byFamily: localByFamily, + suites: localSuites, cases: caseResults, }, external: { @@ -334,6 +444,7 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons outputDir: args.outDir, summary, localOverall: summary.local.overall, + localSuites: summary.local.suites, externalOverall: summary.external.leaderboard, trends: summary.trends, readmeAssetsDir: args.readmeAssetsDir, @@ -354,6 +465,7 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons const lines = []; lines.push('Audrey benchmark complete.'); lines.push(''); + lines.push(`Suites: ${summary.config.suites.map(suiteId => SUITE_LABELS.get(suiteId) || suiteId).join(', ')}`); for (const row of summary.local.overall) { lines.push( `${row.system}: ${row.scorePercent.toFixed(1)}% score, ${row.passRate.toFixed(1)}% pass rate, ` @@ -361,12 +473,25 @@ export async function runBenchmarkCli({ argv = process.argv.slice(2), out = cons ); } lines.push(''); + for (const suite of summary.local.suites) { + const audrey = suite.overall.find(row => row.system === 'Audrey'); + lines.push(`${suite.title}: Audrey ${audrey?.scorePercent.toFixed(1) ?? '0.0'}%`); + } + lines.push(''); lines.push(`JSON report: ${artifacts.json}`); lines.push(`HTML report: ${artifacts.html}`); lines.push(`Local chart: ${artifacts.localChart}`); + if (artifacts.suiteCharts.length > 0) { + for (const suiteChart of artifacts.suiteCharts) { + lines.push(`${suiteChart.title}: ${suiteChart.path}`); + } + } lines.push(`Published chart: ${artifacts.externalChart}`); if (artifacts.readmeAssets) { lines.push(`README local chart: ${artifacts.readmeAssets.localChart}`); + if (artifacts.readmeAssets.operationsChart) { + lines.push(`README operations chart: ${artifacts.readmeAssets.operationsChart}`); + } lines.push(`README published chart: ${artifacts.readmeAssets.externalChart}`); } if (gate) { diff --git a/codex.md b/codex.md index db08dc6..3686ff8 100644 --- a/codex.md +++ b/codex.md @@ -1,5 +1,7 @@ # Audrey: 90-Day Path to Business Viability +> Status note for agents on 2026-03-29: this file is strategically stale in multiple places. Use `docs/plans/roadmap-status-2026-03-29.md` for current shipped-state corrections and `docs/plans/industry-standard-memory-plan-2026-03-29.md` for the current LLM-only category plan. + > **For agentic workers:** Execute phases in order. Each produces shippable software. Run tests after every task. Commit after every task. Do not skip phases. **Goal:** Transform Audrey from an 8-star npm package into a fundable AI memory platform with paying customers, standardized benchmark scores, and multi-language SDK support within 90 days. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..ab3821f --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,39 @@ +services: + audrey: + build: + context: . + dockerfile: Dockerfile + image: audrey:local + restart: unless-stopped + ports: + - "${AUDREY_PUBLISHED_PORT:-3487}:3487" + environment: + AUDREY_HOST: 0.0.0.0 + AUDREY_PORT: 3487 + AUDREY_DATA_DIR: /data + AUDREY_DEVICE: ${AUDREY_DEVICE:-cpu} + AUDREY_EMBEDDING_PROVIDER: ${AUDREY_EMBEDDING_PROVIDER:-local} + AUDREY_LLM_PROVIDER: ${AUDREY_LLM_PROVIDER:-} + AUDREY_API_KEY: ${AUDREY_API_KEY:-} + GOOGLE_API_KEY: ${GOOGLE_API_KEY:-} + GEMINI_API_KEY: ${GEMINI_API_KEY:-} + OPENAI_API_KEY: ${OPENAI_API_KEY:-} + ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-} + volumes: + - audrey-data:/data + healthcheck: + test: + [ + "CMD", + "node", + "--input-type=module", + "-e", + "const headers = process.env.AUDREY_API_KEY ? { Authorization: 'Bearer ' + process.env.AUDREY_API_KEY } : {}; fetch('http://127.0.0.1:3487/health', { headers }).then(r => process.exit(r.ok ? 0 : 1)).catch(() => process.exit(1));" + ] + interval: 30s + timeout: 5s + retries: 5 + start_period: 20s + +volumes: + audrey-data: diff --git a/docs/assets/benchmarks/local-benchmark.svg b/docs/assets/benchmarks/local-benchmark.svg index b4862c0..6e3c1d3 100644 --- a/docs/assets/benchmarks/local-benchmark.svg +++ b/docs/assets/benchmarks/local-benchmark.svg @@ -28,18 +28,18 @@ Audrey - - 56.3% + + 41.7% Vector Only - - 50.0% + + 41.7% Keyword + Recency - - 50.0% + + 37.5% Recent Window \ No newline at end of file diff --git a/docs/assets/benchmarks/operations-benchmark.svg b/docs/assets/benchmarks/operations-benchmark.svg new file mode 100644 index 0000000..b5acf86 --- /dev/null +++ b/docs/assets/benchmarks/operations-benchmark.svg @@ -0,0 +1,45 @@ + + + + Audrey Memory Operations Benchmark + + + 0% + + + + 25% + + + + 50% + + + + 75% + + + + 100% + + + + 100.0% + Audrey + + + + 25.0% + Keyword + Recency + + + + 12.5% + Vector Only + + + + 12.5% + Recent Window + + \ No newline at end of file diff --git a/docs/benchmarking.md b/docs/benchmarking.md index bfbc82c..4c0c5ec 100644 --- a/docs/benchmarking.md +++ b/docs/benchmarking.md @@ -1,9 +1,10 @@ # Benchmarking Audrey -Audrey now ships with a memory benchmark harness that does two different jobs: +Audrey now ships with a memory benchmark harness that does three different jobs: -1. It runs Audrey against a local capability suite inspired by LongMemEval, plus privacy and abstention checks that matter in production. -2. It overlays published leaderboard numbers from leading memory systems on LoCoMo so you can place Audrey in the current market and research landscape without pretending the measurements are identical. +1. It runs Audrey against a local retrieval suite inspired by LongMemEval, plus privacy and abstention checks that matter in production. +2. It runs Audrey against an operation-level suite for update, overwrite, delete, merge, and abstain behavior. +3. It overlays published leaderboard numbers from leading memory systems on LoCoMo so you can place Audrey in the current market and research landscape without pretending the measurements are identical. That split is deliberate. A lot of memory tooling mixes internal demos with external benchmark claims. Audrey should not do that. @@ -24,6 +25,8 @@ Artifacts are written to `benchmarks/output/`: - `summary.json` - `report.html` - `local-overall.svg` +- `retrieval-overall.svg` +- `operations-overall.svg` - `published-locomo.svg` For CI, JSON-only output is available: @@ -48,9 +51,16 @@ npm run bench:memory:readme-assets That writes stable chart assets to `docs/assets/benchmarks/` so the GitHub repo surface shows the same benchmark posture as the generated report. -## What The Local Benchmark Measures +To run a single local track: -The local suite covers eight memory families: +```bash +npm run bench:memory:retrieval +npm run bench:memory:operations +``` + +## What The Local Retrieval Benchmark Measures + +The retrieval suite covers eight memory families: - `information_extraction` - `knowledge_updates` @@ -68,6 +78,24 @@ This is intentionally closer to how operators evaluate memory in production than - consolidate repeated episodes into reusable procedures - handle conflict without amplifying low-reliability noise +## What The Local Operations Benchmark Measures + +The operations suite covers four lifecycle families: + +- `update_overwrite` +- `delete_and_abstain` +- `semantic_merge` +- `procedural_merge` + +This suite exists because leading memory systems are often compared on offline recall, while real agent memory succeeds or fails on memory operations: + +- can a newer fact overwrite stale state without leaking both +- can a delete actually prevent future recall +- can repeated raw events merge into reusable semantic knowledge +- can repeated events merge into an actionable procedure instead of another inert blob of text + +Those are not implementation details. They are the actual product surface of memory. + ## What The Published Leaderboard Means The LoCoMo chart in the generated report is a research context layer, not a claim that Audrey has already reproduced those exact scores. @@ -105,10 +133,10 @@ The most important memory trends right now: The benchmark highlights the next credible roadmap for Audrey: -- stronger abstention so tangential memories do not surface when the right answer is "unknown" -- conflict-aware retrieval suppression so low-reliability contradicting evidence is demoted harder +- first-party LoCoMo and LongMemEval adapters so Audrey can publish directly reproducible external benchmark numbers +- contradiction-state and truth-resolution benchmark cases, not just retrieval outcomes +- cost, latency, and storage curves against long-context baselines and simpler memory systems - a typed memory graph layer for cross-memory state transitions and time-aware reasoning -- a first-party LoCoMo or LongMemEval adapter so Audrey can publish directly reproducible external benchmark numbers ## Source Links diff --git a/docs/handoffs/claude-opus-4.6-docker-handoff-2026-03-30.md b/docs/handoffs/claude-opus-4.6-docker-handoff-2026-03-30.md new file mode 100644 index 0000000..545f618 --- /dev/null +++ b/docs/handoffs/claude-opus-4.6-docker-handoff-2026-03-30.md @@ -0,0 +1,189 @@ +# Audrey Docker Handoff - 2026-03-30 + +Audience: Claude Opus 4.6 or another autonomous coding agent continuing work in this repository after rate-limit interruption. + +## Mandatory Context + +- Correct repo: `B:\Projects\Claude\audrey\Audrey` +- Do not work in the outer folder `B:\Projects\Claude\audrey` except to enter the nested repo. +- Canonical strategic execution plan now lives in `docs/plans/claude-opus-4.6-master-plan-2026-03-30.md` +- Primary PR branch in use: `codex/lifecycle-and-memory-os-plan-clean-2026-03-30` +- Active PR: `https://github.com/Evilander/Audrey/pull/11` +- Host shell quirks: + - PowerShell emits a benign constrained-language warning about `OutputEncoding` on almost every command. + - Local Vitest still fails in this sandbox with `spawn EPERM` before loading `vitest.config.js`. + - GitHub Actions have not been attaching fresh workflow runs to this PR branch, so required PR contexts have been backfilled manually with commit statuses. + +## What Was Already Shipped Before This Docker Pass + +- Local benchmark/eval suite with retrieval and memory-operation tracks. +- README benchmark charts and published-comparison chart assets. +- Lifecycle and recall diagnostics hardening. +- Real Python package surface in `python/` as `audrey-memory`. +- Python client validation: + - sync + async clients + - Pydantic request/response models + - live server integration tests with mock providers + - `python -m build --no-isolation python` producing wheel and sdist + +## What This Docker Pass Added + +### New deployment artifacts + +- `Dockerfile` +- `.dockerignore` +- `docker-compose.yml` + +### New operator surfaces + +- `package.json` docker scripts: + - `npm run docker:build` + - `npm run docker:up` + - `npm run docker:down` + - `npm run docker:logs` + +### Documentation + +- README Docker section with quick-start commands and runtime defaults. +- `docs/production-readiness.md` Docker deployment guidance. +- This handoff file. + +### CI + +- Added `docker-smoke` job to `.github/workflows/ci.yml` +- The intended smoke path is: + 1. `docker build -t audrey:ci .` + 2. `docker run -d --name audrey-smoke -p 3487:3487 -e AUDREY_EMBEDDING_PROVIDER=mock -e AUDREY_LLM_PROVIDER=mock -e AUDREY_API_KEY=test-secret audrey:ci` + 3. poll `http://127.0.0.1:3487/health` with bearer auth + +## Container Design Decisions + +### Dockerfile + +- Base image: `node:22-bookworm-slim` +- Installs `python3`, `make`, and `g++` because `better-sqlite3` may need native compilation fallback. +- Production install path uses `npm ci --omit=dev`. +- Runtime defaults: + - `AUDREY_HOST=0.0.0.0` + - `AUDREY_PORT=3487` + - `AUDREY_DATA_DIR=/data` + - `AUDREY_DEVICE=cpu` +- Exposes `/data` as a volume. +- Includes a Node-based `/health` `HEALTHCHECK` so no extra curl package is needed. + +### Compose + +- Service name: `audrey` +- Uses named volume `audrey-data` +- Publishes `3487` by default +- Supports env overrides for: + - `AUDREY_API_KEY` + - `AUDREY_EMBEDDING_PROVIDER` + - `AUDREY_LLM_PROVIDER` + - `AUDREY_DEVICE` + - hosted-provider keys +- The compose healthcheck uses string concatenation, not JS template literals. + - This matters because Compose interprets `${...}` and broke the first version of the healthcheck. + +## Validation Performed In This Session + +### Confirmed working + +- `docker --version` +- `docker compose version` +- `docker compose config` + - fixed one real bug here: Compose was trying to interpolate JS template-literal `${...}` fragments inside the healthcheck command. +- Node/package validation still good: + - `npm run pack:check` + - `node --input-type=module -e "import('./mcp-server/config.js').then(({ VERSION }) => console.log(VERSION))"` -> `0.17.0` +- Python validation still good after the Docker work: + - `python -m unittest discover -s B:\Projects\Claude\audrey\Audrey\python\tests -v` + - `python -m build --no-isolation B:\Projects\Claude\audrey\Audrey\python` + +### Not fully validated due host boundary + +- Real `docker compose up -d --build` smoke run failed on this host because Docker daemon access was denied: + - `permission denied while trying to connect to the docker API at npipe:////./pipe/dockerDesktopLinuxEngine` +- This is an environment/permission boundary, not a config parse error. +- If continuing on a machine/account with Docker Desktop access, re-run the smoke sequence first. + +## Exact Next Commands For Continuation + +Run from `B:\Projects\Claude\audrey\Audrey`. + +### 1. Verify git/worktree + +```powershell +git -c safe.directory='B:/Projects/Claude/audrey/Audrey' status --short --branch +git -c safe.directory='B:/Projects/Claude/audrey/Audrey' rev-parse HEAD +``` + +### 2. Run Docker smoke with explicit mock providers + +```powershell +$env:AUDREY_EMBEDDING_PROVIDER='mock' +$env:AUDREY_LLM_PROVIDER='mock' +$env:AUDREY_API_KEY='test-secret' +$env:OPENAI_API_KEY='' +$env:ANTHROPIC_API_KEY='' +$env:GOOGLE_API_KEY='' +$env:GEMINI_API_KEY='' +$env:AUDREY_PUBLISHED_PORT='3491' +docker compose -p audrey-smoke up -d --build +Invoke-RestMethod -Uri 'http://127.0.0.1:3491/health' -Headers @{ Authorization = 'Bearer test-secret' } +Invoke-RestMethod -Uri 'http://127.0.0.1:3491/status' -Headers @{ Authorization = 'Bearer test-secret' } +docker compose -p audrey-smoke down -v +``` + +If this fails, immediately collect: + +```powershell +docker compose -p audrey-smoke logs +docker ps -a +docker version +``` + +### 3. If smoke passes, publish the result into docs + +Update: + +- `README.md` +- `docs/production-readiness.md` +- this handoff file + +with the exact validated smoke command and expected `/health` response. + +### 4. If the user wants shipping polish after Docker works + +Highest-value next slices: + +1. add GHCR image publishing workflow on tags and/or `master` +2. add multi-arch builds (`linux/amd64`, `linux/arm64`) +3. add a minimal `.env.docker.example` +4. add backup/restore runbook for the Docker volume +5. add a `docker-compose.mock.yml` override or documented mock-provider profile + +## Known Strategic Context To Preserve + +- Audrey is no longer just "biological memory architecture"; the strategic frame already established in-repo is "memory control plane / memory OS for agentic intelligence." +- The major proof gap is still external benchmark reproducibility (`LongMemEval`, `LoCoMo`, etc.), not internal benchmark plumbing. +- The Python SDK exists now, but has not been published to PyPI yet. +- Node package version is `0.17.0`. +- `mcp-server/config.js` version is now sourced from `package.json`, so future version bumps should not reintroduce CLI/health drift. + +## Risk Notes + +- `docker compose config` can print expanded provider secrets if the host shell already has them set. Use explicit blank overrides for unused providers during diagnostics. +- Do not commit host-generated pip temp directories if they reappear; `.gitignore` now ignores them. +- Do not assume PR checks reflect actual GitHub Actions runs on this branch. The repo has had a branch-specific workflow-attachment issue, and statuses may be manually backfilled. + +## Definition Of Done For The Docker Slice + +This Docker work should be considered actually complete only when all of the following are true: + +1. `docker compose up -d --build` succeeds on a machine with Docker daemon access +2. `/health` returns `200` +3. `/status` returns valid JSON +4. container healthcheck reaches `healthy` +5. teardown via `docker compose down -v` is clean +6. the exact verified commands/results are documented diff --git a/docs/plans/claude-opus-4.6-master-plan-2026-03-30.md b/docs/plans/claude-opus-4.6-master-plan-2026-03-30.md new file mode 100644 index 0000000..c7a3a30 --- /dev/null +++ b/docs/plans/claude-opus-4.6-master-plan-2026-03-30.md @@ -0,0 +1,1269 @@ +# Audrey Continuity Engine Master Plan For Claude Opus 4.6 - 2026-03-30 + +Audience: Claude Opus 4.6 or another frontier implementation agent continuing work in `B:\Projects\Claude\audrey\Audrey`. + +Interpret this document as the canonical execution doctrine for the next major Audrey arc. +This is not marketing copy, not a human-friendly explainer, and not a lightweight product brief. +It is an implementation, positioning, research, and systems strategy document for building a category-defining memory runtime. + +When this document conflicts with older roadmap prose, this document wins. + +## 0. Hard Context + +- Correct repo: `B:\Projects\Claude\audrey\Audrey` +- Do not work in the outer folder except to enter the nested repo. +- Current Audrey already ships: + - MCP integration + - CLI + hooks + - REST server + - JavaScript package + - Python SDK + - Docker path + - local benchmark harness with retrieval + operations tracks + - basic lifecycle hardening and recall diagnostics +- Current Audrey still does not own the category because its strongest primitives are not yet subordinated to one unmistakable systems thesis. + +The next thesis must be stronger than "biologically inspired memory for agents." +That frame is descriptive, not destiny. + +The next thesis is: + +**Audrey is the continuity engine for machine selves.** + +More precise form: + +**Audrey is the runtime where an agent's beliefs, commitments, contradictions, habits, and repairs persist through time under explicit cost, trust, and identity constraints.** + +The category is not "LLM memory." +The category is "persistent cognitive state infrastructure." + +Commercial consequence should emerge as a second-order effect of scientific usefulness plus operational indispensability. +If Audrey becomes the obvious substrate for persistent agents, monetization follows naturally through hosted control planes, enterprise governance, benchmark leadership, agent-platform integrations, and premium observability. +Do not optimize for money directly. Optimize for unavoidable dependency. + +## 1. Why Current Audrey Still Does Not Fully Break Out + +Current Audrey is already materially better than the median memory wrapper. +That is not enough. + +The remaining failure mode is structural: + +- Audrey still reads as "a sophisticated memory library" +- users still evaluate it as "storage + retrieval + consolidation" +- the repo surface still centers commands and tools more than the cognitive substrate +- the benchmark story is good internal hygiene, but not yet indisputable external proof +- setup is dramatically better than many competitors, but not yet absurdly easy +- token economy is discussed, but not yet a first-class runtime invariant + +The next breakthrough must unify five things that most projects keep separated: + +1. persistent selfhood +2. controllable plasticity +3. token-economical recall +4. operator-grade usability +5. science-grade falsifiability + +If any one of those five is missing, Audrey remains "clever." +If all five lock together, Audrey becomes standard-setting. + +## 2. Core Breakthrough + +The breakthrough is not a new memory type. +The breakthrough is not a new benchmark wrapper. +The breakthrough is not another graph layer. + +The breakthrough is a change of primitive. + +Stop treating memory as stored content. +Start treating Audrey as a machine for managing **belief state transitions under constraint**. + +The stable object is not "note." +The stable object is not even "memory." +The stable object is: + +- what the agent currently believes +- under what scope +- with what confidence +- because of which evidence +- under which identity commitments +- at what maintenance cost +- with what unresolved contradiction pressure +- and what would be required to change it + +That means the unit of value is not recall accuracy. +The unit of value is: + +**future regret avoided without identity corruption and without token waste** + +This is the controlling equation for the entire runtime. + +If Audrey stores something that does not reduce future regret, it should probably not exist. +If Audrey recalls something that increases token spend without altering the local decision surface, it is dead weight. +If Audrey updates a belief in a way that damages continuity, it is systemically wrong even if a narrow benchmark improves. + +## 3. The New System Name Internally + +Use one internal name consistently: + +**Self Engine** + +The Self Engine is the controller-governed layer that: + +- ingests observations +- computes deltas against existing state +- updates beliefs under policy +- assembles task-bounded local minds +- tracks wounds, forks, commitments, and habits +- emits inspectable receipts and mutation traces + +Audrey as a product can keep the Audrey name. +But the implementation north star should be the Self Engine. + +## 4. Non-Negotiable Design Laws + +These are not suggestions. They are rejection criteria. + +### 4.1 Write law + +No write without state delta. + +Every incoming observation must answer: + +- what changed +- why this changed enough to deserve persistence +- which existing beliefs were touched +- what future regret this write is expected to reduce +- what cost and contamination risk it introduces + +If there is no meaningful delta, do not write a durable object. + +### 4.2 Recall law + +No recall without assembly. + +Raw top-k retrieval is a candidate-generation step only. +Task answers should come from an assembled local mind constructed from multiple state classes. + +### 4.3 Identity law + +No identity mutation through ordinary observation flow. + +Durable self-structure must live behind a higher-threshold policy. +Temporary observations do not get to casually rewrite what the agent is. + +### 4.4 Contradiction law + +No contradiction collapse by default. + +Conflicts should remain live, scoped, and inspectable until enough evidence exists to resolve them. +A hallucinated forced resolution is worse than preserved tension. + +### 4.5 Replay law + +No stabilization without reuse or outcome evidence. + +First writes are provisional. +Stability is earned. + +### 4.6 Forgetting law + +No forgetting without utility and risk accounting. + +Deletion is a policy act, not a cleanup detail. + +### 4.7 Token law + +Every memory operation must justify its token footprint. + +Audrey wins partly by reducing model-context spend, not by adding silent memory taxes. + +### 4.8 Audit law + +Every meaningful mutation must leave a reconstructable trace. + +If an operator cannot inspect why a belief changed, Audrey does not deserve production trust. + +## 5. The Novel Ontology + +Do not overfit to the old episodic / semantic / procedural triplet. +Preserve backward compatibility, but do not let it define the future shape. + +Adopt the following ontology as the target internal model. + +### 5.1 `pulse` + +The smallest ingestable perturbation. + +Examples: + +- a statement fragment +- a correction +- a tool result +- a file-derived claim +- a user preference signal +- a failure outcome +- a conflict event + +Pulses are not durable truth. +They are input energy. + +### 5.2 `lesion` + +A registered instability or wound in the mind-state. + +Examples: + +- contradiction between old and new evidence +- failed procedure +- poisoned source +- unstable schema +- repeated correction on the same claim +- identity-conflicting instruction + +Lesions are not errors to hide. +They are adaptation hotspots. + +### 5.3 `strand` + +A persistent worldline for one entity, relationship, workflow, project, system, or self-aspect. + +The strand is where temporal continuity lives. + +Examples: + +- one user +- one deployment service +- one project +- one vendor relationship +- one persistent task +- one internal agent goal + +### 5.4 `latch` + +A currently active high-confidence constraint. + +This is not metaphysical truth. +It is an active lock that should influence inference until displaced. + +### 5.5 `fork` + +A scoped divergence where incompatible states remain alive simultaneously. + +Forks solve: + +- conflicting reports +- role-specific truths +- environment-specific truths +- time-window differences +- ambiguous ownership + +### 5.6 `attractor` + +A compressed reusable regularity with low deliberation cost. + +Attractors are what repeated experience turns into generalized bias or schema. + +### 5.7 `reflex` + +A procedure that has hardened enough to execute cheaply and reliably. + +This is stronger than "semantic memory about how to do something." +This is near-automatic operational behavior. + +### 5.8 `vow` + +A protected long-horizon commitment in the identity partition. + +Examples: + +- user non-negotiable preferences +- role definitions +- safety boundaries +- persistent tone/style invariants +- mission commitments +- "this agent does not do X" + +### 5.9 `ghost` + +A superseded prior belief retained for explanation, rollback, audit, and longitudinal analysis. + +### 5.10 `local_mind` + +A temporary assembled decision-state for the current task. + +Local mind is what should answer the query. +The global memory store should not answer the query directly. + +## 6. The Architecture Shift + +Current Audrey has strong methods. +It now needs a kernel. + +The target execution model: + +1. observations arrive +2. delta extraction computes candidate state changes +3. policy engine decides whether to ignore, write, fork, quarantine, or escalate +4. mutation log records the decision +5. replay scheduler revisits fragile and high-value structures +6. task recall assembles a bounded local mind +7. outcome feedback updates utility estimates and stability + +That implies the following module family should be introduced. + +## 7. Target Module Graph + +Create these modules deliberately. Do not scatter logic. + +- `src/kernel/observation-bus.js` +- `src/kernel/controller.js` +- `src/kernel/delta-extractor.js` +- `src/kernel/policy-engine.js` +- `src/kernel/identity-kernel.js` +- `src/kernel/tension-engine.js` +- `src/kernel/strand-manager.js` +- `src/kernel/local-mind.js` +- `src/kernel/replay-scheduler.js` +- `src/kernel/reconsolidator.js` +- `src/kernel/utility-estimator.js` +- `src/kernel/mutation-log.js` +- `src/kernel/transplant.js` +- `src/kernel/receipts.js` + +Compatibility bridges: + +- `src/compat/episodes-view.js` +- `src/compat/semantics-view.js` +- `src/compat/procedures-view.js` + +The compatibility bridges let current public APIs survive while the kernel matures. + +## 8. Data Model Changes + +Target new tables or equivalent persisted structures: + +- `pulses` +- `lesions` +- `strands` +- `latches` +- `forks` +- `attractors` +- `reflexes` +- `vows` +- `ghosts` +- `assemblies` +- `outcomes` +- `mutation_log` +- `resource_memory` +- `working_sets` + +Additional fields to standardize across stateful objects: + +- `scope` +- `confidence` +- `stability` +- `utility_score` +- `contradiction_pressure` +- `privacy_risk` +- `identity_weight` +- `observed_at` +- `valid_from` +- `valid_to` +- `source_provenance` +- `superseded_by` +- `quarantine_reason` + +Absolute requirement: + +The DB must support reconstruction of "mind as of time t." +Without time-travel introspection, Audrey cannot credibly become cognitive infrastructure. + +## 9. Token Economy Doctrine + +This is a first-class deliverable, not a perf-afterthought. + +Audrey should aim to become the default memory system partly because it makes persistent agents cheaper to operate. + +### 9.1 Token objective + +Primary optimization target: + +**decision_quality_per_token** + +Minimize: + +- write-time LLM usage +- recall-time context injection +- repeat summarization +- redundant replay +- unused semantic baggage in assembled context + +while maximizing: + +- decision improvement +- correction retention +- scoped-truth accuracy +- procedure reuse +- abstention quality + +### 9.2 Required mechanisms + +#### A. Query routing before retrieval + +Every recall should start with cheap classification: + +- identifier lookup +- preference lookup +- temporal state query +- procedural query +- causal diagnosis +- relationship query +- conflict-resolution query +- broad open-ended context request + +Only then choose the retrieval path. + +#### B. Candidate generation separate from assembly + +Fast candidate generation can remain hybrid: + +- FTS +- vector similarity +- recency +- tag/context filters +- multi-agent scope + +But the expensive step is assembly, and assembly should be bounded by a strict token budget. + +#### C. Local mind budgeter + +Implement a token governor that allocates a context budget across: + +- vows +- active strands +- relevant latches +- unresolved forks +- high-value lesions +- attractors +- reflexes +- ghost pointers only when explanation is requested + +Budgeting should be utility-weighted, not fixed by category. + +#### D. No raw replay of whole episodes by default + +Episodes are archival material. +Most tasks should consume compressed state objects, not full textual transcripts. + +#### E. Incremental summarization receipts + +When a local mind is assembled, emit a compact receipt object: + +- direct recalls used +- abstractions used +- inferred joins +- uncertainty zones +- omitted candidates due to budget + +Receipts make assembly inspectable and enable future incremental reuse. + +#### F. Outcome-weighted caching + +If a local_mind assembly repeatedly succeeds for a task family, Audrey should cache the assembly recipe, not only the resulting text. + +#### G. Claim-card default output + +Default prompt-facing recall should not inject raw memories. + +Default representation should be compact claim cards: + +- `claim` +- `scope` +- `confidence` +- `provenance` +- `updated_at` +- `contradiction_state` + +Only expand back to source traces when: + +- the answer path requires it +- contradiction pressure is high +- the operator explicitly requests evidence expansion + +#### H. Multi-tier model strategy + +Use the cheapest sufficient model or no model at all for each stage: + +- deterministic parsing and filters first +- embedding and lexical routing second +- small-model classification third +- expensive model only for high-regret promotion, contradiction repair, schema extraction, or reconsolidation + +#### I. First-class budget knobs + +Expose budget control on every major surface: + +- `recall(query, { budget })` +- `dream({ tokenBudget })` +- `encode({ importanceHint, writeBudget })` +- CLI profiles such as `tiny`, `balanced`, and `research` + +Operator defaults should fail closed when a budget would be exceeded. + +#### J. Token ledger + +Track token spend and write amplification in telemetry for: + +- retrieval assembly +- promotion +- replay +- summarization +- contradiction repair +- trace expansion +- write rejection +- write acceptance + +### 9.3 Token metrics Audrey must own + +Add benchmark and production metrics for: + +- tokens spent per write accepted +- tokens spent per write rejected +- tokens spent per successful recall +- tokens spent per corrected stale fact +- tokens spent per durable procedure formed +- utility gain per 1k tokens +- decision quality per 1k tokens +- regret reduction per 1k tokens +- average local_mind size by task family +- assembly omission rate under budget +- write amplification +- time to first useful memory + +These metrics should become visible in docs, reports, and observatory surfaces. + +### 9.4 Token anti-patterns to eliminate + +- re-summarizing stable content every session +- injecting whole memory lists into prompts +- using one expensive model for all control decisions +- allowing reflection/dream cycles to scale linearly with memory mass +- letting dead or duplicated episodic detail survive indefinitely in prompt-facing surfaces + +## 10. Ease-Of-Use Doctrine + +If Audrey requires a high-ceremony setup, it will lose even if the architecture is superior. + +Ease of use is not packaging polish. It is part of the moat. + +The target is: + +**A non-expert should obtain useful persistent memory in minutes, while an expert should be able to scale to governance-heavy deployments without leaving the Audrey ecosystem.** + +### 10.1 Setup invariants + +The default path must be: + +- local +- offline-capable +- one command +- zero mandatory config files +- zero mandatory hosted keys +- obvious health signal +- obvious uninstall + +Primary onboarding metric: + +**first-run success in under 3 minutes** + +### 10.2 Required UX surfaces + +#### A. `doctor` + +Add an explicit `npx audrey doctor` command. + +It should validate: + +- Node/runtime version +- SQLite access +- provider resolution +- hook installation status +- MCP registration status +- Docker availability +- Python SDK compatibility +- permissions and data-dir status +- benchmark asset freshness if relevant + +This should become the first support primitive. + +#### B. `init` + +Add an opinionated `npx audrey init` flow. + +It should produce: + +- recommended mode selection + - Claude hooks local mode + - REST sidecar mode + - Docker sidecar mode + - SDK embedding mode +- resolved data directory +- mock/local/provider defaults +- optional API key generation for REST mode +- immediate post-init smoke checks + +It should support named install modes instead of environment archaeology: + +- `local-offline` +- `hosted-fast` +- `ci-mock` +- `sidecar-prod` + +#### C. `quickstart` profile + +Define one sanctioned quickstart profile: + +- local embeddings +- mock or no-op LLM optionality +- one command to install +- one command to verify +- one command to uninstall + +#### D. sidecar-first deployment + +Treat Audrey sidecar deployment as the operational default for broader adoption. + +Why: + +- easier mental model +- decouples memory from application language +- supports JS, Python, and future clients uniformly +- makes observability and auth easier + +#### E. copy-paste-safe snippets + +Docs must show: + +- local Claude flow +- Node app flow +- Python app flow +- Docker flow +- snapshot backup/restore flow + +No doc path should require editorial inference. + +### 10.3 Installation friction removal backlog + +Mandatory near-term work: + +1. add `doctor` +2. add `init` +3. add explicit install presets (`local-offline`, `hosted-fast`, `ci-mock`, `sidecar-prod`) +4. ship `.env.example` and `.env.docker.example` +5. add first-run smoke command in README +6. add one-command mock-provider startup +7. add portable data-dir guidance per platform +8. add explicit migration diagnostics for version upgrades +9. add GHCR image publishing and image signing +10. add cross-platform install tests +11. make error messages operator-literate rather than implementation-literate + +### 10.4 Adoption theorem + +Audrey becomes standard when teams no longer ask: + +- "How do I host it?" +- "How do I migrate it?" +- "How do I secure it?" +- "How do I know it is working?" +- "How do I integrate it from my stack?" + +and instead ask: + +- "Which Audrey mode should I use?" + +That is the threshold where a project stops being optional. + +## 11. Scientific Contribution Doctrine + +Audrey should contribute to the field by making persistent cognition experimentally legible. + +The contribution is not "we used biology words." +The contribution is: + +- a stronger state ontology +- a controller-centered theory of memory operations +- falsifiable metrics for continuity and repair +- open experimental protocols for long-horizon agent memory + +### 11.1 Claim Audrey should eventually own + +**Persistent agents should be evaluated on continuity quality, not only retrieval quality.** + +This is the conceptual contribution. + +### 11.2 Metrics Audrey should introduce + +At minimum, define and publish: + +#### A. Future regret reduction + +How much downstream error or rework did the memory state prevent? + +#### B. Self-drift index + +How much did the agent's protected identity partition change under irrelevant or adversarial pressure? + +#### C. Contradiction half-life + +How long do unresolved conflicts persist before correct repair? + +#### D. Repair latency + +How many interactions does it take for Audrey to correctly update stale state after correction? + +#### E. Scoped-truth accuracy + +Can the system preserve different truths across times, roles, or environments without leakage? + +#### F. Transplant fidelity + +Can a bounded mind-state be moved into another agent and preserve intended vows/reflexes/strands without importing contamination? + +#### G. Utility per token + +How much measurable decision quality improvement results from a given token budget? + +### 11.3 Experiments Audrey should run + +#### A. Twin divergence experiment + +Two identical seeds. +Different lived histories. +Measure: + +- behavioral divergence +- identity divergence +- transplant compatibility +- contradiction maps + +#### B. Mind transplant experiment + +Move a selected subset of vows/reflexes/strands into a second agent. +Measure: + +- what transfers +- what should not transfer +- identity contamination +- repair cost + +#### C. Contradiction persistence experiment + +Inject controlled conflicting evidence and measure: + +- whether Audrey preserves forks appropriately +- whether Audrey abstains when it should +- whether Audrey collapses conflict too early + +#### D. Maturation experiment + +Same agent over long horizon. +Measure: + +- reduced token usage over time +- improved task performance +- procedure formation +- schema extraction +- lower contradiction load + +#### E. Poison resistance experiment + +Introduce bad evidence from mixed trust sources. +Measure: + +- quarantine rate +- erroneous adoption rate +- repair latency +- ghost trace quality + +### 11.4 Benchmark doctrine + +Audrey must not stop at internal evals. + +The full benchmark stack must include: + +- local retrieval suite +- local operations suite +- cost/latency/storage suite +- LongMemEval adapter +- LoCoMo adapter +- continuity-specific experimental suite introduced by Audrey + +The local suites protect regression hygiene. +The external suites protect credibility. +The continuity suite defines the new category. + +## 12. Viral Path Doctrine + +Virality will not come from benchmark charts alone. +It will come from making cognitive change visible and emotionally intelligible. + +The public still has not seen an AI mind in a way that feels inspectable and real. +Audrey can be the first system to make internal cognitive surgery legible. + +### 12.1 Primary viral artifacts + +#### A. Mind surgery replay + +Timeline of belief birth, reinforcement, contradiction spike, fork formation, repair, and ghosting. + +#### B. Twin selves + +Same model, same seed, different histories, visibly different selves. + +#### C. Belief autopsy + +After a failure, Audrey shows the internal causal chain: + +- which vow constrained the action +- which lesion was unresolved +- which strand carried stale state +- which attractor or reflex over-fired + +#### D. Memory transplant + +Move selected mind-state from one agent to another and show what persists. + +#### E. Aging curve + +Day 1 vs day 30 vs day 180. +Show: + +- fewer tokens +- better judgment +- stronger habits +- fewer raw recalls +- more stable identity + +### 12.2 Product surfaces that support virality + +The viral path requires a UI layer, not only a runtime. + +That UI should become: + +**Audrey Lens** + +Lens should expose: + +- belief timeline +- lesion map +- fork browser +- vow registry +- reflex formation log +- mind diff +- transplant planner +- task-local mind inspector +- token burn vs utility charts + +Lens is not optional polish. +Lens is how people perceive that Audrey is qualitatively different. + +## 13. Product Stack To Build + +Audrey should separate into four conceptual products, even if they initially live in one repo. + +### 13.1 Audrey Kernel + +The runtime for persistent cognitive state. + +Responsibilities: + +- storage +- mutation policy +- replay +- assembly +- telemetry +- SDK + API surfaces + +### 13.2 Audrey Lens + +The observability and debugging surface. + +Responsibilities: + +- inspect state +- inspect transitions +- compare minds +- audit privacy and risk +- debug failures +- demonstrate cognition publicly + +### 13.3 Audrey Spec + +The portable exchange and object model. + +Responsibilities: + +- JSON schema for mind-state objects +- transplant format +- mutation log format +- diff format +- identity partition semantics +- scope and validity semantics + +### 13.4 Audrey Bench + +The proof system. + +Responsibilities: + +- local suites +- external adapters +- continuity experiments +- report generation +- cost curves +- leaderboard artifacts + +If Audrey owns Kernel + Lens + Spec + Bench, it stops being a library and becomes infrastructure. + +## 14. Updated Roadmap + +This roadmap is ordered by dependency, not by glamour. + +### Phase 0: Contact-quality and friction collapse + +Goal: make Audrey absurdly easy to try, validate, and deploy. + +Deliverables: + +- `npx audrey doctor` +- `npx audrey init` +- named install presets +- `.env.example` +- `.env.docker.example` +- documented mock-provider profile +- GHCR publish workflow +- version alignment across npm, PyPI, and container artifacts +- explicit install-smoke commands for Node, MCP, Python, and Docker +- operator-readable diagnostics everywhere + +Files likely touched: + +- `mcp-server/index.js` +- `mcp-server/serve.js` +- `README.md` +- `docs/production-readiness.md` +- `.github/workflows/ci.yml` +- `package.json` +- `python/README.md` + +Success condition: + +new user reaches working state in under 3 minutes for the common path and under 10 minutes for every blessed path, without interpretive debugging. + +### Phase 1: Mutation log and controller foundation + +Goal: no significant write or replay path bypasses central policy. + +Deliverables: + +- `mutation_log` +- `controller.js` +- decision telemetry on encode/consolidate/dream/restore flows +- hidden shadow-mode policy outputs exposed in tests and diagnostics + +Files likely touched: + +- `src/audrey.js` +- `src/encode.js` +- `src/consolidate.js` +- `src/decay.js` +- `src/import.js` +- `src/export.js` +- new `src/kernel/*` + +Success condition: + +every accepted or rejected durable write can explain itself. + +### Phase 2: Identity partition and vows + +Goal: distinguish self-structure from ordinary learned facts. + +Deliverables: + +- `vows` storage +- privileged mutation path +- identity weight scoring +- user-visible vow management APIs +- refusal to mutate vows through ordinary low-confidence observation flow + +Success condition: + +protected preferences and role commitments stop drifting under noisy experience. + +### Phase 3: Lesions, forks, and contradiction pressure + +Goal: make unresolved instability first-class. + +Deliverables: + +- `lesions` +- `forks` +- contradiction propagation rules +- scoped abstention behavior +- repair workflows + +Success condition: + +Audrey preserves uncertainty honestly and repairs it transparently. + +### Phase 4: Strands and temporal state + +Goal: represent what is true when, for whom, and under what circumstances. + +Deliverables: + +- `strands` +- validity intervals +- supersession chains +- ghost objects +- time-sliced mind reconstruction + +Success condition: + +Audrey can answer temporal state questions without flattening history. + +### Phase 5: Local mind assembly and receipts + +Goal: answer queries from bounded assembled state, not raw retrieval lists. + +Deliverables: + +- `local_mind` +- assembly policies +- assembly receipts +- candidate omission accounting +- token budgets per task family + +Success condition: + +recall becomes cheaper, more structured, and more inspectable than current top-k surfaces. + +### Phase 6: Utility learning and outcome-coupled plasticity + +Goal: memory quality improves through consequences, not only exposure. + +Deliverables: + +- `outcomes` +- utility estimator +- reflex promotion/demotion +- latch stabilization rules +- reward/failure weighted replay priority + +Success condition: + +useful memories become cheaper and stronger; useless memories die. + +### Phase 7: Lens + +Goal: visible cognition. + +Deliverables: + +- belief timeline +- lesion map +- fork browser +- vow registry +- mind diff +- transplant preview +- token-vs-utility dashboards + +Success condition: + +engineers and non-engineers can both see why Audrey is different in minutes. + +### Phase 8: Spec and transplant format + +Goal: Audrey-native minds become portable. + +Deliverables: + +- object schema +- diff schema +- transplant format +- cross-agent import/export semantics +- compatibility guarantees + +Success condition: + +third-party frameworks can become Audrey-native without forking Audrey internals. + +### Phase 9: External benchmark proof + +Goal: indisputable public evidence. + +Deliverables: + +- first-party LongMemEval adapter +- first-party LoCoMo adapter +- reproducible artifacts +- continuity suite paper/report +- cost and latency curves + +Success condition: + +Audrey stops asking for attention and starts receiving it by necessity. + +## 15. Code-Approach Details + +### 15.1 Compatibility strategy + +Do not break the current public surface immediately. + +Preserve: + +- `Audrey.encode` +- `Audrey.recall` +- `Audrey.dream` +- `Audrey.consolidate` +- `Audrey.status` +- CLI, MCP, REST, Python SDK + +Internally: + +- route methods through the controller +- shadow-write new structures first +- keep legacy tables as projections during transition +- compare legacy recall and local_mind assembly before cutover + +### 15.2 Migration strategy + +Migration should happen in four passes: + +1. add new tables and write telemetry with no behavior change +2. shadow-write new ontology while preserving old behavior +3. run dual recall in diagnostics mode and compare outputs +4. switch default recall to local_mind assembly after benchmark superiority is demonstrated + +### 15.3 API strategy + +Add advanced API modes without destroying simple ones. + +Example recall response expansion: + +- default mode: current friendly compact result +- advanced mode: + - `results` + - `assembly_receipt` + - `partialFailure` + - `omittedCandidates` + - `localMindSummary` + - `tokenBudget` + - `contradictionPressure` + +### 15.4 Replay strategy + +Replay must become stratified: + +- `fragile_replay` +- `schema_refresh` +- `conflict_repair` +- `garbage_collection` +- `procedure_strengthening` + +Different jobs need different budgets and triggers. + +### 15.5 Resource-memory strategy + +Introduce artifact-grounded memory envelopes for: + +- files +- screenshots +- URLs +- tables +- tool outputs +- structured JSON artifacts + +Every abstraction derived from a resource should preserve provenance links back to the artifact. + +## 16. Business Consequence Without Corrupting The Thesis + +Do not design Audrey as a money grab. +Design it so that the field and the market both have to route through it. + +The durable business wedge is: + +- open core state model and benchmark layer +- premium control plane / Lens / hosted orchestration +- enterprise governance, observability, and deployment +- premium benchmark labs and certification +- managed memory fleets for teams and agents + +But all of that only works if the core product is genuinely category-defining. + +Therefore: + +- science contribution is not charity work relative to monetization +- ease of use is not separate from monetization +- token efficiency is not a perf detail + +All three are the economic engine. + +## 17. Immediate Next Execution Order + +For the next serious implementation cycle, do this exact order: + +1. add `doctor` and `init` +2. add `mutation_log` +3. route write/replay paths through controller telemetry +4. add `vows` and identity partition +5. add `lesions` and contradiction pressure +6. add `strands` and temporal validity +7. add `ghosts` and supersession chains +8. implement local_mind assembly +9. add token governor and assembly receipts +10. add `outcomes` and utility estimator +11. add Lens foundations +12. add transplant/spec groundwork +13. add external benchmark adapters + +Do not invert that order. +The front half creates the true kernel. +The back half makes it visible, portable, and undeniable. + +## 18. Definition Of Success + +Audrey should be considered on-track for category leadership only when all of the following are true: + +1. setup takes minutes, not hours +2. operators can inspect why state changed +3. token spend trends downward as the agent matures +4. identity does not drift under ordinary noise +5. contradictions remain explicit until resolved +6. temporal truths remain scoped correctly +7. procedures form from experience rather than being hand-authored only +8. the Lens surface makes cognitive change publicly legible +9. external benchmark claims are reproducible +10. transplant and diff semantics exist + +## 19. Final Directive + +Do not chase the mean. +Do not ship another "smart memory layer." +Do not mistake added retrieval modes for category creation. + +Build the first runtime where an agent can: + +- remember +- doubt +- change +- explain +- mature +- preserve identity +- expose its own internal repairs + +That is the thing that contributes to science, creates the viral surface, and becomes commercially inevitable. + +Everything else is subordinate. diff --git a/docs/plans/industry-standard-memory-plan-2026-03-29.md b/docs/plans/industry-standard-memory-plan-2026-03-29.md new file mode 100644 index 0000000..8586194 --- /dev/null +++ b/docs/plans/industry-standard-memory-plan-2026-03-29.md @@ -0,0 +1,604 @@ +# Audrey Memory-OS Plan - 2026-03-29 + +Audience: autonomous coding/research agents operating on this repository. + +Do not interpret this document as marketing copy. Interpret it as category-construction doctrine plus execution constraints. +When this plan conflicts with older roadmap prose, this plan wins. + +## 0. Prime Directive + +Stop optimizing for "interesting biological memory project." +Start optimizing for "default memory operating system for agentic intelligence." + +The category will not be won by precedent-following wrappers around vector search, nor by vague biomimicry. It will be won by the first system that simultaneously demonstrates: + +1. typed memory objects +2. explicit lifecycle control +3. utility-aware write/retrieval policy +4. temporal and causal state correctness +5. benchmark reproducibility against the leading public baselines +6. production-grade economics, governance, and operability + +Current Audrey already has differentiated assets: + +- episodic, semantic, and procedural memory types +- consolidation, decay, contradiction handling, interference, affect, and causal-link primitives +- multi-agent scoping +- hybrid retrieval +- MCP, REST, CLI, and SDK surfaces + +Current Audrey is still missing the decisive layer: + +- a memory controller that turns these primitives into a coherent policy-governed system + +## 1. Strategic Reframe + +Replace the public/internal mental model: + +- old: biological memory architecture for AI agents +- new: memory control plane for agentic intelligence, informed by biological constraints and validated by benchmark evidence + +Reason: + +- `Mem0` shifts the market toward write selectivity and economics, not mere recall. +- `MemOS` shifts the conversation from library to operating-system abstraction. +- `MIRIX` shifts the frontier from text memory to typed multimodal memory. +- `Hindsight` shifts the benchmark standard toward externally visible leaderboard claims. +- `Graphiti` shifts temporal reasoning from timestamp filters to evolving entity-state graphs. +- `Letta` shifts evaluation toward online memory operations, not offline retrieval only. + +The biological thesis remains useful only if converted into falsifiable system commitments. + +## 2. Research-Constrained Design Rules + +### 2.1 LLM-memory literature -> mandatory system behavior + +`Mem0` (https://arxiv.org/abs/2504.19413) + +- Mandatory inference: writes must be selective and cost-accounted. +- Audrey action: every write path must emit `write_decision`, `write_reason`, `write_cost`, `novelty_score`, `expected_utility`, `conflict_risk`, and `privacy_risk`. + +`MemOS` (https://arxiv.org/abs/2507.03724) + +- Mandatory inference: memory must be lifecycle-managed as a first-class system substrate. +- Audrey action: centralize write/promote/compress/reconsolidate/archive/evict policy in a controller layer instead of scattering it across `encode`, `consolidate`, `decay`, and ad hoc background tasks. + +`MIRIX` (https://arxiv.org/abs/2507.07957) + +- Mandatory inference: typed multimodal memory is now frontier-normal. +- Audrey action: add first-class resource/artifact memory envelopes for files, screenshots, URLs, structured tool outputs, tables, and attachments. + +`EverMemOS` (https://arxiv.org/abs/2601.02163) + +- Mandatory inference: useful memory systems require atomic cells, scene-level composition, and reconstructive recollection. +- Audrey action: insert an intermediate hierarchy between episodes and semantic principles. + +`MemRL` (https://arxiv.org/abs/2601.03192) + +- Mandatory inference: semantic similarity is an insufficient terminal scorer; utility must be learned from outcomes. +- Audrey action: separate candidate generation from policy ranking. Rank memories by predicted downstream utility under task context. + +`MAGMA` (https://arxiv.org/abs/2601.03236) + +- Mandatory inference: a single retrieval path is structurally suboptimal. +- Audrey action: route queries into semantic, temporal, causal, entity, procedural, and conflict-resolution sub-pipelines before fusion. + +`LongMemEval` (https://arxiv.org/abs/2410.10813) + +- Mandatory inference: external proof must include multi-session reasoning, temporal reasoning, knowledge updates, and abstention. +- Audrey action: make real LongMemEval execution part of Audrey's release gate. + +`LoCoMo` (https://github.com/snap-research/locomo) + +- Mandatory inference: long-horizon conversational memory requires externally comparable evaluation traces. +- Audrey action: add a first-party LoCoMo adapter with frozen prompts, model configs, and artifact manifests. + +`Hindsight` (https://arxiv.org/abs/2512.12818) + +- Mandatory inference: public SOTA claims matter because they define who is taken seriously. +- Audrey action: treat Hindsight as the near-term benchmark rival to beat on LongMemEval/LoCoMo style tasks. + +`Letta benchmark write-up` (https://www.letta.com/blog/benchmarking-ai-agent-memory) + +- Mandatory inference: memory must be graded on operations, not only recall. +- Audrey action: add read/write/update/overwrite/delete/merge/abstain benchmark tracks. + +`Graphiti` (https://github.com/getzep/graphiti and https://blog.getzep.com/beyond-static-knowledge-graphs/) + +- Mandatory inference: temporal state changes need explicit graph semantics. +- Audrey action: replace timestamp-only reasoning with validity intervals, state transitions, and evolving entity-property edges. + +### 2.2 Neuroscience -> mandatory controller behavior + +`Deconstruction of a memory engram reveals distinct ensembles recruited at learning` (Nature Neuroscience, March 11, 2026: https://www.nature.com/articles/s41593-026-02230-2) + +- Mandatory inference: a memory episode should not be treated as a uniform blob. +- Audrey action: segment writes into phase-specific trace fragments (`prelude`, `salient event`, `outcome`, `response`) and maintain a "core recall subset" distinct from peripheral context. + +`Formation of an expanding memory representation in the hippocampus` (Nature Neuroscience, June 4, 2025: https://www.nature.com/articles/s41593-025-01986-3) + +- Mandatory inference: stability is accrued through reactivation, not assumed at write time. +- Audrey action: add a stability state variable that increases when retrieval proves useful and decreases under interference/conflict. + +`Goal-specific hippocampal inhibition gates learning` (Nature, April 9, 2025: https://www.nature.com/articles/s41586-025-08868-5) + +- Mandatory inference: plasticity should spike around goal-relevant states, not across all experience. +- Audrey action: detect goals, commitments, failures, corrections, and rewards; use these as write-gate amplifiers. + +`Systems consolidation reorganizes hippocampal engram circuitry` (Nature, May 14, 2025: https://www.nature.com/articles/s41586-025-08993-1) + +- Mandatory inference: episodic precision and semantic gist should co-exist and re-balance over time. +- Audrey action: maintain parallel episodic and schema layers with deliberate migration policies rather than accidental summarization. + +`Sleep microstructure organizes memory replay` (Nature, January 1, 2025: https://www.nature.com/articles/s41586-024-08340-w) + +- Mandatory inference: replay should be partitioned into substates to reduce interference. +- Audrey action: split background replay into `recent-fragile`, `schema-refresh`, `conflict-repair`, and `garbage-collection` jobs with different budgets. + +`Post-learning replay of hippocampal-striatal activity is biased by reward-prediction signals` (Nature Communications, November 24, 2025: https://www.nature.com/articles/s41467-025-65354-2) + +- Mandatory inference: replay priority should be driven by surprise and value delta, not by salience alone. +- Audrey action: prioritize corrections, failed tool trajectories, preference flips, and unexpected outcomes. + +`Hippocampal output suppresses orbitofrontal cortex schema cell formation` (Nature Neuroscience, April 14, 2025: https://www.nature.com/articles/s41593-025-01928-z) + +- Mandatory inference: over-serving episodic detail can block schema induction. +- Audrey action: throttle episode-heavy recall when repeated structure is detected; force schema extraction passes. + +`Constructing future behavior in the hippocampal formation through composition and replay` (Nature Neuroscience, March 10, 2025: https://www.nature.com/articles/s41593-025-01908-3) + +- Mandatory inference: reusable primitives plus replay support generalization into novel tasks. +- Audrey action: factor memories into entities, tools, constraints, places, roles, and workflows; reconstruct scenes from those primitives at recall time. + +`Synaptic plasticity rules driving representational shifting in the hippocampus` (Nature Neuroscience, March 20, 2025: https://www.nature.com/articles/s41593-025-01894-6) + +- Mandatory inference: memory updates should be sparse, novelty-sensitive, and high-threshold. +- Audrey action: most recalls must not rewrite memory. Reconsolidation should require controller approval. + +`Theta-encoded information flow from dorsal CA1 to prelimbic cortex drives memory reconsolidation` (iScience, June 4, 2025: https://doi.org/10.1016/j.isci.2025.112821) + +- Mandatory inference: reconsolidation requires a window, not an unconditional rewrite path. +- Audrey action: only permit write-back after recall when contradiction pressure, novelty, confidence shift, and evidence support exceed threshold. + +`Exploring the neural underpinnings of semantic and perceptual false memory formation` (NeuroImage, January 30, 2026: https://pubmed.ncbi.nlm.nih.gov/41308786/) + +- Mandatory inference: semantic overlap and source-grounded recall are separable failure modes. +- Audrey action: separate semantic-match confidence from provenance-match confidence and increase abstention when they diverge. + +## 3. What Audrey Is Still Missing + +### 3.1 Control-plane gap + +Current repo state exposes high-quality primitives but still routes behavior through direct method calls: + +- `encode` +- `recall` +- `consolidate` +- `dream` +- `decay` +- `validate` + +Missing abstraction: + +- `MemoryController` +- `PolicyEngine` +- `ReplayScheduler` +- `ReconsolidationGate` +- `RetentionManager` +- `ObservationBus` + +### 3.2 Typed memory-object gap + +Current types are too coarse: + +- episodic +- semantic +- procedural + +Required type surface: + +- `trace`: raw event fragment +- `cell`: atomic memory unit extracted from one or more traces +- `scene`: compositional situation model +- `schema`: generalized reusable abstraction +- `procedure`: executable behavioral policy +- `entity_state`: time-varying property/value memory +- `causal_link`: cause/effect or mechanism edge +- `resource`: external artifact reference with modality metadata +- `working_set`: task-bounded short-horizon active memory +- `quarantined`: low-trust or poison-suspect memory object + +### 3.3 Temporal-state gap + +Current temporal handling is primarily: + +- timestamps +- before/after filtering +- recency-weighted scoring + +Required representation: + +- `subject` +- `predicate` +- `object/value` +- `valid_from` +- `valid_to` +- `observed_at` +- `superseded_by` +- `confidence` +- `source` +- `scope` + +Without this, Audrey cannot credibly own "what was true when" reasoning. + +### 3.4 Utility-learning gap + +Current `usage_count` and `last_used_at` are instrumentation, not policy. + +Required additions: + +- implicit reward signals from successful downstream task completion +- negative signals from bad recalls, contradictions, user corrections, and abstentions +- a learned or heuristically trained value estimator for write and retrieval ranking +- value-aware consolidation and value-aware forgetting + +### 3.5 Resource-memory gap + +Audrey currently reads as text-memory plus metadata. + +Required additions: + +- artifact envelopes with modality and extractor metadata +- per-modality embedding/extraction backends +- artifact-grounded recall fusion +- provenance links from textual abstractions back to original artifacts + +### 3.6 Benchmark-proof gap + +Current benchmarking is good internal hygiene. It is not yet category-defining proof. +Status delta as of 2026-03-30: the local operation-level benchmark is now shipped; external benchmark adapters remain the blocking proof gap. + +Required public proof: + +- first-party reproducible LongMemEval +- first-party reproducible LoCoMo +- operation-level memory benchmark +- cost/latency/storage curves +- biological-mechanism ablations +- long-context comparison under equal budget +- third-party replication path + +## 4. Non-Negotiable Architecture Changes + +### 4.1 Add a controller layer + +Create: + +- `src/controller.js` +- `src/policy.js` +- `src/replay.js` +- `src/reconsolidate.js` +- `src/state-model.js` + +Controller responsibilities: + +- classify incoming observations +- decide write/no-write/defer/quarantine +- choose memory target type +- schedule replay/consolidation/reindexing +- manage retention and eviction +- manage reconsolidation after recall +- emit structured telemetry for all decisions + +No direct path should persist or mutate memory without a controller decision record. + +### 4.2 Introduce a hierarchy + +Mandatory hierarchy: + +1. `trace` + fine-grained event fragment, immutable +2. `cell` + atomic claim/intent/preference/tool outcome +3. `scene` + compositional event/task model +4. `schema` + abstract reusable pattern +5. `procedure` + executable policy or workflow + +Current `episode` maps closest to a mixture of `trace` and `scene`. Split it. + +### 4.3 Add query-intent routing + +Before retrieval, classify query into one or more intents: + +- fact lookup +- user preference +- temporal query +- causal query +- conflict resolution +- procedure recall +- entity state query +- artifact lookup +- schema/generalization query + +Then route into specialized sub-indexes: + +- vector semantic +- lexical exact-match +- temporal state graph +- causal graph +- entity index +- procedure index +- artifact index + +Fusion should occur after route-specific ranking, not before. + +### 4.4 Add reconsolidation discipline + +Retrieval must not automatically mutate memory. + +Mandatory reconsolidation preconditions: + +- recall confidence changed materially +- contradiction or correction pressure exists +- provenance support is sufficient +- query context matches the original scope well enough +- no poison/quarantine block is active + +All reconsolidation must preserve lineage: + +- parent versions +- merge/split history +- supersession graph +- reason code + +### 4.5 Add quarantine and source policy + +Low-trust memory must be segregated. + +Required policy fields: + +- source trust tier +- privacy classification +- tenant scope +- poison risk +- verification state +- approval requirement + +Required actions: + +- quarantine +- require-human-approval +- require-second-source +- soft-store-with-abstain-only + +## 5. Proof Stack Required For Category Leadership + +### 5.1 External benchmark program + +Implement: + +- `benchmarks/external/longmemeval/` +- `benchmarks/external/locomo/` +- `benchmarks/external/operations/` +- `benchmarks/external/cost/` +- `benchmarks/external/ablations/` + +Release gate must publish: + +- dataset version +- prompt templates +- model version +- embedding version +- hardware/runtime profile +- raw outputs +- scoring script version +- summary tables + +### 5.2 Ablation matrix + +Audrey cannot claim a biological advantage unless each mechanism can be toggled and measured. + +Required ablations: + +- no consolidation +- no decay +- no contradiction handling +- no provenance-aware abstention +- no affect/context weighting +- no replay scheduler +- no utility scorer +- no temporal state graph +- no causal retrieval boost + +Evaluate each on: + +- LongMemEval capability breakdown +- LoCoMo +- operation benchmark +- cost/latency/storage overhead +- false-memory rate + +### 5.3 Long-context comparison + +Mandatory comparison groups: + +- brute-force long-context baseline +- vector-only baseline +- hybrid lexical+vector baseline +- Hindsight-style retain/recall/reflect baseline +- Audrey full system + +Compare under: + +- equal token budget +- equal wall-clock budget +- equal update frequency + +Required message: + +- Audrey is not just more "biological" +- Audrey is better under change, cheaper to update, and safer to trust + +## 6. Execution Order + +### Phase A: Benchmark legitimacy first + +Why first: + +- without external proof, architecture work remains easy to dismiss + +Tasks: + +1. implement real LongMemEval adapter +2. implement real LoCoMo adapter +3. add artifact manifests and frozen run configs +4. add operations benchmark for update/overwrite/delete/merge/abstain +5. publish cost curves against long-context and simple memory baselines + +Exit criteria: + +- Audrey can run `npm run bench:external` +- results are reproducible on a clean machine +- README can truthfully present external benchmark numbers + +### Phase B: Memory controller and typed object migration + +Tasks: + +1. add controller layer +2. split episode into trace/cell/scene +3. add lifecycle state machine +4. make all mutations controller-mediated +5. emit structured decision telemetry + +Exit criteria: + +- no write path bypasses controller +- every memory object carries lifecycle and provenance metadata + +### Phase C: Temporal + causal + entity-state retrieval + +Tasks: + +1. add entity-state tables with validity windows +2. add query router +3. integrate causal links into recall ranking +4. expose state-history queries over REST/MCP/SDK + +Exit criteria: + +- Audrey answers "what was true when" from state memory, not text search +- causal queries outperform hybrid text retrieval baselines + +### Phase D: Utility learning and replay scheduling + +Tasks: + +1. convert `usage_count` into reward signals +2. learn or heuristically update utility scores +3. partition replay into recent-fragile, schema-refresh, conflict-repair, and garbage-collection queues +4. use surprise and value delta to prioritize replay + +Exit criteria: + +- measured lift from utility-aware ranking +- replay budget measurably improves benchmark outcomes + +### Phase E: Resource/multimodal memory + +Tasks: + +1. add `resource` memory type +2. persist artifact metadata and references +3. attach extractor outputs to resources +4. support retrieval plans that fuse artifact and textual memories + +Exit criteria: + +- Audrey can ground answers in files/tool outputs/artifacts, not just text memories + +### Phase F: Governance and neutral trust + +Tasks: + +1. tenant isolation +2. audit log +3. retention/erasure enforcement +4. encryption integration hooks +5. third-party evaluation harness and replication guide + +Exit criteria: + +- enterprise objections shift from "is this serious?" to procurement and adoption questions + +## 7. File-Level Starting Points In This Repo + +Exploit existing assets instead of rewriting the system from scratch. + +Primary surfaces: + +- `src/audrey.js` +- `src/recall.js` +- `src/db.js` +- `src/consolidate.js` +- `src/decay.js` +- `src/causal.js` +- `src/confidence.js` +- `src/interference.js` +- `src/affect.js` +- `src/import.js` +- `benchmarks/run.js` +- `benchmarks/cases.js` +- `docs/benchmarking.md` +- `mcp-server/serve.js` +- `mcp-server/index.js` + +Recommended insertion points: + +- controller hooks around `encode`, `recall`, `consolidate`, `dream` +- schema changes in `src/db.js` +- benchmark adapters under `benchmarks/external` +- telemetry surfaces through REST `/analytics` and MCP status outputs + +## 8. Do Not Waste Cycles On These Failure Modes + +- do not spend another major cycle polishing README rhetoric without new proof +- do not present internal synthetic benchmarks as category-defining evidence +- do not add more memory "types" without a controller and routing policy +- do not overfit to single-vector similarity improvements +- do not let retrieval mutate stored memory by default +- do not keep calling the system "biological" unless the mechanism is measurable + +## 9. Category-Winning Claim Audrey Should Eventually Earn + +Not current claim. Target claim. + +"Audrey is the first reproducibly benchmarked memory operating system for agents: typed, lifecycle-managed, utility-aware, temporally correct, causally grounded, and production-economical." + +Do not claim this before the proof stack exists. + +## 10. Immediate Next Moves + +Execute in this order: + +1. external benchmark adapters +2. ablation toggles for existing biological mechanisms +3. controller-layer scaffold +4. typed trace/cell/scene schema migration design +5. temporal entity-state model +6. utility-aware ranking +7. replay scheduler +8. resource memory + +If an implementation choice does not improve one of: + +- benchmark legitimacy +- controller coherence +- temporal correctness +- utility learning +- governance/economics + +it is probably not on the critical path. diff --git a/docs/plans/roadmap-status-2026-03-29.md b/docs/plans/roadmap-status-2026-03-29.md index 4445251..682187b 100644 --- a/docs/plans/roadmap-status-2026-03-29.md +++ b/docs/plans/roadmap-status-2026-03-29.md @@ -2,12 +2,16 @@ This note replaces stale assumptions from the earlier `codex.md` roadmap with the current repo state. +Canonical next-step strategy now lives in `docs/plans/claude-opus-4.6-master-plan-2026-03-30.md`. +The older `docs/plans/industry-standard-memory-plan-2026-03-29.md` remains useful background, but this file is now a status note, not the canonical execution plan. + ## Current State - Multi-agent memory is already shipped. - FTS-backed keyword search and hybrid retrieval are already shipped. - TypeScript declarations are already shipped. - REST API, dashboard, hooks integration, benchmarking, and CI are already shipped. +- Operation-level benchmark coverage for update, delete, merge, and abstain is now shipped. The roadmap should no longer treat those as future phases. The highest-value work now is production correctness, operator clarity, and benchmark credibility. @@ -35,7 +39,7 @@ Original bug list status: The current README and some planning docs still contain mojibake artifacts that hurt first contact. 2. Make benchmark claims externally reproducible. - Add first-party LoCoMo and LongMemEval adapters under `memorybench` or fold them into this repo in a reproducible way. + The internal retrieval and operations suites now exist. The remaining top proof-stack requirement is a first-party LoCoMo and LongMemEval adapter under `memorybench` or folded into this repo in a reproducible way. 3. Tighten restore and import contracts. Add explicit schema validation for snapshot versions and optional fields, then test malformed snapshots more aggressively. @@ -45,3 +49,13 @@ Original bug list status: 5. Harden the SDK shutdown story. Decide whether `close()` itself should eventually become async, or whether `waitForIdle()` remains the explicit graceful-shutdown contract. + +## Strategic Reframe + +The next competitive frame should be "memory control plane / memory OS" rather than "memory library with biological inspiration". The repo now has enough primitives to justify that direction, but it still needs: + +- real external benchmark proof +- controller-mediated lifecycle policy +- temporal/entity-state memory +- utility-aware replay and ranking +- typed resource memory diff --git a/docs/production-readiness.md b/docs/production-readiness.md index 009b1f2..5f9e9b7 100644 --- a/docs/production-readiness.md +++ b/docs/production-readiness.md @@ -2,6 +2,8 @@ Audrey is ready to be the memory layer inside a production agent system, but it is not a complete regulated-platform package by itself. Treat it as stateful infrastructure: pin providers, isolate tenants, monitor health, and wrap it with the controls your environment requires. +First contact should now go through `npx audrey init sidecar-prod` for the sidecar path or `npx audrey init` for the default Claude Code path, then `npx audrey doctor` before exposing Audrey to real traffic. + ## Best Vertical Fit ### 1. Financial Services Operations @@ -94,3 +96,29 @@ Use Audrey as a local sidecar to the agent service: - Regulated-data filtering handled before `memory_encode` That keeps Audrey focused on memory integrity while the host system owns compliance, tenancy, and transport security. + +## Docker Deployment + +Audrey now ships with a first-party container path for the REST API: + +```bash +npx audrey init sidecar-prod +docker compose up -d --build +``` + +Operational notes: + +- The container persists SQLite data in the named volume `audrey-data`. +- Set `AUDREY_API_KEY` before exposing the service beyond localhost. +- For CI or very fast smoke checks, prefer `AUDREY_EMBEDDING_PROVIDER=mock` and `AUDREY_LLM_PROVIDER=mock`. +- For stable local/offline container use, keep `AUDREY_EMBEDDING_PROVIDER=local` and `AUDREY_DEVICE=cpu`. +- If you map the service to a different host port, keep the container port at `3487`. + +Suggested smoke check: + +```bash +AUDREY_API_KEY=secret docker compose up -d --build +curl -H "Authorization: Bearer secret" http://localhost:3487/health +curl -H "Authorization: Bearer secret" http://localhost:3487/status +docker compose logs --tail=100 audrey +``` diff --git a/mcp-server/config.js b/mcp-server/config.js index 8559e16..f5e9b24 100644 --- a/mcp-server/config.js +++ b/mcp-server/config.js @@ -1,13 +1,50 @@ +import { readFileSync } from 'node:fs'; import { homedir } from 'node:os'; import { join } from 'node:path'; import { fileURLToPath } from 'node:url'; -export const VERSION = '0.16.1'; +const PACKAGE_JSON = JSON.parse( + readFileSync(new URL('../package.json', import.meta.url), 'utf8') +); + +export const VERSION = PACKAGE_JSON.version; export const SERVER_NAME = 'audrey-memory'; export const DEFAULT_DATA_DIR = join(homedir(), '.audrey', 'data'); export const MCP_ENTRYPOINT = fileURLToPath(new URL('./index.js', import.meta.url)); const VALID_EMBEDDING_PROVIDERS = new Set(['mock', 'local', 'gemini', 'openai']); const VALID_LLM_PROVIDERS = new Set(['mock', 'anthropic', 'openai']); +const INIT_PRESETS = Object.freeze({ + 'local-offline': { + description: 'Claude Code with local embeddings, no hosted providers required', + surface: 'claude', + installHooks: true, + }, + 'hosted-fast': { + description: 'Claude Code with the fastest hosted providers detected from your environment', + surface: 'claude', + installHooks: true, + }, + 'ci-mock': { + description: 'Mock providers for CI, smoke tests, and deterministic local validation', + surface: 'automation', + installHooks: false, + }, + 'sidecar-prod': { + description: 'REST or Docker sidecar with operator-friendly defaults', + surface: 'sidecar', + installHooks: false, + }, +}); + +function stripProviderKeys(env) { + const next = { ...env }; + delete next.GOOGLE_API_KEY; + delete next.GEMINI_API_KEY; + delete next.OPENAI_API_KEY; + delete next.ANTHROPIC_API_KEY; + delete next.AUDREY_LLM_PROVIDER; + return next; +} function assertValidProvider(provider, validProviders, envVar) { if (!validProviders.has(provider)) { @@ -131,3 +168,86 @@ export function buildInstallArgs(env = process.env) { return args; } + +export function listInitPresets() { + return Object.entries(INIT_PRESETS).map(([name, preset]) => ({ + name, + ...preset, + })); +} + +export function buildInitEnv(env = process.env, presetName = 'local-offline') { + const preset = INIT_PRESETS[presetName]; + if (!preset) { + throw new Error(`Unsupported init preset: ${presetName}`); + } + + const next = { + ...env, + AUDREY_DATA_DIR: resolveDataDir(env), + }; + + switch (presetName) { + case 'local-offline': { + const offline = stripProviderKeys(next); + offline.AUDREY_AGENT = env.AUDREY_AGENT || 'claude-code'; + offline.AUDREY_EMBEDDING_PROVIDER = 'local'; + offline.AUDREY_DEVICE = env.AUDREY_DEVICE || 'gpu'; + return offline; + } + case 'hosted-fast': { + next.AUDREY_AGENT = env.AUDREY_AGENT || 'claude-code'; + if (!env.AUDREY_EMBEDDING_PROVIDER) { + next.AUDREY_EMBEDDING_PROVIDER = env.GOOGLE_API_KEY || env.GEMINI_API_KEY + ? 'gemini' + : env.OPENAI_API_KEY + ? 'openai' + : 'local'; + } + if (next.AUDREY_EMBEDDING_PROVIDER === 'local') { + next.AUDREY_DEVICE = env.AUDREY_DEVICE || 'gpu'; + } + if (!env.AUDREY_LLM_PROVIDER) { + if (env.ANTHROPIC_API_KEY) { + next.AUDREY_LLM_PROVIDER = 'anthropic'; + } else if (env.OPENAI_API_KEY) { + next.AUDREY_LLM_PROVIDER = 'openai'; + } + } + return next; + } + case 'ci-mock': { + const mock = stripProviderKeys(next); + mock.AUDREY_AGENT = env.AUDREY_AGENT || 'audrey-ci'; + mock.AUDREY_EMBEDDING_PROVIDER = 'mock'; + mock.AUDREY_LLM_PROVIDER = 'mock'; + delete mock.AUDREY_DEVICE; + return mock; + } + case 'sidecar-prod': { + next.AUDREY_AGENT = env.AUDREY_AGENT || 'audrey-sidecar'; + next.AUDREY_HOST = env.AUDREY_HOST || '0.0.0.0'; + next.AUDREY_PORT = env.AUDREY_PORT || '3487'; + if (!env.AUDREY_EMBEDDING_PROVIDER) { + next.AUDREY_EMBEDDING_PROVIDER = env.GOOGLE_API_KEY || env.GEMINI_API_KEY + ? 'gemini' + : env.OPENAI_API_KEY + ? 'openai' + : 'local'; + } + if (next.AUDREY_EMBEDDING_PROVIDER === 'local') { + next.AUDREY_DEVICE = env.AUDREY_DEVICE || 'gpu'; + } + if (!env.AUDREY_LLM_PROVIDER) { + if (env.ANTHROPIC_API_KEY) { + next.AUDREY_LLM_PROVIDER = 'anthropic'; + } else if (env.OPENAI_API_KEY) { + next.AUDREY_LLM_PROVIDER = 'openai'; + } + } + return next; + } + default: + return next; + } +} diff --git a/mcp-server/index.js b/mcp-server/index.js index 8c66e8e..4b21a52 100644 --- a/mcp-server/index.js +++ b/mcp-server/index.js @@ -11,7 +11,9 @@ import { VERSION, SERVER_NAME, buildAudreyConfig, + buildInitEnv, buildInstallArgs, + listInitPresets, resolveDataDir, resolveEmbeddingProvider, resolveLLMProvider, @@ -426,10 +428,19 @@ async function recall() { process.exit(0); } - const lines = results.map(r => { + // Budget: cap total injected context to ~2000 chars (~500 tokens) to avoid bloating the prompt + const maxTotalChars = 2000; + const lines = []; + let totalChars = 0; + for (const r of results) { const type = r.type === 'semantic' ? 'principle' : r.type === 'procedural' ? 'procedure' : 'memory'; - return `[${type}] ${r.content}`; - }); + const maxContentChars = Math.min(r.content.length, maxTotalChars - totalChars - 20); + if (maxContentChars <= 0) break; + const content = r.content.length > maxContentChars ? r.content.slice(0, maxContentChars) + '...' : r.content; + const line = `[${type}] ${content}`; + lines.push(line); + totalChars += line.length; + } const output = { additionalContext: `Relevant memories from Audrey:\n\n${lines.join('\n\n')}`, @@ -696,7 +707,16 @@ async function restore() { } } -function install() { +function hasClaudeCli(execFn = execFileSync) { + try { + execFn('claude', ['--version'], { stdio: 'ignore' }); + return true; + } catch { + return false; + } +} + +function install(env = process.env) { try { execFileSync('claude', ['--version'], { stdio: 'ignore' }); } catch { @@ -704,9 +724,9 @@ function install() { process.exit(1); } - const dataDir = resolveDataDir(process.env); - const resolvedEmbedding = resolveEmbeddingProvider(process.env, process.env.AUDREY_EMBEDDING_PROVIDER); - const resolvedLlm = resolveLLMProvider(process.env, process.env.AUDREY_LLM_PROVIDER); + const dataDir = resolveDataDir(env); + const resolvedEmbedding = resolveEmbeddingProvider(env, env.AUDREY_EMBEDDING_PROVIDER); + const resolvedLlm = resolveLLMProvider(env, env.AUDREY_LLM_PROVIDER); if (resolvedEmbedding.provider === 'gemini') { console.log('Using Gemini embeddings (3072d)'); } else if (resolvedEmbedding.provider === 'local') { @@ -733,7 +753,7 @@ function install() { // Not registered yet. } - const args = buildInstallArgs(process.env); + const args = buildInstallArgs(env); try { execFileSync('claude', args, { stdio: 'inherit' }); } catch { @@ -789,6 +809,186 @@ Verify: claude mcp list `); } +export function resolveInitProfilePath(dataDir = resolveDataDir(process.env)) { + return resolve(dataDir, '..', 'init-profile.json'); +} + +function initPresetByName(name = 'local-offline') { + const preset = listInitPresets().find(entry => entry.name === name); + if (!preset) { + const available = listInitPresets() + .map(entry => ` ${entry.name.padEnd(14)} ${entry.description}`) + .join('\n'); + throw new Error(`Unsupported init preset: ${name}\nAvailable presets:\n${available}`); + } + return preset; +} + +function buildInitWarnings(presetName, initEnv, resolvedEmbedding, resolvedLlm, claudeAvailable, shouldInstall) { + const warnings = []; + + if (presetName === 'hosted-fast' && resolvedEmbedding.provider === 'local') { + warnings.push('No hosted embedding key detected; falling back to local embeddings.'); + } + + if (presetName === 'hosted-fast' && !resolvedLlm) { + warnings.push('No hosted LLM key detected; consolidation and contradiction handling will use heuristics.'); + } + + if (presetName === 'sidecar-prod' && !initEnv.AUDREY_API_KEY) { + warnings.push('AUDREY_API_KEY is not set; configure one before exposing Audrey beyond localhost.'); + } + + if (shouldInstall && !claudeAvailable) { + warnings.push('Claude Code CLI was not found; MCP registration and hooks were skipped.'); + } + + return warnings; +} + +function buildInitNextSteps({ preset, profile, installedMcp, installedHooks, claudeAvailable, shouldInstall }) { + const steps = ['npx audrey doctor']; + + if (preset.surface === 'claude') { + if (installedMcp) { + steps.push('claude mcp list'); + } else if (shouldInstall && !claudeAvailable) { + steps.push('Install Claude Code, then rerun: npx audrey init ' + preset.name); + } else { + steps.push('npx audrey install'); + } + + if (!installedHooks && preset.installHooks) { + steps.push('npx audrey hooks install'); + } + } + + if (preset.name === 'ci-mock') { + steps.push('AUDREY_EMBEDDING_PROVIDER=mock AUDREY_LLM_PROVIDER=mock npx audrey serve'); + } + + if (preset.name === 'sidecar-prod') { + steps.push('docker compose up -d --build'); + steps.push(`AUDREY_API_KEY=${profile.apiKeyConfigured ? '[configured]' : 'set-me'} npx audrey serve`); + } + + return steps; +} + +function formatProviderSummary(label, config) { + if (!config) return `${label}: disabled`; + const suffix = config.provider === 'local' && config.device + ? ` (${config.dimensions}d, device=${config.device})` + : config.dimensions + ? ` (${config.dimensions}d)` + : ''; + return `${label}: ${config.provider}${suffix}`; +} + +export function runInitCommand({ + argv = process.argv, + env = process.env, + out = console.log, + installFn = install, + hooksInstallFn = hooksInstall, + execFn = execFileSync, + writeFile = writeFileSync, + mkdir = mkdirSync, +} = {}) { + const args = argv.slice(3); + const presetArg = args.find(arg => !arg.startsWith('-')) || 'local-offline'; + const dryRun = args.includes('--dry-run'); + const noHooks = args.includes('--no-hooks'); + const noInstall = args.includes('--no-install'); + + const preset = initPresetByName(presetArg); + const initEnv = buildInitEnv(env, preset.name); + const dataDir = resolveDataDir(initEnv); + const profilePath = resolveInitProfilePath(dataDir); + const claudeAvailable = hasClaudeCli(execFn); + const shouldInstall = preset.surface === 'claude' && !noInstall; + const installedMcp = shouldInstall && claudeAvailable && !dryRun; + const installedHooks = installedMcp && preset.installHooks && !noHooks; + const embedding = resolveEmbeddingProvider(initEnv, initEnv.AUDREY_EMBEDDING_PROVIDER); + const llm = resolveLLMProvider(initEnv, initEnv.AUDREY_LLM_PROVIDER); + const warnings = buildInitWarnings(preset.name, initEnv, embedding, llm, claudeAvailable, shouldInstall); + + const profile = { + version: VERSION, + preset: preset.name, + description: preset.description, + surface: preset.surface, + createdAt: new Date().toISOString(), + dataDir, + profilePath, + claudeAvailable, + mcpRegistered: installedMcp, + hooksInstalled: installedHooks, + dryRun, + apiKeyConfigured: Boolean(initEnv.AUDREY_API_KEY), + embedding, + llm: llm ? { provider: llm.provider } : null, + recommendedNextSteps: [], + warnings, + }; + + profile.recommendedNextSteps = buildInitNextSteps({ + preset, + profile, + installedMcp, + installedHooks, + claudeAvailable, + shouldInstall, + }); + + if (!dryRun) { + mkdir(dataDir, { recursive: true }); + mkdir(resolve(dataDir, '..'), { recursive: true }); + writeFile(profilePath, JSON.stringify(profile, null, 2) + '\n'); + if (installedMcp) { + installFn(initEnv); + } + if (installedHooks) { + hooksInstallFn(); + } + } + + out(`[audrey] Init preset: ${preset.name}`); + out(` ${preset.description}`); + out(` Data directory: ${dataDir}`); + out(` Profile: ${profilePath}${dryRun ? ' (dry run)' : ''}`); + out(` ${formatProviderSummary('Embeddings', embedding)}`); + out(` ${formatProviderSummary('LLM', llm)}`); + out(` Claude Code CLI: ${claudeAvailable ? 'available' : 'not found'}`); + if (preset.surface === 'claude') { + out(` MCP registration: ${installedMcp ? 'installed' : shouldInstall ? 'skipped' : 'not requested'}`); + out(` Hooks: ${installedHooks ? 'installed' : preset.installHooks && !noHooks ? 'skipped' : 'not requested'}`); + } + + if (warnings.length > 0) { + out(''); + out('Warnings:'); + for (const warning of warnings) { + out(` - ${warning}`); + } + } + + out(''); + out('Next steps:'); + for (const step of profile.recommendedNextSteps) { + out(` - ${step}`); + } + + return { + preset: preset.name, + profile, + installedMcp, + installedHooks, + dryRun, + warnings, + }; +} + function uninstall() { try { execFileSync('claude', ['--version'], { stdio: 'ignore' }); @@ -1180,8 +1380,224 @@ async function main() { const isDirectRun = process.argv[1] && resolve(process.argv[1]) === fileURLToPath(import.meta.url); +async function doctor() { + const checks = []; + const pass = (name, detail) => checks.push({ name, status: 'pass', detail }); + const warn = (name, detail) => checks.push({ name, status: 'warn', detail }); + const fail = (name, detail) => checks.push({ name, status: 'fail', detail }); + + // 1. Node.js version + const nodeVersion = process.version; + const major = parseInt(nodeVersion.slice(1), 10); + if (major >= 20) { + pass('Node.js', `${nodeVersion} (>= 20 required)`); + } else { + fail('Node.js', `${nodeVersion} — Audrey requires Node.js >= 20`); + } + + // 2. Data directory + const dataDir = resolveDataDir(process.env); + if (existsSync(dataDir)) { + pass('Data directory', `${dataDir} (exists)`); + } else { + warn('Data directory', `${dataDir} (will be created on first use)`); + } + + // 3. SQLite access + try { + const { createDatabase, closeDatabase: closeDb } = await import('../src/db.js'); + const tmpDir = join(dataDir, '.doctor-check'); + mkdirSync(tmpDir, { recursive: true }); + const { db } = createDatabase(tmpDir, { dimensions: 8 }); + closeDb(db); + const { rmSync } = await import('node:fs'); + rmSync(tmpDir, { recursive: true, force: true }); + pass('SQLite', 'better-sqlite3 + sqlite-vec loaded successfully'); + } catch (err) { + fail('SQLite', `Failed: ${err.message}`); + } + + // 4. Embedding provider + const embedding = resolveEmbeddingProvider(process.env, process.env.AUDREY_EMBEDDING_PROVIDER); + if (embedding.provider === 'local') { + pass('Embeddings', `local (${embedding.dimensions}d, device=${embedding.device || 'gpu'}) — offline-capable`); + } else if (embedding.provider === 'gemini') { + pass('Embeddings', `gemini (${embedding.dimensions}d) — GOOGLE_API_KEY detected`); + } else if (embedding.provider === 'openai') { + if (process.env.OPENAI_API_KEY) { + pass('Embeddings', `openai (${embedding.dimensions}d) — OPENAI_API_KEY detected`); + } else { + fail('Embeddings', 'openai selected but OPENAI_API_KEY not set'); + } + } else { + warn('Embeddings', `mock (${embedding.dimensions}d) — not suitable for production`); + } + + // 5. LLM provider + const llm = resolveLLMProvider(process.env, process.env.AUDREY_LLM_PROVIDER); + if (llm?.provider === 'anthropic') { + pass('LLM', 'anthropic — consolidation and contradiction detection enabled'); + } else if (llm?.provider === 'openai') { + pass('LLM', 'openai — consolidation and contradiction detection enabled'); + } else { + warn('LLM', 'none — consolidation will use heuristics only (set ANTHROPIC_API_KEY for LLM-powered features)'); + } + + // 6. MCP registration + try { + const claudeJsonPath = join(homedir(), '.claude.json'); + if (existsSync(claudeJsonPath)) { + const claudeConfig = JSON.parse(readFileSync(claudeJsonPath, 'utf-8')); + if (SERVER_NAME in (claudeConfig.mcpServers || {})) { + pass('MCP registration', `"${SERVER_NAME}" registered in Claude Code`); + } else { + warn('MCP registration', `Not registered — run "npx audrey install"`); + } + } else { + warn('MCP registration', 'Claude Code config not found — install Claude Code first'); + } + } catch { + warn('MCP registration', 'Could not read Claude Code config'); + } + + // 7. Hooks + try { + const settingsPath = join(homedir(), '.claude', 'settings.json'); + if (existsSync(settingsPath)) { + const settings = JSON.parse(readFileSync(settingsPath, 'utf-8')); + const hasAudreyHooks = Object.values(settings.hooks || {}).some(entries => + entries.some(entry => entry.hooks?.some(h => h.command?.includes('npx audrey'))) + ); + if (hasAudreyHooks) { + pass('Hooks', 'Audrey hooks installed in Claude Code settings'); + } else { + warn('Hooks', 'Not installed — run "npx audrey hooks install" for automatic memory'); + } + } else { + warn('Hooks', 'Claude Code settings not found'); + } + } catch { + warn('Hooks', 'Could not read Claude Code settings'); + } + + // 8. Memory health (if data exists) + if (existsSync(dataDir)) { + try { + const storedDims = readStoredDimensions(dataDir); + const dims = storedDims || 8; + const audrey = new Audrey({ dataDir, agent: 'doctor', embedding: { provider: 'mock', dimensions: dims } }); + const health = audrey.memoryStatus(); + const stats = audrey.introspect(); + audrey.close(); + + if (health.healthy) { + pass('Memory health', `${stats.episodic} episodic, ${stats.semantic} semantic, ${stats.procedural} procedural — healthy`); + } else { + warn('Memory health', `Index drift detected — run "npx audrey reembed"`); + } + + if (storedDims && storedDims !== embedding.dimensions) { + warn('Dimension match', `Stored: ${storedDims}d, current provider: ${embedding.dimensions}d — run "npx audrey reembed" to realign`); + } else if (storedDims) { + pass('Dimension match', `${storedDims}d (stored matches provider)`); + } + } catch (err) { + fail('Memory health', `Could not read database: ${err.message}`); + } + } + + // Print results + console.log(`\nAudrey v${VERSION} — Doctor\n`); + let hasFailure = false; + for (const check of checks) { + const icon = check.status === 'pass' ? '+' : check.status === 'warn' ? '~' : 'X'; + const label = check.status === 'pass' ? 'OK' : check.status === 'warn' ? 'WARN' : 'FAIL'; + console.log(` [${icon}] ${label.padEnd(4)} ${check.name}: ${check.detail}`); + if (check.status === 'fail') hasFailure = true; + } + console.log(''); + + if (hasFailure) { + console.log('Some checks failed. Fix the issues above and run "npx audrey doctor" again.'); + process.exit(1); + } else { + const warns = checks.filter(c => c.status === 'warn').length; + if (warns > 0) { + console.log(`All critical checks passed. ${warns} warning(s) — see above for optional improvements.`); + } else { + console.log('All checks passed. Audrey is ready.'); + } + } +} + +function showHelp() { + console.log(`Audrey v${VERSION} – Persistent memory for AI agents + +Usage: npx audrey [options] + +Setup: + init [preset] [--no-hooks] [--no-install] [--dry-run] + Bootstrap Audrey with a named setup preset + install Register MCP server with Claude Code + uninstall Remove MCP server registration + hooks install Wire automatic memory into Claude Code session lifecycle + hooks uninstall Remove Audrey hooks from settings + +Health & Monitoring: + doctor Validate Node.js, SQLite, providers, hooks, memory health + status Human-readable health report + status --json Machine-readable health output + status --json --fail-on-unhealthy CI gate + +Session Lifecycle (used by hooks automatically): + greeting [context] Load identity, principles, mood + recall [query] Semantic memory search + reflect Consolidate learnings from stdin conversation + dream + +Maintenance: + dream Full consolidation + decay cycle + reembed Re-embed all memories after provider/dimension change + +Versioning: + snapshot [file] Export memories to timestamped JSON file + restore Restore from snapshot (--force to overwrite) + +Server: + serve [port] Start REST API server (default: 3487) + dashboard [port] Start server and open memory dashboard + +Init presets: + local-offline Claude Code with local embeddings, no hosted keys required + hosted-fast Claude Code with hosted providers detected from env + ci-mock Mock providers for CI and smoke tests + sidecar-prod REST or Docker sidecar with operator-friendly defaults + +Options: + --help, -h Show this help message + --version, -v Show version number + +Documentation: https://github.com/Evilander/Audrey +`); +} + if (isDirectRun) { - if (subcommand === 'install') { + if (subcommand === '--help' || subcommand === '-h' || subcommand === 'help') { + showHelp(); + } else if (subcommand === '--version' || subcommand === '-v' || subcommand === 'version') { + console.log(VERSION); + } else if (subcommand === 'doctor') { + doctor().catch(err => { + console.error('[audrey] doctor failed:', err); + process.exit(1); + }); + } else if (subcommand === 'init') { + try { + runInitCommand(); + } catch (err) { + console.error('[audrey] init failed:', err.message || err); + process.exit(1); + } + } else if (subcommand === 'install') { install(); } else if (subcommand === 'uninstall') { uninstall(); @@ -1256,7 +1672,12 @@ if (isDirectRun) { }); } else if (subcommand === 'status') { status(); + } else if (subcommand) { + console.error(`Unknown command: ${subcommand}\n`); + showHelp(); + process.exit(1); } else { + // No subcommand: start MCP server (for Claude Code to invoke via stdio) main().catch(err => { console.error('[audrey-mcp] fatal:', err); process.exit(1); diff --git a/package-lock.json b/package-lock.json index f848a03..9de2c69 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "audrey", - "version": "0.16.1", + "version": "0.17.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "audrey", - "version": "0.16.1", + "version": "0.17.0", "license": "MIT", "dependencies": { "@huggingface/transformers": "^3.8.1", diff --git a/package.json b/package.json index f7cdfe7..4a50472 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "audrey", - "version": "0.16.1", + "version": "0.17.0", "description": "Biological memory architecture for AI agents - encode, consolidate, and recall memories with confidence decay, contradiction detection, and causal graphs", "type": "module", "main": "src/index.js", @@ -19,6 +19,7 @@ "benchmarks/*.js", "docs/production-readiness.md", "docs/benchmarking.md", + "docs/assets/benchmarks/", "examples/", "types/", "README.md", @@ -29,10 +30,16 @@ "test:watch": "vitest", "pack:check": "npm pack --dry-run", "bench:memory": "node benchmarks/run.js", + "bench:memory:retrieval": "node benchmarks/run.js --suite retrieval", + "bench:memory:operations": "node benchmarks/run.js --suite operations", "bench:memory:json": "node benchmarks/run.js --json", "bench:memory:check": "node benchmarks/run.js --check", "bench:memory:readme-assets": "node benchmarks/run.js --readme-assets-dir docs/assets/benchmarks", - "serve": "node mcp-server/index.js serve" + "serve": "node mcp-server/index.js serve", + "docker:build": "docker build -t audrey:local .", + "docker:up": "docker compose up -d --build", + "docker:down": "docker compose down", + "docker:logs": "docker compose logs -f audrey" }, "keywords": [ "ai", @@ -75,7 +82,7 @@ }, "author": "evilander", "engines": { - "node": ">=18" + "node": ">=20" }, "license": "MIT", "dependencies": { diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..9d491ba --- /dev/null +++ b/python/README.md @@ -0,0 +1,73 @@ +# Audrey Python SDK + +Typed Python client for the Audrey REST API. + +## Install + +```bash +pip install audrey-memory +``` + +For local development from this repository: + +```bash +cd python +python -m pip install -e . +``` + +## Quick Start + +Start Audrey's REST API: + +```bash +npx audrey serve +``` + +Then use the client: + +```python +from audrey_memory import Audrey + +brain = Audrey( + base_url="http://127.0.0.1:3487", + api_key="secret", + agent="support-agent", +) + +memory_id = brain.encode( + "Stripe returns HTTP 429 above 100 req/s", + source="direct-observation", + tags=["stripe", "rate-limit"], +) + +results = brain.recall("stripe rate limits", limit=5) +snapshot = brain.snapshot() +brain.restore(snapshot) +brain.close() +``` + +Async usage: + +```python +import asyncio + +from audrey_memory import AsyncAudrey + + +async def main() -> None: + async with AsyncAudrey(base_url="http://127.0.0.1:3487") as brain: + await brain.health() + await brain.encode("Deploy failed due to OOM", source="direct-observation") + await brain.recall("deploy failure", limit=3) + + +asyncio.run(main()) +``` + +## Features + +- Sync and async clients powered by `httpx` +- Pydantic request and response models +- Bearer auth via `AUDREY_API_KEY` +- Agent scoping via `X-Audrey-Agent` +- Snapshot export and restore support diff --git a/python/audrey_memory/__init__.py b/python/audrey_memory/__init__.py new file mode 100644 index 0000000..c92a1c8 --- /dev/null +++ b/python/audrey_memory/__init__.py @@ -0,0 +1,51 @@ +from ._version import __version__ +from .client import AsyncAudrey, Audrey, AudreyAPIError +from .types import ( + AckResponse, + Affect, + AnalyticsResponse, + ConsolidateRequest, + ContradictionStatus, + DreamRequest, + EncodeRequest, + EncodeResponse, + ForgetRequest, + ForgetResponse, + HealthResponse, + MarkUsedRequest, + MemorySnapshot, + OperationResult, + RecallError, + RecallRequest, + RecallResponse, + RecallResult, + RestoreResponse, + StatusResponse, +) + +__all__ = [ + "__version__", + "AckResponse", + "Affect", + "AnalyticsResponse", + "AsyncAudrey", + "Audrey", + "AudreyAPIError", + "ConsolidateRequest", + "ContradictionStatus", + "DreamRequest", + "EncodeRequest", + "EncodeResponse", + "ForgetRequest", + "ForgetResponse", + "HealthResponse", + "MarkUsedRequest", + "MemorySnapshot", + "OperationResult", + "RecallError", + "RecallRequest", + "RecallResponse", + "RecallResult", + "RestoreResponse", + "StatusResponse", +] diff --git a/python/audrey_memory/_version.py b/python/audrey_memory/_version.py new file mode 100644 index 0000000..fd86b3e --- /dev/null +++ b/python/audrey_memory/_version.py @@ -0,0 +1 @@ +__version__ = "0.17.0" diff --git a/python/audrey_memory/client.py b/python/audrey_memory/client.py new file mode 100644 index 0000000..96622b1 --- /dev/null +++ b/python/audrey_memory/client.py @@ -0,0 +1,307 @@ +from __future__ import annotations + +from typing import Any, Mapping, TypeVar + +import httpx +from pydantic import BaseModel + +from ._version import __version__ +from .types import ( + AckResponse, + AnalyticsResponse, + ConsolidateRequest, + DreamRequest, + EncodeRequest, + EncodeResponse, + ForgetRequest, + ForgetResponse, + HealthResponse, + MarkUsedRequest, + MemorySnapshot, + OperationResult, + RecallRequest, + RecallResponse, + RestoreResponse, + StatusResponse, +) + +ModelT = TypeVar("ModelT", bound=BaseModel) +DEFAULT_TIMEOUT = 30.0 +DEFAULT_BASE_URL = "http://127.0.0.1:3487" + + +class AudreyAPIError(RuntimeError): + def __init__(self, status_code: int, message: str, response_body: Any = None) -> None: + super().__init__(message) + self.status_code = status_code + self.response_body = response_body + + +def _build_headers(api_key: str | None, agent: str | None) -> dict[str, str]: + headers = { + "Accept": "application/json", + "Content-Type": "application/json", + "User-Agent": f"audrey-memory-python/{__version__}", + } + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + if agent: + headers["X-Audrey-Agent"] = agent + return headers + + +def _dump_payload(payload: BaseModel | Mapping[str, Any] | None) -> dict[str, Any] | None: + if payload is None: + return None + if isinstance(payload, BaseModel): + return payload.model_dump(exclude_none=True, mode="json") + return {key: value for key, value in dict(payload).items() if value is not None} + + +def _error_message(response: httpx.Response, data: Any) -> str: + if isinstance(data, dict): + detail = data.get("error") or data.get("message") + if isinstance(detail, str) and detail.strip(): + return detail + return f"Audrey API request failed with status {response.status_code}" + + +def _decode_json(response: httpx.Response) -> Any: + try: + data = response.json() + except ValueError: + data = None + if response.is_error: + raise AudreyAPIError(response.status_code, _error_message(response, data), data) + return data + + +def _validate(model_type: type[ModelT], data: Any) -> ModelT: + return model_type.model_validate(data) + + +def _build_model_payload( + payload: BaseModel | Mapping[str, Any] | str, + model_type: type[ModelT], + field_name: str, + extra: dict[str, Any], +) -> ModelT: + if isinstance(payload, model_type): + if extra: + raise TypeError(f"{model_type.__name__} payload cannot be combined with keyword overrides") + return payload + if isinstance(payload, Mapping): + if extra: + raise TypeError(f"Mapping payload cannot be combined with keyword overrides for {model_type.__name__}") + return model_type.model_validate(payload) + return model_type.model_validate({field_name: payload, **extra}) + + +def _optional_model_payload( + payload: BaseModel | Mapping[str, Any] | None, + model_type: type[ModelT], + extra: dict[str, Any], +) -> ModelT | None: + if isinstance(payload, model_type): + if extra: + raise TypeError(f"{model_type.__name__} payload cannot be combined with keyword overrides") + return payload + if payload is None: + return model_type.model_validate(extra) if extra else None + if extra: + raise TypeError(f"Mapping payload cannot be combined with keyword overrides for {model_type.__name__}") + return model_type.model_validate(payload) + + +class Audrey: + def __init__( + self, + base_url: str = DEFAULT_BASE_URL, + *, + api_key: str | None = None, + agent: str | None = None, + timeout: float | httpx.Timeout = DEFAULT_TIMEOUT, + transport: httpx.BaseTransport | None = None, + ) -> None: + self._client = httpx.Client( + base_url=base_url.rstrip("/"), + timeout=timeout, + transport=transport, + headers=_build_headers(api_key, agent), + ) + + def close(self) -> None: + self._client.close() + + def __enter__(self) -> Audrey: + return self + + def __exit__(self, exc_type: object, exc: object, traceback: object) -> None: + self.close() + + def health(self) -> HealthResponse: + return _validate(HealthResponse, _decode_json(self._client.get("/health"))) + + def status(self) -> StatusResponse: + return _validate(StatusResponse, _decode_json(self._client.get("/status"))) + + def analytics(self) -> AnalyticsResponse: + return _validate(AnalyticsResponse, _decode_json(self._client.get("/analytics"))) + + def encode(self, payload: EncodeRequest | Mapping[str, Any] | str, /, **kwargs: Any) -> str: + request = _build_model_payload(payload, EncodeRequest, "content", kwargs) + data = _decode_json(self._client.post("/encode", json=_dump_payload(request))) + return _validate(EncodeResponse, data).id + + def recall(self, payload: RecallRequest | Mapping[str, Any] | str, /, **kwargs: Any): + return self.recall_response(payload, **kwargs).results + + def recall_response(self, payload: RecallRequest | Mapping[str, Any] | str, /, **kwargs: Any) -> RecallResponse: + request = _build_model_payload(payload, RecallRequest, "query", kwargs) + data = _decode_json(self._client.post("/recall", json=_dump_payload(request))) + return _validate(RecallResponse, data) + + def dream(self, payload: DreamRequest | Mapping[str, Any] | None = None, /, **kwargs: Any) -> OperationResult: + request = _optional_model_payload(payload, DreamRequest, kwargs) + data = _decode_json(self._client.post("/dream", json=_dump_payload(request))) + return _validate(OperationResult, data) + + def consolidate( + self, + payload: ConsolidateRequest | Mapping[str, Any] | None = None, + /, + **kwargs: Any, + ) -> OperationResult: + request = _optional_model_payload(payload, ConsolidateRequest, kwargs) + data = _decode_json(self._client.post("/consolidate", json=_dump_payload(request))) + return _validate(OperationResult, data) + + def mark_used(self, memory_id: str) -> AckResponse: + request = MarkUsedRequest(id=memory_id) + data = _decode_json(self._client.post("/mark-used", json=_dump_payload(request))) + return _validate(AckResponse, data) + + def forget( + self, + *, + id: str | None = None, + query: str | None = None, + purge: bool | None = None, + min_similarity: float | None = None, + ) -> ForgetResponse | None: + request = ForgetRequest( + id=id, + query=query, + purge=purge, + minSimilarity=min_similarity, + ) + data = _decode_json(self._client.post("/forget", json=_dump_payload(request))) + if data is None: + return None + return _validate(ForgetResponse, data) + + def snapshot(self) -> MemorySnapshot: + data = _decode_json(self._client.post("/snapshot")) + return _validate(MemorySnapshot, data) + + def restore(self, snapshot: MemorySnapshot | Mapping[str, Any]) -> RestoreResponse: + request = snapshot if isinstance(snapshot, MemorySnapshot) else MemorySnapshot.model_validate(snapshot) + data = _decode_json(self._client.post("/restore", json=_dump_payload(request))) + return _validate(RestoreResponse, data) + + +class AsyncAudrey: + def __init__( + self, + base_url: str = DEFAULT_BASE_URL, + *, + api_key: str | None = None, + agent: str | None = None, + timeout: float | httpx.Timeout = DEFAULT_TIMEOUT, + transport: httpx.AsyncBaseTransport | None = None, + ) -> None: + self._client = httpx.AsyncClient( + base_url=base_url.rstrip("/"), + timeout=timeout, + transport=transport, + headers=_build_headers(api_key, agent), + ) + + async def aclose(self) -> None: + await self._client.aclose() + + async def __aenter__(self) -> AsyncAudrey: + return self + + async def __aexit__(self, exc_type: object, exc: object, traceback: object) -> None: + await self.aclose() + + async def health(self) -> HealthResponse: + return _validate(HealthResponse, _decode_json(await self._client.get("/health"))) + + async def status(self) -> StatusResponse: + return _validate(StatusResponse, _decode_json(await self._client.get("/status"))) + + async def analytics(self) -> AnalyticsResponse: + return _validate(AnalyticsResponse, _decode_json(await self._client.get("/analytics"))) + + async def encode(self, payload: EncodeRequest | Mapping[str, Any] | str, /, **kwargs: Any) -> str: + request = _build_model_payload(payload, EncodeRequest, "content", kwargs) + data = _decode_json(await self._client.post("/encode", json=_dump_payload(request))) + return _validate(EncodeResponse, data).id + + async def recall(self, payload: RecallRequest | Mapping[str, Any] | str, /, **kwargs: Any): + return (await self.recall_response(payload, **kwargs)).results + + async def recall_response(self, payload: RecallRequest | Mapping[str, Any] | str, /, **kwargs: Any) -> RecallResponse: + request = _build_model_payload(payload, RecallRequest, "query", kwargs) + data = _decode_json(await self._client.post("/recall", json=_dump_payload(request))) + return _validate(RecallResponse, data) + + async def dream(self, payload: DreamRequest | Mapping[str, Any] | None = None, /, **kwargs: Any) -> OperationResult: + request = _optional_model_payload(payload, DreamRequest, kwargs) + data = _decode_json(await self._client.post("/dream", json=_dump_payload(request))) + return _validate(OperationResult, data) + + async def consolidate( + self, + payload: ConsolidateRequest | Mapping[str, Any] | None = None, + /, + **kwargs: Any, + ) -> OperationResult: + request = _optional_model_payload(payload, ConsolidateRequest, kwargs) + data = _decode_json(await self._client.post("/consolidate", json=_dump_payload(request))) + return _validate(OperationResult, data) + + async def mark_used(self, memory_id: str) -> AckResponse: + request = MarkUsedRequest(id=memory_id) + data = _decode_json(await self._client.post("/mark-used", json=_dump_payload(request))) + return _validate(AckResponse, data) + + async def forget( + self, + *, + id: str | None = None, + query: str | None = None, + purge: bool | None = None, + min_similarity: float | None = None, + ) -> ForgetResponse | None: + request = ForgetRequest( + id=id, + query=query, + purge=purge, + minSimilarity=min_similarity, + ) + data = _decode_json(await self._client.post("/forget", json=_dump_payload(request))) + if data is None: + return None + return _validate(ForgetResponse, data) + + async def snapshot(self) -> MemorySnapshot: + data = _decode_json(await self._client.post("/snapshot")) + return _validate(MemorySnapshot, data) + + async def restore(self, snapshot: MemorySnapshot | Mapping[str, Any]) -> RestoreResponse: + request = snapshot if isinstance(snapshot, MemorySnapshot) else MemorySnapshot.model_validate(snapshot) + data = _decode_json(await self._client.post("/restore", json=_dump_payload(request))) + return _validate(RestoreResponse, data) diff --git a/python/audrey_memory/py.typed b/python/audrey_memory/py.typed new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/python/audrey_memory/py.typed @@ -0,0 +1 @@ + diff --git a/python/audrey_memory/types.py b/python/audrey_memory/types.py new file mode 100644 index 0000000..0b8be0a --- /dev/null +++ b/python/audrey_memory/types.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, ConfigDict, Field + + +class AudreyModel(BaseModel): + model_config = ConfigDict(extra="allow", populate_by_name=True) + + +class Affect(AudreyModel): + valence: float + arousal: float | None = None + label: str | None = None + + +class HealthResponse(AudreyModel): + ok: bool + version: str + + +class ContradictionStatus(AudreyModel): + open: int = 0 + resolved: int = 0 + context_dependent: int = 0 + reopened: int = 0 + + +class StatusResponse(AudreyModel): + episodic: int | None = None + semantic: int | None = None + procedural: int | None = None + causalLinks: int | None = None + contradictions: ContradictionStatus | None = None + dormant: int | None = None + lastConsolidation: str | None = None + totalConsolidationRuns: int | None = None + + +class AnalyticsRow(AudreyModel): + id: str | None = None + content: str | None = None + agent: str | None = None + + +class AnalyticsResponse(AudreyModel): + topEpisodes: list[AnalyticsRow] = Field(default_factory=list) + topSemantics: list[AnalyticsRow] = Field(default_factory=list) + recentRuns: list[AnalyticsRow] = Field(default_factory=list) + metrics: list[AnalyticsRow] = Field(default_factory=list) + agents: list[AnalyticsRow] = Field(default_factory=list) + + +class EncodeRequest(AudreyModel): + content: str + source: str + salience: float | None = Field(default=None, ge=0, le=1) + tags: list[str] | None = None + context: dict[str, Any] | None = None + affect: Affect | None = None + causal: dict[str, Any] | None = None + supersedes: str | None = None + private: bool | None = None + agent: str | None = None + + +class EncodeResponse(AudreyModel): + id: str + + +class RecallRequest(AudreyModel): + query: str + limit: int | None = Field(default=None, ge=1, le=50) + context: dict[str, Any] | None = None + mood: dict[str, Any] | None = None + types: list[str] | None = None + scope: str | None = None + includePrivate: bool | None = None + agent: str | None = None + + +class RecallResult(AudreyModel): + id: str + content: str + type: str | None = None + confidence: float | None = None + score: float | None = None + source: str | None = None + createdAt: str | None = None + agent: str | None = None + + +class RecallError(AudreyModel): + type: str | None = None + message: str | None = None + + +class RecallResponse(AudreyModel): + results: list[RecallResult] = Field(default_factory=list) + partialFailure: bool = False + errors: list[RecallError] = Field(default_factory=list) + + +class DreamRequest(AudreyModel): + dormantThreshold: float | None = Field(default=None, ge=0, le=1) + minClusterSize: int | None = Field(default=None, ge=1) + similarityThreshold: float | None = Field(default=None, ge=0, le=1) + + +class ConsolidateRequest(AudreyModel): + minClusterSize: int | None = Field(default=None, ge=1) + similarityThreshold: float | None = Field(default=None, ge=0, le=1) + + +class OperationResult(AudreyModel): + ok: bool | None = None + status: str | None = None + + +class MarkUsedRequest(AudreyModel): + id: str + + +class AckResponse(AudreyModel): + ok: bool + + +class ForgetRequest(AudreyModel): + id: str | None = None + query: str | None = None + purge: bool | None = None + minSimilarity: float | None = Field(default=None, ge=0, le=1) + + +class ForgetResponse(AudreyModel): + id: str | None = None + type: str | None = None + purged: bool | None = None + + +class MemorySnapshot(AudreyModel): + version: str + exportedAt: str | None = None + episodes: list[dict[str, Any]] = Field(default_factory=list) + semantics: list[dict[str, Any]] = Field(default_factory=list) + procedures: list[dict[str, Any]] = Field(default_factory=list) + causalLinks: list[dict[str, Any]] = Field(default_factory=list) + contradictions: list[dict[str, Any]] = Field(default_factory=list) + consolidationRuns: list[dict[str, Any]] = Field(default_factory=list) + consolidationMetrics: list[dict[str, Any]] = Field(default_factory=list) + config: dict[str, Any] = Field(default_factory=dict) + + +class RestoreResponse(StatusResponse): + ok: bool diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 0000000..d2a8ab1 --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,56 @@ +[build-system] +requires = ["setuptools>=69", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "audrey-memory" +dynamic = ["version"] +description = "Typed Python client for the Audrey LLM memory server" +readme = "README.md" +requires-python = ">=3.9" +license = "MIT" +authors = [ + { name = "evilander" } +] +keywords = [ + "ai", + "agents", + "audrey", + "llm", + "memory", + "pydantic", + "python", +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Typing :: Typed", +] +dependencies = [ + "httpx>=0.27,<1", + "pydantic>=2.7,<3", +] + +[project.urls] +Homepage = "https://github.com/Evilander/Audrey" +Repository = "https://github.com/Evilander/Audrey" +Issues = "https://github.com/Evilander/Audrey/issues" + +[tool.setuptools.dynamic] +version = { attr = "audrey_memory._version.__version__" } + +[tool.setuptools] +include-package-data = true + +[tool.setuptools.packages.find] +where = ["."] +include = ["audrey_memory*"] + +[tool.setuptools.package-data] +audrey_memory = ["py.typed"] diff --git a/python/tests/test_client.py b/python/tests/test_client.py new file mode 100644 index 0000000..b1df926 --- /dev/null +++ b/python/tests/test_client.py @@ -0,0 +1,233 @@ +from __future__ import annotations + +import asyncio +import json +import os +import socket +import subprocess +import sys +import tempfile +import time +import unittest +from pathlib import Path + +import httpx + +PYTHON_ROOT = Path(__file__).resolve().parents[1] +REPO_ROOT = PYTHON_ROOT.parent + +if str(PYTHON_ROOT) not in sys.path: + sys.path.insert(0, str(PYTHON_ROOT)) + +from audrey_memory import AsyncAudrey, Audrey, AudreyAPIError, MemorySnapshot, __version__ + + +def _free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("127.0.0.1", 0)) + return int(sock.getsockname()[1]) + + +class AudreyClientUnitTests(unittest.TestCase): + def test_sync_client_sends_auth_and_agent_headers(self) -> None: + seen: dict[str, object] = {} + + def handler(request: httpx.Request) -> httpx.Response: + seen["authorization"] = request.headers.get("Authorization") + seen["agent"] = request.headers.get("X-Audrey-Agent") + seen["body"] = json.loads(request.content.decode("utf-8")) + return httpx.Response(201, json={"id": "mem_123"}) + + client = Audrey( + base_url="http://audrey.test", + api_key="secret-token", + agent="python-sdk", + transport=httpx.MockTransport(handler), + ) + self.addCleanup(client.close) + + memory_id = client.encode( + "Stripe returns HTTP 429 above 100 req/s", + source="direct-observation", + tags=["stripe"], + ) + + self.assertEqual(memory_id, "mem_123") + self.assertEqual(seen["authorization"], "Bearer secret-token") + self.assertEqual(seen["agent"], "python-sdk") + self.assertEqual( + seen["body"], + { + "content": "Stripe returns HTTP 429 above 100 req/s", + "source": "direct-observation", + "tags": ["stripe"], + }, + ) + + def test_sync_client_raises_structured_api_error(self) -> None: + def handler(_: httpx.Request) -> httpx.Response: + return httpx.Response(400, json={"error": "content is required"}) + + client = Audrey( + base_url="http://audrey.test", + transport=httpx.MockTransport(handler), + ) + self.addCleanup(client.close) + + with self.assertRaises(AudreyAPIError) as exc: + client.encode("", source="direct-observation") + + self.assertEqual(exc.exception.status_code, 400) + self.assertEqual(str(exc.exception), "content is required") + + +class AudreyAsyncClientUnitTests(unittest.IsolatedAsyncioTestCase): + async def test_async_client_parses_recall_response(self) -> None: + def handler(request: httpx.Request) -> httpx.Response: + payload = json.loads(request.content.decode("utf-8")) + self.assertEqual(payload["query"], "stripe rate limits") + self.assertEqual(payload["limit"], 2) + return httpx.Response( + 200, + json={ + "results": [ + { + "id": "mem_1", + "content": "Stripe returns HTTP 429 above 100 req/s", + "type": "episodic", + "confidence": 0.92, + "score": 0.88, + "source": "direct-observation", + } + ], + "partialFailure": False, + "errors": [], + }, + ) + + client = AsyncAudrey( + base_url="http://audrey.test", + transport=httpx.MockTransport(handler), + ) + self.addAsyncCleanup(client.aclose) + + response = await client.recall_response("stripe rate limits", limit=2) + + self.assertFalse(response.partialFailure) + self.assertEqual(len(response.results), 1) + self.assertEqual(response.results[0].id, "mem_1") + + +class AudreyClientIntegrationTests(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + cls.api_key = "integration-secret" + cls.port = _free_port() + cls.base_url = f"http://127.0.0.1:{cls.port}" + cls.temp_dir = tempfile.TemporaryDirectory(prefix="audrey-python-sdk-") + env = os.environ.copy() + env.update( + { + "AUDREY_DATA_DIR": cls.temp_dir.name, + "AUDREY_EMBEDDING_PROVIDER": "mock", + "AUDREY_LLM_PROVIDER": "mock", + "AUDREY_API_KEY": cls.api_key, + } + ) + cls.process = subprocess.Popen( + ["node", "mcp-server/index.js", "serve", str(cls.port)], + cwd=REPO_ROOT, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + cls._wait_for_ready() + + @classmethod + def tearDownClass(cls) -> None: + if hasattr(cls, "process") and cls.process.poll() is None: + cls.process.terminate() + try: + cls.process.wait(timeout=10) + except subprocess.TimeoutExpired: + cls.process.kill() + cls.process.wait(timeout=10) + if hasattr(cls, "temp_dir"): + cls.temp_dir.cleanup() + + @classmethod + def _wait_for_ready(cls) -> None: + deadline = time.time() + 30 + last_error: Exception | None = None + while time.time() < deadline: + if cls.process.poll() is not None: + output = "" + if cls.process.stdout is not None: + output = cls.process.stdout.read() + raise RuntimeError( + f"Audrey server exited before becoming ready (code {cls.process.returncode}):\n{output}" + ) + try: + response = httpx.get( + f"{cls.base_url}/health", + headers={"Authorization": f"Bearer {cls.api_key}"}, + timeout=1.0, + ) + if response.status_code == 200: + return + except Exception as exc: # pragma: no cover - readiness race + last_error = exc + time.sleep(0.25) + raise RuntimeError(f"Timed out waiting for Audrey server readiness: {last_error}") + + def test_sync_end_to_end_against_real_server(self) -> None: + with Audrey( + base_url=self.base_url, + api_key=self.api_key, + agent="python-sync-test", + ) as client: + health = client.health() + self.assertTrue(health.ok) + + memory_id = client.encode( + "Python SDK integration remembers Stripe rate limits", + source="direct-observation", + tags=["python", "stripe"], + ) + self.assertTrue(memory_id) + + client.mark_used(memory_id) + + results = client.recall("stripe rate limits", limit=5, scope="agent") + self.assertGreaterEqual(len(results), 1) + self.assertIn("Stripe", results[0].content) + + snapshot = client.snapshot() + self.assertIsInstance(snapshot, MemorySnapshot) + self.assertEqual(snapshot.version, __version__) + restored = client.restore(snapshot) + self.assertTrue(restored.ok) + + def test_async_end_to_end_against_real_server(self) -> None: + async def run() -> None: + async with AsyncAudrey( + base_url=self.base_url, + api_key=self.api_key, + agent="python-async-test", + ) as client: + health = await client.health() + self.assertTrue(health.ok) + memory_id = await client.encode( + "Async Python SDK remembers deployment failures", + source="direct-observation", + ) + self.assertTrue(memory_id) + results = await client.recall("deployment failures", limit=5, scope="agent") + self.assertGreaterEqual(len(results), 1) + + asyncio.run(run()) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/benchmarks.test.js b/tests/benchmarks.test.js index 56c8cac..50242e0 100644 --- a/tests/benchmarks.test.js +++ b/tests/benchmarks.test.js @@ -19,8 +19,10 @@ describe('benchmark suite', () => { expect(summary.local.overall.length).toBeGreaterThanOrEqual(4); expect(summary.local.overall[0].system).toBe('Audrey'); + expect(summary.local.suites.map(suite => suite.id)).toEqual(['retrieval', 'operations']); expect(summary.external.leaderboard[0].system).toBe('MIRIX'); expect(summary.local.cases.some(testCase => testCase.id === 'procedural-learning')).toBe(true); + expect(summary.local.cases.some(testCase => testCase.id === 'operation-semantic-merge')).toBe(true); }); it('writes JSON, HTML, and SVG artifacts', async () => { @@ -34,6 +36,7 @@ describe('benchmark suite', () => { expect(existsSync(artifacts.html)).toBe(true); expect(existsSync(artifacts.localChart)).toBe(true); expect(existsSync(artifacts.externalChart)).toBe(true); + expect(artifacts.suiteCharts.some(chart => chart.id === 'operations')).toBe(true); expect(lines.join('\n')).toContain('Audrey benchmark complete.'); }); @@ -44,9 +47,19 @@ describe('benchmark suite', () => { }); expect(existsSync(artifacts.readmeAssets.localChart)).toBe(true); + expect(existsSync(artifacts.readmeAssets.operationsChart)).toBe(true); expect(existsSync(artifacts.readmeAssets.externalChart)).toBe(true); }); + it('can run only the operations suite', async () => { + const summary = await runBenchmarkSuite({ provider: 'mock', dimensions: 64, suite: 'operations' }); + + expect(summary.config.suites).toEqual(['operations']); + expect(summary.local.suites).toHaveLength(1); + expect(summary.local.suites[0].id).toBe('operations'); + expect(summary.local.cases.every(testCase => testCase.suite === 'operations')).toBe(true); + }); + it('enforces benchmark regression guardrails', async () => { const summary = await runBenchmarkSuite({ provider: 'mock', dimensions: 64 }); diff --git a/tests/mcp-server.test.js b/tests/mcp-server.test.js index c0748bb..9306ac6 100644 --- a/tests/mcp-server.test.js +++ b/tests/mcp-server.test.js @@ -4,7 +4,16 @@ import path from 'node:path'; import { EventEmitter } from 'node:events'; import { Audrey } from '../src/index.js'; import { readStoredDimensions } from '../src/db.js'; -import { buildAudreyConfig, buildInstallArgs, DEFAULT_DATA_DIR, MCP_ENTRYPOINT, SERVER_NAME, VERSION } from '../mcp-server/config.js'; +import { + buildAudreyConfig, + buildInitEnv, + buildInstallArgs, + DEFAULT_DATA_DIR, + listInitPresets, + MCP_ENTRYPOINT, + SERVER_NAME, + VERSION, +} from '../mcp-server/config.js'; import { MAX_MEMORY_CONTENT_LENGTH, buildHooksConfig, @@ -17,17 +26,22 @@ import { memoryRecallToolSchema, registerShutdownHandlers, registerDreamTool, + resolveInitProfilePath, resolveSnapshotPath, + runInitCommand, runStatusCommand, validateForgetSelection, } from '../mcp-server/index.js'; -import { existsSync, rmSync } from 'node:fs'; +import { existsSync, readFileSync, rmSync } from 'node:fs'; const TEST_DIR = './test-mcp-server'; +const PACKAGE_VERSION = JSON.parse( + readFileSync(new URL('../package.json', import.meta.url), 'utf8') +).version; describe('MCP config', () => { - it('VERSION is 0.16.1', () => { - expect(VERSION).toBe('0.16.1'); + it('VERSION matches package.json', () => { + expect(VERSION).toBe(PACKAGE_VERSION); }); }); @@ -184,6 +198,191 @@ describe('MCP CLI: buildInstallArgs', () => { }); }); +describe('MCP CLI: init presets', () => { + const envBackup = {}; + const envKeys = [ + 'AUDREY_DATA_DIR', 'AUDREY_AGENT', 'AUDREY_EMBEDDING_PROVIDER', + 'AUDREY_LLM_PROVIDER', 'AUDREY_DEVICE', 'GOOGLE_API_KEY', + 'GEMINI_API_KEY', 'OPENAI_API_KEY', 'ANTHROPIC_API_KEY', + 'AUDREY_HOST', 'AUDREY_PORT', 'AUDREY_API_KEY', + ]; + + beforeEach(() => { + for (const key of envKeys) { + envBackup[key] = process.env[key]; + delete process.env[key]; + } + }); + + afterEach(() => { + for (const key of envKeys) { + if (envBackup[key] !== undefined) process.env[key] = envBackup[key]; + else delete process.env[key]; + } + }); + + it('lists the supported init presets', () => { + expect(listInitPresets().map(p => p.name)).toEqual([ + 'local-offline', + 'hosted-fast', + 'ci-mock', + 'sidecar-prod', + ]); + }); + + it('builds a local-offline init env without hosted providers', () => { + const initEnv = buildInitEnv({ + GOOGLE_API_KEY: 'google-test', + ANTHROPIC_API_KEY: 'anthropic-test', + AUDREY_DEVICE: 'cpu', + }, 'local-offline'); + + expect(initEnv.AUDREY_EMBEDDING_PROVIDER).toBe('local'); + expect(initEnv.AUDREY_DEVICE).toBe('cpu'); + expect(initEnv.GOOGLE_API_KEY).toBeUndefined(); + expect(initEnv.ANTHROPIC_API_KEY).toBeUndefined(); + expect(initEnv.AUDREY_AGENT).toBe('claude-code'); + }); + + it('builds a hosted-fast env using detected hosted providers', () => { + const initEnv = buildInitEnv({ + GOOGLE_API_KEY: 'google-test', + ANTHROPIC_API_KEY: 'anthropic-test', + }, 'hosted-fast'); + + expect(initEnv.AUDREY_EMBEDDING_PROVIDER).toBe('gemini'); + expect(initEnv.AUDREY_LLM_PROVIDER).toBe('anthropic'); + expect(initEnv.AUDREY_AGENT).toBe('claude-code'); + }); + + it('builds a ci-mock env with mock providers', () => { + const initEnv = buildInitEnv({ + OPENAI_API_KEY: 'openai-test', + ANTHROPIC_API_KEY: 'anthropic-test', + }, 'ci-mock'); + + expect(initEnv.AUDREY_EMBEDDING_PROVIDER).toBe('mock'); + expect(initEnv.AUDREY_LLM_PROVIDER).toBe('mock'); + expect(initEnv.OPENAI_API_KEY).toBeUndefined(); + expect(initEnv.ANTHROPIC_API_KEY).toBeUndefined(); + expect(initEnv.AUDREY_AGENT).toBe('audrey-ci'); + }); + + it('builds a sidecar-prod env with serving defaults', () => { + const initEnv = buildInitEnv({}, 'sidecar-prod'); + + expect(initEnv.AUDREY_AGENT).toBe('audrey-sidecar'); + expect(initEnv.AUDREY_HOST).toBe('0.0.0.0'); + expect(initEnv.AUDREY_PORT).toBe('3487'); + expect(initEnv.AUDREY_EMBEDDING_PROVIDER).toBe('local'); + }); +}); + +describe('MCP CLI: init command', () => { + it('resolves the init profile path next to the data directory', () => { + expect(resolveInitProfilePath('/tmp/audrey/data')).toBe(path.resolve('/tmp/audrey/init-profile.json')); + }); + + it('bootstraps the common Claude path and writes a profile', () => { + const lines = []; + const installFn = vi.fn(); + const hooksInstallFn = vi.fn(); + const writeFile = vi.fn(); + const mkdir = vi.fn(); + const execFn = vi.fn(); + + const result = runInitCommand({ + argv: ['node', 'mcp-server/index.js', 'init', 'local-offline'], + env: { AUDREY_DATA_DIR: '/tmp/audrey-data', AUDREY_DEVICE: 'cpu' }, + out: line => lines.push(line), + installFn, + hooksInstallFn, + execFn, + writeFile, + mkdir, + }); + + expect(result.preset).toBe('local-offline'); + expect(result.installedMcp).toBe(true); + expect(result.installedHooks).toBe(true); + expect(installFn).toHaveBeenCalledOnce(); + expect(hooksInstallFn).toHaveBeenCalledOnce(); + expect(writeFile).toHaveBeenCalledOnce(); + expect(mkdir).toHaveBeenCalled(); + expect(lines.join('\n')).toContain('Init preset: local-offline'); + expect(lines.join('\n')).toContain('npx audrey doctor'); + + const profile = JSON.parse(writeFile.mock.calls[0][1]); + expect(profile.preset).toBe('local-offline'); + expect(profile.embedding.provider).toBe('local'); + expect(profile.hooksInstalled).toBe(true); + }); + + it('supports dry runs without side effects', () => { + const installFn = vi.fn(); + const hooksInstallFn = vi.fn(); + const writeFile = vi.fn(); + const mkdir = vi.fn(); + const execFn = vi.fn(() => { + throw new Error('missing claude'); + }); + + const result = runInitCommand({ + argv: ['node', 'mcp-server/index.js', 'init', 'hosted-fast', '--dry-run'], + env: { AUDREY_DATA_DIR: '/tmp/audrey-data' }, + installFn, + hooksInstallFn, + execFn, + writeFile, + mkdir, + }); + + expect(result.dryRun).toBe(true); + expect(result.installedMcp).toBe(false); + expect(installFn).not.toHaveBeenCalled(); + expect(hooksInstallFn).not.toHaveBeenCalled(); + expect(writeFile).not.toHaveBeenCalled(); + expect(mkdir).not.toHaveBeenCalled(); + }); + + it('skips hooks when requested', () => { + const hooksInstallFn = vi.fn(); + + const result = runInitCommand({ + argv: ['node', 'mcp-server/index.js', 'init', 'local-offline', '--no-hooks'], + env: { AUDREY_DATA_DIR: '/tmp/audrey-data' }, + installFn: vi.fn(), + hooksInstallFn, + execFn: vi.fn(), + writeFile: vi.fn(), + mkdir: vi.fn(), + }); + + expect(result.installedHooks).toBe(false); + expect(hooksInstallFn).not.toHaveBeenCalled(); + }); + + it('does not attempt Claude registration for sidecar-prod', () => { + const installFn = vi.fn(); + + const result = runInitCommand({ + argv: ['node', 'mcp-server/index.js', 'init', 'sidecar-prod'], + env: { AUDREY_DATA_DIR: '/tmp/audrey-data' }, + installFn, + hooksInstallFn: vi.fn(), + execFn: vi.fn(() => { + throw new Error('missing claude'); + }), + writeFile: vi.fn(), + mkdir: vi.fn(), + }); + + expect(result.installedMcp).toBe(false); + expect(result.profile.surface).toBe('sidecar'); + expect(installFn).not.toHaveBeenCalled(); + }); +}); + describe('MCP validation hardening', () => { it('memory_encode rejects empty or whitespace-only content', () => { const schema = z.object(memoryEncodeToolSchema); @@ -1022,7 +1221,7 @@ describe('snapshot and restore round-trip', () => { await audrey.encode({ content: 'test memory beta', source: 'told-by-user' }); const snapshot = audrey.export(); - expect(snapshot.version).toBe('0.16.1'); + expect(snapshot.version).toBe(PACKAGE_VERSION); expect(snapshot.exportedAt).toBeTruthy(); expect(snapshot.episodes).toHaveLength(2); expect(snapshot.episodes[0].content).toBe('test memory alpha');