diff --git a/apps/docs/docs.json b/apps/docs/docs.json index 22b3e9ff..2bbfdc0a 100644 --- a/apps/docs/docs.json +++ b/apps/docs/docs.json @@ -185,6 +185,17 @@ ], "tab": "Developer Platform" }, + { + "icon": "book-open", + "anchors": [ + { + "anchor": "API Reference", + "icon": "unplug", + "openapi": "https://api.supermemory.ai/v3/openapi" + } + ], + "tab": "API Reference" + }, { "icon": "plug", "anchors": [ @@ -234,15 +245,35 @@ "tab": "SDKs" }, { - "icon": "book-open", + "icon": "flask-conical", "anchors": [ { - "anchor": "API Reference", - "icon": "unplug", - "openapi": "https://api.supermemory.ai/v3/openapi" + "anchor": "MemoryBench", + "icon": "flask-conical", + "pages": [ + "memorybench/overview", + "memorybench/github", + { + "group": "Getting Started", + "pages": ["memorybench/installation", "memorybench/quickstart"] + }, + { + "group": "Development", + "pages": [ + "memorybench/architecture", + "memorybench/extend-provider", + "memorybench/extend-benchmark", + "memorybench/contributing" + ] + }, + { + "group": "Reference", + "pages": ["memorybench/cli", "memorybench/integrations"] + } + ] } ], - "tab": "API Reference" + "tab": "MemoryBench" }, { "icon": "chef-hat", @@ -269,7 +300,6 @@ ], "tab": "Cookbook" }, - { "icon": "list-ordered", "anchors": [ diff --git a/apps/docs/memorybench/architecture.mdx b/apps/docs/memorybench/architecture.mdx new file mode 100644 index 00000000..5d087817 --- /dev/null +++ b/apps/docs/memorybench/architecture.mdx @@ -0,0 +1,99 @@ +--- +title: "Architecture" +description: "Understanding MemoryBench's design and implementation" +sidebarTitle: "Architecture" +--- + +## System Overview + +```mermaid +flowchart TB + B["Benchmarks
(LoCoMo, LongMemEval..)"] + P["Providers
(Supermemory, Mem0, Zep)"] + J["Judges
(GPT-4o, Claude..)"] + + B --> O[Orchestrator] + P --> O + J --> O + + O --> Pipeline + + subgraph Pipeline[" "] + direction LR + I[Ingest] --> IX[Indexing] --> S[Search] --> A[Answer] --> E[Evaluate] + end + + style B fill:#E0F2FE,stroke:#0369A1,color:#0C4A6E + style P fill:#E0F2FE,stroke:#0369A1,color:#0C4A6E + style J fill:#E0F2FE,stroke:#0369A1,color:#0C4A6E + style O fill:#0369A1,stroke:#0369A1,color:#fff + style I fill:#F1F5F9,stroke:#64748B,color:#334155 + style IX fill:#F1F5F9,stroke:#64748B,color:#334155 + style S fill:#F1F5F9,stroke:#64748B,color:#334155 + style A fill:#F1F5F9,stroke:#64748B,color:#334155 + style E fill:#F1F5F9,stroke:#64748B,color:#334155 +``` + +## Core Components + +| Component | Role | +|-----------|------| +| **Benchmarks** | Load test data and provide questions with ground truth answers | +| **Providers** | Memory services being evaluated (handle ingestion and search) | +| **Judges** | LLM-based evaluators that score answers against ground truth | + +See [Integrations](/memorybench/integrations) for all supported benchmarks, providers, and models. + +## Pipeline + +```mermaid +flowchart LR + A[Ingest] --> B[Index] --> C[Search] --> D[Answer] --> E[Evaluate] --> F[Report] + + style A fill:#E0F2FE,stroke:#0369A1,color:#0C4A6E + style B fill:#E0F2FE,stroke:#0369A1,color:#0C4A6E + style C fill:#E0F2FE,stroke:#0369A1,color:#0C4A6E + style D fill:#E0F2FE,stroke:#0369A1,color:#0C4A6E + style E fill:#E0F2FE,stroke:#0369A1,color:#0C4A6E + style F fill:#DCFCE7,stroke:#16A34A,color:#166534 +``` + +| Phase | What Happens | +|-------|--------------| +| **Ingest** | Load benchmark sessions → Push to provider | +| **Index** | Wait for provider indexing | +| **Search** | Query provider → Retrieve context | +| **Answer** | Build prompt → Generate answer via LLM | +| **Evaluate** | Compare to ground truth → Score via judge | +| **Report** | Aggregate scores → Output accuracy + latency | + +Each phase checkpoints independently. Failed runs resume from last successful point. + +## Advanced Checkpointing + +Runs persist to `data/runs/{runId}/`: + +``` +data/runs/my-run/ +├── checkpoint.json # Run state and progress +├── results/ # Search results per question +└── report.json # Final report +``` + +Re-running same ID resumes. Use `--force` to restart. + +## File Structure + +``` +src/ +├── cli/commands/ # run, compare, test, serve, status... +├── orchestrator/phases/ # ingest, search, answer, evaluate, report +├── benchmarks/ +│ └── /index.ts # e.g. locomo/, longmemeval/, convomem/ +├── providers/ +│ └── / +│ ├── index.ts # Provider implementation +│ └── prompts.ts # Custom prompts (optional) +├── judges/ # openai.ts, anthropic.ts, google.ts +└── types/ # provider.ts, benchmark.ts, unified.ts +``` diff --git a/apps/docs/memorybench/cli.mdx b/apps/docs/memorybench/cli.mdx new file mode 100644 index 00000000..3ab5c503 --- /dev/null +++ b/apps/docs/memorybench/cli.mdx @@ -0,0 +1,117 @@ +--- +title: "CLI Reference" +description: "Command-line interface for running MemoryBench evaluations" +sidebarTitle: "CLI" +--- + +## Commands + +### run + +Execute the full benchmark pipeline. + +```bash +bun run src/index.ts run -p -b -j -r +``` + +| Option | Description | +|--------|-------------| +| `-p, --provider` | Memory provider (`supermemory`, `mem0`, `zep`) | +| `-b, --benchmark` | Benchmark (`locomo`, `longmemeval`, `convomem`) | +| `-j, --judge` | Judge model (default: `gpt-4o`) | +| `-r, --run-id` | Run identifier (auto-generated if omitted) | +| `-m, --answering-model` | Model for answer generation (default: `gpt-4o`) | +| `-l, --limit` | Limit number of questions | +| `-s, --sample` | Sample N questions per category | +| `--sample-type` | Sampling strategy: `consecutive` (default), `random` | +| `--force` | Clear checkpoint and restart | + +See [Supported Models](/memorybench/supported-models) for all available judge and answering models. + +--- + +### compare + +Run benchmark across multiple providers in parallel. + +```bash +bun run src/index.ts compare -p supermemory,mem0,zep -b locomo -j gpt-4o +``` + +--- + +### test + +Evaluate a single question for debugging. + +```bash +bun run src/index.ts test -r -q +``` + +--- + +### status + +Check progress of a run. + +```bash +bun run src/index.ts status -r +``` + +--- + +### show-failures + +Debug failed questions with full context. + +```bash +bun run src/index.ts show-failures -r +``` + +--- + +### list-questions + +Browse benchmark questions. + +```bash +bun run src/index.ts list-questions -b +``` + +--- + +### Random Sampling + +Sample N questions per category with optional randomization. + +```bash +bun run src/index.ts run -p supermemory -b longmemeval -s 3 --sample-type random +``` + +--- + +### serve + +Start the web UI. + +```bash +bun run src/index.ts serve +``` + +Opens at [http://localhost:3000](http://localhost:3000). + +--- + +### help + +Get help on providers, models, or benchmarks. + +```bash +bun run src/index.ts help providers +bun run src/index.ts help models +bun run src/index.ts help benchmarks +``` + +## Checkpointing + +Runs are saved to `data/runs/{runId}/` and automatically resume from the last successful phase. Use `--force` to restart. diff --git a/apps/docs/memorybench/contributing.mdx b/apps/docs/memorybench/contributing.mdx new file mode 100644 index 00000000..2f8e45e2 --- /dev/null +++ b/apps/docs/memorybench/contributing.mdx @@ -0,0 +1,89 @@ +--- +title: "Contributing" +description: "Guidelines for contributing to MemoryBench" +sidebarTitle: "Contributing" +--- + +## Getting Started + +1. Fork the repository +2. Clone your fork: + ```bash + git clone https://github.com/YOUR_USERNAME/memorybench + cd memorybench + bun install + ``` +3. Create a branch: + ```bash + git checkout -b feature/your-feature + ``` + +## Development Workflow + +### Running Tests + +```bash +bun test +``` + +### Running the CLI + +```bash +bun run src/index.ts +``` + +### Running the Web UI + +```bash +cd ui +bun run dev +``` + +## Code Structure + +| Directory | Purpose | +|-----------|---------| +| `src/cli/` | CLI commands | +| `src/orchestrator/` | Pipeline execution | +| `src/benchmarks/` | Benchmark adapters | +| `src/providers/` | Provider integrations | +| `src/judges/` | LLM judge implementations | +| `src/types/` | TypeScript interfaces | +| `ui/` | Next.js web interface | + +## Contribution Types + +### Adding a Provider + +See [Extending MemoryBench](/memorybench/extend-provider) for the full guide. + +1. Create `src/providers/yourprovider/index.ts` +2. Implement the `Provider` interface +3. Register in `src/providers/index.ts` +4. Add config in `src/utils/config.ts` +5. Submit PR with tests + +### Adding a Benchmark + +1. Create `src/benchmarks/yourbenchmark/index.ts` +2. Implement the `Benchmark` interface +3. Register in `src/benchmarks/index.ts` +4. Document question types +5. Submit PR with sample data + +### Bug Fixes + +1. Create an issue describing the bug +2. Reference the issue in your PR +3. Include test cases that reproduce the bug + +## Pull Request Guidelines + +- Keep PRs focused on a single change +- Update documentation if needed +- Ensure all tests pass +- Follow existing code style + +## Questions? + +Open an issue on [GitHub](https://github.com/supermemoryai/memorybench/issues). diff --git a/apps/docs/memorybench/extend-benchmark.mdx b/apps/docs/memorybench/extend-benchmark.mdx new file mode 100644 index 00000000..c66cf4d9 --- /dev/null +++ b/apps/docs/memorybench/extend-benchmark.mdx @@ -0,0 +1,75 @@ +--- +title: "Extend Benchmark" +description: "Add a custom benchmark dataset to MemoryBench" +sidebarTitle: "Extend Benchmark" +--- + +## Benchmark Interface + +```typescript +interface Benchmark { + name: string + load(config?: BenchmarkConfig): Promise + getQuestions(filter?: QuestionFilter): UnifiedQuestion[] + getHaystackSessions(questionId: string): UnifiedSession[] + getGroundTruth(questionId: string): string + getQuestionTypes(): QuestionTypeRegistry +} +``` + +--- + +## Adding a Custom Benchmark + +### 1. Create the Benchmark + +```typescript +// src/benchmarks/mybenchmark/index.ts +import type { Benchmark, UnifiedQuestion, UnifiedSession } from "../../types" + +export class MyBenchmark implements Benchmark { + name = "mybenchmark" + private questions: UnifiedQuestion[] = [] + private sessions: Map = new Map() + + async load() { + const data = await this.loadDataset() + this.processData(data) + } + + getQuestions(filter?: QuestionFilter) { + let result = [...this.questions] + if (filter?.limit) result = result.slice(0, filter.limit) + return result + } + + getHaystackSessions(questionId: string) { + return this.sessions.get(questionId) || [] + } + + getGroundTruth(questionId: string) { + return this.questions.find(q => q.questionId === questionId)?.groundTruth || "" + } + + getQuestionTypes() { + return { + "type1": { id: "type1", description: "Type 1 questions" }, + "type2": { id: "type2", description: "Type 2 questions" }, + } + } +} +``` + +### 2. Register the Benchmark + +```typescript +// src/benchmarks/index.ts +import { MyBenchmark } from "./mybenchmark" + +export const benchmarks = { + locomo: LoComoBenchmark, + longmemeval: LongMemEvalBenchmark, + convomem: ConvoMemBenchmark, + mybenchmark: MyBenchmark, // Add here +} +``` diff --git a/apps/docs/memorybench/extend-provider.mdx b/apps/docs/memorybench/extend-provider.mdx new file mode 100644 index 00000000..f3fec92e --- /dev/null +++ b/apps/docs/memorybench/extend-provider.mdx @@ -0,0 +1,118 @@ +--- +title: "Extend Provider" +description: "Add a custom memory provider to MemoryBench" +sidebarTitle: "Extend Provider" +--- + +## Provider Interface + +```typescript +interface Provider { + name: string + prompts?: ProviderPrompts + initialize(config: ProviderConfig): Promise + ingest(sessions: UnifiedSession[], options: IngestOptions): Promise + awaitIndexing(result: IngestResult, containerTag: string): Promise + search(query: string, options: SearchOptions): Promise + clear(containerTag: string): Promise +} +``` + +--- + +## Adding a Custom Provider + +### 1. Create the Provider + +```typescript +// src/providers/myprovider/index.ts +import type { Provider, ProviderConfig, UnifiedSession } from "../../types" + +export class MyProvider implements Provider { + name = "myprovider" + private client: MyClient | null = null + + async initialize(config: ProviderConfig) { + this.client = new MyClient({ apiKey: config.apiKey }) + } + + async ingest(sessions: UnifiedSession[], options: IngestOptions) { + const documentIds: string[] = [] + for (const session of sessions) { + const response = await this.client.add({ + content: JSON.stringify(session.messages), + metadata: session.metadata + }) + documentIds.push(response.id) + } + return { documentIds } + } + + async awaitIndexing(result: IngestResult) { + // Poll until indexing complete + } + + async search(query: string, options: SearchOptions) { + return await this.client.search({ q: query, limit: 10 }) + } + + async clear(containerTag: string) { + await this.client.delete(containerTag) + } +} +``` + +### 2. Register the Provider + +```typescript +// src/providers/index.ts +import { MyProvider } from "./myprovider" + +export const providers = { + supermemory: SupermemoryProvider, + mem0: Mem0Provider, + zep: ZepProvider, + myprovider: MyProvider, // Add here +} +``` + +### 3. Add Configuration + +```typescript +// src/utils/config.ts +case "myprovider": + return { + apiKey: process.env.MYPROVIDER_API_KEY!, + } +``` + +--- + +## Custom Prompts + +Providers can define custom answer and judge prompts for better results. + +```typescript +// src/providers/myprovider/prompts.ts +export const MY_PROMPTS: ProviderPrompts = { + answerPrompt: (question, context, questionDate) => { + return `Based on context:\n${context}\n\nAnswer: ${question}` + }, + + judgePrompt: (question, groundTruth, hypothesis) => ({ + default: "Compare answer to ground truth...", + temporal: "Allow off-by-one for dates...", + adversarial: "Check if model correctly abstained...", + }) +} +``` + +Then reference in your provider: + +```typescript +export class MyProvider implements Provider { + name = "myprovider" + prompts = MY_PROMPTS // Custom prompts + // ... +} +``` diff --git a/apps/docs/memorybench/github.mdx b/apps/docs/memorybench/github.mdx new file mode 100644 index 00000000..34468d21 --- /dev/null +++ b/apps/docs/memorybench/github.mdx @@ -0,0 +1,5 @@ +--- +title: "MemoryBench on GitHub" +url: "https://github.com/supermemoryai/memorybench" +icon: github +--- diff --git a/apps/docs/memorybench/installation.mdx b/apps/docs/memorybench/installation.mdx new file mode 100644 index 00000000..ef21cac8 --- /dev/null +++ b/apps/docs/memorybench/installation.mdx @@ -0,0 +1,60 @@ +--- +title: "Installation" +description: "Get MemoryBench up and running in your environment" +sidebarTitle: "Installation" +--- + +## Prerequisites + +- [Bun](https://bun.sh) runtime installed +- API keys for providers and LLM judges you want to use + +## Install MemoryBench + +```bash +git clone https://github.com/supermemoryai/memorybench +cd memorybench +bun install +``` + +## Configure API Keys + +Create a `.env.local` file in the root directory: + +```bash +# Memory Providers (add keys for providers you want to test) +SUPERMEMORY_API_KEY=your_key +MEM0_API_KEY=your_key +ZEP_API_KEY=your_key + +# LLM Judges (at least one required) +OPENAI_API_KEY=your_key +ANTHROPIC_API_KEY=your_key +GOOGLE_API_KEY=your_key +``` + + +You only need API keys for the providers and judges you plan to use. For example, to benchmark Supermemory with GPT-4o as judge, you only need `SUPERMEMORY_API_KEY` and `OPENAI_API_KEY`. + + +## Verify Installation + +```bash +bun run src/index.ts help +``` + +You should see the list of available commands. + +## Start the Web Interface + +```bash +bun run src/index.ts serve +``` + +Opens at [http://localhost:3000](http://localhost:3000). + +## Next Steps + +- [CLI Reference](/memorybench/cli) - Play around with MemoryBench +- [Architecture](/memorybench/architecture) - Understand how MemoryBench works +- [Extend MemoryBench](/memorybench/extend-provider) - Add custom providers, benchmarks, and prompts diff --git a/apps/docs/memorybench/integrations.mdx b/apps/docs/memorybench/integrations.mdx new file mode 100644 index 00000000..e4532d50 --- /dev/null +++ b/apps/docs/memorybench/integrations.mdx @@ -0,0 +1,39 @@ +--- +title: "Integrations" +description: "Supported benchmarks and providers in MemoryBench" +sidebarTitle: "Integrations" +--- + +## Benchmarks + +| Benchmark | Description | Source | Categories | +|-----------|-------------|--------|------------| +| LoCoMo | Long context memory testing fact recall across extended conversations | [snap-research/locomo](https://github.com/snap-research/locomo) | `single-hop`, `multi-hop`, `temporal`, `world-knowledge`, `adversarial` | +| LongMemEval | Long-term memory evaluation across multiple sessions with knowledge updates | [xiaowu0162/longmemeval](https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned) | `single-session-user`, `single-session-assistant`, `multi-session`, `temporal-reasoning`, `knowledge-update` | +| ConvoMem | Conversational memory focused on personalization and preference learning | [Salesforce/ConvoMem](https://huggingface.co/datasets/Salesforce/ConvoMem) | `user_evidence`, `assistant_facts_evidence`, `preference_evidence`, `changing_evidence`, `abstention_evidence` | + + +We're actively adding support for more benchmarks. [Contribute your own](/memorybench/extend-benchmark) or [create a feature request](https://github.com/supermemoryai/memorybench/issues). + + +--- + +## Providers + + + + Chunk-based semantic search + + + + LLM-powered memory extraction + + + + Knowledge graph construction + + + + +We're actively adding support for more providers. [Contribute your own](/memorybench/extend-provider) or [create a feature request](https://github.com/supermemoryai/memorybench/issues). + diff --git a/apps/docs/memorybench/overview.mdx b/apps/docs/memorybench/overview.mdx new file mode 100644 index 00000000..14e01466 --- /dev/null +++ b/apps/docs/memorybench/overview.mdx @@ -0,0 +1,53 @@ +--- +title: "MemoryBench" +description: "Open-source framework for standardized, reproducible benchmarks of memory layer providers" +sidebarTitle: "Overview" +icon: "flask-conical" +--- + +Our goal is to make evaluation more rigorous, accessible, and in line with industry standards. Design and run evaluations tailored to your specific needs, and run industry-standard benchmarks easily on any memory provider. With MemoryBench, you can trust in provider through transparent, reproducible, and domain-relevant evaluations. + + + +
+ ```bun run src/index.ts serve``` +
+ + + Get MemoryBench up and running in your environment + + + + Command-line interface for running evaluations + + + + Understanding MemoryBench's design and implementation + +
+ +## Works with any memory provider + + + + Cloud-based memory layer + + + + Graph-based memory + + + + Long-term memory for AI + + + + +We're actively adding support for more providers. [Contribute your own](/memorybench/extend-provider) or [create a feature request](https://github.com/supermemoryai/memorybench/issues). + + +## Contribute + + + Found a bug or have a feature request? Let us know. + diff --git a/apps/docs/memorybench/quickstart.mdx b/apps/docs/memorybench/quickstart.mdx new file mode 100644 index 00000000..e52094a9 --- /dev/null +++ b/apps/docs/memorybench/quickstart.mdx @@ -0,0 +1,61 @@ +--- +title: "Quick Start" +description: "Run your first benchmark evaluation in 3 steps" +sidebarTitle: "Quick Start" +--- + +## 1. Run Your First Benchmark + +```bash +bun run src/index.ts run -p supermemory -b longmemeval -j gpt-4o -r my-first-run +``` + +## 2. View Results + +### Option A: Web UI + +```bash +bun run src/index.ts serve +``` + +Open [http://localhost:3000](http://localhost:3000) to see results visually. + +### Option B: CLI + +```bash +# Check run status +bun run src/index.ts status -r my-first-run + +# View failed questions for debugging +bun run src/index.ts show-failures -r my-first-run +``` + +## 3. Compare Providers + +Run the same benchmark across multiple providers: + +```bash +bun run src/index.ts compare -p supermemory,mem0,zep -b locomo -j gpt-4o +``` + +Results are saved to `data/runs/{runId}/report.json`. + +## Sample Output + +```json +{ + "accuracy": 0.72, + "accuracyByType": { + "single-hop": 0.85, + "multi-hop": 0.65, + "temporal": 0.70, + "adversarial": 0.68 + }, + "avgLatency": 1250, + "totalQuestions": 50 +} +``` + +## What's Next + +Head to [CLI Reference](/memorybench/cli) to play around with all the commands, or check out [Architecture](/memorybench/architecture) to understand how MemoryBench works under the hood. diff --git a/apps/docs/memorybench/supported-models.mdx b/apps/docs/memorybench/supported-models.mdx new file mode 100644 index 00000000..fd374c64 --- /dev/null +++ b/apps/docs/memorybench/supported-models.mdx @@ -0,0 +1,49 @@ +--- +title: "Supported Models" +description: "Available models for judges and answer generation" +sidebarTitle: "Supported Models" +--- + +Models available for evaluation judges and answer generation in MemoryBench. + +## OpenAI + +| Model Name | Slug | +|------------|------| +| GPT-4o | `gpt-4o` | +| GPT-4o Mini | `gpt-4o-mini` | +| GPT-4.1 | `gpt-4.1` | +| GPT-4.1 Mini | `gpt-4.1-mini` | +| GPT-4.1 Nano | `gpt-4.1-nano` | +| GPT-5 | `gpt-5` | +| GPT-5 Mini | `gpt-5-mini` | +| o1 | `o1` | +| o1 Pro | `o1-pro` | +| o3 | `o3` | +| o3 Mini | `o3-mini` | +| o3 Pro | `o3-pro` | +| o4 Mini | `o4-mini` | + +## Anthropic + +| Model Name | Slug | +|------------|------| +| Claude Opus 4.5 | `opus-4.5` | +| Claude Sonnet 4.5 | `sonnet-4.5` | +| Claude Haiku 4.5 | `haiku-4.5` | +| Claude Opus 4.1 | `opus-4.1` | +| Claude Sonnet 4 | `sonnet-4` | + +## Google + +| Model Name | Slug | +|------------|------| +| Gemini 2.5 Pro | `gemini-2.5-pro` | +| Gemini 2.5 Flash | `gemini-2.5-flash` | +| Gemini 2.5 Flash Lite | `gemini-2.5-flash-lite` | +| Gemini 2.0 Flash | `gemini-2.0-flash` | +| Gemini 3 Pro Preview | `gemini-3-pro-preview` | + + +Make sure you have the corresponding API key set in your `.env.local` for the model you want to use. +