Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,15 @@ elfmem uses [Semantic Versioning](https://semver.org/).


### Added
- **`MemorySystem.learn_document(text, chunk_size, chunker, skip_llm)`:** Ingest a document in one call — chunks text, learns each chunk, auto-consolidates via `dream()` at `inbox_threshold` intervals. Accepts an optional `chunker` callback (e.g. `nltk.sent_tokenize`); default splits at sentence boundaries. Returns `LearnDocumentResult` with chunk and consolidation counts.
- **`LearnDocumentResult` type:** New result type with `chunks_total`, `chunks_created`, `chunks_duplicate`, `consolidations`, `blocks_promoted`. Exported from `elfmem`.
- **BM25 keyword search in retrieval pipeline (stage 2b):** `hybrid_retrieve()` now runs BM25 in parallel with vector search, discovering blocks with strong keyword overlap that embedding similarity misses. Soft dependency on `rank_bm25` — when not installed, the stage is silently skipped (zero regression). Install via `pip install elfmem[bm25]`.
- **`dream(skip_llm, skip_contradictions)` parameters:** `dream()` now forwards `skip_llm` and `skip_contradictions` to `consolidate()`, enabling fast-path consolidation without bypassing policy tracking or threshold persistence.
- **`MABenchConfig.context_window_tokens`:** New config field (default 4096) representing the LM Studio model's context window. All answer-context truncation derives from this value; set to 2048 for smaller models.

### Fixed
- **Config wiring: `contradiction_threshold`, `near_dup_exact_threshold`, `near_dup_near_threshold`:** These three `MemoryConfig` fields existed but were not passed from `MemorySystem.consolidate()` to the consolidation operation. Custom config values were silently ignored (defaults matched, so no observable bug at default settings). Now wired through.

### Added
- **LoCoMo benchmark harness:** Complete benchmark suite for evaluating elfmem against LoCoMo (ACL 2024) — 10 conversations, 1,986 QA pairs, 5 categories. Includes metrics (Porter-stemmed F1), typed data loading, BM25 hybrid retrieval, observation transform, and CLI runner with `--test`, `--baselines`, `--resume`, `--top-k`, `--category` flags. Results conform to `benchmark_report_spec.md`.
- **`consolidate(skip_llm=True)`:** Bypass all LLM calls during consolidation (embed + promote only). Reduces ingestion from hours to seconds for bulk import and benchmarks.
Expand Down
186 changes: 26 additions & 160 deletions benchmarks/memoryagentbench/adapter.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
"""elfmem adapter for MemoryAgentBench — chunk ingestion + retrieval.
"""elfmem adapter for MemoryAgentBench — document ingestion + retrieval.

Key difference from LoCoMo: MemoryAgentBench's Conflict Resolution competency
tests contradiction detection — elfmem's primary moat. For CR, we use FULL
consolidation (contradiction detection ON). For other competencies, we use
skip_contradictions for speed.
skip_llm for speed.

BM25 hybrid retrieval is handled natively by elfmem's retrieval pipeline
(stage 2b in hybrid_retrieve). No adapter-level BM25 or RRF needed.
"""

from __future__ import annotations

import logging
import tempfile
import time
from dataclasses import dataclass, field
from dataclasses import dataclass
from pathlib import Path
from typing import Any

import nltk

from rank_bm25 import BM25Okapi

from elfmem import ElfmemConfig, MemorySystem

from benchmarks.memoryagentbench.config import MABenchConfig
from elfmem import ElfmemConfig, MemorySystem

nltk.download("punkt_tab", quiet=True)

Expand Down Expand Up @@ -50,65 +50,6 @@ class ExampleResult:
ingestion_seconds: float


class _BM25Index:
"""BM25 index over block content for keyword-boosted retrieval.

Built after consolidation from elfmem's active block content (summaries
when available, raw content otherwise). Stores block IDs so RRF merge
can match by ID rather than approximate content-prefix heuristics.
"""

def __init__(self) -> None:
self._ids: list[str] = []
self._contents: list[str] = []
self._bm25: BM25Okapi | None = None

def add(self, block_id: str, content: str) -> None:
self._ids.append(block_id)
self._contents.append(content)

def build(self) -> None:
if self._contents:
tokenized = [c.lower().split() for c in self._contents]
self._bm25 = BM25Okapi(tokenized)

def search(self, query: str, top_k: int = 10) -> list[tuple[str, str, float]]:
"""Return (block_id, content, bm25_score) triples ranked by score."""
if self._bm25 is None:
return []
scores = self._bm25.get_scores(query.lower().split())
ranked = sorted(
zip(self._ids, self._contents, scores),
key=lambda x: x[2],
reverse=True,
)
return ranked[:top_k]


def chunk_text(text: str, chunk_size: int = 1024) -> list[str]:
"""Split text into sentence-aligned chunks of ~chunk_size words.

Uses NLTK sentence tokenization for clean boundaries.
"""
sentences = nltk.sent_tokenize(text)
chunks: list[str] = []
current: list[str] = []
current_len = 0

for sentence in sentences:
sent_len = len(sentence.split())
if current_len + sent_len > chunk_size and current:
chunks.append(" ".join(current))
current = []
current_len = 0
current.append(sentence)
current_len += sent_len

if current:
chunks.append(" ".join(current))
return chunks


def _context_budget_words(config: MABenchConfig) -> int:
"""Answer-context word budget derived from the model's context window.

Expand Down Expand Up @@ -153,34 +94,6 @@ def build_elfmem_config(config: MABenchConfig) -> ElfmemConfig:
})


def _rrf_merge(
vector_blocks: list,
bm25_results: list[tuple[str, str, float]],
top_k: int,
k: int = 60,
) -> tuple[list, str]:
"""Merge vector search and BM25 results via Reciprocal Rank Fusion.

BM25 results carry block IDs (from _BM25Index.search), so matching is
exact — no content-prefix heuristics, no supplementary raw-chunk fallback.
Both retrieval paths use the same block content (summaries or raw text),
so the merged context is consistent.
"""
block_scores: dict[str, float] = {}
block_map: dict[str, object] = {}
for rank, block in enumerate(vector_blocks):
block_scores[block.id] = 1.0 / (k + rank)
block_map[block.id] = block

for rank, (block_id, _content, _score) in enumerate(bm25_results):
if block_id in block_map:
block_scores[block_id] += 1.0 / (k + rank)

ranked = sorted(block_map.values(), key=lambda b: block_scores[b.id], reverse=True)
trimmed = ranked[:top_k]
return trimmed, "\n\n".join(b.content for b in trimmed)


async def process_example(
example: dict[str, Any],
competency: str,
Expand All @@ -199,6 +112,7 @@ async def process_example(

# Conflict Resolution needs contradiction detection — elfmem's moat
is_conflict_resolution = competency == "Conflict_Resolution"
skip_llm = not is_conflict_resolution

elfmem_cfg = build_elfmem_config(config)

Expand All @@ -207,80 +121,32 @@ async def process_example(

system = await MemorySystem.from_config(db_path, config=elfmem_cfg)
try:
# Phase 1: Chunk and ingest context
# Phase 1: Ingest document via learn_document()
start_ingest = time.monotonic()
chunks = chunk_text(context, config.chunk_size)
total_promoted = 0

# CR needs contradiction detection — elfmem's core capability.
# Other competencies use skip_llm for speed (embedding-only retrieval).
skip_llm = not is_conflict_resolution

await system.begin_session(task_type="ingestion")
for i, chunk in enumerate(chunks):
await system.learn(
content=chunk,
tags=[f"chunk:{i}"],
category="knowledge",
source="memoryagentbench",
)
# Consolidate periodically
if (i + 1) % config.consolidate_every_n_chunks == 0:
await system.end_session()
await system.begin_session(task_type="consolidation")
result = await system.consolidate(skip_llm=skip_llm)
total_promoted += result.promoted
await system.end_session()
await system.begin_session(task_type="ingestion")
await system.end_session()

# Final consolidation for remaining chunks
await system.begin_session(task_type="consolidation")
result = await system.consolidate(skip_llm=skip_llm)
total_promoted += result.promoted
await system.end_session()

# Build BM25 from active block content after consolidation.
# With skip_llm=False (CR), blocks carry LLM-generated summaries —
# the same content used by elfmem's vector retrieval. This ensures
# BM25 and vector search are consistent, enabling exact ID-based RRF
# merging without heuristic content-prefix matching.
# With skip_llm=True (other competencies), content falls back to the
# raw chunk text — same as the previous raw-chunk approach.
await system.begin_session(task_type="retrieval")
index_frame = await system.frame("attention", query=None, top_k=10000)
await system.end_session()

bm25_index = _BM25Index()
for block in index_frame.blocks:
bm25_index.add(block.id, block.content)
bm25_index.build()

doc_result = await system.learn_document(
context,
chunk_size=config.chunk_size,
chunker=nltk.sent_tokenize,
source="memoryagentbench",
skip_llm=skip_llm,
)
ingestion_time = time.monotonic() - start_ingest
log.info(f" Ingested {len(chunks)} chunks, {total_promoted} promoted in {ingestion_time:.1f}s")
log.info(
f" Ingested {doc_result.chunks_total} chunks, "
f"{doc_result.blocks_promoted} promoted in {ingestion_time:.1f}s"
)

# Phase 2: Answer each question
qa_results: list[QAResult] = []
for q_idx, (question, answer_list) in enumerate(zip(questions, answers)):
for _q_idx, (question, answer_list) in enumerate(zip(questions, answers, strict=False)):
q_start = time.monotonic()

await system.begin_session(task_type="retrieval")
frame_result = await system.frame("attention", query=question, top_k=config.top_k)
await system.end_session()

# Build context from blocks directly, bypassing the frame's hardcoded
# token_budget (2000 tokens in the attention frame definition). Both
# the BM25 and no-BM25 paths are now bounded only by _context_budget_words,
# which is derived from config.context_window_tokens.
blocks = frame_result.blocks
# elfmem's recall() includes BM25 (stage 2b) + graph expansion
# + contradiction suppression natively.
blocks = await system.recall(query=question, top_k=config.top_k)
context_text = "\n\n".join(b.content for b in blocks)
bm25_hits = bm25_index.search(question, top_k=config.top_k)
if bm25_hits:
blocks, context_text = _rrf_merge(blocks, bm25_hits, config.top_k)

# Truncate context to fit the model's context window.
# Budget is derived from config.context_window_tokens minus
# system prompt, template, question, and answer overhead.
max_context_words = _context_budget_words(config)
words = context_text.split()
if len(words) > max_context_words:
Expand All @@ -304,8 +170,8 @@ async def process_example(
return ExampleResult(
source=source,
competency=competency,
chunks_ingested=len(chunks),
blocks_promoted=total_promoted,
chunks_ingested=doc_result.chunks_total,
blocks_promoted=doc_result.blocks_promoted,
qa_results=qa_results,
ingestion_seconds=ingestion_time,
)
Expand Down
Loading
Loading