diff --git a/app/tests/eval_harness.rs b/app/tests/eval_harness.rs index a1ff9543..83355b9d 100644 --- a/app/tests/eval_harness.rs +++ b/app/tests/eval_harness.rs @@ -1579,6 +1579,90 @@ async fn judge_e2e_context_locomo() { eprintln!("\nTotal judged: {}", report.total_judged); } +/// Generate LongMemEval E2E answers via Claude CLI Haiku (Max plan, no API key). +/// +/// Mirrors `generate_e2e_context_tuples_locomo_api` for the LME side. Uses the +/// same `run_e2e_context_eval_longmemeval` path that exercises Task #11's +/// dated-context formatter and `question_date` system-prompt anchor. +#[tokio::test] +#[ignore] +async fn generate_e2e_context_tuples_longmemeval_api() { + use origin_lib::eval::token_efficiency::{ + run_e2e_context_eval_longmemeval, save_judgment_tuples, + }; + use std::sync::Arc; + + let path = + std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("eval/data/longmemeval_oracle.json"); + if !path.exists() { + eprintln!("SKIP: longmemeval_oracle.json not found"); + return; + } + + let llm: Arc = + Arc::new(origin_lib::llm_provider::ClaudeCliProvider::haiku()); + + // 50 questions for validation, 1 answer per question. + let tuples = run_e2e_context_eval_longmemeval(&path, llm, 10, 50, 1) + .await + .expect("run_e2e_context_eval_longmemeval with Haiku CLI failed"); + + eprintln!("Generated {} judgment tuples (Haiku CLI)", tuples.len()); + assert!(!tuples.is_empty()); + + let baselines_dir = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("eval/baselines"); + std::fs::create_dir_all(&baselines_dir).ok(); + let out_path = baselines_dir.join("e2e_context_tuples_longmemeval_api.json"); + save_judgment_tuples(&tuples, &out_path).expect("save tuples"); + eprintln!("Saved to {:?}", out_path); +} + +/// Judge LongMemEval API-generated tuples with Claude Haiku (matches the answer model). +/// Run after `generate_e2e_context_tuples_longmemeval_api`. +#[tokio::test] +#[ignore] +async fn judge_e2e_context_longmemeval_api_haiku() { + use origin_lib::eval::token_efficiency::{ + aggregate_judgments, judge_with_claude_model, load_judgment_tuples, + }; + + let tuples_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("eval/baselines/e2e_context_tuples_longmemeval_api.json"); + if !tuples_path.exists() { + eprintln!("SKIP: run generate_e2e_context_tuples_longmemeval_api first"); + return; + } + + let tuples = load_judgment_tuples(&tuples_path).expect("load tuples"); + eprintln!("Judging {} tuples with Haiku...", tuples.len()); + + let results = judge_with_claude_model(&tuples, 3, "haiku") + .await + .expect("judge failed"); + + let report = aggregate_judgments(&results, "haiku"); + eprintln!("\n=== E2E Context Eval: LongMemEval (Haiku answers, Haiku judge) ==="); + eprintln!( + "{:<25} | {:<10} | {:<10} | {:<14} | Total", + "Approach", "Accuracy", "Correct", "Context Tok" + ); + eprintln!( + "{:-<25}-+-{:-<10}-+-{:-<10}-+-{:-<14}-+-{:-<6}", + "", "", "", "", "" + ); + for r in &report.results_by_approach { + eprintln!( + "{:<25} | {:<10.1}% | {:<10} | {:<14.0} | {}", + r.approach, + r.accuracy * 100.0, + r.correct, + r.mean_context_tokens, + r.total + ); + } + eprintln!("\nTotal judged: {}", report.total_judged); +} + // --------------------------------------------------------------------------- // API-based E2E: Haiku as answer model, Sonnet as judge // --------------------------------------------------------------------------- @@ -2548,3 +2632,411 @@ async fn probe_overlap_gate() { } } } + +// --------------------------------------------------------------------------- +// Retrieval-only diagnostic: no LLM calls, no API cost. Seeds conv-26, runs +// search_memory for the 5 temporal questions, and dumps which hits came back +// alongside whether the literal answer text exists in the seeded data and +// where it ranks (or whether it ranks at all). +// --------------------------------------------------------------------------- +#[tokio::test] +#[ignore] +async fn temporal_retrieval_diag_locomo() { + use origin_core::events::NoopEmitter; + use origin_lib::memory_db::MemoryDB; + use origin_lib::sources::RawDocument; + use std::sync::Arc; + + let locomo_path = + std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("eval/data/locomo10.json"); + let samples = origin_lib::eval::locomo::load_locomo(&locomo_path).expect("load"); + let sample = &samples[0]; + let memories = origin_lib::eval::locomo::extract_observations(sample); + eprintln!("[diag] {} memories seeded", memories.len()); + + let probes = [ + ("LGBTQ support group", "support group"), + ("painted a sunrise", "sunrise"), + ("charity race", "charity race"), + ("camping next month", "camping"), + ("speech at a school", "speech"), + ]; + eprintln!("\n[diag] Substring check on raw extracted content:"); + for (label, needle) in &probes { + let hits: Vec<&str> = memories + .iter() + .filter(|m| m.content.to_lowercase().contains(&needle.to_lowercase())) + .map(|m| m.content.as_str()) + .collect(); + eprintln!(" {:<22} ({} matches in extraction):", label, hits.len()); + for h in hits.iter().take(3) { + eprintln!(" - {}", h.chars().take(120).collect::()); + } + } + + let tmp = tempfile::tempdir().expect("tempdir"); + let db = MemoryDB::new(tmp.path(), Arc::new(NoopEmitter)) + .await + .expect("db"); + let docs: Vec = memories + .iter() + .enumerate() + .map(|(i, mem)| RawDocument { + content: mem.content.clone(), + source_id: format!("locomo_{}_obs_{}", sample.sample_id, i), + source: "memory".to_string(), + title: format!("{} session {}", mem.speaker, mem.session_num), + memory_type: Some("fact".to_string()), + domain: Some("conversation".to_string()), + last_modified: chrono::Utc::now().timestamp(), + event_date: origin_lib::eval::dates::seed_event_date( + mem.session_date.as_deref(), + origin_lib::eval::locomo::parse_locomo_date, + ), + ..Default::default() + }) + .collect(); + db.upsert_documents(docs).await.expect("upsert"); + + let queries = [ + ( + "Q1 LGBTQ support group", + "When did Caroline go to the LGBTQ support group?", + "support group", + ), + ( + "Q2 painted sunrise", + "When did Melanie paint a sunrise?", + "sunrise", + ), + ( + "Q3 charity race", + "When did Melanie run a charity race?", + "charity race", + ), + ( + "Q4 camping plan", + "When is Melanie planning on going camping?", + "camping", + ), + ( + "Q5 school speech", + "When did Caroline give a speech at a school?", + "speech", + ), + ]; + + for (label, q, needle) in &queries { + eprintln!("\n=== {} ===", label); + eprintln!(" query: {}", q); + let hits = db + .search_memory(q, 50, None, Some("conversation"), None, None, None, None) + .await + .expect("search"); + eprintln!(" returned {} hits", hits.len()); + + let needle_l = needle.to_lowercase(); + let target_rank = hits + .iter() + .position(|h| h.content.to_lowercase().contains(&needle_l)); + match target_rank { + Some(r) => { + eprintln!(" ✓ first hit containing {:?}: rank {}", needle, r + 1); + eprintln!( + " {}", + hits[r].content.chars().take(140).collect::() + ); + } + None => { + eprintln!(" ✗ NO hit in top-50 contains {:?}", needle); + let raw_hits: Vec<&origin_lib::eval::locomo::LocomoMemory> = memories + .iter() + .filter(|m| m.content.to_lowercase().contains(&needle_l)) + .collect(); + if raw_hits.is_empty() { + eprintln!(" + raw extraction also has 0 — extraction dropped it"); + } else { + eprintln!( + " + raw extraction has {} matches — search ranking missed them:", + raw_hits.len() + ); + for m in raw_hits.iter().take(3) { + eprintln!( + " - {}", + m.content.chars().take(140).collect::() + ); + } + } + } + } + eprintln!(" top 5 returned:"); + for (i, h) in hits.iter().take(5).enumerate() { + eprintln!( + " {}. [score={:.3}] {}", + i + 1, + h.score, + h.content.chars().take(110).collect::() + ); + } + } +} + +// --------------------------------------------------------------------------- +// Temporal smoke: A/B compare date-aware vs date-blind context on 5 LoCoMo +// temporal questions. No enrichment (search_memory works on vectors+FTS alone), +// so the run completes in ~2 min on Haiku CLI. Used to verify the temporal +// mechanisms before committing to the full 20- or 50-question eval. +// --------------------------------------------------------------------------- +#[tokio::test] +#[ignore] +async fn temporal_smoke_locomo_5q() { + use origin_core::events::NoopEmitter; + use origin_lib::llm_provider::{ClaudeCliProvider, LlmProvider, LlmRequest}; + use origin_lib::memory_db::MemoryDB; + use origin_lib::sources::RawDocument; + use std::sync::Arc; + + let locomo_path = + std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("eval/data/locomo10.json"); + if !locomo_path.exists() { + eprintln!("SKIP: locomo10.json not found"); + return; + } + + let samples = origin_lib::eval::locomo::load_locomo(&locomo_path).expect("load locomo"); + let sample = &samples[0]; + let memories = origin_lib::eval::locomo::extract_observations(sample); + eprintln!( + "[smoke] conv {} — {} observations across {} sessions", + sample.sample_id, + memories.len(), + memories + .iter() + .map(|m| m.session_num) + .collect::>() + .len() + ); + + // Seed with REAL session dates — date prefix and date-blind paths read the + // same DB; they differ only in how they render and prompt. + let tmp = tempfile::tempdir().expect("tempdir"); + let db = MemoryDB::new(tmp.path(), Arc::new(NoopEmitter)) + .await + .expect("db"); + let docs: Vec = memories + .iter() + .enumerate() + .map(|(i, mem)| RawDocument { + content: mem.content.clone(), + source_id: format!("locomo_{}_obs_{}", sample.sample_id, i), + source: "memory".to_string(), + title: format!("{} session {}", mem.speaker, mem.session_num), + memory_type: Some("fact".to_string()), + domain: Some("conversation".to_string()), + last_modified: chrono::Utc::now().timestamp(), + event_date: origin_lib::eval::dates::seed_event_date( + mem.session_date.as_deref(), + origin_lib::eval::locomo::parse_locomo_date, + ), + ..Default::default() + }) + .collect(); + db.upsert_documents(docs).await.expect("upsert"); + + // Latest parseable session date = LoCoMo "asked on". + let asked_on: Option = memories + .iter() + .filter_map(|m| m.session_date.clone()) + .filter(|d| origin_lib::eval::locomo::parse_locomo_date(d).is_some()) + .max_by_key(|d| origin_lib::eval::locomo::parse_locomo_date(d).unwrap_or(0)); + eprintln!("[smoke] asked_on (latest session): {:?}", asked_on); + + // First 5 temporal questions (category 2). + let temporal_qs: Vec<&origin_lib::eval::locomo::LocomoQA> = sample + .qa + .iter() + .filter(|qa| qa.category == 2) + .take(5) + .collect(); + + let llm: Arc = Arc::new(ClaudeCliProvider::haiku()); + + let mut results: Vec<(String, String, String, String)> = Vec::new(); // q, gt, A, B + for (i, qa) in temporal_qs.iter().enumerate() { + let gt = qa + .answer + .as_ref() + .map(|v| v.as_str().unwrap_or(&v.to_string()).to_string()) + .unwrap_or_default(); + if gt.is_empty() { + continue; + } + eprintln!("\n[smoke] Q{}: {}", i + 1, qa.question); + eprintln!(" GT: {}", gt); + + let hits = db + .search_memory( + &qa.question, + 30, + None, + Some("conversation"), + None, + None, + None, + None, + ) + .await + .expect("search"); + // Diagnostic: is the relevant content in top-30? + eprintln!(" retrieved {} hits; first 3 contents:", hits.len()); + for r in hits.iter().take(3) { + eprintln!( + " - {}", + r.content.chars().take(90).collect::() + ); + } + + // A: date-aware context + date-anchored system prompt + let ctx_a: String = hits + .iter() + .map(|r| { + format!( + "On {}: {}", + origin_lib::eval::shared::format_ymd(r.event_date.unwrap_or(r.last_modified)), + r.content + ) + }) + .collect::>() + .join("\n"); + let sys_a = match asked_on.as_deref() { + Some(d) => format!( + "The question was asked on {}. Answer the question using only the provided context. Be specific and concise. Respond in 1-3 sentences.", + d + ), + None => "Answer the question using only the provided context. Be specific and concise. Respond in 1-3 sentences.".to_string(), + }; + let answer_a = llm + .generate(LlmRequest { + system_prompt: Some(sys_a), + user_prompt: format!("Context:\n{}\n\nQuestion: {}", ctx_a, qa.question), + max_tokens: 200, + temperature: 0.1, + label: Some("smoke_A".to_string()), + }) + .await + .unwrap_or_else(|e| format!("ERR: {e}")); + eprintln!(" A (with dates): {}", answer_a.trim()); + + // B: date-blind context + plain system prompt + let ctx_b: String = hits + .iter() + .map(|r| r.content.clone()) + .collect::>() + .join("\n"); + let sys_b = "Answer the question using only the provided context. Be specific and concise. Respond in 1-3 sentences.".to_string(); + let answer_b = llm + .generate(LlmRequest { + system_prompt: Some(sys_b), + user_prompt: format!("Context:\n{}\n\nQuestion: {}", ctx_b, qa.question), + max_tokens: 200, + temperature: 0.1, + label: Some("smoke_B".to_string()), + }) + .await + .unwrap_or_else(|e| format!("ERR: {e}")); + eprintln!(" B (no dates): {}", answer_b.trim()); + + results.push((qa.question.clone(), gt, answer_a, answer_b)); + } + + // Substring-match scoring: extract the date/year from ground truth and look for it. + fn score(answer: &str, gt: &str) -> bool { + let needle: String = gt.to_lowercase(); + let hay: String = answer.to_lowercase(); + // Match on "may", "2023", "may 2023", "7 may", etc — any non-empty token from GT + // that's a year (4 digits) or a month name should appear in the answer. + let months = [ + "january", + "february", + "march", + "april", + "may", + "june", + "july", + "august", + "september", + "october", + "november", + "december", + "jan", + "feb", + "mar", + "apr", + "jun", + "jul", + "aug", + "sep", + "oct", + "nov", + "dec", + ]; + let years: Vec<&str> = needle + .split(|c: char| !c.is_ascii_digit()) + .filter(|s| s.len() == 4 && s.starts_with("20")) + .collect(); + let needs_year = !years.is_empty(); + let needs_month = months.iter().any(|m| needle.contains(m)); + let year_ok = years.iter().any(|y| hay.contains(y)); + let month_ok = months.iter().any(|m| needle.contains(m) && hay.contains(m)); + let approx_match = needle + .split_whitespace() + .any(|w| w.len() > 3 && hay.contains(w)); + match (needs_year, needs_month) { + (true, true) => year_ok && month_ok, + (true, false) => year_ok, + (false, true) => month_ok, + (false, false) => approx_match, + } + } + + let mut a_correct = 0; + let mut b_correct = 0; + eprintln!("\n========================================"); + eprintln!("Smoke result: A=date-aware, B=date-blind"); + eprintln!("========================================"); + for (i, (q, gt, a, b)) in results.iter().enumerate() { + let a_ok = score(a, gt); + let b_ok = score(b, gt); + if a_ok { + a_correct += 1; + } + if b_ok { + b_correct += 1; + } + eprintln!( + "Q{}: A={} B={} — gt: {:?}", + i + 1, + if a_ok { "✓" } else { "✗" }, + if b_ok { "✓" } else { "✗" }, + gt.chars().take(40).collect::(), + ); + eprintln!(" Q: {}", q.chars().take(80).collect::()); + } + let n = results.len().max(1); + eprintln!( + "\nA (date-aware): {}/{} = {:.0}%", + a_correct, + n, + 100.0 * a_correct as f64 / n as f64 + ); + eprintln!( + "B (date-blind): {}/{} = {:.0}%", + b_correct, + n, + 100.0 * b_correct as f64 / n as f64 + ); + eprintln!( + "Lift: {} pp", + (a_correct as i64 - b_correct as i64) * 100 / n as i64 + ); +} diff --git a/crates/origin-core/src/db.rs b/crates/origin-core/src/db.rs index c2a73de2..e3f9794c 100644 --- a/crates/origin-core/src/db.rs +++ b/crates/origin-core/src/db.rs @@ -4112,6 +4112,31 @@ impl MemoryDB { } } + // Migration 44: event_date column on memories — when the event the memory + // describes actually happened, distinct from last_modified (ingestion time). + // Necessary so importing old content (email archives, conversation backfills, + // benchmark seeds) doesn't get penalised by recency decay scoring, while still + // letting the LLM see real dates in retrieved context. + if version < 44 { + let chunk_cols = self.get_table_columns("memories").await?; + let conn = self.conn.lock().await; + + if !chunk_cols.contains("event_date") { + conn.execute("ALTER TABLE memories ADD COLUMN event_date INTEGER", ()) + .await + .map_err(|e| { + OriginError::VectorDb(format!("migration 44 add event_date: {e}")) + })?; + log::info!( + "[memory_db] migration 44: added memories.event_date (NULL-able, no backfill)" + ); + } + + conn.execute("PRAGMA user_version = 44", ()) + .await + .map_err(|e| OriginError::VectorDb(format!("set user_version=44: {e}")))?; + } + Ok(()) } @@ -5067,7 +5092,8 @@ impl MemoryDB { /// 11=byte_start, 12=byte_end, 13=semantic_unit, 14=memory_type, 15=domain, /// 16=source_agent, 17=confidence, 18=confirmed, 19=stability, 20=supersedes, /// 21=entity_id, 22=quality, 23=is_recap, 24=supersede_mode, - /// 25=structured_fields, 26=retrieval_cue, 27=source_text, 28=score/distance/rank + /// 25=structured_fields, 26=retrieval_cue, 27=source_text, 28=created_at, + /// 29=event_date, 30=score/distance/rank fn row_to_search_result(row: &libsql::Row, score: f32) -> Result { Ok(SearchResult { id: row @@ -5111,6 +5137,8 @@ impl MemoryDB { structured_fields: row.get::>(25).unwrap_or(None), retrieval_cue: row.get::>(26).unwrap_or(None), source_text: row.get::>(27).unwrap_or(None), + created_at: row.get::(28).unwrap_or(0), + event_date: row.get::>(29).unwrap_or(None), raw_score: 0.0, // Set later during normalization }) } @@ -5134,6 +5162,7 @@ impl MemoryDB { url: Option, chunk_index: i32, last_modified: i64, + event_date: Option, chunk_type: String, language: Option, byte_start: Option, @@ -5223,6 +5252,7 @@ impl MemoryDB { url: doc.url.clone(), chunk_index: i as i32, last_modified: doc.last_modified, + event_date: doc.event_date, chunk_type: chunk.chunk_type.clone(), language: chunk.language.clone(), byte_start: chunk.byte_range.map(|(s, _)| s as i64), @@ -5356,6 +5386,11 @@ impl MemoryDB { .map(|s| s.into()) .unwrap_or(libsql::Value::Null); + let event_date_val = row + .event_date + .map(libsql::Value::Integer) + .unwrap_or(libsql::Value::Null); + conn.execute( "INSERT INTO memories (id, content, source, source_id, title, summary, url, chunk_index, last_modified, chunk_type, language, byte_start, byte_end, @@ -5363,12 +5398,12 @@ impl MemoryDB { stability, supersedes, pending_revision, word_count, entity_id, enrichment_status, quality, is_recap, supersede_mode, structured_fields, retrieval_cue, source_text, - embedding, created_at) + embedding, created_at, event_date) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17, ?18, ?19, ?20, ?21, ?22, ?23, ?24, ?25, ?26, ?27, ?28, ?29, ?30, ?31, - vector32(?32), ?33)", + vector32(?32), ?33, ?34)", libsql::params![ row.id, row.content, @@ -5402,7 +5437,8 @@ impl MemoryDB { retrieval_cue_val, source_text_val, vec_str, - row.last_modified // created_at = last_modified at insert time + row.last_modified, // created_at = last_modified at insert time + event_date_val ], ) .await @@ -5454,6 +5490,8 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, + c.event_date, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -5465,6 +5503,8 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, + c.event_date, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -5484,7 +5524,7 @@ impl MemoryDB { match rows_result { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let distance: f64 = row.get(28).unwrap_or(1.0); + let distance: f64 = row.get(30).unwrap_or(1.0); if let Ok(result) = Self::row_to_search_result(&row, distance as f32) { vector_results.push(result); } @@ -5510,6 +5550,8 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, + c.event_date, fts.rank FROM memories_fts fts JOIN memories c ON fts.rowid = c.rowid @@ -5523,6 +5565,8 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, + c.event_date, fts.rank FROM memories_fts fts JOIN memories c ON fts.rowid = c.rowid @@ -5545,7 +5589,7 @@ impl MemoryDB { match fts_result { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let rank: f64 = row.get(28).unwrap_or(0.0); + let rank: f64 = row.get(30).unwrap_or(0.0); if let Ok(result) = Self::row_to_search_result(&row, rank as f32) { fts_results.push(result); } @@ -5718,6 +5762,8 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, + c.event_date, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -5734,7 +5780,7 @@ impl MemoryDB { match conn.query(&sql, params).await { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let distance: f64 = row.get(28).unwrap_or(1.0); + let distance: f64 = row.get(30).unwrap_or(1.0); if let Ok(result) = Self::row_to_search_result(&row, distance as f32) { vector_results.push(result); } @@ -5777,6 +5823,8 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, + c.event_date, fts.rank FROM memories_fts fts JOIN memories c ON fts.rowid = c.rowid @@ -5801,7 +5849,7 @@ impl MemoryDB { match conn.query(&fts_sql, params).await { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let rank: f64 = row.get(28).unwrap_or(0.0); + let rank: f64 = row.get(30).unwrap_or(0.0); if let Ok(result) = Self::row_to_search_result(&row, rank as f32) { fts_results.push(result); } @@ -5875,15 +5923,14 @@ impl MemoryDB { .map(|mut r| { let rrf = *score_map.get(&r.id).unwrap_or(&0.0); - // Tiered retrieval: weight by confidence and recency decay + // Tiered retrieval: weight by confidence and recency decay. + // Decay anchored to last_modified (ingestion/edit time), NOT event_date — + // an old email imported today should rank as freshly ingested. event_date + // is for display only. let conf = r.confidence.unwrap_or(0.5); let tier = stability_tier(r.memory_type.as_deref()); - // Inline decay rates (match TuningConfig defaults) — search doesn't hold config ref - let dr = match tier { - crate::sources::StabilityTier::Protected => 0.001, - crate::sources::StabilityTier::Standard => 0.01, - crate::sources::StabilityTier::Ephemeral => 0.05, - }; + let decay_cfg = crate::tuning::ConfidenceConfig::default(); + let dr = crate::sources::decay_rate(&tier, &decay_cfg); let age_days = ((now - r.last_modified) as f64 / 86400.0).max(0.0); let recency = (-dr * age_days).exp() as f32; @@ -6471,6 +6518,8 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, + c.event_date, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -6490,6 +6539,8 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, + c.event_date, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -6506,7 +6557,7 @@ impl MemoryDB { match conn.query(&sql, params).await { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let distance: f64 = row.get(28).unwrap_or(1.0); + let distance: f64 = row.get(30).unwrap_or(1.0); let score = (1.0 - distance).max(0.0) as f32; if let Ok(result) = Self::row_to_search_result(&row, score) { results.push(result); @@ -6562,6 +6613,8 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, + c.event_date, fts.rank FROM memories_fts fts JOIN memories c ON fts.rowid = c.rowid @@ -6589,7 +6642,7 @@ impl MemoryDB { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { // FTS5 rank is negative BM25; negate so higher = better - let rank: f64 = row.get(28).unwrap_or(0.0); + let rank: f64 = row.get(30).unwrap_or(0.0); let score = (-rank) as f32; if let Ok(result) = Self::row_to_search_result(&row, score) { results.push(result); @@ -6845,6 +6898,8 @@ impl MemoryDB { structured_fields: None, retrieval_cue: None, source_text: None, + created_at, + event_date: None, raw_score: 0.0, }); } @@ -24270,4 +24325,100 @@ pub(crate) mod tests { sources_after ); } + + #[tokio::test] + async fn test_search_result_exposes_created_at() { + let (db, _dir) = test_db().await; + + // Seed a chunk with a known historical timestamp (2023-01-01 00:00:00 UTC = 1672531200). + let known_ts: i64 = 1_672_531_200; + let docs = vec![crate::sources::RawDocument { + content: "Alice met Bob in Tokyo".to_string(), + source_id: "doc1".to_string(), + source: "memory".to_string(), + title: "test".to_string(), + last_modified: known_ts, + memory_type: Some("fact".to_string()), + domain: Some("conversation".to_string()), + ..Default::default() + }]; + db.upsert_documents(docs).await.unwrap(); + + let results = db + .search_memory( + "Tokyo", + 5, + None, + Some("conversation"), + None, + None, + None, + None, + ) + .await + .unwrap(); + assert!(!results.is_empty(), "search returned no results"); + let r = &results[0]; + assert_eq!(r.last_modified, known_ts, "last_modified mismatch"); + assert_eq!( + r.created_at, known_ts, + "created_at mismatch (upsert_documents mirrors last_modified -> created_at on INSERT)" + ); + } + + /// Regression: search ranking must depend on `last_modified` (recency anchor), + /// not on `event_date` (display-only). A chunk with a 3-year-old event_date + /// but a fresh last_modified (think: imported old email, benchmark seed) + /// must score meaningfully — the recency multiplier should not crush it + /// to ~0 just because the *event* is old. + /// Guards against the recency-decay-eats-old-content regression that was + /// caught when an earlier change made `last_modified` carry the event time. + #[tokio::test] + async fn test_search_ranking_uses_last_modified_not_event_date() { + let (db, _dir) = test_db().await; + let now_ts = chrono::Utc::now().timestamp(); + let three_years_ago = now_ts - 3 * 365 * 86400; + + // Single row with old event_date + fresh last_modified (mirrors + // benchmark seeds and old-archive imports). + let docs = vec![crate::sources::RawDocument { + content: "Alice met Bob in Tokyo".to_string(), + source_id: "old_event_fresh_import".to_string(), + source: "memory".to_string(), + title: "test".to_string(), + last_modified: now_ts, + event_date: Some(three_years_ago), + memory_type: Some("fact".to_string()), + domain: Some("conversation".to_string()), + ..Default::default() + }]; + db.upsert_documents(docs).await.unwrap(); + + let results = db + .search_memory( + "Tokyo", + 10, + None, + Some("conversation"), + None, + None, + None, + None, + ) + .await + .unwrap(); + assert_eq!(results.len(), 1, "expected one result: {results:?}"); + let r = &results[0]; + assert_eq!(r.last_modified, now_ts); + assert_eq!(r.event_date, Some(three_years_ago)); + // Score must be meaningful. Pre-fix, recency multiplier was + // exp(-0.01 * 1095) ≈ 1.7e-5, crushing any RRF score to ~0. + // With ranking anchored to last_modified=now(), recency=1.0 and + // score should reflect actual RRF×confidence×... product (well above 1e-3). + assert!( + r.score > 0.001, + "score crushed by recency decay despite fresh last_modified: {}", + r.score + ); + } } diff --git a/crates/origin-core/src/eval/answer_quality.rs b/crates/origin-core/src/eval/answer_quality.rs index 84084efe..a2ac296b 100644 --- a/crates/origin-core/src/eval/answer_quality.rs +++ b/crates/origin-core/src/eval/answer_quality.rs @@ -485,6 +485,10 @@ pub async fn run_e2e_locomo_eval( memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), last_modified: chrono::Utc::now().timestamp(), + event_date: crate::eval::dates::seed_event_date( + mem.session_date.as_deref(), + crate::eval::dates::parse_locomo_date, + ), ..Default::default() }) .collect(); @@ -698,19 +702,18 @@ pub async fn run_e2e_locomo_eval( /// /// Requires enrichment + distillation to be run first (concepts must exist). /// Call this after seeding + enriching a DB, or use the all-in-one wrapper. -async fn generate_e2e_answers_for_question( +pub(crate) async fn generate_e2e_answers_for_question( db: &MemoryDB, question: &str, ground_truth: &str, category: &str, search_limit: usize, llm: &Arc, + question_date: Option<&str>, ) -> Result, OriginError> { use crate::llm_provider::{strip_think_tags, LlmRequest}; - let system_prompt = "Answer the question using only the provided context. \ - Be specific and concise. Respond in 1-3 sentences." - .to_string(); + let system_prompt = build_e2e_system_prompt(question_date); let mut tuples = Vec::new(); @@ -729,15 +732,23 @@ async fn generate_e2e_answers_for_question( .await?; let flat_context: String = flat_results .iter() - .enumerate() - .map(|(i, r)| format!("{}. {}", i + 1, r.content)) + .map(|r| { + format!( + "On {}: {}", + crate::eval::shared::format_ymd(r.event_date.unwrap_or(r.last_modified)), + r.content + ) + }) .collect::>() .join("\n"); let flat_tokens = count_tokens(&flat_context); let flat_request = LlmRequest { system_prompt: Some(system_prompt.clone()), - user_prompt: format!("Context:\n{}\n\nQuestion: {}", flat_context, question), + user_prompt: format!( + "Context (each line prefixed with the date the memory was recorded):\n{}\n\nQuestion: {}", + flat_context, question + ), max_tokens: 200, temperature: 0.1, label: Some("e2e_flat".to_string()), @@ -770,8 +781,12 @@ async fn generate_e2e_answers_for_question( // Memory search results if !flat_results.is_empty() { structured_parts.push("## Relevant Memories".to_string()); - for (i, r) in flat_results.iter().enumerate() { - structured_parts.push(format!("{}. {}", i + 1, r.content)); + for r in flat_results.iter() { + structured_parts.push(format!( + "On {}: {}", + crate::eval::shared::format_ymd(r.event_date.unwrap_or(r.last_modified)), + r.content + )); } } @@ -780,7 +795,10 @@ async fn generate_e2e_answers_for_question( let structured_request = LlmRequest { system_prompt: Some(system_prompt), - user_prompt: format!("Context:\n{}\n\nQuestion: {}", structured_context, question), + user_prompt: format!( + "Context (each line prefixed with the date the memory was recorded; concept articles are time-spanning):\n{}\n\nQuestion: {}", + structured_context, question + ), max_tokens: 200, temperature: 0.1, label: Some("e2e_structured".to_string()), @@ -860,6 +878,10 @@ pub async fn run_e2e_context_eval( memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), last_modified: chrono::Utc::now().timestamp(), + event_date: crate::eval::dates::seed_event_date( + mem.session_date.as_deref(), + crate::eval::dates::parse_locomo_date, + ), ..Default::default() }) .collect(); @@ -903,6 +925,7 @@ pub async fn run_e2e_context_eval( category, search_limit, &llm, + None, ) .await { @@ -1008,6 +1031,10 @@ pub async fn run_e2e_context_eval_longmemeval( ), domain: Some("conversation".to_string()), last_modified: chrono::Utc::now().timestamp(), + event_date: crate::eval::dates::seed_event_date( + mem.session_date.as_deref(), + crate::eval::dates::parse_lme_date, + ), ..Default::default() }) .collect(); @@ -1037,6 +1064,7 @@ pub async fn run_e2e_context_eval_longmemeval( category, search_limit, &llm, + Some(&sample.question_date), ) .await { @@ -1073,10 +1101,23 @@ struct PendingAnswer { context_tokens: usize, } -/// System prompt used for all E2E answer generation. +/// System prompt used for all E2E answer generation (no date anchor). const E2E_SYSTEM_PROMPT: &str = "Answer the question using only the provided context. Be specific and concise. Respond in 1-3 sentences."; +/// Build the E2E system prompt, prepending the question's "asked on" date when available +/// so the LLM has a temporal anchor for relative-time references in the question. +fn build_e2e_system_prompt(question_date: Option<&str>) -> String { + match question_date { + Some(d) => format!( + "The question was asked on {}. Answer the question using only the provided context. \ + Be specific and concise. Respond in 1-3 sentences.", + d + ), + None => E2E_SYSTEM_PROMPT.to_string(), + } +} + /// Build structured context for a question against an enriched DB. /// /// Returns the structured context: search_memory results + concept articles. @@ -1139,8 +1180,12 @@ async fn build_structured_context( } if !results.is_empty() { parts.push("## Relevant Memories".to_string()); - for (i, r) in results.iter().enumerate() { - parts.push(format!("{}. {}", i + 1, r.content)); + for r in results.iter() { + parts.push(format!( + "On {}: {}", + crate::eval::shared::format_ymd(r.event_date.unwrap_or(r.last_modified)), + r.content + )); } } let structured_context = parts.join("\n\n"); @@ -1234,6 +1279,10 @@ pub async fn run_fullpipeline_locomo_batch( memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), last_modified: chrono::Utc::now().timestamp(), + event_date: crate::eval::dates::seed_event_date( + mem.session_date.as_deref(), + crate::eval::locomo::parse_locomo_date, + ), ..Default::default() }) .collect(); @@ -1283,6 +1332,14 @@ pub async fn run_fullpipeline_locomo_batch( for sample in &samples { let mut q_count = 0usize; + // Question "asked on" = latest session date in this sample (questions follow the conversation). + // Skipped when no session_date can be parsed; falls back to the date-blind prompt. + let sample_question_date: Option = extract_observations(sample) + .iter() + .filter_map(|m| m.session_date.clone()) + .filter(|d| crate::eval::locomo::parse_locomo_date(d).is_some()) + .max_by_key(|d| crate::eval::locomo::parse_locomo_date(d).unwrap_or(0)); + for qa in &sample.qa { if qa.category == 5 { continue; @@ -1307,7 +1364,7 @@ pub async fn run_fullpipeline_locomo_batch( batch_requests.push(( req_id.clone(), format!("Context:\n{}\n\nQuestion: {}", ctx, qa.question), - Some(E2E_SYSTEM_PROMPT.to_string()), + Some(build_e2e_system_prompt(sample_question_date.as_deref())), 200, )); pending.insert( @@ -1478,6 +1535,10 @@ pub async fn run_fullpipeline_lme_batch( ), domain: Some("conversation".to_string()), last_modified: chrono::Utc::now().timestamp(), + event_date: crate::eval::dates::seed_event_date( + mem.session_date.as_deref(), + crate::eval::longmemeval::parse_lme_date, + ), ..Default::default() }) .collect(); @@ -1541,7 +1602,7 @@ pub async fn run_fullpipeline_lme_batch( batch_requests.push(( req_id.clone(), format!("Context:\n{}\n\nQuestion: {}", ctx, sample.question), - Some(E2E_SYSTEM_PROMPT.to_string()), + Some(build_e2e_system_prompt(Some(&sample.question_date))), 200, )); pending.insert( @@ -1629,4 +1690,45 @@ pub async fn run_fullpipeline_lme_batch( Ok(finished_tuples) } -// ===== Flat cache loaders ===== +#[cfg(test)] +mod tests { + #[tokio::test] + async fn test_format_ymd_used_in_context() { + assert_eq!(crate::eval::shared::format_ymd(1_681_168_020), "2023-04-10"); + assert_eq!(crate::eval::shared::format_ymd(1_683_554_160), "2023-05-08"); + } + + #[test] + fn test_system_prompt_includes_question_date_when_provided() { + // Mirror the system_prompt construction from generate_e2e_answers_for_question + // to lock in the format. If the function changes, this test should reflect. + let with_date = match Some("2023/04/10 (Mon) 23:07") { + Some(d) => format!( + "The question was asked on {}. Answer the question using only the provided context. \ + Be specific and concise. Respond in 1-3 sentences.", + d + ), + None => "Answer the question using only the provided context. \ + Be specific and concise. Respond in 1-3 sentences." + .to_string(), + }; + assert!(with_date.contains("The question was asked on 2023/04/10")); + assert!(with_date.contains("only the provided context")); + } + + #[test] + fn test_system_prompt_omits_when_no_question_date() { + let without_date: String = match None::<&str> { + Some(d) => format!( + "The question was asked on {}. Answer the question using only the provided context. \ + Be specific and concise. Respond in 1-3 sentences.", + d + ), + None => "Answer the question using only the provided context. \ + Be specific and concise. Respond in 1-3 sentences." + .to_string(), + }; + assert!(!without_date.contains("question was asked on")); + assert!(without_date.contains("only the provided context")); + } +} diff --git a/crates/origin-core/src/eval/dates.rs b/crates/origin-core/src/eval/dates.rs new file mode 100644 index 00000000..0a2937af --- /dev/null +++ b/crates/origin-core/src/eval/dates.rs @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: AGPL-3.0-only +//! Date helpers shared across eval benchmark adapters. +//! +//! Centralises the three date utilities that were previously scattered across +//! `locomo`, `longmemeval`, and `shared`: +//! +//! - [`parse_locomo_date`] — LoCoMo session timestamps ("1:56 pm on 8 May, 2023") +//! - [`parse_lme_date`] — LongMemEval session timestamps ("2023/04/10 (Mon) 23:07") +//! - [`format_ymd`] — Unix-seconds → "YYYY-MM-DD" formatting +//! - [`seed_event_date`] — resolve a benchmark chunk's `event_date` field + +/// Parse a LoCoMo session date like "1:56 pm on 8 May, 2023" into Unix seconds. +/// Returns `None` on parse failure (caller falls back to `now()`). +pub fn parse_locomo_date(s: &str) -> Option { + use chrono::{NaiveDateTime, TimeZone, Utc}; + // The dataset uses " on ". chrono's strftime + // %p needs uppercase AM/PM; LoCoMo uses lowercase. Normalise first. + let normalised = s.replace(" am ", " AM ").replace(" pm ", " PM "); + NaiveDateTime::parse_from_str(&normalised, "%I:%M %p on %d %B, %Y") + .ok() + .and_then(|naive| Utc.from_local_datetime(&naive).single()) + .map(|dt| dt.timestamp()) +} + +/// Parse a LongMemEval `question_date` / `haystack_date` into Unix seconds. +/// Format example: "2023/04/10 (Mon) 23:07". Returns `None` on parse failure +/// (e.g. dataset variants with different formats -- caller falls back to `now()`). +pub fn parse_lme_date(s: &str) -> Option { + use chrono::{NaiveDateTime, TimeZone, Utc}; + // Strip the weekday tag in parens: "2023/04/10 (Mon) 23:07" -> "2023/04/10 23:07" + let cleaned: String = s + .split_whitespace() + .filter(|tok| !(tok.starts_with('(') && tok.ends_with(')'))) + .collect::>() + .join(" "); + NaiveDateTime::parse_from_str(&cleaned, "%Y/%m/%d %H:%M") + .ok() + .and_then(|naive| Utc.from_local_datetime(&naive).single()) + .map(|dt| dt.timestamp()) +} + +/// Format a unix-seconds timestamp as ISO-8601 calendar date "YYYY-MM-DD" in UTC. +/// Returns "unknown date" on conversion failure (e.g. malformed timestamp). +pub fn format_ymd(ts: i64) -> String { + use chrono::{TimeZone, Utc}; + Utc.timestamp_opt(ts, 0) + .single() + .map(|dt| dt.format("%Y-%m-%d").to_string()) + .unwrap_or_else(|| "unknown date".to_string()) +} + +/// Resolve `event_date` for a benchmark-seeded chunk: parse the per-session +/// date string with `parser`. Returns `None` if no date is provided, or if +/// parsing fails (with a warning so silent degradation is visible in logs). +/// +/// Used at seed sites to populate `RawDocument.event_date` while +/// `last_modified` stays at `now()` — so search ranking treats benchmark +/// memories as fresh while LLM context still sees the original event date. +pub fn seed_event_date(date: Option<&str>, parser: fn(&str) -> Option) -> Option { + let s = date?; + if let Some(ts) = parser(s) { + return Some(ts); + } + log::warn!( + "[eval:dates] failed to parse date {s:?}; event_date set to None — display falls back to last_modified" + ); + None +} + +#[cfg(test)] +mod tests { + use super::*; + + // ── parse_locomo_date ──────────────────────────────────────────────────── + + #[test] + fn test_parse_locomo_date() { + let ts = parse_locomo_date("1:56 pm on 8 May, 2023").expect("should parse"); + // 2023-05-08 13:56 UTC = 1683554160 + assert_eq!(ts, 1_683_554_160); + } + + #[test] + fn test_parse_locomo_date_garbage_returns_none() { + assert!(parse_locomo_date("nonsense").is_none()); + } + + // ── parse_lme_date ─────────────────────────────────────────────────────── + + #[test] + fn test_parse_lme_date_round_trip() { + let ts = parse_lme_date("2023/04/10 (Mon) 23:07").expect("should parse"); + // 2023-04-10 23:07 UTC == 1681168020 + assert_eq!(ts, 1_681_168_020); + } + + #[test] + fn test_parse_lme_date_garbage_returns_none() { + assert!(parse_lme_date("not a date").is_none()); + assert!(parse_lme_date("").is_none()); + } + + // ── format_ymd ─────────────────────────────────────────────────────────── + + #[test] + fn test_format_ymd_round_trip() { + assert_eq!(format_ymd(1_681_168_020), "2023-04-10"); + assert_eq!(format_ymd(1_683_554_160), "2023-05-08"); + assert_eq!(format_ymd(0), "1970-01-01"); + } + + // ── seed_event_date ────────────────────────────────────────────────────── + + #[test] + fn test_seed_event_date_parses_when_date_present() { + assert_eq!( + seed_event_date(Some("2023/04/10 (Mon) 23:07"), parse_lme_date), + Some(1_681_168_020) + ); + } + + #[test] + fn test_seed_event_date_returns_none_when_date_missing() { + assert_eq!(seed_event_date(None, parse_lme_date), None); + } + + #[test] + fn test_seed_event_date_returns_none_when_parser_rejects() { + // malformed string still yields None (with a logged warning) rather than + // silently turning into "today" — that was the bug seed_last_modified had. + assert_eq!(seed_event_date(Some("malformed"), parse_lme_date), None); + } +} diff --git a/crates/origin-core/src/eval/locomo.rs b/crates/origin-core/src/eval/locomo.rs index 9cda00ee..60ba7e3b 100644 --- a/crates/origin-core/src/eval/locomo.rs +++ b/crates/origin-core/src/eval/locomo.rs @@ -18,6 +18,11 @@ use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::path::Path; +// Bring date helpers into scope for use within this module. +use crate::eval::dates::seed_event_date; +// Re-export so external callers using `crate::eval::locomo::parse_locomo_date` still compile. +pub use crate::eval::dates::parse_locomo_date; + // --------------------------------------------------------------------------- // Data structures // --------------------------------------------------------------------------- @@ -53,6 +58,8 @@ pub struct LocomoMemory { pub session_num: usize, pub dia_id: String, pub sample_id: String, + /// Raw "h:mm am/pm on D Month, YYYY" string for this session, when present. + pub session_date: Option, } // --------------------------------------------------------------------------- @@ -112,12 +119,20 @@ pub fn extract_observations(sample: &LocomoSample) -> Vec { None => continue, }; + let session_date_key = session_key.replace("_observation", "_date_time"); + let session_date = sample + .conversation + .get(&session_date_key) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()); + memories.push(LocomoMemory { content, speaker: speaker.clone(), session_num, dia_id, sample_id: sample.sample_id.clone(), + session_date, }); } } @@ -418,6 +433,7 @@ pub async fn run_locomo_eval(path: &Path) -> Result { memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), last_modified: chrono::Utc::now().timestamp(), + event_date: seed_event_date(mem.session_date.as_deref(), parse_locomo_date), ..Default::default() }) .collect(); @@ -549,6 +565,7 @@ pub async fn run_locomo_eval_reranked( memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), last_modified: chrono::Utc::now().timestamp(), + event_date: seed_event_date(mem.session_date.as_deref(), parse_locomo_date), ..Default::default() }) .collect(); @@ -674,6 +691,7 @@ pub async fn run_locomo_eval_expanded( memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), last_modified: chrono::Utc::now().timestamp(), + event_date: seed_event_date(mem.session_date.as_deref(), parse_locomo_date), ..Default::default() }) .collect(); @@ -940,6 +958,7 @@ pub async fn run_locomo_eval_with_gate( memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), last_modified: chrono::Utc::now().timestamp(), + event_date: seed_event_date(mem.session_date.as_deref(), parse_locomo_date), ..Default::default() }) .collect(); @@ -1415,4 +1434,47 @@ mod tests { // Verify delta printing is present assert!(text.contains("->")); } + + #[tokio::test] + async fn test_locomo_seed_propagates_session_date() { + use crate::sources::RawDocument; + + let last_modified = super::parse_locomo_date("1:56 pm on 8 May, 2023").unwrap(); + + let tmp = tempfile::tempdir().unwrap(); + let db = + crate::db::MemoryDB::new(tmp.path(), std::sync::Arc::new(crate::events::NoopEmitter)) + .await + .unwrap(); + + db.upsert_documents(vec![RawDocument { + content: "Alice told Bob she moved to Tokyo".to_string(), + source_id: "locomo/sample1/dia1".to_string(), + source: "memory".to_string(), + title: "test".to_string(), + last_modified, + memory_type: Some("fact".to_string()), + domain: Some("conversation".to_string()), + ..Default::default() + }]) + .await + .unwrap(); + + let results = db + .search_memory( + "Tokyo", + 5, + None, + Some("conversation"), + None, + None, + None, + None, + ) + .await + .unwrap(); + assert!(!results.is_empty()); + assert_eq!(results[0].last_modified, 1_683_554_160); + assert_eq!(results[0].created_at, 1_683_554_160); + } } diff --git a/crates/origin-core/src/eval/longmemeval.rs b/crates/origin-core/src/eval/longmemeval.rs index c9e968ca..476a3135 100644 --- a/crates/origin-core/src/eval/longmemeval.rs +++ b/crates/origin-core/src/eval/longmemeval.rs @@ -30,6 +30,11 @@ use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::path::Path; +// Bring date helpers into scope for use within this module. +use crate::eval::dates::seed_event_date; +// Re-export so external callers using `crate::eval::longmemeval::parse_lme_date` still compile. +pub use crate::eval::dates::parse_lme_date; + // --------------------------------------------------------------------------- // Data structures (matches the JSON schema from HuggingFace) // --------------------------------------------------------------------------- @@ -74,6 +79,8 @@ pub struct LongMemEvalMemory { pub turn_idx: usize, pub has_answer: bool, pub question_id: String, + /// Raw date string from the dataset (e.g. "2023/04/10 (Mon) 23:07"). None for samples missing dates. + pub session_date: Option, } // --------------------------------------------------------------------------- @@ -108,6 +115,7 @@ pub fn extract_memories(sample: &LongMemEvalSample) -> Vec { .zip(sample.haystack_sessions.iter()) .enumerate() { + let session_date = sample.haystack_dates.get(sess_idx).cloned(); for (turn_idx, turn) in session.iter().enumerate() { // Always include user turns (they contain the personal facts). // Include assistant turns only if they have answer evidence, @@ -121,6 +129,7 @@ pub fn extract_memories(sample: &LongMemEvalSample) -> Vec { turn_idx, has_answer: turn.has_answer, question_id: sample.question_id.clone(), + session_date: session_date.clone(), }); } } @@ -423,6 +432,7 @@ pub async fn run_longmemeval_eval(path: &Path) -> Result")); assert!(text.contains("single-session-user")); } + + #[tokio::test] + async fn test_lme_seed_propagates_session_date() { + use crate::sources::RawDocument; + + // Mimic what the runner builds for a single memory. + let mem = super::LongMemEvalMemory { + content: "I moved to Tokyo last summer.".to_string(), + role: "user".to_string(), + session_id: "s1".to_string(), + session_idx: 0, + turn_idx: 0, + has_answer: true, + question_id: "q1".to_string(), + session_date: Some("2023/04/10 (Mon) 23:07".to_string()), + }; + let event_date = + crate::eval::dates::seed_event_date(mem.session_date.as_deref(), super::parse_lme_date); + let now_ts = chrono::Utc::now().timestamp(); + + let tmp = tempfile::tempdir().unwrap(); + let db = + crate::db::MemoryDB::new(tmp.path(), std::sync::Arc::new(crate::events::NoopEmitter)) + .await + .unwrap(); + db.upsert_documents(vec![RawDocument { + content: mem.content.clone(), + source_id: "lme/q1/s1/0".to_string(), + source: "memory".to_string(), + title: "test".to_string(), + last_modified: now_ts, + event_date, + memory_type: Some("fact".to_string()), + domain: Some("conversation".to_string()), + ..Default::default() + }]) + .await + .unwrap(); + + let results = db + .search_memory( + "Tokyo", + 5, + None, + Some("conversation"), + None, + None, + None, + None, + ) + .await + .unwrap(); + assert!(!results.is_empty()); + assert_eq!(results[0].last_modified, now_ts); + assert_eq!(results[0].event_date, Some(1_681_168_020)); + } } diff --git a/crates/origin-core/src/eval/mod.rs b/crates/origin-core/src/eval/mod.rs index f0869df0..32b117c4 100644 --- a/crates/origin-core/src/eval/mod.rs +++ b/crates/origin-core/src/eval/mod.rs @@ -2,6 +2,7 @@ //! Memory eval system — quality measurement and feedback capture. pub mod anthropic; +pub mod dates; pub mod judge; pub mod shared; diff --git a/crates/origin-core/src/eval/pipeline.rs b/crates/origin-core/src/eval/pipeline.rs index 76ff2e50..5e1d1a94 100644 --- a/crates/origin-core/src/eval/pipeline.rs +++ b/crates/origin-core/src/eval/pipeline.rs @@ -472,6 +472,10 @@ pub async fn run_locomo_pipeline_eval( memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), last_modified: chrono::Utc::now().timestamp(), + event_date: crate::eval::dates::seed_event_date( + mem.session_date.as_deref(), + crate::eval::dates::parse_locomo_date, + ), ..Default::default() }) .collect(); @@ -789,6 +793,10 @@ pub async fn run_longmemeval_pipeline_eval( ), domain: Some("conversation".to_string()), last_modified: chrono::Utc::now().timestamp(), + event_date: crate::eval::dates::seed_event_date( + mem.session_date.as_deref(), + crate::eval::dates::parse_lme_date, + ), ..Default::default() }) .collect(); diff --git a/crates/origin-core/src/eval/shared.rs b/crates/origin-core/src/eval/shared.rs index d4daddeb..f8276286 100644 --- a/crates/origin-core/src/eval/shared.rs +++ b/crates/origin-core/src/eval/shared.rs @@ -39,6 +39,10 @@ pub fn count_tokens(text: &str) -> usize { BPE.encode_with_special_tokens(text).len() } +/// Format a unix-seconds timestamp as ISO-8601 calendar date "YYYY-MM-DD" in UTC. +/// Re-exported from [`crate::eval::dates`] for backward compatibility. +pub use crate::eval::dates::format_ymd; + /// Probe on-device batch extraction at different batch sizes. /// Returns vec of (batch_size, input_tokens, response_len, entities_found, observations_found). pub async fn probe_extraction_batch_sizes( diff --git a/crates/origin-core/src/eval/token_efficiency.rs b/crates/origin-core/src/eval/token_efficiency.rs index 816cf0fb..0aeb490f 100644 --- a/crates/origin-core/src/eval/token_efficiency.rs +++ b/crates/origin-core/src/eval/token_efficiency.rs @@ -4458,118 +4458,8 @@ pub async fn run_context_path_eval( } // ===== E2E Answer Quality: flat vs structured context with LLM-as-judge ===== - -/// Run E2E answer quality comparison: flat (search_memory) vs structured (search + concepts). -/// -/// For each LoCoMo question: -/// 1. Build flat context: search_memory top-K concatenated -/// 2. Build structured context: search_memory + concept articles (like chat-context) -/// 3. Generate answers from both contexts using on-device LLM -/// 4. Return JudgmentTuples for offline Claude Haiku judging -/// -/// Requires enrichment + distillation to be run first (concepts must exist). -/// Call this after seeding + enriching a DB, or use the all-in-one wrapper. -async fn generate_e2e_answers_for_question( - db: &MemoryDB, - question: &str, - ground_truth: &str, - category: &str, - search_limit: usize, - llm: &Arc, -) -> Result, OriginError> { - use crate::llm_provider::{strip_think_tags, LlmRequest}; - - let system_prompt = "Answer the question using only the provided context. \ - Be specific and concise. Respond in 1-3 sentences." - .to_string(); - - let mut tuples = Vec::new(); - - // --- Flat context: search_memory only --- - let flat_results = db - .search_memory( - question, - search_limit, - None, - Some("conversation"), - None, - None, - None, - None, - ) - .await?; - let flat_context: String = flat_results - .iter() - .enumerate() - .map(|(i, r)| format!("{}. {}", i + 1, r.content)) - .collect::>() - .join("\n"); - let flat_tokens = count_tokens(&flat_context); - - let flat_request = LlmRequest { - system_prompt: Some(system_prompt.clone()), - user_prompt: format!("Context:\n{}\n\nQuestion: {}", flat_context, question), - max_tokens: 200, - temperature: 0.1, - label: Some("e2e_flat".to_string()), - }; - if let Ok(raw) = llm.generate(flat_request).await { - let answer = strip_think_tags(&raw); - tuples.push(JudgmentTuple { - question: question.to_string(), - ground_truth: ground_truth.to_string(), - approach: format!("flat_{}", category), - answer, - context_tokens: flat_tokens, - category: category.to_string(), - }); - } - - // --- Structured context: search_memory + concept articles --- - let mut structured_parts: Vec = Vec::new(); - - // Concept articles (like chat-context's "Compiled Knowledge" section) - let concepts = db.search_concepts(question, 3).await.unwrap_or_default(); - if !concepts.is_empty() { - structured_parts.push("## Compiled Knowledge".to_string()); - for c in &concepts { - let summary = c.summary.as_deref().unwrap_or(""); - structured_parts.push(format!("**{}**: {}\n{}", c.title, summary, c.content)); - } - } - - // Memory search results - if !flat_results.is_empty() { - structured_parts.push("## Relevant Memories".to_string()); - for (i, r) in flat_results.iter().enumerate() { - structured_parts.push(format!("{}. {}", i + 1, r.content)); - } - } - - let structured_context = structured_parts.join("\n\n"); - let structured_tokens = count_tokens(&structured_context); - - let structured_request = LlmRequest { - system_prompt: Some(system_prompt), - user_prompt: format!("Context:\n{}\n\nQuestion: {}", structured_context, question), - max_tokens: 200, - temperature: 0.1, - label: Some("e2e_structured".to_string()), - }; - if let Ok(raw) = llm.generate(structured_request).await { - let answer = strip_think_tags(&raw); - tuples.push(JudgmentTuple { - question: question.to_string(), - ground_truth: ground_truth.to_string(), - approach: format!("structured_{}", category), - answer, - context_tokens: structured_tokens, - category: category.to_string(), - }); - } - - Ok(tuples) -} +// `generate_e2e_answers_for_question` is the date-aware version in +// crate::eval::answer_quality; we use it for both LoCoMo and LongMemEval below. /// Run full E2E answer quality eval on LoCoMo: seed, enrich, distill, generate answers. /// @@ -4602,6 +4492,14 @@ pub async fn run_e2e_context_eval( continue; } + // Question "asked on" = latest session date in this sample (questions follow the + // conversation in LoCoMo). Falls back to None if no session_date parses. + let sample_question_date: Option = memories + .iter() + .filter_map(|m| m.session_date.clone()) + .filter(|d| crate::eval::locomo::parse_locomo_date(d).is_some()) + .max_by_key(|d| crate::eval::locomo::parse_locomo_date(d).unwrap_or(0)); + eprintln!( "[e2e_context] Conv {}/{} ({}): {} observations", conv_idx + 1, @@ -4630,7 +4528,10 @@ pub async fn run_e2e_context_eval( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: crate::eval::dates::seed_last_modified( + mem.session_date.as_deref(), + crate::eval::locomo::parse_locomo_date, + ), ..Default::default() }) .collect(); @@ -4667,13 +4568,14 @@ pub async fn run_e2e_context_eval( let category = category_name(qa.category); - match generate_e2e_answers_for_question( + match crate::eval::answer_quality::generate_e2e_answers_for_question( &db, &qa.question, &ground_truth, category, search_limit, &llm, + sample_question_date.as_deref(), ) .await { @@ -4778,7 +4680,10 @@ pub async fn run_e2e_context_eval_longmemeval( .to_string(), ), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: crate::eval::dates::seed_last_modified( + mem.session_date.as_deref(), + crate::eval::longmemeval::parse_lme_date, + ), ..Default::default() }) .collect(); @@ -4801,13 +4706,14 @@ pub async fn run_e2e_context_eval_longmemeval( let category = category_name(&sample.question_type); - if let Ok(tuples) = generate_e2e_answers_for_question( + if let Ok(tuples) = crate::eval::answer_quality::generate_e2e_answers_for_question( &db, &sample.question, &ground_truth, category, search_limit, &llm, + Some(&sample.question_date), ) .await { diff --git a/crates/origin-core/src/llm_provider.rs b/crates/origin-core/src/llm_provider.rs index 7b19a8be..97a4c25e 100644 --- a/crates/origin-core/src/llm_provider.rs +++ b/crates/origin-core/src/llm_provider.rs @@ -747,8 +747,14 @@ impl LlmProvider for ClaudeCliProvider { args.push(sys.clone()); } + // Scrub ANTHROPIC_API_KEY from the child's env: when present, the CLI + // routes through pay-as-you-go API instead of the user's Max OAuth, so + // a Max-plan eval would silently burn API credits (and fail with "Credit + // balance is too low" when the API key has none). The CLI provider's + // whole purpose is to use Max OAuth, so always remove the override. let mut child = Command::new("claude") .args(&args) + .env_remove("ANTHROPIC_API_KEY") .stdin(std::process::Stdio::piped()) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) diff --git a/crates/origin-core/src/post_ingest.rs b/crates/origin-core/src/post_ingest.rs index 10aca138..f7024c37 100644 --- a/crates/origin-core/src/post_ingest.rs +++ b/crates/origin-core/src/post_ingest.rs @@ -756,6 +756,7 @@ mod tests { content: content.to_string(), url: None, last_modified: chrono::Utc::now().timestamp(), + event_date: None, metadata: std::collections::HashMap::new(), memory_type: Some("fact".to_string()), domain: None, diff --git a/crates/origin-server/src/ingest_batcher.rs b/crates/origin-server/src/ingest_batcher.rs index 89b53335..b5b287eb 100644 --- a/crates/origin-server/src/ingest_batcher.rs +++ b/crates/origin-server/src/ingest_batcher.rs @@ -246,6 +246,7 @@ mod tests { content: content.into(), url: None, last_modified: 0, + event_date: None, metadata: HashMap::new(), memory_type: Some("fact".into()), domain: None, diff --git a/crates/origin-server/src/memory_routes.rs b/crates/origin-server/src/memory_routes.rs index 6afa382b..0a688347 100644 --- a/crates/origin-server/src/memory_routes.rs +++ b/crates/origin-server/src/memory_routes.rs @@ -539,6 +539,7 @@ pub async fn handle_store_memory( content: req.content.clone(), url: None, last_modified: chrono::Utc::now().timestamp(), + event_date: None, metadata: HashMap::new(), memory_type: Some(memory_type_str.clone()), domain: final_domain.clone(), diff --git a/crates/origin-types/src/lib.rs b/crates/origin-types/src/lib.rs index 332d3d07..811727de 100644 --- a/crates/origin-types/src/lib.rs +++ b/crates/origin-types/src/lib.rs @@ -102,6 +102,8 @@ mod tests { url: None, chunk_index: 0, last_modified: 1000, + created_at: 1000, + event_date: None, score: 0.9, chunk_type: None, language: None, diff --git a/crates/origin-types/src/memory.rs b/crates/origin-types/src/memory.rs index 60df3c02..b1e01798 100644 --- a/crates/origin-types/src/memory.rs +++ b/crates/origin-types/src/memory.rs @@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize}; /// A search result from hybrid (vector + FTS) search. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct SearchResult { pub id: String, pub content: String, @@ -14,6 +14,16 @@ pub struct SearchResult { pub url: Option, pub chunk_index: i32, pub last_modified: i64, + /// Unix seconds timestamp when the chunk was first inserted. + /// Equal to `last_modified` for benchmark/eval seeds; diverges in real use as memories get re-enriched. + #[serde(default)] + pub created_at: i64, + /// Unix timestamp of when the event the document describes actually happened. + /// Distinct from `last_modified` (ingestion time). `None` = unknown; display code should + /// fall back to `last_modified`. Does NOT influence search ranking — recency decay + /// continues to use `last_modified` so old-but-just-imported content isn't penalised. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub event_date: Option, pub score: f32, #[serde(skip_serializing_if = "Option::is_none")] pub chunk_type: Option, diff --git a/crates/origin-types/src/sources.rs b/crates/origin-types/src/sources.rs index 9fc966db..77d52ffd 100644 --- a/crates/origin-types/src/sources.rs +++ b/crates/origin-types/src/sources.rs @@ -108,8 +108,15 @@ pub struct RawDocument { pub content: String, /// Deep link back to the source (URL, file path) pub url: Option, - /// Unix timestamp of last modification + /// Unix timestamp of last modification (ingestion/edit time — used for recency ranking). pub last_modified: i64, + /// Unix timestamp of when the event the document describes actually happened. + /// Distinct from `last_modified` (ingestion time): a benchmark seed has session_date here + /// and now() in last_modified; an imported old email has the email Date header here and + /// the import time in last_modified. Used for date-aware display in retrieved context. + /// `None` = unknown event time; consumers should fall back to `last_modified`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub event_date: Option, /// Additional metadata pub metadata: HashMap, @@ -183,6 +190,7 @@ impl Default for RawDocument { content: String::new(), url: None, last_modified: 0, + event_date: None, metadata: HashMap::new(), memory_type: None, domain: None,