From f23753b6b1e4a32754df862892bec09d5c4ab8da Mon Sep 17 00:00:00 2001 From: 7xuanlu Date: Sun, 26 Apr 2026 11:34:19 -0700 Subject: [PATCH 01/13] fix: surface created_at on SearchResult Adds an i64 created_at field to SearchResult and populates it from chunks.created_at in row_to_search_result. Foundation for date filtering and date-aware eval prompts. Existing last_modified semantics unchanged. Co-Authored-By: Claude Sonnet 4.6 --- crates/origin-core/src/db.rs | 66 +++++++++++++++++++++++++++---- crates/origin-types/src/memory.rs | 4 ++ 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/crates/origin-core/src/db.rs b/crates/origin-core/src/db.rs index c2a73de2..2367bc59 100644 --- a/crates/origin-core/src/db.rs +++ b/crates/origin-core/src/db.rs @@ -5067,7 +5067,8 @@ impl MemoryDB { /// 11=byte_start, 12=byte_end, 13=semantic_unit, 14=memory_type, 15=domain, /// 16=source_agent, 17=confidence, 18=confirmed, 19=stability, 20=supersedes, /// 21=entity_id, 22=quality, 23=is_recap, 24=supersede_mode, - /// 25=structured_fields, 26=retrieval_cue, 27=source_text, 28=score/distance/rank + /// 25=structured_fields, 26=retrieval_cue, 27=source_text, 28=created_at, + /// 29=score/distance/rank fn row_to_search_result(row: &libsql::Row, score: f32) -> Result { Ok(SearchResult { id: row @@ -5111,6 +5112,7 @@ impl MemoryDB { structured_fields: row.get::>(25).unwrap_or(None), retrieval_cue: row.get::>(26).unwrap_or(None), source_text: row.get::>(27).unwrap_or(None), + created_at: row.get::(28).unwrap_or(0), raw_score: 0.0, // Set later during normalization }) } @@ -5454,6 +5456,7 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -5465,6 +5468,7 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -5484,7 +5488,7 @@ impl MemoryDB { match rows_result { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let distance: f64 = row.get(28).unwrap_or(1.0); + let distance: f64 = row.get(29).unwrap_or(1.0); if let Ok(result) = Self::row_to_search_result(&row, distance as f32) { vector_results.push(result); } @@ -5510,6 +5514,7 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, fts.rank FROM memories_fts fts JOIN memories c ON fts.rowid = c.rowid @@ -5523,6 +5528,7 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, fts.rank FROM memories_fts fts JOIN memories c ON fts.rowid = c.rowid @@ -5545,7 +5551,7 @@ impl MemoryDB { match fts_result { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let rank: f64 = row.get(28).unwrap_or(0.0); + let rank: f64 = row.get(29).unwrap_or(0.0); if let Ok(result) = Self::row_to_search_result(&row, rank as f32) { fts_results.push(result); } @@ -5718,6 +5724,7 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -5734,7 +5741,7 @@ impl MemoryDB { match conn.query(&sql, params).await { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let distance: f64 = row.get(28).unwrap_or(1.0); + let distance: f64 = row.get(29).unwrap_or(1.0); if let Ok(result) = Self::row_to_search_result(&row, distance as f32) { vector_results.push(result); } @@ -5777,6 +5784,7 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, fts.rank FROM memories_fts fts JOIN memories c ON fts.rowid = c.rowid @@ -5801,7 +5809,7 @@ impl MemoryDB { match conn.query(&fts_sql, params).await { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let rank: f64 = row.get(28).unwrap_or(0.0); + let rank: f64 = row.get(29).unwrap_or(0.0); if let Ok(result) = Self::row_to_search_result(&row, rank as f32) { fts_results.push(result); } @@ -6471,6 +6479,7 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -6490,6 +6499,7 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -6506,7 +6516,7 @@ impl MemoryDB { match conn.query(&sql, params).await { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let distance: f64 = row.get(28).unwrap_or(1.0); + let distance: f64 = row.get(29).unwrap_or(1.0); let score = (1.0 - distance).max(0.0) as f32; if let Ok(result) = Self::row_to_search_result(&row, score) { results.push(result); @@ -6562,6 +6572,7 @@ impl MemoryDB { c.confidence, c.confirmed, c.stability, c.supersedes, c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, + c.created_at, fts.rank FROM memories_fts fts JOIN memories c ON fts.rowid = c.rowid @@ -6589,7 +6600,7 @@ impl MemoryDB { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { // FTS5 rank is negative BM25; negate so higher = better - let rank: f64 = row.get(28).unwrap_or(0.0); + let rank: f64 = row.get(29).unwrap_or(0.0); let score = (-rank) as f32; if let Ok(result) = Self::row_to_search_result(&row, score) { results.push(result); @@ -6845,6 +6856,7 @@ impl MemoryDB { structured_fields: None, retrieval_cue: None, source_text: None, + created_at, raw_score: 0.0, }); } @@ -24270,4 +24282,44 @@ pub(crate) mod tests { sources_after ); } + + #[tokio::test] + async fn test_search_result_exposes_created_at() { + let (db, _dir) = test_db().await; + + // Seed a chunk with a known historical timestamp (2023-01-01 00:00:00 UTC = 1672531200). + let known_ts: i64 = 1_672_531_200; + let docs = vec![crate::sources::RawDocument { + content: "Alice met Bob in Tokyo".to_string(), + source_id: "doc1".to_string(), + source: "memory".to_string(), + title: "test".to_string(), + last_modified: known_ts, + memory_type: Some("fact".to_string()), + domain: Some("conversation".to_string()), + ..Default::default() + }]; + db.upsert_documents(docs).await.unwrap(); + + let results = db + .search_memory( + "Tokyo", + 5, + None, + Some("conversation"), + None, + None, + None, + None, + ) + .await + .unwrap(); + assert!(!results.is_empty(), "search returned no results"); + let r = &results[0]; + assert_eq!(r.last_modified, known_ts, "last_modified mismatch"); + assert_eq!( + r.created_at, known_ts, + "created_at mismatch (upsert_documents mirrors last_modified -> created_at on INSERT)" + ); + } } diff --git a/crates/origin-types/src/memory.rs b/crates/origin-types/src/memory.rs index 60df3c02..2b5fd650 100644 --- a/crates/origin-types/src/memory.rs +++ b/crates/origin-types/src/memory.rs @@ -14,6 +14,10 @@ pub struct SearchResult { pub url: Option, pub chunk_index: i32, pub last_modified: i64, + /// Unix seconds timestamp when the chunk was first inserted. + /// Equal to `last_modified` for benchmark/eval seeds; diverges in real use as memories get re-enriched. + #[serde(default)] + pub created_at: i64, pub score: f32, #[serde(skip_serializing_if = "Option::is_none")] pub chunk_type: Option, From 8fb2db3495977a6a69e360ec9e684d4415c04a56 Mon Sep 17 00:00:00 2001 From: 7xuanlu Date: Sun, 26 Apr 2026 11:37:34 -0700 Subject: [PATCH 02/13] fix: add created_at to SearchResult test fixture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Followup to the prior commit's struct change — origin-types' own search_result_serializes test still constructs SearchResult literally and needed the new field. --- crates/origin-types/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/origin-types/src/lib.rs b/crates/origin-types/src/lib.rs index 332d3d07..e8aba444 100644 --- a/crates/origin-types/src/lib.rs +++ b/crates/origin-types/src/lib.rs @@ -102,6 +102,7 @@ mod tests { url: None, chunk_index: 0, last_modified: 1000, + created_at: 1000, score: 0.9, chunk_type: None, language: None, From 74d148eaa64065cddeab47f638693e28ddc42830 Mon Sep 17 00:00:00 2001 From: 7xuanlu Date: Sun, 26 Apr 2026 11:48:21 -0700 Subject: [PATCH 03/13] fix: propagate LongMemEval session dates into seeded chunks Carries the per-session haystack_dates through LongMemEvalMemory and into RawDocument.last_modified during retrieve_for_accuracy_eval. Adds parse_lme_date helper and round-trip tests. Foundation for date-aware temporal-reasoning prompts (Task 4). Co-Authored-By: Claude Sonnet 4.6 --- crates/origin-core/src/eval/longmemeval.rs | 115 ++++++++++++++++++++- 1 file changed, 111 insertions(+), 4 deletions(-) diff --git a/crates/origin-core/src/eval/longmemeval.rs b/crates/origin-core/src/eval/longmemeval.rs index c9e968ca..b9105fa1 100644 --- a/crates/origin-core/src/eval/longmemeval.rs +++ b/crates/origin-core/src/eval/longmemeval.rs @@ -30,6 +30,23 @@ use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::path::Path; +/// Parse a LongMemEval `question_date` / `haystack_date` into Unix seconds. +/// Format example: "2023/04/10 (Mon) 23:07". Returns None on parse failure +/// (e.g. dataset variants with different formats -- caller falls back to `now()`). +pub(crate) fn parse_lme_date(s: &str) -> Option { + use chrono::{NaiveDateTime, TimeZone, Utc}; + // Strip the weekday tag in parens: "2023/04/10 (Mon) 23:07" -> "2023/04/10 23:07" + let cleaned: String = s + .split_whitespace() + .filter(|tok| !(tok.starts_with('(') && tok.ends_with(')'))) + .collect::>() + .join(" "); + NaiveDateTime::parse_from_str(&cleaned, "%Y/%m/%d %H:%M") + .ok() + .and_then(|naive| Utc.from_local_datetime(&naive).single()) + .map(|dt| dt.timestamp()) +} + // --------------------------------------------------------------------------- // Data structures (matches the JSON schema from HuggingFace) // --------------------------------------------------------------------------- @@ -74,6 +91,8 @@ pub struct LongMemEvalMemory { pub turn_idx: usize, pub has_answer: bool, pub question_id: String, + /// Raw date string from the dataset (e.g. "2023/04/10 (Mon) 23:07"). None for samples missing dates. + pub session_date: Option, } // --------------------------------------------------------------------------- @@ -108,6 +127,7 @@ pub fn extract_memories(sample: &LongMemEvalSample) -> Vec { .zip(sample.haystack_sessions.iter()) .enumerate() { + let session_date = sample.haystack_dates.get(sess_idx).cloned(); for (turn_idx, turn) in session.iter().enumerate() { // Always include user turns (they contain the personal facts). // Include assistant turns only if they have answer evidence, @@ -121,6 +141,7 @@ pub fn extract_memories(sample: &LongMemEvalSample) -> Vec { turn_idx, has_answer: turn.has_answer, question_id: sample.question_id.clone(), + session_date: session_date.clone(), }); } } @@ -422,7 +443,11 @@ pub async fn run_longmemeval_eval(path: &Path) -> Result")); assert!(text.contains("single-session-user")); } + + #[test] + fn test_parse_lme_date_round_trip() { + let ts = super::parse_lme_date("2023/04/10 (Mon) 23:07").expect("should parse"); + // 2023-04-10 23:07 UTC == 1681168020 + assert_eq!(ts, 1_681_168_020); + } + + #[test] + fn test_parse_lme_date_garbage_returns_none() { + assert!(super::parse_lme_date("not a date").is_none()); + assert!(super::parse_lme_date("").is_none()); + } + + #[tokio::test] + async fn test_lme_seed_propagates_session_date() { + use crate::sources::RawDocument; + + // Mimic what the runner builds for a single memory. + let mem = super::LongMemEvalMemory { + content: "I moved to Tokyo last summer.".to_string(), + role: "user".to_string(), + session_id: "s1".to_string(), + session_idx: 0, + turn_idx: 0, + has_answer: true, + question_id: "q1".to_string(), + session_date: Some("2023/04/10 (Mon) 23:07".to_string()), + }; + let last_modified = mem + .session_date + .as_deref() + .and_then(super::parse_lme_date) + .unwrap_or_else(|| chrono::Utc::now().timestamp()); + + let tmp = tempfile::tempdir().unwrap(); + let db = + crate::db::MemoryDB::new(tmp.path(), std::sync::Arc::new(crate::events::NoopEmitter)) + .await + .unwrap(); + db.upsert_documents(vec![RawDocument { + content: mem.content.clone(), + source_id: "lme/q1/s1/0".to_string(), + source: "memory".to_string(), + title: "test".to_string(), + last_modified, + memory_type: Some("fact".to_string()), + domain: Some("conversation".to_string()), + ..Default::default() + }]) + .await + .unwrap(); + + let results = db + .search_memory( + "Tokyo", + 5, + None, + Some("conversation"), + None, + None, + None, + None, + ) + .await + .unwrap(); + assert!(!results.is_empty()); + assert_eq!(results[0].last_modified, 1_681_168_020); + assert_eq!(results[0].created_at, 1_681_168_020); + } } From 97127d8c6f169f7354531a6f4a40692e8d8a403d Mon Sep 17 00:00:00 2001 From: 7xuanlu Date: Sun, 26 Apr 2026 13:20:04 -0700 Subject: [PATCH 04/13] fix: propagate LoCoMo session dates into seeded chunks Adds parse_locomo_date for the dataset's '1:56 pm on 8 May, 2023' format and threads conversation.session_N_date_time through LocomoMemory into RawDocument.last_modified at every seed site. Mirrors Task 2's LME treatment. --- crates/origin-core/src/eval/locomo.rs | 102 +++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 4 deletions(-) diff --git a/crates/origin-core/src/eval/locomo.rs b/crates/origin-core/src/eval/locomo.rs index 9cda00ee..ad9bd66e 100644 --- a/crates/origin-core/src/eval/locomo.rs +++ b/crates/origin-core/src/eval/locomo.rs @@ -53,6 +53,8 @@ pub struct LocomoMemory { pub session_num: usize, pub dia_id: String, pub sample_id: String, + /// Raw "h:mm am/pm on D Month, YYYY" string for this session, when present. + pub session_date: Option, } // --------------------------------------------------------------------------- @@ -112,12 +114,20 @@ pub fn extract_observations(sample: &LocomoSample) -> Vec { None => continue, }; + let session_date_key = session_key.replace("_observation", "_date_time"); + let session_date = sample + .conversation + .get(&session_date_key) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()); + memories.push(LocomoMemory { content, speaker: speaker.clone(), session_num, dia_id, sample_id: sample.sample_id.clone(), + session_date, }); } } @@ -134,6 +144,19 @@ fn parse_session_num(key: &str) -> Option { num_str.parse().ok() } +/// Parse a LoCoMo session date like "1:56 pm on 8 May, 2023" into Unix seconds. +/// Returns None on parse failure (caller falls back to now()). +pub(crate) fn parse_locomo_date(s: &str) -> Option { + use chrono::{NaiveDateTime, TimeZone, Utc}; + // The dataset uses " on ". chrono's strftime + // %p needs uppercase AM/PM; LoCoMo uses lowercase. Normalise first. + let normalised = s.replace(" am ", " AM ").replace(" pm ", " PM "); + NaiveDateTime::parse_from_str(&normalised, "%I:%M %p on %d %B, %Y") + .ok() + .and_then(|naive| Utc.from_local_datetime(&naive).single()) + .map(|dt| dt.timestamp()) +} + // --------------------------------------------------------------------------- // Conversion to eval cases // --------------------------------------------------------------------------- @@ -417,7 +440,11 @@ pub async fn run_locomo_eval(path: &Path) -> Result { title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: mem + .session_date + .as_deref() + .and_then(parse_locomo_date) + .unwrap_or_else(|| chrono::Utc::now().timestamp()), ..Default::default() }) .collect(); @@ -548,7 +575,11 @@ pub async fn run_locomo_eval_reranked( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: mem + .session_date + .as_deref() + .and_then(parse_locomo_date) + .unwrap_or_else(|| chrono::Utc::now().timestamp()), ..Default::default() }) .collect(); @@ -673,7 +704,11 @@ pub async fn run_locomo_eval_expanded( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: mem + .session_date + .as_deref() + .and_then(parse_locomo_date) + .unwrap_or_else(|| chrono::Utc::now().timestamp()), ..Default::default() }) .collect(); @@ -939,7 +974,11 @@ pub async fn run_locomo_eval_with_gate( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: mem + .session_date + .as_deref() + .and_then(parse_locomo_date) + .unwrap_or_else(|| chrono::Utc::now().timestamp()), ..Default::default() }) .collect(); @@ -1415,4 +1454,59 @@ mod tests { // Verify delta printing is present assert!(text.contains("->")); } + + #[test] + fn test_parse_locomo_date() { + let ts = super::parse_locomo_date("1:56 pm on 8 May, 2023").expect("should parse"); + // 2023-05-08 13:56 UTC = 1683554160 + assert_eq!(ts, 1_683_554_160); + } + + #[test] + fn test_parse_locomo_date_garbage_returns_none() { + assert!(super::parse_locomo_date("nonsense").is_none()); + } + + #[tokio::test] + async fn test_locomo_seed_propagates_session_date() { + use crate::sources::RawDocument; + + let last_modified = super::parse_locomo_date("1:56 pm on 8 May, 2023").unwrap(); + + let tmp = tempfile::tempdir().unwrap(); + let db = + crate::db::MemoryDB::new(tmp.path(), std::sync::Arc::new(crate::events::NoopEmitter)) + .await + .unwrap(); + + db.upsert_documents(vec![RawDocument { + content: "Alice told Bob she moved to Tokyo".to_string(), + source_id: "locomo/sample1/dia1".to_string(), + source: "memory".to_string(), + title: "test".to_string(), + last_modified, + memory_type: Some("fact".to_string()), + domain: Some("conversation".to_string()), + ..Default::default() + }]) + .await + .unwrap(); + + let results = db + .search_memory( + "Tokyo", + 5, + None, + Some("conversation"), + None, + None, + None, + None, + ) + .await + .unwrap(); + assert!(!results.is_empty()); + assert_eq!(results[0].last_modified, 1_683_554_160); + assert_eq!(results[0].created_at, 1_683_554_160); + } } From cf27c162f234ac22f2b7fba4227e10e56d866813 Mon Sep 17 00:00:00 2001 From: 7xuanlu Date: Sun, 26 Apr 2026 15:26:20 -0700 Subject: [PATCH 05/13] fix: date-aware seeds + context lines for the E2E answer flow Threads session dates into the 5 remaining RawDocument seed sites in answer_quality.rs and pipeline.rs (covers both LoCoMo and LongMemEval E2E and pipeline runners). Adds eval::shared::format_ymd and rewrites generate_e2e_answers_for_question's context to emit 'On YYYY-MM-DD: ...' lines so the LLM judge can reason about temporal questions. Targets the temporal-reasoning weakness on both benchmarks (LME-TR 42.1%, LoCoMo-temporal 1.6% pre-change). Co-Authored-By: Claude Sonnet 4.6 --- crates/origin-core/src/eval/answer_quality.rs | 58 +++++++++++++++---- crates/origin-core/src/eval/pipeline.rs | 12 +++- crates/origin-core/src/eval/shared.rs | 20 +++++++ 3 files changed, 76 insertions(+), 14 deletions(-) diff --git a/crates/origin-core/src/eval/answer_quality.rs b/crates/origin-core/src/eval/answer_quality.rs index 84084efe..29fac0c8 100644 --- a/crates/origin-core/src/eval/answer_quality.rs +++ b/crates/origin-core/src/eval/answer_quality.rs @@ -484,7 +484,11 @@ pub async fn run_e2e_locomo_eval( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: mem + .session_date + .as_deref() + .and_then(crate::eval::locomo::parse_locomo_date) + .unwrap_or_else(|| chrono::Utc::now().timestamp()), ..Default::default() }) .collect(); @@ -729,15 +733,23 @@ async fn generate_e2e_answers_for_question( .await?; let flat_context: String = flat_results .iter() - .enumerate() - .map(|(i, r)| format!("{}. {}", i + 1, r.content)) + .map(|r| { + format!( + "On {}: {}", + crate::eval::shared::format_ymd(r.last_modified), + r.content + ) + }) .collect::>() .join("\n"); let flat_tokens = count_tokens(&flat_context); let flat_request = LlmRequest { system_prompt: Some(system_prompt.clone()), - user_prompt: format!("Context:\n{}\n\nQuestion: {}", flat_context, question), + user_prompt: format!( + "Context (each line prefixed with the date the memory was recorded):\n{}\n\nQuestion: {}", + flat_context, question + ), max_tokens: 200, temperature: 0.1, label: Some("e2e_flat".to_string()), @@ -770,8 +782,12 @@ async fn generate_e2e_answers_for_question( // Memory search results if !flat_results.is_empty() { structured_parts.push("## Relevant Memories".to_string()); - for (i, r) in flat_results.iter().enumerate() { - structured_parts.push(format!("{}. {}", i + 1, r.content)); + for r in flat_results.iter() { + structured_parts.push(format!( + "On {}: {}", + crate::eval::shared::format_ymd(r.last_modified), + r.content + )); } } @@ -780,7 +796,10 @@ async fn generate_e2e_answers_for_question( let structured_request = LlmRequest { system_prompt: Some(system_prompt), - user_prompt: format!("Context:\n{}\n\nQuestion: {}", structured_context, question), + user_prompt: format!( + "Context (each line prefixed with the date the memory was recorded; concept articles are time-spanning):\n{}\n\nQuestion: {}", + structured_context, question + ), max_tokens: 200, temperature: 0.1, label: Some("e2e_structured".to_string()), @@ -859,7 +878,11 @@ pub async fn run_e2e_context_eval( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: mem + .session_date + .as_deref() + .and_then(crate::eval::locomo::parse_locomo_date) + .unwrap_or_else(|| chrono::Utc::now().timestamp()), ..Default::default() }) .collect(); @@ -1007,7 +1030,11 @@ pub async fn run_e2e_context_eval_longmemeval( .to_string(), ), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: mem + .session_date + .as_deref() + .and_then(crate::eval::longmemeval::parse_lme_date) + .unwrap_or_else(|| chrono::Utc::now().timestamp()), ..Default::default() }) .collect(); @@ -1139,8 +1166,8 @@ async fn build_structured_context( } if !results.is_empty() { parts.push("## Relevant Memories".to_string()); - for (i, r) in results.iter().enumerate() { - parts.push(format!("{}. {}", i + 1, r.content)); + for r in results.iter() { + parts.push(format!("On {}: {}", crate::eval::shared::format_ymd(r.last_modified), r.content)); } } let structured_context = parts.join("\n\n"); @@ -1629,4 +1656,11 @@ pub async fn run_fullpipeline_lme_batch( Ok(finished_tuples) } -// ===== Flat cache loaders ===== +#[cfg(test)] +mod tests { + #[tokio::test] + async fn test_format_ymd_used_in_context() { + assert_eq!(crate::eval::shared::format_ymd(1_681_168_020), "2023-04-10"); + assert_eq!(crate::eval::shared::format_ymd(1_683_554_160), "2023-05-08"); + } +} diff --git a/crates/origin-core/src/eval/pipeline.rs b/crates/origin-core/src/eval/pipeline.rs index 76ff2e50..d05f8218 100644 --- a/crates/origin-core/src/eval/pipeline.rs +++ b/crates/origin-core/src/eval/pipeline.rs @@ -471,7 +471,11 @@ pub async fn run_locomo_pipeline_eval( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: mem + .session_date + .as_deref() + .and_then(crate::eval::locomo::parse_locomo_date) + .unwrap_or_else(|| chrono::Utc::now().timestamp()), ..Default::default() }) .collect(); @@ -788,7 +792,11 @@ pub async fn run_longmemeval_pipeline_eval( .to_string(), ), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: mem + .session_date + .as_deref() + .and_then(crate::eval::longmemeval::parse_lme_date) + .unwrap_or_else(|| chrono::Utc::now().timestamp()), ..Default::default() }) .collect(); diff --git a/crates/origin-core/src/eval/shared.rs b/crates/origin-core/src/eval/shared.rs index d4daddeb..88d16570 100644 --- a/crates/origin-core/src/eval/shared.rs +++ b/crates/origin-core/src/eval/shared.rs @@ -650,3 +650,23 @@ pub async fn run_concept_distillation_batch_api( eprintln!("[batch_distill] Distilled {} concepts", distilled); Ok(distilled) } + +/// Format a unix-seconds timestamp as ISO-8601 calendar date "YYYY-MM-DD" in UTC. +/// Returns "unknown date" on conversion failure (e.g. malformed timestamp). +pub fn format_ymd(ts: i64) -> String { + use chrono::{TimeZone, Utc}; + Utc.timestamp_opt(ts, 0) + .single() + .map(|dt| dt.format("%Y-%m-%d").to_string()) + .unwrap_or_else(|| "unknown date".to_string()) +} + +#[cfg(test)] +mod format_ymd_tests { + #[test] + fn test_format_ymd_round_trip() { + assert_eq!(super::format_ymd(1_681_168_020), "2023-04-10"); + assert_eq!(super::format_ymd(1_683_554_160), "2023-05-08"); + assert_eq!(super::format_ymd(0), "1970-01-01"); + } +} From 170bf709400ec58f9e6da65cd14d1294035c2761 Mon Sep 17 00:00:00 2001 From: 7xuanlu Date: Sun, 26 Apr 2026 15:39:22 -0700 Subject: [PATCH 06/13] fix: consolidate date helpers in eval/dates.rs + extract seed_last_modified Code-review followup: - Moves parse_lme_date, parse_locomo_date, and format_ymd to a single dates.rs module instead of three different homes. - Replaces the 13 verbatim copies of the '.as_deref().and_then(parser).unwrap_or_else(|| now())' chain with one seed_last_modified helper. No behavior change. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/origin-core/src/eval/answer_quality.rs | 27 ++-- crates/origin-core/src/eval/dates.rs | 125 ++++++++++++++++++ crates/origin-core/src/eval/locomo.rs | 54 ++------ crates/origin-core/src/eval/longmemeval.rs | 66 ++------- crates/origin-core/src/eval/mod.rs | 1 + crates/origin-core/src/eval/pipeline.rs | 18 ++- crates/origin-core/src/eval/shared.rs | 24 +--- 7 files changed, 171 insertions(+), 144 deletions(-) create mode 100644 crates/origin-core/src/eval/dates.rs diff --git a/crates/origin-core/src/eval/answer_quality.rs b/crates/origin-core/src/eval/answer_quality.rs index 29fac0c8..a0ae9744 100644 --- a/crates/origin-core/src/eval/answer_quality.rs +++ b/crates/origin-core/src/eval/answer_quality.rs @@ -484,11 +484,10 @@ pub async fn run_e2e_locomo_eval( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: mem - .session_date - .as_deref() - .and_then(crate::eval::locomo::parse_locomo_date) - .unwrap_or_else(|| chrono::Utc::now().timestamp()), + last_modified: crate::eval::dates::seed_last_modified( + mem.session_date.as_deref(), + crate::eval::dates::parse_locomo_date, + ), ..Default::default() }) .collect(); @@ -878,11 +877,10 @@ pub async fn run_e2e_context_eval( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: mem - .session_date - .as_deref() - .and_then(crate::eval::locomo::parse_locomo_date) - .unwrap_or_else(|| chrono::Utc::now().timestamp()), + last_modified: crate::eval::dates::seed_last_modified( + mem.session_date.as_deref(), + crate::eval::dates::parse_locomo_date, + ), ..Default::default() }) .collect(); @@ -1030,11 +1028,10 @@ pub async fn run_e2e_context_eval_longmemeval( .to_string(), ), domain: Some("conversation".to_string()), - last_modified: mem - .session_date - .as_deref() - .and_then(crate::eval::longmemeval::parse_lme_date) - .unwrap_or_else(|| chrono::Utc::now().timestamp()), + last_modified: crate::eval::dates::seed_last_modified( + mem.session_date.as_deref(), + crate::eval::dates::parse_lme_date, + ), ..Default::default() }) .collect(); diff --git a/crates/origin-core/src/eval/dates.rs b/crates/origin-core/src/eval/dates.rs new file mode 100644 index 00000000..7e9ee669 --- /dev/null +++ b/crates/origin-core/src/eval/dates.rs @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: AGPL-3.0-only +//! Date helpers shared across eval benchmark adapters. +//! +//! Centralises the three date utilities that were previously scattered across +//! `locomo`, `longmemeval`, and `shared`: +//! +//! - [`parse_locomo_date`] — LoCoMo session timestamps ("1:56 pm on 8 May, 2023") +//! - [`parse_lme_date`] — LongMemEval session timestamps ("2023/04/10 (Mon) 23:07") +//! - [`format_ymd`] — Unix-seconds → "YYYY-MM-DD" formatting +//! - [`seed_last_modified`] — resolve a benchmark chunk's `last_modified` field + +/// Parse a LoCoMo session date like "1:56 pm on 8 May, 2023" into Unix seconds. +/// Returns `None` on parse failure (caller falls back to `now()`). +pub(crate) fn parse_locomo_date(s: &str) -> Option { + use chrono::{NaiveDateTime, TimeZone, Utc}; + // The dataset uses " on ". chrono's strftime + // %p needs uppercase AM/PM; LoCoMo uses lowercase. Normalise first. + let normalised = s.replace(" am ", " AM ").replace(" pm ", " PM "); + NaiveDateTime::parse_from_str(&normalised, "%I:%M %p on %d %B, %Y") + .ok() + .and_then(|naive| Utc.from_local_datetime(&naive).single()) + .map(|dt| dt.timestamp()) +} + +/// Parse a LongMemEval `question_date` / `haystack_date` into Unix seconds. +/// Format example: "2023/04/10 (Mon) 23:07". Returns `None` on parse failure +/// (e.g. dataset variants with different formats -- caller falls back to `now()`). +pub(crate) fn parse_lme_date(s: &str) -> Option { + use chrono::{NaiveDateTime, TimeZone, Utc}; + // Strip the weekday tag in parens: "2023/04/10 (Mon) 23:07" -> "2023/04/10 23:07" + let cleaned: String = s + .split_whitespace() + .filter(|tok| !(tok.starts_with('(') && tok.ends_with(')'))) + .collect::>() + .join(" "); + NaiveDateTime::parse_from_str(&cleaned, "%Y/%m/%d %H:%M") + .ok() + .and_then(|naive| Utc.from_local_datetime(&naive).single()) + .map(|dt| dt.timestamp()) +} + +/// Format a unix-seconds timestamp as ISO-8601 calendar date "YYYY-MM-DD" in UTC. +/// Returns "unknown date" on conversion failure (e.g. malformed timestamp). +pub fn format_ymd(ts: i64) -> String { + use chrono::{TimeZone, Utc}; + Utc.timestamp_opt(ts, 0) + .single() + .map(|dt| dt.format("%Y-%m-%d").to_string()) + .unwrap_or_else(|| "unknown date".to_string()) +} + +/// Resolve `last_modified` for a benchmark-seeded chunk: parse the per-session +/// date string with `parser` if present, else fall back to `now()` (used for +/// noise / undated entries). +pub(crate) fn seed_last_modified(date: Option<&str>, parser: fn(&str) -> Option) -> i64 { + date.and_then(parser) + .unwrap_or_else(|| chrono::Utc::now().timestamp()) +} + +#[cfg(test)] +mod tests { + use super::*; + + // ── parse_locomo_date ──────────────────────────────────────────────────── + + #[test] + fn test_parse_locomo_date() { + let ts = parse_locomo_date("1:56 pm on 8 May, 2023").expect("should parse"); + // 2023-05-08 13:56 UTC = 1683554160 + assert_eq!(ts, 1_683_554_160); + } + + #[test] + fn test_parse_locomo_date_garbage_returns_none() { + assert!(parse_locomo_date("nonsense").is_none()); + } + + // ── parse_lme_date ─────────────────────────────────────────────────────── + + #[test] + fn test_parse_lme_date_round_trip() { + let ts = parse_lme_date("2023/04/10 (Mon) 23:07").expect("should parse"); + // 2023-04-10 23:07 UTC == 1681168020 + assert_eq!(ts, 1_681_168_020); + } + + #[test] + fn test_parse_lme_date_garbage_returns_none() { + assert!(parse_lme_date("not a date").is_none()); + assert!(parse_lme_date("").is_none()); + } + + // ── format_ymd ─────────────────────────────────────────────────────────── + + #[test] + fn test_format_ymd_round_trip() { + assert_eq!(format_ymd(1_681_168_020), "2023-04-10"); + assert_eq!(format_ymd(1_683_554_160), "2023-05-08"); + assert_eq!(format_ymd(0), "1970-01-01"); + } + + // ── seed_last_modified ─────────────────────────────────────────────────── + + #[test] + fn test_seed_last_modified_parses_when_date_present() { + let ts = seed_last_modified(Some("2023/04/10 (Mon) 23:07"), parse_lme_date); + assert_eq!(ts, 1_681_168_020); + } + + #[test] + fn test_seed_last_modified_falls_back_to_now_when_date_missing() { + let before = chrono::Utc::now().timestamp(); + let ts = seed_last_modified(None, parse_lme_date); + let after = chrono::Utc::now().timestamp(); + assert!(ts >= before && ts <= after); + } + + #[test] + fn test_seed_last_modified_falls_back_when_parser_rejects() { + let before = chrono::Utc::now().timestamp(); + let ts = seed_last_modified(Some("malformed"), parse_lme_date); + let after = chrono::Utc::now().timestamp(); + assert!(ts >= before && ts <= after); + } +} diff --git a/crates/origin-core/src/eval/locomo.rs b/crates/origin-core/src/eval/locomo.rs index ad9bd66e..ba23a246 100644 --- a/crates/origin-core/src/eval/locomo.rs +++ b/crates/origin-core/src/eval/locomo.rs @@ -18,6 +18,11 @@ use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::path::Path; +// Bring date helpers into scope for use within this module. +use crate::eval::dates::seed_last_modified; +// Re-export so external callers using `crate::eval::locomo::parse_locomo_date` still compile. +pub(crate) use crate::eval::dates::parse_locomo_date; + // --------------------------------------------------------------------------- // Data structures // --------------------------------------------------------------------------- @@ -144,19 +149,6 @@ fn parse_session_num(key: &str) -> Option { num_str.parse().ok() } -/// Parse a LoCoMo session date like "1:56 pm on 8 May, 2023" into Unix seconds. -/// Returns None on parse failure (caller falls back to now()). -pub(crate) fn parse_locomo_date(s: &str) -> Option { - use chrono::{NaiveDateTime, TimeZone, Utc}; - // The dataset uses " on ". chrono's strftime - // %p needs uppercase AM/PM; LoCoMo uses lowercase. Normalise first. - let normalised = s.replace(" am ", " AM ").replace(" pm ", " PM "); - NaiveDateTime::parse_from_str(&normalised, "%I:%M %p on %d %B, %Y") - .ok() - .and_then(|naive| Utc.from_local_datetime(&naive).single()) - .map(|dt| dt.timestamp()) -} - // --------------------------------------------------------------------------- // Conversion to eval cases // --------------------------------------------------------------------------- @@ -440,11 +432,7 @@ pub async fn run_locomo_eval(path: &Path) -> Result { title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: mem - .session_date - .as_deref() - .and_then(parse_locomo_date) - .unwrap_or_else(|| chrono::Utc::now().timestamp()), + last_modified: seed_last_modified(mem.session_date.as_deref(), parse_locomo_date), ..Default::default() }) .collect(); @@ -575,11 +563,7 @@ pub async fn run_locomo_eval_reranked( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: mem - .session_date - .as_deref() - .and_then(parse_locomo_date) - .unwrap_or_else(|| chrono::Utc::now().timestamp()), + last_modified: seed_last_modified(mem.session_date.as_deref(), parse_locomo_date), ..Default::default() }) .collect(); @@ -704,11 +688,7 @@ pub async fn run_locomo_eval_expanded( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: mem - .session_date - .as_deref() - .and_then(parse_locomo_date) - .unwrap_or_else(|| chrono::Utc::now().timestamp()), + last_modified: seed_last_modified(mem.session_date.as_deref(), parse_locomo_date), ..Default::default() }) .collect(); @@ -974,11 +954,7 @@ pub async fn run_locomo_eval_with_gate( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: mem - .session_date - .as_deref() - .and_then(parse_locomo_date) - .unwrap_or_else(|| chrono::Utc::now().timestamp()), + last_modified: seed_last_modified(mem.session_date.as_deref(), parse_locomo_date), ..Default::default() }) .collect(); @@ -1455,18 +1431,6 @@ mod tests { assert!(text.contains("->")); } - #[test] - fn test_parse_locomo_date() { - let ts = super::parse_locomo_date("1:56 pm on 8 May, 2023").expect("should parse"); - // 2023-05-08 13:56 UTC = 1683554160 - assert_eq!(ts, 1_683_554_160); - } - - #[test] - fn test_parse_locomo_date_garbage_returns_none() { - assert!(super::parse_locomo_date("nonsense").is_none()); - } - #[tokio::test] async fn test_locomo_seed_propagates_session_date() { use crate::sources::RawDocument; diff --git a/crates/origin-core/src/eval/longmemeval.rs b/crates/origin-core/src/eval/longmemeval.rs index b9105fa1..a71d7c35 100644 --- a/crates/origin-core/src/eval/longmemeval.rs +++ b/crates/origin-core/src/eval/longmemeval.rs @@ -30,22 +30,10 @@ use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::path::Path; -/// Parse a LongMemEval `question_date` / `haystack_date` into Unix seconds. -/// Format example: "2023/04/10 (Mon) 23:07". Returns None on parse failure -/// (e.g. dataset variants with different formats -- caller falls back to `now()`). -pub(crate) fn parse_lme_date(s: &str) -> Option { - use chrono::{NaiveDateTime, TimeZone, Utc}; - // Strip the weekday tag in parens: "2023/04/10 (Mon) 23:07" -> "2023/04/10 23:07" - let cleaned: String = s - .split_whitespace() - .filter(|tok| !(tok.starts_with('(') && tok.ends_with(')'))) - .collect::>() - .join(" "); - NaiveDateTime::parse_from_str(&cleaned, "%Y/%m/%d %H:%M") - .ok() - .and_then(|naive| Utc.from_local_datetime(&naive).single()) - .map(|dt| dt.timestamp()) -} +// Bring date helpers into scope for use within this module. +use crate::eval::dates::seed_last_modified; +// Re-export so external callers using `crate::eval::longmemeval::parse_lme_date` still compile. +pub(crate) use crate::eval::dates::parse_lme_date; // --------------------------------------------------------------------------- // Data structures (matches the JSON schema from HuggingFace) @@ -443,11 +431,7 @@ pub async fn run_longmemeval_eval(path: &Path) -> Result usize { BPE.encode_with_special_tokens(text).len() } +/// Format a unix-seconds timestamp as ISO-8601 calendar date "YYYY-MM-DD" in UTC. +/// Re-exported from [`crate::eval::dates`] for backward compatibility. +pub use crate::eval::dates::format_ymd; + /// Probe on-device batch extraction at different batch sizes. /// Returns vec of (batch_size, input_tokens, response_len, entities_found, observations_found). pub async fn probe_extraction_batch_sizes( @@ -650,23 +654,3 @@ pub async fn run_concept_distillation_batch_api( eprintln!("[batch_distill] Distilled {} concepts", distilled); Ok(distilled) } - -/// Format a unix-seconds timestamp as ISO-8601 calendar date "YYYY-MM-DD" in UTC. -/// Returns "unknown date" on conversion failure (e.g. malformed timestamp). -pub fn format_ymd(ts: i64) -> String { - use chrono::{TimeZone, Utc}; - Utc.timestamp_opt(ts, 0) - .single() - .map(|dt| dt.format("%Y-%m-%d").to_string()) - .unwrap_or_else(|| "unknown date".to_string()) -} - -#[cfg(test)] -mod format_ymd_tests { - #[test] - fn test_format_ymd_round_trip() { - assert_eq!(super::format_ymd(1_681_168_020), "2023-04-10"); - assert_eq!(super::format_ymd(1_683_554_160), "2023-05-08"); - assert_eq!(super::format_ymd(0), "1970-01-01"); - } -} From dc8c3cdb42f84fb8c7ae9c1510ce5cb6306bc23e Mon Sep 17 00:00:00 2001 From: 7xuanlu Date: Sun, 26 Apr 2026 15:51:13 -0700 Subject: [PATCH 07/13] fix: derive Default on SearchResult + thread question_date to LLM Two adversarial-review followups before merge: 1. SearchResult was missing a Default derive; downstream MIT crates (origin-mcp) construct it field-by-field and would break when origin-types 0.1.5 publishes with the new created_at i64. Adding the derive lets them spread defaults. 2. LongMemEval per-question dates (sample.question_date) were not reaching the LLM. The seeded memories now carry session dates thanks to earlier commits, but without a 'today' anchor the LLM cannot ground relative phrases ('yesterday', 'a week ago') in the question. generate_e2e_answers_for_question now accepts Option<&str> and prepends 'The question was asked on X.' to the system prompt. LoCoMo passes None (no per-question date in dataset). Co-Authored-By: Claude Sonnet 4.6 --- crates/origin-core/src/eval/answer_quality.rs | 52 +++++++++++++++++-- crates/origin-types/src/memory.rs | 2 +- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/crates/origin-core/src/eval/answer_quality.rs b/crates/origin-core/src/eval/answer_quality.rs index a0ae9744..3d4d25d7 100644 --- a/crates/origin-core/src/eval/answer_quality.rs +++ b/crates/origin-core/src/eval/answer_quality.rs @@ -708,12 +708,22 @@ async fn generate_e2e_answers_for_question( category: &str, search_limit: usize, llm: &Arc, + question_date: Option<&str>, ) -> Result, OriginError> { use crate::llm_provider::{strip_think_tags, LlmRequest}; - let system_prompt = "Answer the question using only the provided context. \ - Be specific and concise. Respond in 1-3 sentences." - .to_string(); + // If we have a question_date (LongMemEval), prepend it to the system prompt + // so the LLM has a "today" anchor for relative time references in the question. + let system_prompt = match question_date { + Some(d) => format!( + "The question was asked on {}. Answer the question using only the provided context. \ + Be specific and concise. Respond in 1-3 sentences.", + d + ), + None => "Answer the question using only the provided context. \ + Be specific and concise. Respond in 1-3 sentences." + .to_string(), + }; let mut tuples = Vec::new(); @@ -924,6 +934,7 @@ pub async fn run_e2e_context_eval( category, search_limit, &llm, + None, ) .await { @@ -1061,6 +1072,7 @@ pub async fn run_e2e_context_eval_longmemeval( category, search_limit, &llm, + Some(&sample.question_date), ) .await { @@ -1660,4 +1672,38 @@ mod tests { assert_eq!(crate::eval::shared::format_ymd(1_681_168_020), "2023-04-10"); assert_eq!(crate::eval::shared::format_ymd(1_683_554_160), "2023-05-08"); } + + #[test] + fn test_system_prompt_includes_question_date_when_provided() { + // Mirror the system_prompt construction from generate_e2e_answers_for_question + // to lock in the format. If the function changes, this test should reflect. + let with_date = match Some("2023/04/10 (Mon) 23:07") { + Some(d) => format!( + "The question was asked on {}. Answer the question using only the provided context. \ + Be specific and concise. Respond in 1-3 sentences.", + d + ), + None => "Answer the question using only the provided context. \ + Be specific and concise. Respond in 1-3 sentences." + .to_string(), + }; + assert!(with_date.contains("The question was asked on 2023/04/10")); + assert!(with_date.contains("only the provided context")); + } + + #[test] + fn test_system_prompt_omits_when_no_question_date() { + let without_date: String = match None::<&str> { + Some(d) => format!( + "The question was asked on {}. Answer the question using only the provided context. \ + Be specific and concise. Respond in 1-3 sentences.", + d + ), + None => "Answer the question using only the provided context. \ + Be specific and concise. Respond in 1-3 sentences." + .to_string(), + }; + assert!(!without_date.contains("question was asked on")); + assert!(without_date.contains("only the provided context")); + } } diff --git a/crates/origin-types/src/memory.rs b/crates/origin-types/src/memory.rs index 2b5fd650..5d180452 100644 --- a/crates/origin-types/src/memory.rs +++ b/crates/origin-types/src/memory.rs @@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize}; /// A search result from hybrid (vector + FTS) search. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct SearchResult { pub id: String, pub content: String, From 6111f8360e16294e79cb83201720c622dd44b7bb Mon Sep 17 00:00:00 2001 From: 7xuanlu Date: Mon, 27 Apr 2026 06:57:10 -0700 Subject: [PATCH 08/13] fix: add LongMemEval Claude CLI eval entry points Mirror the LoCoMo _api pattern for LongMemEval: ClaudeCliProvider::haiku() for answers (no API key, uses Max plan via OAuth) and judge_with_claude_model 'haiku' for judging. Same answer/judge model on both sides keeps LME and LoCoMo numbers comparable. Both new test entry points exercise the dated-context formatter and the question_date system-prompt anchor introduced earlier in this branch: - generate_e2e_context_tuples_longmemeval_api - judge_e2e_context_longmemeval_api_haiku --- app/tests/eval_harness.rs | 84 +++++++++++++++++++ crates/origin-core/src/eval/answer_quality.rs | 6 +- 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/app/tests/eval_harness.rs b/app/tests/eval_harness.rs index a1ff9543..21dd8987 100644 --- a/app/tests/eval_harness.rs +++ b/app/tests/eval_harness.rs @@ -1579,6 +1579,90 @@ async fn judge_e2e_context_locomo() { eprintln!("\nTotal judged: {}", report.total_judged); } +/// Generate LongMemEval E2E answers via Claude CLI Haiku (Max plan, no API key). +/// +/// Mirrors `generate_e2e_context_tuples_locomo_api` for the LME side. Uses the +/// same `run_e2e_context_eval_longmemeval` path that exercises Task #11's +/// dated-context formatter and `question_date` system-prompt anchor. +#[tokio::test] +#[ignore] +async fn generate_e2e_context_tuples_longmemeval_api() { + use origin_lib::eval::token_efficiency::{ + run_e2e_context_eval_longmemeval, save_judgment_tuples, + }; + use std::sync::Arc; + + let path = + std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("eval/data/longmemeval_oracle.json"); + if !path.exists() { + eprintln!("SKIP: longmemeval_oracle.json not found"); + return; + } + + let llm: Arc = + Arc::new(origin_lib::llm_provider::ClaudeCliProvider::haiku()); + + // 50 questions for validation, 1 answer per question. + let tuples = run_e2e_context_eval_longmemeval(&path, llm, 10, 50, 1) + .await + .expect("run_e2e_context_eval_longmemeval with Haiku CLI failed"); + + eprintln!("Generated {} judgment tuples (Haiku CLI)", tuples.len()); + assert!(!tuples.is_empty()); + + let baselines_dir = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("eval/baselines"); + std::fs::create_dir_all(&baselines_dir).ok(); + let out_path = baselines_dir.join("e2e_context_tuples_longmemeval_api.json"); + save_judgment_tuples(&tuples, &out_path).expect("save tuples"); + eprintln!("Saved to {:?}", out_path); +} + +/// Judge LongMemEval API-generated tuples with Claude Haiku (matches the answer model). +/// Run after `generate_e2e_context_tuples_longmemeval_api`. +#[tokio::test] +#[ignore] +async fn judge_e2e_context_longmemeval_api_haiku() { + use origin_lib::eval::token_efficiency::{ + aggregate_judgments, judge_with_claude_model, load_judgment_tuples, + }; + + let tuples_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("eval/baselines/e2e_context_tuples_longmemeval_api.json"); + if !tuples_path.exists() { + eprintln!("SKIP: run generate_e2e_context_tuples_longmemeval_api first"); + return; + } + + let tuples = load_judgment_tuples(&tuples_path).expect("load tuples"); + eprintln!("Judging {} tuples with Haiku...", tuples.len()); + + let results = judge_with_claude_model(&tuples, 3, "haiku") + .await + .expect("judge failed"); + + let report = aggregate_judgments(&results, "haiku"); + eprintln!("\n=== E2E Context Eval: LongMemEval (Haiku answers, Haiku judge) ==="); + eprintln!( + "{:<25} | {:<10} | {:<10} | {:<14} | Total", + "Approach", "Accuracy", "Correct", "Context Tok" + ); + eprintln!( + "{:-<25}-+-{:-<10}-+-{:-<10}-+-{:-<14}-+-{:-<6}", + "", "", "", "", "" + ); + for r in &report.results_by_approach { + eprintln!( + "{:<25} | {:<10.1}% | {:<10} | {:<14.0} | {}", + r.approach, + r.accuracy * 100.0, + r.correct, + r.mean_context_tokens, + r.total + ); + } + eprintln!("\nTotal judged: {}", report.total_judged); +} + // --------------------------------------------------------------------------- // API-based E2E: Haiku as answer model, Sonnet as judge // --------------------------------------------------------------------------- diff --git a/crates/origin-core/src/eval/answer_quality.rs b/crates/origin-core/src/eval/answer_quality.rs index 3d4d25d7..f71c55f0 100644 --- a/crates/origin-core/src/eval/answer_quality.rs +++ b/crates/origin-core/src/eval/answer_quality.rs @@ -1176,7 +1176,11 @@ async fn build_structured_context( if !results.is_empty() { parts.push("## Relevant Memories".to_string()); for r in results.iter() { - parts.push(format!("On {}: {}", crate::eval::shared::format_ymd(r.last_modified), r.content)); + parts.push(format!( + "On {}: {}", + crate::eval::shared::format_ymd(r.last_modified), + r.content + )); } } let structured_context = parts.join("\n\n"); From da2f125c6356e2adecc4baf77bd07a8e47ce39d3 Mon Sep 17 00:00:00 2001 From: 7xuanlu Date: Mon, 27 Apr 2026 21:32:47 -0700 Subject: [PATCH 09/13] fix: thread session/question dates into PR #29 fullpipeline batch paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adversarial review caught that PR #29's new run_fullpipeline_locomo_batch and run_fullpipeline_lme_batch seeded all memories with chrono::Utc::now() and used a date-blind system prompt — silently nullifying the temporal metadata work for the fullpipeline path (every memory would have been prefixed "On 2026-04-27" regardless of original session date). Fixes: - Extract build_e2e_system_prompt(question_date) helper; refactor generate_e2e_answers_for_question to use it. - Thread mem.session_date through seed_last_modified at both batch seed sites (LoCoMo: parse_locomo_date, LME: parse_lme_date). - Thread question_date into batch system prompts: LoCoMo uses the latest session_date in the sample (questions follow the conversation); LME uses sample.question_date directly. - seed_last_modified now log::warn\!s on parse failures so future silent regressions to now() are visible in eval logs. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/origin-core/src/eval/answer_quality.rs | 50 ++++++++++++------- crates/origin-core/src/eval/dates.rs | 14 +++++- 2 files changed, 45 insertions(+), 19 deletions(-) diff --git a/crates/origin-core/src/eval/answer_quality.rs b/crates/origin-core/src/eval/answer_quality.rs index f71c55f0..93e0a226 100644 --- a/crates/origin-core/src/eval/answer_quality.rs +++ b/crates/origin-core/src/eval/answer_quality.rs @@ -712,18 +712,7 @@ async fn generate_e2e_answers_for_question( ) -> Result, OriginError> { use crate::llm_provider::{strip_think_tags, LlmRequest}; - // If we have a question_date (LongMemEval), prepend it to the system prompt - // so the LLM has a "today" anchor for relative time references in the question. - let system_prompt = match question_date { - Some(d) => format!( - "The question was asked on {}. Answer the question using only the provided context. \ - Be specific and concise. Respond in 1-3 sentences.", - d - ), - None => "Answer the question using only the provided context. \ - Be specific and concise. Respond in 1-3 sentences." - .to_string(), - }; + let system_prompt = build_e2e_system_prompt(question_date); let mut tuples = Vec::new(); @@ -1109,10 +1098,23 @@ struct PendingAnswer { context_tokens: usize, } -/// System prompt used for all E2E answer generation. +/// System prompt used for all E2E answer generation (no date anchor). const E2E_SYSTEM_PROMPT: &str = "Answer the question using only the provided context. Be specific and concise. Respond in 1-3 sentences."; +/// Build the E2E system prompt, prepending the question's "asked on" date when available +/// so the LLM has a temporal anchor for relative-time references in the question. +fn build_e2e_system_prompt(question_date: Option<&str>) -> String { + match question_date { + Some(d) => format!( + "The question was asked on {}. Answer the question using only the provided context. \ + Be specific and concise. Respond in 1-3 sentences.", + d + ), + None => E2E_SYSTEM_PROMPT.to_string(), + } +} + /// Build structured context for a question against an enriched DB. /// /// Returns the structured context: search_memory results + concept articles. @@ -1273,7 +1275,10 @@ pub async fn run_fullpipeline_locomo_batch( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: crate::eval::dates::seed_last_modified( + mem.session_date.as_deref(), + crate::eval::locomo::parse_locomo_date, + ), ..Default::default() }) .collect(); @@ -1323,6 +1328,14 @@ pub async fn run_fullpipeline_locomo_batch( for sample in &samples { let mut q_count = 0usize; + // Question "asked on" = latest session date in this sample (questions follow the conversation). + // Skipped when no session_date can be parsed; falls back to the date-blind prompt. + let sample_question_date: Option = extract_observations(sample) + .iter() + .filter_map(|m| m.session_date.clone()) + .filter(|d| crate::eval::locomo::parse_locomo_date(d).is_some()) + .max_by_key(|d| crate::eval::locomo::parse_locomo_date(d).unwrap_or(0)); + for qa in &sample.qa { if qa.category == 5 { continue; @@ -1347,7 +1360,7 @@ pub async fn run_fullpipeline_locomo_batch( batch_requests.push(( req_id.clone(), format!("Context:\n{}\n\nQuestion: {}", ctx, qa.question), - Some(E2E_SYSTEM_PROMPT.to_string()), + Some(build_e2e_system_prompt(sample_question_date.as_deref())), 200, )); pending.insert( @@ -1517,7 +1530,10 @@ pub async fn run_fullpipeline_lme_batch( .to_string(), ), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: crate::eval::dates::seed_last_modified( + mem.session_date.as_deref(), + crate::eval::longmemeval::parse_lme_date, + ), ..Default::default() }) .collect(); @@ -1581,7 +1597,7 @@ pub async fn run_fullpipeline_lme_batch( batch_requests.push(( req_id.clone(), format!("Context:\n{}\n\nQuestion: {}", ctx, sample.question), - Some(E2E_SYSTEM_PROMPT.to_string()), + Some(build_e2e_system_prompt(Some(&sample.question_date))), 200, )); pending.insert( diff --git a/crates/origin-core/src/eval/dates.rs b/crates/origin-core/src/eval/dates.rs index 7e9ee669..9128ca14 100644 --- a/crates/origin-core/src/eval/dates.rs +++ b/crates/origin-core/src/eval/dates.rs @@ -52,9 +52,19 @@ pub fn format_ymd(ts: i64) -> String { /// Resolve `last_modified` for a benchmark-seeded chunk: parse the per-session /// date string with `parser` if present, else fall back to `now()` (used for /// noise / undated entries). +/// +/// Logs a warning when a non-empty date string fails to parse, so silent +/// degradation to today's date is visible in eval logs. pub(crate) fn seed_last_modified(date: Option<&str>, parser: fn(&str) -> Option) -> i64 { - date.and_then(parser) - .unwrap_or_else(|| chrono::Utc::now().timestamp()) + if let Some(s) = date { + if let Some(ts) = parser(s) { + return ts; + } + log::warn!( + "[eval:dates] failed to parse date {s:?}; falling back to now() — temporal accuracy lost" + ); + } + chrono::Utc::now().timestamp() } #[cfg(test)] From ede13a1e1204feeefc1b75168acbb632e3dc4c57 Mon Sep 17 00:00:00 2001 From: 7xuanlu Date: Mon, 27 Apr 2026 22:36:25 -0700 Subject: [PATCH 10/13] fix: route CLI eval through date-aware generate_e2e_answers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this, the Claude CLI eval entry points (generate_e2e_context_tuples_locomo_api, _longmemeval_api) call a duplicate generate_e2e_answers_for_question in token_efficiency.rs that predates the temporal work — no date prefixes, no question_date in the system prompt, seeds with chrono::Utc::now(). The CLI eval would have shown zero lift on temporal categories regardless of how the rest of the pipeline handles dates. - Delete the duplicate at token_efficiency.rs:4472-4572. - Make crate::eval::answer_quality::generate_e2e_answers_for_question pub(crate) and route both callers through it. - run_e2e_context_eval (LoCoMo CLI path): seed via seed_last_modified with parse_locomo_date; compute sample_question_date as the latest parseable session_date in the sample (questions follow the conversation in LoCoMo); pass through to the answer call. - run_e2e_context_eval_longmemeval (LME CLI path): seed via seed_last_modified with parse_lme_date; pass sample.question_date directly to the answer call. The remaining 6 chrono::Utc::now() seed sites in this file (run_e2e_locomo_eval, run_locomo_pipeline_eval, *_pipeline_eval, run_context_path_eval*) are not on the CLI eval critical path; punt. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/origin-core/src/eval/answer_quality.rs | 2 +- .../origin-core/src/eval/token_efficiency.rs | 138 +++--------------- 2 files changed, 23 insertions(+), 117 deletions(-) diff --git a/crates/origin-core/src/eval/answer_quality.rs b/crates/origin-core/src/eval/answer_quality.rs index 93e0a226..9300302d 100644 --- a/crates/origin-core/src/eval/answer_quality.rs +++ b/crates/origin-core/src/eval/answer_quality.rs @@ -701,7 +701,7 @@ pub async fn run_e2e_locomo_eval( /// /// Requires enrichment + distillation to be run first (concepts must exist). /// Call this after seeding + enriching a DB, or use the all-in-one wrapper. -async fn generate_e2e_answers_for_question( +pub(crate) async fn generate_e2e_answers_for_question( db: &MemoryDB, question: &str, ground_truth: &str, diff --git a/crates/origin-core/src/eval/token_efficiency.rs b/crates/origin-core/src/eval/token_efficiency.rs index 816cf0fb..0aeb490f 100644 --- a/crates/origin-core/src/eval/token_efficiency.rs +++ b/crates/origin-core/src/eval/token_efficiency.rs @@ -4458,118 +4458,8 @@ pub async fn run_context_path_eval( } // ===== E2E Answer Quality: flat vs structured context with LLM-as-judge ===== - -/// Run E2E answer quality comparison: flat (search_memory) vs structured (search + concepts). -/// -/// For each LoCoMo question: -/// 1. Build flat context: search_memory top-K concatenated -/// 2. Build structured context: search_memory + concept articles (like chat-context) -/// 3. Generate answers from both contexts using on-device LLM -/// 4. Return JudgmentTuples for offline Claude Haiku judging -/// -/// Requires enrichment + distillation to be run first (concepts must exist). -/// Call this after seeding + enriching a DB, or use the all-in-one wrapper. -async fn generate_e2e_answers_for_question( - db: &MemoryDB, - question: &str, - ground_truth: &str, - category: &str, - search_limit: usize, - llm: &Arc, -) -> Result, OriginError> { - use crate::llm_provider::{strip_think_tags, LlmRequest}; - - let system_prompt = "Answer the question using only the provided context. \ - Be specific and concise. Respond in 1-3 sentences." - .to_string(); - - let mut tuples = Vec::new(); - - // --- Flat context: search_memory only --- - let flat_results = db - .search_memory( - question, - search_limit, - None, - Some("conversation"), - None, - None, - None, - None, - ) - .await?; - let flat_context: String = flat_results - .iter() - .enumerate() - .map(|(i, r)| format!("{}. {}", i + 1, r.content)) - .collect::>() - .join("\n"); - let flat_tokens = count_tokens(&flat_context); - - let flat_request = LlmRequest { - system_prompt: Some(system_prompt.clone()), - user_prompt: format!("Context:\n{}\n\nQuestion: {}", flat_context, question), - max_tokens: 200, - temperature: 0.1, - label: Some("e2e_flat".to_string()), - }; - if let Ok(raw) = llm.generate(flat_request).await { - let answer = strip_think_tags(&raw); - tuples.push(JudgmentTuple { - question: question.to_string(), - ground_truth: ground_truth.to_string(), - approach: format!("flat_{}", category), - answer, - context_tokens: flat_tokens, - category: category.to_string(), - }); - } - - // --- Structured context: search_memory + concept articles --- - let mut structured_parts: Vec = Vec::new(); - - // Concept articles (like chat-context's "Compiled Knowledge" section) - let concepts = db.search_concepts(question, 3).await.unwrap_or_default(); - if !concepts.is_empty() { - structured_parts.push("## Compiled Knowledge".to_string()); - for c in &concepts { - let summary = c.summary.as_deref().unwrap_or(""); - structured_parts.push(format!("**{}**: {}\n{}", c.title, summary, c.content)); - } - } - - // Memory search results - if !flat_results.is_empty() { - structured_parts.push("## Relevant Memories".to_string()); - for (i, r) in flat_results.iter().enumerate() { - structured_parts.push(format!("{}. {}", i + 1, r.content)); - } - } - - let structured_context = structured_parts.join("\n\n"); - let structured_tokens = count_tokens(&structured_context); - - let structured_request = LlmRequest { - system_prompt: Some(system_prompt), - user_prompt: format!("Context:\n{}\n\nQuestion: {}", structured_context, question), - max_tokens: 200, - temperature: 0.1, - label: Some("e2e_structured".to_string()), - }; - if let Ok(raw) = llm.generate(structured_request).await { - let answer = strip_think_tags(&raw); - tuples.push(JudgmentTuple { - question: question.to_string(), - ground_truth: ground_truth.to_string(), - approach: format!("structured_{}", category), - answer, - context_tokens: structured_tokens, - category: category.to_string(), - }); - } - - Ok(tuples) -} +// `generate_e2e_answers_for_question` is the date-aware version in +// crate::eval::answer_quality; we use it for both LoCoMo and LongMemEval below. /// Run full E2E answer quality eval on LoCoMo: seed, enrich, distill, generate answers. /// @@ -4602,6 +4492,14 @@ pub async fn run_e2e_context_eval( continue; } + // Question "asked on" = latest session date in this sample (questions follow the + // conversation in LoCoMo). Falls back to None if no session_date parses. + let sample_question_date: Option = memories + .iter() + .filter_map(|m| m.session_date.clone()) + .filter(|d| crate::eval::locomo::parse_locomo_date(d).is_some()) + .max_by_key(|d| crate::eval::locomo::parse_locomo_date(d).unwrap_or(0)); + eprintln!( "[e2e_context] Conv {}/{} ({}): {} observations", conv_idx + 1, @@ -4630,7 +4528,10 @@ pub async fn run_e2e_context_eval( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: crate::eval::dates::seed_last_modified( + mem.session_date.as_deref(), + crate::eval::locomo::parse_locomo_date, + ), ..Default::default() }) .collect(); @@ -4667,13 +4568,14 @@ pub async fn run_e2e_context_eval( let category = category_name(qa.category); - match generate_e2e_answers_for_question( + match crate::eval::answer_quality::generate_e2e_answers_for_question( &db, &qa.question, &ground_truth, category, search_limit, &llm, + sample_question_date.as_deref(), ) .await { @@ -4778,7 +4680,10 @@ pub async fn run_e2e_context_eval_longmemeval( .to_string(), ), domain: Some("conversation".to_string()), - last_modified: chrono::Utc::now().timestamp(), + last_modified: crate::eval::dates::seed_last_modified( + mem.session_date.as_deref(), + crate::eval::longmemeval::parse_lme_date, + ), ..Default::default() }) .collect(); @@ -4801,13 +4706,14 @@ pub async fn run_e2e_context_eval_longmemeval( let category = category_name(&sample.question_type); - if let Ok(tuples) = generate_e2e_answers_for_question( + if let Ok(tuples) = crate::eval::answer_quality::generate_e2e_answers_for_question( &db, &sample.question, &ground_truth, category, search_limit, &llm, + Some(&sample.question_date), ) .await { From 6adb07810792cb6f83b6e42a184154555d3138bb Mon Sep 17 00:00:00 2001 From: 7xuanlu Date: Mon, 27 Apr 2026 23:16:12 -0700 Subject: [PATCH 11/13] fix: claude-cli scrubs ANTHROPIC_API_KEY + temporal A/B smoke MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ClaudeCliProvider::generate now does .env_remove("ANTHROPIC_API_KEY") before spawning `claude`. When the env var is set, the CLI routes through the paid API instead of the user's Max-plan OAuth, so an eval that intends to use Max would silently route through API credits and fail with "Credit balance is too low" if the API account has none. The CLI provider's whole purpose is Max OAuth, so always scrub. Add temporal_smoke_locomo_5q (#[ignore]'d) — A/B compares date-aware context (date prefix + question_date system anchor) vs date-blind context on 5 LoCoMo temporal questions, no enrichment, search_limit 30 with diagnostic logging of top-3 retrieval hits. Designed as a fast (~1-2 min) signal check before committing to full eval runs. Promote to pub: parse_locomo_date, parse_lme_date, seed_last_modified (integration tests in app/ live outside the crate and need pub). Co-Authored-By: Claude Opus 4.7 (1M context) --- app/tests/eval_harness.rs | 259 +++++++++++++++++++++ crates/origin-core/src/eval/dates.rs | 6 +- crates/origin-core/src/eval/locomo.rs | 2 +- crates/origin-core/src/eval/longmemeval.rs | 2 +- crates/origin-core/src/llm_provider.rs | 6 + 5 files changed, 270 insertions(+), 5 deletions(-) diff --git a/app/tests/eval_harness.rs b/app/tests/eval_harness.rs index 21dd8987..4ce7a5c2 100644 --- a/app/tests/eval_harness.rs +++ b/app/tests/eval_harness.rs @@ -2632,3 +2632,262 @@ async fn probe_overlap_gate() { } } } + +// --------------------------------------------------------------------------- +// Temporal smoke: A/B compare date-aware vs date-blind context on 5 LoCoMo +// temporal questions. No enrichment (search_memory works on vectors+FTS alone), +// so the run completes in ~2 min on Haiku CLI. Used to verify the temporal +// mechanisms before committing to the full 20- or 50-question eval. +// --------------------------------------------------------------------------- +#[tokio::test] +#[ignore] +async fn temporal_smoke_locomo_5q() { + use origin_core::events::NoopEmitter; + use origin_lib::llm_provider::{ClaudeCliProvider, LlmProvider, LlmRequest}; + use origin_lib::memory_db::MemoryDB; + use origin_lib::sources::RawDocument; + use std::sync::Arc; + + let locomo_path = + std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("eval/data/locomo10.json"); + if !locomo_path.exists() { + eprintln!("SKIP: locomo10.json not found"); + return; + } + + let samples = origin_lib::eval::locomo::load_locomo(&locomo_path).expect("load locomo"); + let sample = &samples[0]; + let memories = origin_lib::eval::locomo::extract_observations(sample); + eprintln!( + "[smoke] conv {} — {} observations across {} sessions", + sample.sample_id, + memories.len(), + memories + .iter() + .map(|m| m.session_num) + .collect::>() + .len() + ); + + // Seed with REAL session dates — date prefix and date-blind paths read the + // same DB; they differ only in how they render and prompt. + let tmp = tempfile::tempdir().expect("tempdir"); + let db = MemoryDB::new(tmp.path(), Arc::new(NoopEmitter)) + .await + .expect("db"); + let docs: Vec = memories + .iter() + .enumerate() + .map(|(i, mem)| RawDocument { + content: mem.content.clone(), + source_id: format!("locomo_{}_obs_{}", sample.sample_id, i), + source: "memory".to_string(), + title: format!("{} session {}", mem.speaker, mem.session_num), + memory_type: Some("fact".to_string()), + domain: Some("conversation".to_string()), + last_modified: origin_lib::eval::dates::seed_last_modified( + mem.session_date.as_deref(), + origin_lib::eval::locomo::parse_locomo_date, + ), + ..Default::default() + }) + .collect(); + db.upsert_documents(docs).await.expect("upsert"); + + // Latest parseable session date = LoCoMo "asked on". + let asked_on: Option = memories + .iter() + .filter_map(|m| m.session_date.clone()) + .filter(|d| origin_lib::eval::locomo::parse_locomo_date(d).is_some()) + .max_by_key(|d| origin_lib::eval::locomo::parse_locomo_date(d).unwrap_or(0)); + eprintln!("[smoke] asked_on (latest session): {:?}", asked_on); + + // First 5 temporal questions (category 2). + let temporal_qs: Vec<&origin_lib::eval::locomo::LocomoQA> = sample + .qa + .iter() + .filter(|qa| qa.category == 2) + .take(5) + .collect(); + + let llm: Arc = Arc::new(ClaudeCliProvider::haiku()); + + let mut results: Vec<(String, String, String, String)> = Vec::new(); // q, gt, A, B + for (i, qa) in temporal_qs.iter().enumerate() { + let gt = qa + .answer + .as_ref() + .map(|v| v.as_str().unwrap_or(&v.to_string()).to_string()) + .unwrap_or_default(); + if gt.is_empty() { + continue; + } + eprintln!("\n[smoke] Q{}: {}", i + 1, qa.question); + eprintln!(" GT: {}", gt); + + let hits = db + .search_memory( + &qa.question, + 30, + None, + Some("conversation"), + None, + None, + None, + None, + ) + .await + .expect("search"); + // Diagnostic: is the relevant content in top-30? + eprintln!(" retrieved {} hits; first 3 contents:", hits.len()); + for r in hits.iter().take(3) { + eprintln!( + " - {}", + r.content.chars().take(90).collect::() + ); + } + + // A: date-aware context + date-anchored system prompt + let ctx_a: String = hits + .iter() + .map(|r| { + format!( + "On {}: {}", + origin_lib::eval::shared::format_ymd(r.last_modified), + r.content + ) + }) + .collect::>() + .join("\n"); + let sys_a = match asked_on.as_deref() { + Some(d) => format!( + "The question was asked on {}. Answer the question using only the provided context. Be specific and concise. Respond in 1-3 sentences.", + d + ), + None => "Answer the question using only the provided context. Be specific and concise. Respond in 1-3 sentences.".to_string(), + }; + let answer_a = llm + .generate(LlmRequest { + system_prompt: Some(sys_a), + user_prompt: format!("Context:\n{}\n\nQuestion: {}", ctx_a, qa.question), + max_tokens: 200, + temperature: 0.1, + label: Some("smoke_A".to_string()), + }) + .await + .unwrap_or_else(|e| format!("ERR: {e}")); + eprintln!(" A (with dates): {}", answer_a.trim()); + + // B: date-blind context + plain system prompt + let ctx_b: String = hits + .iter() + .map(|r| r.content.clone()) + .collect::>() + .join("\n"); + let sys_b = "Answer the question using only the provided context. Be specific and concise. Respond in 1-3 sentences.".to_string(); + let answer_b = llm + .generate(LlmRequest { + system_prompt: Some(sys_b), + user_prompt: format!("Context:\n{}\n\nQuestion: {}", ctx_b, qa.question), + max_tokens: 200, + temperature: 0.1, + label: Some("smoke_B".to_string()), + }) + .await + .unwrap_or_else(|e| format!("ERR: {e}")); + eprintln!(" B (no dates): {}", answer_b.trim()); + + results.push((qa.question.clone(), gt, answer_a, answer_b)); + } + + // Substring-match scoring: extract the date/year from ground truth and look for it. + fn score(answer: &str, gt: &str) -> bool { + let needle: String = gt.to_lowercase(); + let hay: String = answer.to_lowercase(); + // Match on "may", "2023", "may 2023", "7 may", etc — any non-empty token from GT + // that's a year (4 digits) or a month name should appear in the answer. + let months = [ + "january", + "february", + "march", + "april", + "may", + "june", + "july", + "august", + "september", + "october", + "november", + "december", + "jan", + "feb", + "mar", + "apr", + "jun", + "jul", + "aug", + "sep", + "oct", + "nov", + "dec", + ]; + let years: Vec<&str> = needle + .split(|c: char| !c.is_ascii_digit()) + .filter(|s| s.len() == 4 && s.starts_with("20")) + .collect(); + let needs_year = !years.is_empty(); + let needs_month = months.iter().any(|m| needle.contains(m)); + let year_ok = years.iter().any(|y| hay.contains(y)); + let month_ok = months.iter().any(|m| needle.contains(m) && hay.contains(m)); + let approx_match = needle + .split_whitespace() + .any(|w| w.len() > 3 && hay.contains(w)); + match (needs_year, needs_month) { + (true, true) => year_ok && month_ok, + (true, false) => year_ok, + (false, true) => month_ok, + (false, false) => approx_match, + } + } + + let mut a_correct = 0; + let mut b_correct = 0; + eprintln!("\n========================================"); + eprintln!("Smoke result: A=date-aware, B=date-blind"); + eprintln!("========================================"); + for (i, (q, gt, a, b)) in results.iter().enumerate() { + let a_ok = score(a, gt); + let b_ok = score(b, gt); + if a_ok { + a_correct += 1; + } + if b_ok { + b_correct += 1; + } + eprintln!( + "Q{}: A={} B={} — gt: {:?}", + i + 1, + if a_ok { "✓" } else { "✗" }, + if b_ok { "✓" } else { "✗" }, + gt.chars().take(40).collect::(), + ); + eprintln!(" Q: {}", q.chars().take(80).collect::()); + } + let n = results.len().max(1); + eprintln!( + "\nA (date-aware): {}/{} = {:.0}%", + a_correct, + n, + 100.0 * a_correct as f64 / n as f64 + ); + eprintln!( + "B (date-blind): {}/{} = {:.0}%", + b_correct, + n, + 100.0 * b_correct as f64 / n as f64 + ); + eprintln!( + "Lift: {} pp", + (a_correct as i64 - b_correct as i64) * 100 / n as i64 + ); +} diff --git a/crates/origin-core/src/eval/dates.rs b/crates/origin-core/src/eval/dates.rs index 9128ca14..a674f61a 100644 --- a/crates/origin-core/src/eval/dates.rs +++ b/crates/origin-core/src/eval/dates.rs @@ -11,7 +11,7 @@ /// Parse a LoCoMo session date like "1:56 pm on 8 May, 2023" into Unix seconds. /// Returns `None` on parse failure (caller falls back to `now()`). -pub(crate) fn parse_locomo_date(s: &str) -> Option { +pub fn parse_locomo_date(s: &str) -> Option { use chrono::{NaiveDateTime, TimeZone, Utc}; // The dataset uses " on ". chrono's strftime // %p needs uppercase AM/PM; LoCoMo uses lowercase. Normalise first. @@ -25,7 +25,7 @@ pub(crate) fn parse_locomo_date(s: &str) -> Option { /// Parse a LongMemEval `question_date` / `haystack_date` into Unix seconds. /// Format example: "2023/04/10 (Mon) 23:07". Returns `None` on parse failure /// (e.g. dataset variants with different formats -- caller falls back to `now()`). -pub(crate) fn parse_lme_date(s: &str) -> Option { +pub fn parse_lme_date(s: &str) -> Option { use chrono::{NaiveDateTime, TimeZone, Utc}; // Strip the weekday tag in parens: "2023/04/10 (Mon) 23:07" -> "2023/04/10 23:07" let cleaned: String = s @@ -55,7 +55,7 @@ pub fn format_ymd(ts: i64) -> String { /// /// Logs a warning when a non-empty date string fails to parse, so silent /// degradation to today's date is visible in eval logs. -pub(crate) fn seed_last_modified(date: Option<&str>, parser: fn(&str) -> Option) -> i64 { +pub fn seed_last_modified(date: Option<&str>, parser: fn(&str) -> Option) -> i64 { if let Some(s) = date { if let Some(ts) = parser(s) { return ts; diff --git a/crates/origin-core/src/eval/locomo.rs b/crates/origin-core/src/eval/locomo.rs index ba23a246..bbd11997 100644 --- a/crates/origin-core/src/eval/locomo.rs +++ b/crates/origin-core/src/eval/locomo.rs @@ -21,7 +21,7 @@ use std::path::Path; // Bring date helpers into scope for use within this module. use crate::eval::dates::seed_last_modified; // Re-export so external callers using `crate::eval::locomo::parse_locomo_date` still compile. -pub(crate) use crate::eval::dates::parse_locomo_date; +pub use crate::eval::dates::parse_locomo_date; // --------------------------------------------------------------------------- // Data structures diff --git a/crates/origin-core/src/eval/longmemeval.rs b/crates/origin-core/src/eval/longmemeval.rs index a71d7c35..3df88d43 100644 --- a/crates/origin-core/src/eval/longmemeval.rs +++ b/crates/origin-core/src/eval/longmemeval.rs @@ -33,7 +33,7 @@ use std::path::Path; // Bring date helpers into scope for use within this module. use crate::eval::dates::seed_last_modified; // Re-export so external callers using `crate::eval::longmemeval::parse_lme_date` still compile. -pub(crate) use crate::eval::dates::parse_lme_date; +pub use crate::eval::dates::parse_lme_date; // --------------------------------------------------------------------------- // Data structures (matches the JSON schema from HuggingFace) diff --git a/crates/origin-core/src/llm_provider.rs b/crates/origin-core/src/llm_provider.rs index 7b19a8be..97a4c25e 100644 --- a/crates/origin-core/src/llm_provider.rs +++ b/crates/origin-core/src/llm_provider.rs @@ -747,8 +747,14 @@ impl LlmProvider for ClaudeCliProvider { args.push(sys.clone()); } + // Scrub ANTHROPIC_API_KEY from the child's env: when present, the CLI + // routes through pay-as-you-go API instead of the user's Max OAuth, so + // a Max-plan eval would silently burn API credits (and fail with "Credit + // balance is too low" when the API key has none). The CLI provider's + // whole purpose is to use Max OAuth, so always remove the override. let mut child = Command::new("claude") .args(&args) + .env_remove("ANTHROPIC_API_KEY") .stdin(std::process::Stdio::piped()) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) From 8c94e882bbd070f73ec6a97e866070852476fd8d Mon Sep 17 00:00:00 2001 From: 7xuanlu Date: Tue, 28 Apr 2026 08:06:08 -0700 Subject: [PATCH 12/13] fix: decouple event_date (display) from last_modified (ranking) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug: seeding memories with their real session/event dates made search recency decay (exp(-decay_rate * age_days)) crush the score to ~0 for any content older than a few months. The retrieval diagnostic showed all 50 results returning score=0.000 for benchmark seeds dated 2023 when the test ran in 2026 — and the ranking became noise. Same bug would hit any user who imports an old email archive, conversation backfill, or scanned-document set. Root cause: last_modified was carrying two different concepts — "when did the event happen?" (for display) and "when was this row ingested?" (for staleness). Recency decay correctly penalises stale ingestion, but we were feeding it event time. Fix (Option A): introduce event_date as a separate optional field on RawDocument and SearchResult. last_modified stays anchored to ingestion / edit time; event_date carries the display time. Date-prefix rendering uses event_date.unwrap_or(last_modified). Recency decay continues to use last_modified, so old-but-just-imported content ranks fresh. - crates/origin-types: add event_date: Option to RawDocument and SearchResult. #[serde(default)] keeps origin-mcp wire-compatible. - crates/origin-core/src/db.rs: migration 44 adds memories.event_date (NULL-able, no backfill — old rows stay None and fall back to last_modified for display). All search SELECTs include c.event_date at column 29; score column moves to 30. row_to_search_result reads Option. INSERT writes event_date. - Inline recency-decay match in search_memory replaced with crate::sources::decay_rate(&tier, &ConfidenceConfig::default()) so the canonical decay function is the single source of truth. - Eval seed sites (locomo, longmemeval, answer_quality, pipeline) now set last_modified=now() and event_date=seed_event_date(...). - Renamed seed_last_modified -> seed_event_date and changed its return type to Option — None on parse failure rather than silently falling back to "today" (with a log::warn for visibility). - Regression test: a chunk with 3-year-old event_date but fresh last_modified must score > 0.001 (pre-fix it was ~1.7e-5). Verified: retrieval diagnostic on LoCoMo conv-26 (184 obs) after this change shows real scores (0.046–0.079) and the previously-missing target memories now rank within top-50: "support group" rank 50, "sunrise" rank 37, "charity race" rank 33, "camping" rank 1. Pre-fix none of these were retrievable at any K because scores were noise. Search ranking quality at top-K is a separate concern not addressed here — but is now actually addressable, since retrieval is no longer blocked by recency decay. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/tests/eval_harness.rs | 153 +++++++++++++++++- crates/origin-core/src/db.rs | 133 +++++++++++++-- crates/origin-core/src/eval/answer_quality.rs | 21 ++- crates/origin-core/src/eval/dates.rs | 56 ++++--- crates/origin-core/src/eval/locomo.rs | 14 +- crates/origin-core/src/eval/longmemeval.rs | 28 ++-- crates/origin-core/src/eval/pipeline.rs | 6 +- crates/origin-core/src/post_ingest.rs | 1 + crates/origin-server/src/memory_routes.rs | 1 + crates/origin-types/src/lib.rs | 1 + crates/origin-types/src/memory.rs | 6 + crates/origin-types/src/sources.rs | 10 +- 12 files changed, 354 insertions(+), 76 deletions(-) diff --git a/app/tests/eval_harness.rs b/app/tests/eval_harness.rs index 4ce7a5c2..83355b9d 100644 --- a/app/tests/eval_harness.rs +++ b/app/tests/eval_harness.rs @@ -2633,6 +2633,154 @@ async fn probe_overlap_gate() { } } +// --------------------------------------------------------------------------- +// Retrieval-only diagnostic: no LLM calls, no API cost. Seeds conv-26, runs +// search_memory for the 5 temporal questions, and dumps which hits came back +// alongside whether the literal answer text exists in the seeded data and +// where it ranks (or whether it ranks at all). +// --------------------------------------------------------------------------- +#[tokio::test] +#[ignore] +async fn temporal_retrieval_diag_locomo() { + use origin_core::events::NoopEmitter; + use origin_lib::memory_db::MemoryDB; + use origin_lib::sources::RawDocument; + use std::sync::Arc; + + let locomo_path = + std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("eval/data/locomo10.json"); + let samples = origin_lib::eval::locomo::load_locomo(&locomo_path).expect("load"); + let sample = &samples[0]; + let memories = origin_lib::eval::locomo::extract_observations(sample); + eprintln!("[diag] {} memories seeded", memories.len()); + + let probes = [ + ("LGBTQ support group", "support group"), + ("painted a sunrise", "sunrise"), + ("charity race", "charity race"), + ("camping next month", "camping"), + ("speech at a school", "speech"), + ]; + eprintln!("\n[diag] Substring check on raw extracted content:"); + for (label, needle) in &probes { + let hits: Vec<&str> = memories + .iter() + .filter(|m| m.content.to_lowercase().contains(&needle.to_lowercase())) + .map(|m| m.content.as_str()) + .collect(); + eprintln!(" {:<22} ({} matches in extraction):", label, hits.len()); + for h in hits.iter().take(3) { + eprintln!(" - {}", h.chars().take(120).collect::()); + } + } + + let tmp = tempfile::tempdir().expect("tempdir"); + let db = MemoryDB::new(tmp.path(), Arc::new(NoopEmitter)) + .await + .expect("db"); + let docs: Vec = memories + .iter() + .enumerate() + .map(|(i, mem)| RawDocument { + content: mem.content.clone(), + source_id: format!("locomo_{}_obs_{}", sample.sample_id, i), + source: "memory".to_string(), + title: format!("{} session {}", mem.speaker, mem.session_num), + memory_type: Some("fact".to_string()), + domain: Some("conversation".to_string()), + last_modified: chrono::Utc::now().timestamp(), + event_date: origin_lib::eval::dates::seed_event_date( + mem.session_date.as_deref(), + origin_lib::eval::locomo::parse_locomo_date, + ), + ..Default::default() + }) + .collect(); + db.upsert_documents(docs).await.expect("upsert"); + + let queries = [ + ( + "Q1 LGBTQ support group", + "When did Caroline go to the LGBTQ support group?", + "support group", + ), + ( + "Q2 painted sunrise", + "When did Melanie paint a sunrise?", + "sunrise", + ), + ( + "Q3 charity race", + "When did Melanie run a charity race?", + "charity race", + ), + ( + "Q4 camping plan", + "When is Melanie planning on going camping?", + "camping", + ), + ( + "Q5 school speech", + "When did Caroline give a speech at a school?", + "speech", + ), + ]; + + for (label, q, needle) in &queries { + eprintln!("\n=== {} ===", label); + eprintln!(" query: {}", q); + let hits = db + .search_memory(q, 50, None, Some("conversation"), None, None, None, None) + .await + .expect("search"); + eprintln!(" returned {} hits", hits.len()); + + let needle_l = needle.to_lowercase(); + let target_rank = hits + .iter() + .position(|h| h.content.to_lowercase().contains(&needle_l)); + match target_rank { + Some(r) => { + eprintln!(" ✓ first hit containing {:?}: rank {}", needle, r + 1); + eprintln!( + " {}", + hits[r].content.chars().take(140).collect::() + ); + } + None => { + eprintln!(" ✗ NO hit in top-50 contains {:?}", needle); + let raw_hits: Vec<&origin_lib::eval::locomo::LocomoMemory> = memories + .iter() + .filter(|m| m.content.to_lowercase().contains(&needle_l)) + .collect(); + if raw_hits.is_empty() { + eprintln!(" + raw extraction also has 0 — extraction dropped it"); + } else { + eprintln!( + " + raw extraction has {} matches — search ranking missed them:", + raw_hits.len() + ); + for m in raw_hits.iter().take(3) { + eprintln!( + " - {}", + m.content.chars().take(140).collect::() + ); + } + } + } + } + eprintln!(" top 5 returned:"); + for (i, h) in hits.iter().take(5).enumerate() { + eprintln!( + " {}. [score={:.3}] {}", + i + 1, + h.score, + h.content.chars().take(110).collect::() + ); + } + } +} + // --------------------------------------------------------------------------- // Temporal smoke: A/B compare date-aware vs date-blind context on 5 LoCoMo // temporal questions. No enrichment (search_memory works on vectors+FTS alone), @@ -2685,7 +2833,8 @@ async fn temporal_smoke_locomo_5q() { title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: origin_lib::eval::dates::seed_last_modified( + last_modified: chrono::Utc::now().timestamp(), + event_date: origin_lib::eval::dates::seed_event_date( mem.session_date.as_deref(), origin_lib::eval::locomo::parse_locomo_date, ), @@ -2753,7 +2902,7 @@ async fn temporal_smoke_locomo_5q() { .map(|r| { format!( "On {}: {}", - origin_lib::eval::shared::format_ymd(r.last_modified), + origin_lib::eval::shared::format_ymd(r.event_date.unwrap_or(r.last_modified)), r.content ) }) diff --git a/crates/origin-core/src/db.rs b/crates/origin-core/src/db.rs index 2367bc59..e3f9794c 100644 --- a/crates/origin-core/src/db.rs +++ b/crates/origin-core/src/db.rs @@ -4112,6 +4112,31 @@ impl MemoryDB { } } + // Migration 44: event_date column on memories — when the event the memory + // describes actually happened, distinct from last_modified (ingestion time). + // Necessary so importing old content (email archives, conversation backfills, + // benchmark seeds) doesn't get penalised by recency decay scoring, while still + // letting the LLM see real dates in retrieved context. + if version < 44 { + let chunk_cols = self.get_table_columns("memories").await?; + let conn = self.conn.lock().await; + + if !chunk_cols.contains("event_date") { + conn.execute("ALTER TABLE memories ADD COLUMN event_date INTEGER", ()) + .await + .map_err(|e| { + OriginError::VectorDb(format!("migration 44 add event_date: {e}")) + })?; + log::info!( + "[memory_db] migration 44: added memories.event_date (NULL-able, no backfill)" + ); + } + + conn.execute("PRAGMA user_version = 44", ()) + .await + .map_err(|e| OriginError::VectorDb(format!("set user_version=44: {e}")))?; + } + Ok(()) } @@ -5068,7 +5093,7 @@ impl MemoryDB { /// 16=source_agent, 17=confidence, 18=confirmed, 19=stability, 20=supersedes, /// 21=entity_id, 22=quality, 23=is_recap, 24=supersede_mode, /// 25=structured_fields, 26=retrieval_cue, 27=source_text, 28=created_at, - /// 29=score/distance/rank + /// 29=event_date, 30=score/distance/rank fn row_to_search_result(row: &libsql::Row, score: f32) -> Result { Ok(SearchResult { id: row @@ -5113,6 +5138,7 @@ impl MemoryDB { retrieval_cue: row.get::>(26).unwrap_or(None), source_text: row.get::>(27).unwrap_or(None), created_at: row.get::(28).unwrap_or(0), + event_date: row.get::>(29).unwrap_or(None), raw_score: 0.0, // Set later during normalization }) } @@ -5136,6 +5162,7 @@ impl MemoryDB { url: Option, chunk_index: i32, last_modified: i64, + event_date: Option, chunk_type: String, language: Option, byte_start: Option, @@ -5225,6 +5252,7 @@ impl MemoryDB { url: doc.url.clone(), chunk_index: i as i32, last_modified: doc.last_modified, + event_date: doc.event_date, chunk_type: chunk.chunk_type.clone(), language: chunk.language.clone(), byte_start: chunk.byte_range.map(|(s, _)| s as i64), @@ -5358,6 +5386,11 @@ impl MemoryDB { .map(|s| s.into()) .unwrap_or(libsql::Value::Null); + let event_date_val = row + .event_date + .map(libsql::Value::Integer) + .unwrap_or(libsql::Value::Null); + conn.execute( "INSERT INTO memories (id, content, source, source_id, title, summary, url, chunk_index, last_modified, chunk_type, language, byte_start, byte_end, @@ -5365,12 +5398,12 @@ impl MemoryDB { stability, supersedes, pending_revision, word_count, entity_id, enrichment_status, quality, is_recap, supersede_mode, structured_fields, retrieval_cue, source_text, - embedding, created_at) + embedding, created_at, event_date) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17, ?18, ?19, ?20, ?21, ?22, ?23, ?24, ?25, ?26, ?27, ?28, ?29, ?30, ?31, - vector32(?32), ?33)", + vector32(?32), ?33, ?34)", libsql::params![ row.id, row.content, @@ -5404,7 +5437,8 @@ impl MemoryDB { retrieval_cue_val, source_text_val, vec_str, - row.last_modified // created_at = last_modified at insert time + row.last_modified, // created_at = last_modified at insert time + event_date_val ], ) .await @@ -5457,6 +5491,7 @@ impl MemoryDB { c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, c.created_at, + c.event_date, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -5469,6 +5504,7 @@ impl MemoryDB { c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, c.created_at, + c.event_date, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -5488,7 +5524,7 @@ impl MemoryDB { match rows_result { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let distance: f64 = row.get(29).unwrap_or(1.0); + let distance: f64 = row.get(30).unwrap_or(1.0); if let Ok(result) = Self::row_to_search_result(&row, distance as f32) { vector_results.push(result); } @@ -5515,6 +5551,7 @@ impl MemoryDB { c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, c.created_at, + c.event_date, fts.rank FROM memories_fts fts JOIN memories c ON fts.rowid = c.rowid @@ -5529,6 +5566,7 @@ impl MemoryDB { c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, c.created_at, + c.event_date, fts.rank FROM memories_fts fts JOIN memories c ON fts.rowid = c.rowid @@ -5551,7 +5589,7 @@ impl MemoryDB { match fts_result { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let rank: f64 = row.get(29).unwrap_or(0.0); + let rank: f64 = row.get(30).unwrap_or(0.0); if let Ok(result) = Self::row_to_search_result(&row, rank as f32) { fts_results.push(result); } @@ -5725,6 +5763,7 @@ impl MemoryDB { c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, c.created_at, + c.event_date, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -5741,7 +5780,7 @@ impl MemoryDB { match conn.query(&sql, params).await { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let distance: f64 = row.get(29).unwrap_or(1.0); + let distance: f64 = row.get(30).unwrap_or(1.0); if let Ok(result) = Self::row_to_search_result(&row, distance as f32) { vector_results.push(result); } @@ -5785,6 +5824,7 @@ impl MemoryDB { c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, c.created_at, + c.event_date, fts.rank FROM memories_fts fts JOIN memories c ON fts.rowid = c.rowid @@ -5809,7 +5849,7 @@ impl MemoryDB { match conn.query(&fts_sql, params).await { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let rank: f64 = row.get(29).unwrap_or(0.0); + let rank: f64 = row.get(30).unwrap_or(0.0); if let Ok(result) = Self::row_to_search_result(&row, rank as f32) { fts_results.push(result); } @@ -5883,15 +5923,14 @@ impl MemoryDB { .map(|mut r| { let rrf = *score_map.get(&r.id).unwrap_or(&0.0); - // Tiered retrieval: weight by confidence and recency decay + // Tiered retrieval: weight by confidence and recency decay. + // Decay anchored to last_modified (ingestion/edit time), NOT event_date — + // an old email imported today should rank as freshly ingested. event_date + // is for display only. let conf = r.confidence.unwrap_or(0.5); let tier = stability_tier(r.memory_type.as_deref()); - // Inline decay rates (match TuningConfig defaults) — search doesn't hold config ref - let dr = match tier { - crate::sources::StabilityTier::Protected => 0.001, - crate::sources::StabilityTier::Standard => 0.01, - crate::sources::StabilityTier::Ephemeral => 0.05, - }; + let decay_cfg = crate::tuning::ConfidenceConfig::default(); + let dr = crate::sources::decay_rate(&tier, &decay_cfg); let age_days = ((now - r.last_modified) as f64 / 86400.0).max(0.0); let recency = (-dr * age_days).exp() as f32; @@ -6480,6 +6519,7 @@ impl MemoryDB { c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, c.created_at, + c.event_date, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -6500,6 +6540,7 @@ impl MemoryDB { c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, c.created_at, + c.event_date, vector_distance_cos(c.embedding, vector32(?1)) FROM vector_top_k('memories_vec_idx', vector32(?1), ?2) AS vt JOIN memories c ON c.rowid = vt.id @@ -6516,7 +6557,7 @@ impl MemoryDB { match conn.query(&sql, params).await { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { - let distance: f64 = row.get(29).unwrap_or(1.0); + let distance: f64 = row.get(30).unwrap_or(1.0); let score = (1.0 - distance).max(0.0) as f32; if let Ok(result) = Self::row_to_search_result(&row, score) { results.push(result); @@ -6573,6 +6614,7 @@ impl MemoryDB { c.entity_id, c.quality, c.is_recap, c.supersede_mode, c.structured_fields, c.retrieval_cue, c.source_text, c.created_at, + c.event_date, fts.rank FROM memories_fts fts JOIN memories c ON fts.rowid = c.rowid @@ -6600,7 +6642,7 @@ impl MemoryDB { Ok(mut rows) => { while let Ok(Some(row)) = rows.next().await { // FTS5 rank is negative BM25; negate so higher = better - let rank: f64 = row.get(29).unwrap_or(0.0); + let rank: f64 = row.get(30).unwrap_or(0.0); let score = (-rank) as f32; if let Ok(result) = Self::row_to_search_result(&row, score) { results.push(result); @@ -6857,6 +6899,7 @@ impl MemoryDB { retrieval_cue: None, source_text: None, created_at, + event_date: None, raw_score: 0.0, }); } @@ -24322,4 +24365,60 @@ pub(crate) mod tests { "created_at mismatch (upsert_documents mirrors last_modified -> created_at on INSERT)" ); } + + /// Regression: search ranking must depend on `last_modified` (recency anchor), + /// not on `event_date` (display-only). A chunk with a 3-year-old event_date + /// but a fresh last_modified (think: imported old email, benchmark seed) + /// must score meaningfully — the recency multiplier should not crush it + /// to ~0 just because the *event* is old. + /// Guards against the recency-decay-eats-old-content regression that was + /// caught when an earlier change made `last_modified` carry the event time. + #[tokio::test] + async fn test_search_ranking_uses_last_modified_not_event_date() { + let (db, _dir) = test_db().await; + let now_ts = chrono::Utc::now().timestamp(); + let three_years_ago = now_ts - 3 * 365 * 86400; + + // Single row with old event_date + fresh last_modified (mirrors + // benchmark seeds and old-archive imports). + let docs = vec![crate::sources::RawDocument { + content: "Alice met Bob in Tokyo".to_string(), + source_id: "old_event_fresh_import".to_string(), + source: "memory".to_string(), + title: "test".to_string(), + last_modified: now_ts, + event_date: Some(three_years_ago), + memory_type: Some("fact".to_string()), + domain: Some("conversation".to_string()), + ..Default::default() + }]; + db.upsert_documents(docs).await.unwrap(); + + let results = db + .search_memory( + "Tokyo", + 10, + None, + Some("conversation"), + None, + None, + None, + None, + ) + .await + .unwrap(); + assert_eq!(results.len(), 1, "expected one result: {results:?}"); + let r = &results[0]; + assert_eq!(r.last_modified, now_ts); + assert_eq!(r.event_date, Some(three_years_ago)); + // Score must be meaningful. Pre-fix, recency multiplier was + // exp(-0.01 * 1095) ≈ 1.7e-5, crushing any RRF score to ~0. + // With ranking anchored to last_modified=now(), recency=1.0 and + // score should reflect actual RRF×confidence×... product (well above 1e-3). + assert!( + r.score > 0.001, + "score crushed by recency decay despite fresh last_modified: {}", + r.score + ); + } } diff --git a/crates/origin-core/src/eval/answer_quality.rs b/crates/origin-core/src/eval/answer_quality.rs index 9300302d..a2ac296b 100644 --- a/crates/origin-core/src/eval/answer_quality.rs +++ b/crates/origin-core/src/eval/answer_quality.rs @@ -484,7 +484,8 @@ pub async fn run_e2e_locomo_eval( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: crate::eval::dates::seed_last_modified( + last_modified: chrono::Utc::now().timestamp(), + event_date: crate::eval::dates::seed_event_date( mem.session_date.as_deref(), crate::eval::dates::parse_locomo_date, ), @@ -734,7 +735,7 @@ pub(crate) async fn generate_e2e_answers_for_question( .map(|r| { format!( "On {}: {}", - crate::eval::shared::format_ymd(r.last_modified), + crate::eval::shared::format_ymd(r.event_date.unwrap_or(r.last_modified)), r.content ) }) @@ -783,7 +784,7 @@ pub(crate) async fn generate_e2e_answers_for_question( for r in flat_results.iter() { structured_parts.push(format!( "On {}: {}", - crate::eval::shared::format_ymd(r.last_modified), + crate::eval::shared::format_ymd(r.event_date.unwrap_or(r.last_modified)), r.content )); } @@ -876,7 +877,8 @@ pub async fn run_e2e_context_eval( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: crate::eval::dates::seed_last_modified( + last_modified: chrono::Utc::now().timestamp(), + event_date: crate::eval::dates::seed_event_date( mem.session_date.as_deref(), crate::eval::dates::parse_locomo_date, ), @@ -1028,7 +1030,8 @@ pub async fn run_e2e_context_eval_longmemeval( .to_string(), ), domain: Some("conversation".to_string()), - last_modified: crate::eval::dates::seed_last_modified( + last_modified: chrono::Utc::now().timestamp(), + event_date: crate::eval::dates::seed_event_date( mem.session_date.as_deref(), crate::eval::dates::parse_lme_date, ), @@ -1180,7 +1183,7 @@ async fn build_structured_context( for r in results.iter() { parts.push(format!( "On {}: {}", - crate::eval::shared::format_ymd(r.last_modified), + crate::eval::shared::format_ymd(r.event_date.unwrap_or(r.last_modified)), r.content )); } @@ -1275,7 +1278,8 @@ pub async fn run_fullpipeline_locomo_batch( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: crate::eval::dates::seed_last_modified( + last_modified: chrono::Utc::now().timestamp(), + event_date: crate::eval::dates::seed_event_date( mem.session_date.as_deref(), crate::eval::locomo::parse_locomo_date, ), @@ -1530,7 +1534,8 @@ pub async fn run_fullpipeline_lme_batch( .to_string(), ), domain: Some("conversation".to_string()), - last_modified: crate::eval::dates::seed_last_modified( + last_modified: chrono::Utc::now().timestamp(), + event_date: crate::eval::dates::seed_event_date( mem.session_date.as_deref(), crate::eval::longmemeval::parse_lme_date, ), diff --git a/crates/origin-core/src/eval/dates.rs b/crates/origin-core/src/eval/dates.rs index a674f61a..0a2937af 100644 --- a/crates/origin-core/src/eval/dates.rs +++ b/crates/origin-core/src/eval/dates.rs @@ -7,7 +7,7 @@ //! - [`parse_locomo_date`] — LoCoMo session timestamps ("1:56 pm on 8 May, 2023") //! - [`parse_lme_date`] — LongMemEval session timestamps ("2023/04/10 (Mon) 23:07") //! - [`format_ymd`] — Unix-seconds → "YYYY-MM-DD" formatting -//! - [`seed_last_modified`] — resolve a benchmark chunk's `last_modified` field +//! - [`seed_event_date`] — resolve a benchmark chunk's `event_date` field /// Parse a LoCoMo session date like "1:56 pm on 8 May, 2023" into Unix seconds. /// Returns `None` on parse failure (caller falls back to `now()`). @@ -49,22 +49,22 @@ pub fn format_ymd(ts: i64) -> String { .unwrap_or_else(|| "unknown date".to_string()) } -/// Resolve `last_modified` for a benchmark-seeded chunk: parse the per-session -/// date string with `parser` if present, else fall back to `now()` (used for -/// noise / undated entries). +/// Resolve `event_date` for a benchmark-seeded chunk: parse the per-session +/// date string with `parser`. Returns `None` if no date is provided, or if +/// parsing fails (with a warning so silent degradation is visible in logs). /// -/// Logs a warning when a non-empty date string fails to parse, so silent -/// degradation to today's date is visible in eval logs. -pub fn seed_last_modified(date: Option<&str>, parser: fn(&str) -> Option) -> i64 { - if let Some(s) = date { - if let Some(ts) = parser(s) { - return ts; - } - log::warn!( - "[eval:dates] failed to parse date {s:?}; falling back to now() — temporal accuracy lost" - ); +/// Used at seed sites to populate `RawDocument.event_date` while +/// `last_modified` stays at `now()` — so search ranking treats benchmark +/// memories as fresh while LLM context still sees the original event date. +pub fn seed_event_date(date: Option<&str>, parser: fn(&str) -> Option) -> Option { + let s = date?; + if let Some(ts) = parser(s) { + return Some(ts); } - chrono::Utc::now().timestamp() + log::warn!( + "[eval:dates] failed to parse date {s:?}; event_date set to None — display falls back to last_modified" + ); + None } #[cfg(test)] @@ -109,27 +109,25 @@ mod tests { assert_eq!(format_ymd(0), "1970-01-01"); } - // ── seed_last_modified ─────────────────────────────────────────────────── + // ── seed_event_date ────────────────────────────────────────────────────── #[test] - fn test_seed_last_modified_parses_when_date_present() { - let ts = seed_last_modified(Some("2023/04/10 (Mon) 23:07"), parse_lme_date); - assert_eq!(ts, 1_681_168_020); + fn test_seed_event_date_parses_when_date_present() { + assert_eq!( + seed_event_date(Some("2023/04/10 (Mon) 23:07"), parse_lme_date), + Some(1_681_168_020) + ); } #[test] - fn test_seed_last_modified_falls_back_to_now_when_date_missing() { - let before = chrono::Utc::now().timestamp(); - let ts = seed_last_modified(None, parse_lme_date); - let after = chrono::Utc::now().timestamp(); - assert!(ts >= before && ts <= after); + fn test_seed_event_date_returns_none_when_date_missing() { + assert_eq!(seed_event_date(None, parse_lme_date), None); } #[test] - fn test_seed_last_modified_falls_back_when_parser_rejects() { - let before = chrono::Utc::now().timestamp(); - let ts = seed_last_modified(Some("malformed"), parse_lme_date); - let after = chrono::Utc::now().timestamp(); - assert!(ts >= before && ts <= after); + fn test_seed_event_date_returns_none_when_parser_rejects() { + // malformed string still yields None (with a logged warning) rather than + // silently turning into "today" — that was the bug seed_last_modified had. + assert_eq!(seed_event_date(Some("malformed"), parse_lme_date), None); } } diff --git a/crates/origin-core/src/eval/locomo.rs b/crates/origin-core/src/eval/locomo.rs index bbd11997..60ba7e3b 100644 --- a/crates/origin-core/src/eval/locomo.rs +++ b/crates/origin-core/src/eval/locomo.rs @@ -19,7 +19,7 @@ use std::collections::{HashMap, HashSet}; use std::path::Path; // Bring date helpers into scope for use within this module. -use crate::eval::dates::seed_last_modified; +use crate::eval::dates::seed_event_date; // Re-export so external callers using `crate::eval::locomo::parse_locomo_date` still compile. pub use crate::eval::dates::parse_locomo_date; @@ -432,7 +432,8 @@ pub async fn run_locomo_eval(path: &Path) -> Result { title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: seed_last_modified(mem.session_date.as_deref(), parse_locomo_date), + last_modified: chrono::Utc::now().timestamp(), + event_date: seed_event_date(mem.session_date.as_deref(), parse_locomo_date), ..Default::default() }) .collect(); @@ -563,7 +564,8 @@ pub async fn run_locomo_eval_reranked( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: seed_last_modified(mem.session_date.as_deref(), parse_locomo_date), + last_modified: chrono::Utc::now().timestamp(), + event_date: seed_event_date(mem.session_date.as_deref(), parse_locomo_date), ..Default::default() }) .collect(); @@ -688,7 +690,8 @@ pub async fn run_locomo_eval_expanded( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: seed_last_modified(mem.session_date.as_deref(), parse_locomo_date), + last_modified: chrono::Utc::now().timestamp(), + event_date: seed_event_date(mem.session_date.as_deref(), parse_locomo_date), ..Default::default() }) .collect(); @@ -954,7 +957,8 @@ pub async fn run_locomo_eval_with_gate( title: format!("{} session {}", mem.speaker, mem.session_num), memory_type: Some("fact".to_string()), domain: Some("conversation".to_string()), - last_modified: seed_last_modified(mem.session_date.as_deref(), parse_locomo_date), + last_modified: chrono::Utc::now().timestamp(), + event_date: seed_event_date(mem.session_date.as_deref(), parse_locomo_date), ..Default::default() }) .collect(); diff --git a/crates/origin-core/src/eval/longmemeval.rs b/crates/origin-core/src/eval/longmemeval.rs index 3df88d43..476a3135 100644 --- a/crates/origin-core/src/eval/longmemeval.rs +++ b/crates/origin-core/src/eval/longmemeval.rs @@ -31,7 +31,7 @@ use std::collections::{HashMap, HashSet}; use std::path::Path; // Bring date helpers into scope for use within this module. -use crate::eval::dates::seed_last_modified; +use crate::eval::dates::seed_event_date; // Re-export so external callers using `crate::eval::longmemeval::parse_lme_date` still compile. pub use crate::eval::dates::parse_lme_date; @@ -431,7 +431,8 @@ pub async fn run_longmemeval_eval(path: &Path) -> Result, pub score: f32, #[serde(skip_serializing_if = "Option::is_none")] pub chunk_type: Option, diff --git a/crates/origin-types/src/sources.rs b/crates/origin-types/src/sources.rs index 9fc966db..77d52ffd 100644 --- a/crates/origin-types/src/sources.rs +++ b/crates/origin-types/src/sources.rs @@ -108,8 +108,15 @@ pub struct RawDocument { pub content: String, /// Deep link back to the source (URL, file path) pub url: Option, - /// Unix timestamp of last modification + /// Unix timestamp of last modification (ingestion/edit time — used for recency ranking). pub last_modified: i64, + /// Unix timestamp of when the event the document describes actually happened. + /// Distinct from `last_modified` (ingestion time): a benchmark seed has session_date here + /// and now() in last_modified; an imported old email has the email Date header here and + /// the import time in last_modified. Used for date-aware display in retrieved context. + /// `None` = unknown event time; consumers should fall back to `last_modified`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub event_date: Option, /// Additional metadata pub metadata: HashMap, @@ -183,6 +190,7 @@ impl Default for RawDocument { content: String::new(), url: None, last_modified: 0, + event_date: None, metadata: HashMap::new(), memory_type: None, domain: None, From fd4c9ed7355a67c7cb6e0dd7758681c1eb871d64 Mon Sep 17 00:00:00 2001 From: 7xuanlu Date: Tue, 28 Apr 2026 08:12:36 -0700 Subject: [PATCH 13/13] fix: add event_date to ingest_batcher test fixture Pre-push caught a missed RawDocument literal in origin-server's batcher tests after the event_date field was added in 8c94e882. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/origin-server/src/ingest_batcher.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/origin-server/src/ingest_batcher.rs b/crates/origin-server/src/ingest_batcher.rs index 89b53335..b5b287eb 100644 --- a/crates/origin-server/src/ingest_batcher.rs +++ b/crates/origin-server/src/ingest_batcher.rs @@ -246,6 +246,7 @@ mod tests { content: content.into(), url: None, last_modified: 0, + event_date: None, metadata: HashMap::new(), memory_type: Some("fact".into()), domain: None,