From bc5f982bb5464ac881dd4f15da2df73bc886fa52 Mon Sep 17 00:00:00 2001
From: "Andrei G." <k05h31@gmail.com>
Date: Fri, 20 Mar 2026 15:59:09 +0100
Subject: [PATCH] test(memory): add Ollama integration stubs for semantic cache
 (#2035)

Three #[ignore] integration tests in crates/zeph-memory/tests/ollama_integration.rs
verify ResponseCache put_with_embedding/get_semantic roundtrip, semantic rephrase hits
at threshold 0.80, and threshold boundary separation using live qwen3-embedding.
---
 .../zeph-memory/tests/ollama_integration.rs   | 163 ++++++++++++++++++
 1 file changed, 163 insertions(+)
 create mode 100644 crates/zeph-memory/tests/ollama_integration.rs
diff --git a/crates/zeph-memory/tests/ollama_integration.rs b/crates/zeph-memory/tests/ollama_integration.rs
new file mode 100644
index 00000000..94f589c1
--- /dev/null
+++ b/crates/zeph-memory/tests/ollama_integration.rs
@@ -0,0 +1,163 @@
+// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
+// SPDX-License-Identifier: MIT OR Apache-2.0
+
+use zeph_llm::ollama::OllamaProvider;
+use zeph_llm::provider::LlmProvider;
+use zeph_memory::response_cache::ResponseCache;
+use zeph_memory::sqlite::SqliteStore;
+
+const OLLAMA_BASE_URL: &str = "http://localhost:11434";
+const CHAT_MODEL: &str = "qwen3:8b";
+const EMBEDDING_MODEL: &str = "qwen3-embedding";
+
+// Cold Ollama model starts can take 30+ seconds on first embed call.
+async fn setup_cache_with_ollama() -> (ResponseCache, OllamaProvider) {
+    let store = SqliteStore::new(":memory:")
+        .await
+        .expect("in-memory SQLite must open");
+    let pool = store.pool().clone();
+    let cache = ResponseCache::new(pool, 3600);
+    let provider = OllamaProvider::new(OLLAMA_BASE_URL, CHAT_MODEL.into(), EMBEDDING_MODEL.into());
+    (cache, provider)
+}
+
+#[tokio::test]
+#[ignore = "requires running Ollama instance with qwen3-embedding model"]
+async fn with_ollama_embedding() {
+    // Roundtrip: embed a query, store via put_with_embedding, retrieve with the same
+    // embedding. Identical vectors must yield score ~1.0.
+    let (cache, provider) = setup_cache_with_ollama().await;
+
+    let query = "What is the Rust programming language?";
+    let embedding = provider
+        .embed(query)
+        .await
+        .expect("Ollama embed must succeed");
+
+    assert!(!embedding.is_empty(), "embedding must not be empty");
+    assert!(
+        embedding.len() > 100,
+        "embedding must have more than 100 dimensions"
+    );
+    assert!(
+        embedding.iter().all(|v| v.is_finite()),
+        "all embedding values must be finite"
+    );
+
+    cache
+        .put_with_embedding(
+            "k1",
+            "Rust is a systems programming language",
+            CHAT_MODEL,
+            &embedding,
+            EMBEDDING_MODEL,
+        )
+        .await
+        .expect("put_with_embedding must succeed");
+
+    let result = cache
+        .get_semantic(&embedding, EMBEDDING_MODEL, 0.95, 10)
+        .await
+        .expect("get_semantic must succeed");
+
+    let (response, score) = result.expect("identical embedding must produce a cache hit");
+    assert_eq!(response, "Rust is a systems programming language");
+    assert!(
+        (score - 1.0_f32).abs() < 1e-5,
+        "identical embedding must yield score ~1.0, got {score}"
+    );
+}
+
+#[tokio::test]
+#[ignore = "requires running Ollama instance with qwen3-embedding model"]
+async fn hit_on_rephrase() {
+    // Threshold 0.80 provides margin for embedding model version variance.
+    // Semantically equivalent rephrases typically score 0.85–0.98 with qwen3-embedding.
+    let (cache, provider) = setup_cache_with_ollama().await;
+
+    let original = "What is the capital of France?";
+    let embedding_original = provider
+        .embed(original)
+        .await
+        .expect("Ollama embed must succeed for original query");
+
+    cache
+        .put_with_embedding(
+            "k1",
+            "Paris is the capital of France",
+            CHAT_MODEL,
+            &embedding_original,
+            EMBEDDING_MODEL,
+        )
+        .await
+        .expect("put_with_embedding must succeed");
+
+    let rephrase = "Tell me the capital city of France";
+    let embedding_rephrase = provider
+        .embed(rephrase)
+        .await
+        .expect("Ollama embed must succeed for rephrased query");
+
+    let result = cache
+        .get_semantic(&embedding_rephrase, EMBEDDING_MODEL, 0.80, 10)
+        .await
+        .expect("get_semantic must succeed");
+
+    let (_response, score) = result.expect("rephrase must hit semantic cache at threshold 0.80");
+    assert!(
+        score > 0.80,
+        "rephrase similarity must exceed 0.80, got {score}"
+    );
+}
+
+#[tokio::test]
+#[ignore = "requires running Ollama instance with qwen3-embedding model"]
+async fn threshold_boundary() {
+    // Verify that threshold correctly separates hits from misses.
+    // An unrelated query (Rust ownership vs pasta recipes) must miss at 0.95
+    // but hit at 0.0 since cosine similarity >= 0.0 for any stored entry.
+    let (cache, provider) = setup_cache_with_ollama().await;
+
+    let tech_query = "Explain Rust ownership and borrowing";
+    let embedding_tech = provider
+        .embed(tech_query)
+        .await
+        .expect("Ollama embed must succeed for tech query");
+
+    cache
+        .put_with_embedding(
+            "k1",
+            "Rust ownership ensures memory safety without GC",
+            CHAT_MODEL,
+            &embedding_tech,
+            EMBEDDING_MODEL,
+        )
+        .await
+        .expect("put_with_embedding must succeed");
+
+    let unrelated = "Best Italian pasta recipes for beginners";
+    let embedding_unrelated = provider
+        .embed(unrelated)
+        .await
+        .expect("Ollama embed must succeed for unrelated query");
+
+    let miss = cache
+        .get_semantic(&embedding_unrelated, EMBEDDING_MODEL, 0.95, 10)
+        .await
+        .expect("get_semantic must succeed");
+    assert!(
+        miss.is_none(),
+        "unrelated query must not hit cache at threshold 0.95"
+    );
+
+    // Threshold 0.0 guarantees a hit: cosine similarity for any real embedding pair
+    // stored in the cache is >= 0.0 for typical non-adversarial inputs.
+    let hit = cache
+        .get_semantic(&embedding_unrelated, EMBEDDING_MODEL, 0.0, 10)
+        .await
+        .expect("get_semantic must succeed");
+    assert!(
+        hit.is_some(),
+        "any non-negative similarity must pass threshold 0.0"
+    );
+}