From 36fbd9aa8e55d1d5de8f017bd758e3a731853e3c Mon Sep 17 00:00:00 2001 From: Abdulkadirklc <104304986+Abdulkadirklc@users.noreply.github.com> Date: Tue, 31 Mar 2026 22:15:49 +0300 Subject: [PATCH 1/5] feat(recall): add proof_count boost to combined scoring Observations with more supporting evidence now rank slightly higher in recall results. proof_count is threaded through the retrieval pipeline and applied as a multiplicative boost in reranking: - types.py: add proof_count field to RetrievalResult - retrieval.py: include proof_count in SELECT columns - reranking.py: add log1p-normalized proof_count boost (alpha=0.1) The boost uses the same multiplicative pattern as recency and temporal signals. proof_count=1 is neutral, proof_count=50 gives ~+5% boost. Non-observation fact types are unaffected (neutral 0.5). --- .../hindsight_api/engine/search/reranking.py | 39 +++++++++++++++---- .../hindsight_api/engine/search/retrieval.py | 2 +- .../hindsight_api/engine/search/types.py | 2 + 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/hindsight-api-slim/hindsight_api/engine/search/reranking.py b/hindsight-api-slim/hindsight_api/engine/search/reranking.py index 7d0c37cfe..0937e5702 100644 --- a/hindsight-api-slim/hindsight_api/engine/search/reranking.py +++ b/hindsight-api-slim/hindsight_api/engine/search/reranking.py @@ -2,6 +2,7 @@ Cross-encoder neural reranking for search results. """ +import math from datetime import datetime, timezone from .types import MergedCandidate, ScoredResult @@ -13,6 +14,7 @@ # so the max combined boost is (1 + alpha/2)^2 ≈ +21% and min is (1 - alpha/2)^2 ≈ -19%. _RECENCY_ALPHA: float = 0.2 _TEMPORAL_ALPHA: float = 0.2 +_PROOF_COUNT_ALPHA: float = 0.1 # Conservative: max ±5% for evidence strength def apply_combined_scoring( @@ -20,28 +22,38 @@ def apply_combined_scoring( now: datetime, recency_alpha: float = _RECENCY_ALPHA, temporal_alpha: float = _TEMPORAL_ALPHA, + proof_count_alpha: float = _PROOF_COUNT_ALPHA, ) -> None: """Apply combined scoring to a list of ScoredResults in-place. - Uses the cross-encoder score as the primary relevance signal, with recency - and temporal proximity applied as multiplicative boosts. This ensures the - influence of these secondary signals is always proportional to the base - relevance score, regardless of the cross-encoder model's score calibration. + Uses the cross-encoder score as the primary relevance signal, with recency, + temporal proximity, and proof count applied as multiplicative boosts. This + ensures the influence of these secondary signals is always proportional to + the base relevance score, regardless of the cross-encoder model's score + calibration. Formula:: - recency_boost = 1 + recency_alpha * (recency - 0.5) # in [1-α/2, 1+α/2] - temporal_boost = 1 + temporal_alpha * (temporal - 0.5) # in [1-α/2, 1+α/2] - combined_score = cross_encoder_score_normalized * recency_boost * temporal_boost + recency_boost = 1 + recency_alpha * (recency - 0.5) # in [1-α/2, 1+α/2] + temporal_boost = 1 + temporal_alpha * (temporal - 0.5) # in [1-α/2, 1+α/2] + proof_count_boost = 1 + proof_count_alpha * (proof_norm - 0.5) # in [1-α/2, 1+α/2] + combined_score = CE_normalized * recency_boost * temporal_boost * proof_count_boost + + proof_norm maps proof_count to [0, 1] using log1p normalization: + proof_count=1 → 0.5 (neutral), proof_count=5 → 0.72, proof_count=50 → 0.93 Temporal proximity is treated as neutral (0.5) when not set by temporal retrieval, so temporal_boost collapses to 1.0 for non-temporal queries. + Proof count is treated as neutral (0.5) when not available (non-observation facts), + so proof_count_boost collapses to 1.0 for world/experience/opinion facts. + Args: scored_results: Results from the cross-encoder reranker. Mutated in place. now: Current UTC datetime for recency calculation. recency_alpha: Max relative recency adjustment (default 0.2 → ±10%). temporal_alpha: Max relative temporal adjustment (default 0.2 → ±10%). + proof_count_alpha: Max relative proof count adjustment (default 0.1 → ±5%). """ if now.tzinfo is None: now = now.replace(tzinfo=UTC) @@ -59,13 +71,24 @@ def apply_combined_scoring( # Temporal proximity: meaningful only for temporal queries; neutral otherwise. sr.temporal = sr.retrieval.temporal_proximity if sr.retrieval.temporal_proximity is not None else 0.5 + # Proof count: log-normalized evidence strength; neutral for non-observations. + # log1p(1)/log1p(100) ≈ 0.15 → maps to neutral 0.5 after centering. + # log1p(50)/log1p(100) ≈ 0.85 → strong boost. + proof_count = sr.retrieval.proof_count + if proof_count is not None and proof_count >= 1: + proof_norm = math.log1p(proof_count) / math.log1p(100) # → [0, 1] + proof_norm = min(proof_norm, 1.0) + else: + proof_norm = 0.5 # Neutral + # RRF: kept at 0.0 for trace continuity but excluded from scoring. # RRF is batch-relative (min-max normalised) and redundant after reranking. sr.rrf_normalized = 0.0 recency_boost = 1.0 + recency_alpha * (sr.recency - 0.5) temporal_boost = 1.0 + temporal_alpha * (sr.temporal - 0.5) - sr.combined_score = sr.cross_encoder_score_normalized * recency_boost * temporal_boost + proof_count_boost = 1.0 + proof_count_alpha * (proof_norm - 0.5) + sr.combined_score = sr.cross_encoder_score_normalized * recency_boost * temporal_boost * proof_count_boost sr.weight = sr.combined_score diff --git a/hindsight-api-slim/hindsight_api/engine/search/retrieval.py b/hindsight-api-slim/hindsight_api/engine/search/retrieval.py index bca52afdf..2e03eb0e6 100644 --- a/hindsight-api-slim/hindsight_api/engine/search/retrieval.py +++ b/hindsight-api-slim/hindsight_api/engine/search/retrieval.py @@ -148,7 +148,7 @@ async def retrieve_semantic_bm25_combined( cols = ( "id, text, context, event_date, occurred_start, occurred_end, mentioned_at, " - "fact_type, document_id, chunk_id, tags, metadata" + "fact_type, document_id, chunk_id, tags, metadata, proof_count" ) table = fq_table("memory_units") diff --git a/hindsight-api-slim/hindsight_api/engine/search/types.py b/hindsight-api-slim/hindsight_api/engine/search/types.py index 90edcf0ee..6a63f5c5f 100644 --- a/hindsight-api-slim/hindsight_api/engine/search/types.py +++ b/hindsight-api-slim/hindsight_api/engine/search/types.py @@ -48,6 +48,7 @@ class RetrievalResult: chunk_id: str | None = None tags: list[str] | None = None # Visibility scope tags metadata: dict[str, str] | None = None # User-provided metadata + proof_count: int | None = None # Number of supporting memories (observations only) # Retrieval-specific scores (only one will be set depending on retrieval method) similarity: float | None = None # Semantic retrieval @@ -72,6 +73,7 @@ def from_db_row(cls, row: dict[str, Any]) -> "RetrievalResult": chunk_id=row.get("chunk_id"), tags=row.get("tags"), metadata=row.get("metadata"), + proof_count=row.get("proof_count"), similarity=row.get("similarity"), bm25_score=row.get("bm25_score"), activation=row.get("activation"), From 24e7e6a1fb22b013b0850cf78cd200ddf4335f07 Mon Sep 17 00:00:00 2001 From: Abdulkadirklc <104304986+Abdulkadirklc@users.noreply.github.com> Date: Wed, 1 Apr 2026 12:58:37 +0300 Subject: [PATCH 2/5] fix(retrieval): Apply proof_count boost to graph and temporal retrieval, normalize scaling --- .../engine/search/graph_retrieval.py | 4 +- .../hindsight_api/engine/search/reranking.py | 17 ++-- .../hindsight_api/engine/search/retrieval.py | 4 +- .../tests/test_reranking_proof_count.py | 94 +++++++++++++++++++ 4 files changed, 108 insertions(+), 11 deletions(-) create mode 100644 hindsight-api-slim/tests/test_reranking_proof_count.py diff --git a/hindsight-api-slim/hindsight_api/engine/search/graph_retrieval.py b/hindsight-api-slim/hindsight_api/engine/search/graph_retrieval.py index 7cdde9290..e5cf27d8a 100644 --- a/hindsight-api-slim/hindsight_api/engine/search/graph_retrieval.py +++ b/hindsight-api-slim/hindsight_api/engine/search/graph_retrieval.py @@ -175,7 +175,7 @@ async def _retrieve_with_conn( entry_points = await conn.fetch( f""" SELECT id, text, context, event_date, occurred_start, occurred_end, - mentioned_at, fact_type, document_id, chunk_id, tags, + mentioned_at, fact_type, document_id, chunk_id, tags, proof_count, 1 - (embedding <=> $1::vector) AS similarity FROM {fq_table("memory_units")} WHERE bank_id = $2 @@ -230,7 +230,7 @@ async def _retrieve_with_conn( neighbors = await conn.fetch( f""" SELECT mu.id, mu.text, mu.context, mu.occurred_start, mu.occurred_end, - mu.mentioned_at, mu.fact_type, + mu.mentioned_at, mu.fact_type, mu.proof_count, mu.document_id, mu.chunk_id, mu.tags, mu.metadata, ml.weight, ml.link_type, ml.from_unit_id FROM {fq_table("memory_links")} ml diff --git a/hindsight-api-slim/hindsight_api/engine/search/reranking.py b/hindsight-api-slim/hindsight_api/engine/search/reranking.py index 0937e5702..e684ef8ed 100644 --- a/hindsight-api-slim/hindsight_api/engine/search/reranking.py +++ b/hindsight-api-slim/hindsight_api/engine/search/reranking.py @@ -39,8 +39,10 @@ def apply_combined_scoring( proof_count_boost = 1 + proof_count_alpha * (proof_norm - 0.5) # in [1-α/2, 1+α/2] combined_score = CE_normalized * recency_boost * temporal_boost * proof_count_boost - proof_norm maps proof_count to [0, 1] using log1p normalization: - proof_count=1 → 0.5 (neutral), proof_count=5 → 0.72, proof_count=50 → 0.93 + proof_norm maps proof_count using natural log1p without an arbitrary cap: + proof_count=1 → base log1p(1)=0.693 (difference=0.0 → neutral multiplier) + proof_count=10 → log1p(10)=2.39 (gives boost) + proof_count=100 → log1p(100)=4.61 (strong boost) Temporal proximity is treated as neutral (0.5) when not set by temporal retrieval, so temporal_boost collapses to 1.0 for non-temporal queries. @@ -72,14 +74,15 @@ def apply_combined_scoring( sr.temporal = sr.retrieval.temporal_proximity if sr.retrieval.temporal_proximity is not None else 0.5 # Proof count: log-normalized evidence strength; neutral for non-observations. - # log1p(1)/log1p(100) ≈ 0.15 → maps to neutral 0.5 after centering. - # log1p(50)/log1p(100) ≈ 0.85 → strong boost. proof_count = sr.retrieval.proof_count if proof_count is not None and proof_count >= 1: - proof_norm = math.log1p(proof_count) / math.log1p(100) # → [0, 1] - proof_norm = min(proof_norm, 1.0) + # We don't cap this at an arbitrary number like 100 anymore. + # log1p naturally dampens large numbers: + # count=1 -> 0.69, count=50 -> 3.93, count=500 -> 6.21 + proof_norm = math.log1p(proof_count) else: - proof_norm = 0.5 # Neutral + # Neutral baseline for log1p is log1p(1) = 0.693 + proof_norm = math.log1p(1) # RRF: kept at 0.0 for trace continuity but excluded from scoring. # RRF is batch-relative (min-max normalised) and redundant after reranking. diff --git a/hindsight-api-slim/hindsight_api/engine/search/retrieval.py b/hindsight-api-slim/hindsight_api/engine/search/retrieval.py index 2e03eb0e6..0538b58dd 100644 --- a/hindsight-api-slim/hindsight_api/engine/search/retrieval.py +++ b/hindsight-api-slim/hindsight_api/engine/search/retrieval.py @@ -343,7 +343,7 @@ async def retrieve_temporal_combined( {groups_clause} ), sim_ranked AS ( - SELECT mu.id, mu.text, mu.context, mu.event_date, mu.occurred_start, mu.occurred_end, mu.mentioned_at, mu.fact_type, mu.document_id, mu.chunk_id, mu.tags, mu.metadata, + SELECT mu.id, mu.text, mu.context, mu.event_date, mu.occurred_start, mu.occurred_end, mu.mentioned_at, mu.fact_type, mu.proof_count, mu.document_id, mu.chunk_id, mu.tags, mu.metadata, 1 - (mu.embedding <=> $1::vector) AS similarity, ROW_NUMBER() OVER (PARTITION BY mu.fact_type ORDER BY mu.embedding <=> $1::vector) AS sim_rn FROM date_ranked dr @@ -351,7 +351,7 @@ async def retrieve_temporal_combined( WHERE dr.rn <= 50 AND (1 - (mu.embedding <=> $1::vector)) >= $6 ) - SELECT id, text, context, event_date, occurred_start, occurred_end, mentioned_at, fact_type, document_id, chunk_id, tags, metadata, similarity + SELECT id, text, context, event_date, occurred_start, occurred_end, mentioned_at, fact_type, proof_count, document_id, chunk_id, tags, metadata, similarity FROM sim_ranked WHERE sim_rn <= 10 """, diff --git a/hindsight-api-slim/tests/test_reranking_proof_count.py b/hindsight-api-slim/tests/test_reranking_proof_count.py new file mode 100644 index 000000000..2f76a29c6 --- /dev/null +++ b/hindsight-api-slim/tests/test_reranking_proof_count.py @@ -0,0 +1,94 @@ +""" +Unit tests for proof_count boost in reranking. +""" + +from datetime import datetime, timezone +import pytest +from uuid import uuid4 + +from hindsight_api.engine.search.types import RetrievalResult, MergedCandidate, ScoredResult +from hindsight_api.engine.search.reranking import apply_combined_scoring + +UTC = timezone.utc + +def create_mock_scored_result(proof_count: int | None = None, ce_score: float = 0.8) -> ScoredResult: + """Helper to create a minimal ScoredResult suitable for scoring tests.""" + retrieval = RetrievalResult( + id=uuid4(), + text="Test mock fact", + fact_type="observation" if proof_count is not None else "world", + document_id=uuid4(), + chunk_id=uuid4(), + embedding=[0.1]*384, + similarity=0.9, + proof_count=proof_count, + # Default neutral dates for testing so only proof_count changes score + occurred_start=datetime.now(UTC), + occurred_end=datetime.now(UTC) + ) + candidate = MergedCandidate( + id=retrieval.id, + retrieval=retrieval, + semantic_rank=1, + bm25_rank=1, + rrf_score=0.1 + ) + return ScoredResult( + candidate=candidate, + cross_encoder_score=ce_score, + cross_encoder_score_normalized=ce_score, + weight=ce_score, + ) + +def test_proof_count_neutral_when_none(): + """Test that when proof_count is None (e.g. non-observation), it gets neutral 0.5 norm.""" + sr = create_mock_scored_result(proof_count=None, ce_score=0.8) + now = datetime.now(UTC) + + apply_combined_scoring([sr], now, proof_count_alpha=0.1) + + # Neutral multiplier means score shouldn't be boosted by proof_count + # Since recency is neutral (just created) and temporal is neutral, score should remain unchanged + assert sr.combined_score == pytest.approx(0.8, rel=1e-3) + +def test_proof_count_neutral_at_one(): + """Test that proof_count=1 gives neutral multiplier.""" + sr = create_mock_scored_result(proof_count=1, ce_score=0.8) + now = datetime.now(UTC) + + apply_combined_scoring([sr], now, proof_count_alpha=0.1) + + # proof_count=1 -> log1p(1) = 0.693, base log1p(1) = 0.693 -> difference 0.0 -> multipler 1.0 + assert sr.combined_score == pytest.approx(0.8, rel=1e-3) + +def test_proof_count_increases_with_higher_counts(): + """Test that higher proof counts yield strictly higher scores.""" + now = datetime.now(UTC) + + # Create results with increasing proof counts + sr_5 = create_mock_scored_result(proof_count=5, ce_score=0.8) + sr_50 = create_mock_scored_result(proof_count=50, ce_score=0.8) + sr_100 = create_mock_scored_result(proof_count=100, ce_score=0.8) + + # Process them + apply_combined_scoring([sr_5, sr_50, sr_100], now, proof_count_alpha=0.1) + + # Assure scores strictly increase + assert sr_5.combined_score > 0.8 + assert sr_50.combined_score > sr_5.combined_score + assert sr_100.combined_score > sr_50.combined_score + +def test_proof_count_no_hardcoded_cap_at_100(): + """Test that observations with counts > 100 continue to scale up (no log1p(100) cap).""" + now = datetime.now(UTC) + + # If capped at 100, these would both get identical scores + sr_100 = create_mock_scored_result(proof_count=100, ce_score=0.8) + sr_500 = create_mock_scored_result(proof_count=500, ce_score=0.8) + sr_1000 = create_mock_scored_result(proof_count=1000, ce_score=0.8) + + apply_combined_scoring([sr_100, sr_500, sr_1000], now, proof_count_alpha=0.1) + + # Must strictly increase, not plateau + assert sr_500.combined_score > sr_100.combined_score + assert sr_1000.combined_score > sr_500.combined_score From ba4021e5284a754c89e09f16b63b993f48751b47 Mon Sep 17 00:00:00 2001 From: Abdulkadirklc <104304986+Abdulkadirklc@users.noreply.github.com> Date: Wed, 1 Apr 2026 13:11:01 +0300 Subject: [PATCH 3/5] fix(retrieval): correct proof_norm math to zero-center at count 1 --- .../hindsight_api/engine/search/reranking.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/hindsight-api-slim/hindsight_api/engine/search/reranking.py b/hindsight-api-slim/hindsight_api/engine/search/reranking.py index e684ef8ed..f4d83c238 100644 --- a/hindsight-api-slim/hindsight_api/engine/search/reranking.py +++ b/hindsight-api-slim/hindsight_api/engine/search/reranking.py @@ -39,10 +39,10 @@ def apply_combined_scoring( proof_count_boost = 1 + proof_count_alpha * (proof_norm - 0.5) # in [1-α/2, 1+α/2] combined_score = CE_normalized * recency_boost * temporal_boost * proof_count_boost - proof_norm maps proof_count using natural log1p without an arbitrary cap: - proof_count=1 → base log1p(1)=0.693 (difference=0.0 → neutral multiplier) - proof_count=10 → log1p(10)=2.39 (gives boost) - proof_count=100 → log1p(100)=4.61 (strong boost) + proof_norm maps proof_count using a smooth logarithmic curve centered at 0.5: + proof_count=1 → 0.5 + 0 = 0.5 (neutral multiplier) + proof_count=150 → 0.5 + 0.5 = 1.0 (strong boost) + proof_count=22000 → 0.5 + 1.0 = 1.5 (very strong boost) Temporal proximity is treated as neutral (0.5) when not set by temporal retrieval, so temporal_boost collapses to 1.0 for non-temporal queries. @@ -76,13 +76,15 @@ def apply_combined_scoring( # Proof count: log-normalized evidence strength; neutral for non-observations. proof_count = sr.retrieval.proof_count if proof_count is not None and proof_count >= 1: - # We don't cap this at an arbitrary number like 100 anymore. - # log1p naturally dampens large numbers: - # count=1 -> 0.69, count=50 -> 3.93, count=500 -> 6.21 - proof_norm = math.log1p(proof_count) + # We don't cap this at an arbitrary number. + # We scale naturally so that: + # count=1 -> 0.5 (neutral baseline) + # count=150 -> 1.0 (+5% boost) + # count=22000 -> 1.5 (+10% boost) + proof_norm = 0.5 + (math.log(proof_count) / 10.0) else: - # Neutral baseline for log1p is log1p(1) = 0.693 - proof_norm = math.log1p(1) + # Neutral baseline is precisely 0.5, ensuring neutral multiplier (1.0) + proof_norm = 0.5 # RRF: kept at 0.0 for trace continuity but excluded from scoring. # RRF is batch-relative (min-max normalised) and redundant after reranking. From 3aac487b91c91efa29e116376fa644f7100e7a7a Mon Sep 17 00:00:00 2001 From: Abdulkadirklc <104304986+Abdulkadirklc@users.noreply.github.com> Date: Wed, 1 Apr 2026 14:14:26 +0300 Subject: [PATCH 4/5] fix(retrieval): Apply proof_count boost to link_expansion retrieval --- .../engine/search/link_expansion_retrieval.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/hindsight-api-slim/hindsight_api/engine/search/link_expansion_retrieval.py b/hindsight-api-slim/hindsight_api/engine/search/link_expansion_retrieval.py index 3b3f434ad..32e81c8e2 100644 --- a/hindsight-api-slim/hindsight_api/engine/search/link_expansion_retrieval.py +++ b/hindsight-api-slim/hindsight_api/engine/search/link_expansion_retrieval.py @@ -59,7 +59,7 @@ async def _find_semantic_seeds( rows = await conn.fetch( f""" SELECT id, text, context, event_date, occurred_start, occurred_end, - mentioned_at, fact_type, document_id, chunk_id, tags, + mentioned_at, fact_type, document_id, chunk_id, tags, proof_count, 1 - (embedding <=> $1::vector) AS similarity FROM {fq_table("memory_units")} WHERE bank_id = $2 @@ -274,7 +274,7 @@ async def _expand_combined( -- Score = COUNT(DISTINCT shared entities), mapped to [0,1] via tanh. SELECT mu.id, mu.text, mu.context, mu.event_date, mu.occurred_start, mu.occurred_end, mu.mentioned_at, - mu.fact_type, mu.document_id, mu.chunk_id, mu.tags, + mu.fact_type, mu.document_id, mu.chunk_id, mu.tags, mu.proof_count, COUNT(DISTINCT ue_seed.entity_id)::float AS score, 'entity'::text AS source FROM {ue} ue_seed @@ -298,14 +298,14 @@ async def _expand_combined( SELECT id, text, context, event_date, occurred_start, occurred_end, mentioned_at, - fact_type, document_id, chunk_id, tags, + fact_type, document_id, chunk_id, tags, proof_count, MAX(weight) AS score, 'semantic'::text AS source FROM ( SELECT mu.id, mu.text, mu.context, mu.event_date, mu.occurred_start, mu.occurred_end, mu.mentioned_at, - mu.fact_type, mu.document_id, mu.chunk_id, mu.tags, + mu.fact_type, mu.document_id, mu.chunk_id, mu.tags, mu.proof_count, ml.weight FROM {ml} ml JOIN {mu} mu ON mu.id = ml.to_unit_id @@ -317,7 +317,7 @@ async def _expand_combined( SELECT mu.id, mu.text, mu.context, mu.event_date, mu.occurred_start, mu.occurred_end, mu.mentioned_at, - mu.fact_type, mu.document_id, mu.chunk_id, mu.tags, + mu.fact_type, mu.document_id, mu.chunk_id, mu.tags, mu.proof_count, ml.weight FROM {ml} ml JOIN {mu} mu ON mu.id = ml.from_unit_id @@ -328,7 +328,7 @@ async def _expand_combined( ) sem_raw GROUP BY id, text, context, event_date, occurred_start, occurred_end, mentioned_at, - fact_type, document_id, chunk_id, tags + fact_type, document_id, chunk_id, tags, proof_count ORDER BY score DESC LIMIT $3 ), @@ -339,7 +339,7 @@ async def _expand_combined( SELECT DISTINCT ON (mu.id) mu.id, mu.text, mu.context, mu.event_date, mu.occurred_start, mu.occurred_end, mu.mentioned_at, - mu.fact_type, mu.document_id, mu.chunk_id, mu.tags, + mu.fact_type, mu.document_id, mu.chunk_id, mu.tags, mu.proof_count, ml.weight AS score, 'causal'::text AS source FROM {ml} ml @@ -429,7 +429,7 @@ async def _expand_observations( SELECT mu.id, mu.text, mu.context, mu.event_date, mu.occurred_start, mu.occurred_end, mu.mentioned_at, - mu.fact_type, mu.document_id, mu.chunk_id, mu.tags, + mu.fact_type, mu.document_id, mu.chunk_id, mu.tags, mu.proof_count, (SELECT COUNT(DISTINCT s) FROM unnest(mu.source_memory_ids) s WHERE s = ANY(ca.source_ids))::float AS score FROM {fq_table("memory_units")} mu, connected_array ca WHERE mu.fact_type = 'observation' @@ -453,13 +453,13 @@ async def _expand_observations( SELECT id, text, context, event_date, occurred_start, occurred_end, mentioned_at, - fact_type, document_id, chunk_id, tags, + fact_type, document_id, chunk_id, tags, proof_count, MAX(weight) AS score, 'semantic'::text AS source FROM ( SELECT mu.id, mu.text, mu.context, mu.event_date, mu.occurred_start, mu.occurred_end, mu.mentioned_at, mu.fact_type, mu.document_id, - mu.chunk_id, mu.tags, ml.weight + mu.chunk_id, mu.tags, mu.proof_count, ml.weight FROM {ml} ml JOIN {mu} mu ON mu.id = ml.to_unit_id WHERE ml.from_unit_id = ANY($1::uuid[]) AND ml.link_type = 'semantic' AND mu.fact_type = 'observation' @@ -467,21 +467,21 @@ async def _expand_observations( UNION ALL SELECT mu.id, mu.text, mu.context, mu.event_date, mu.occurred_start, mu.occurred_end, mu.mentioned_at, mu.fact_type, mu.document_id, - mu.chunk_id, mu.tags, ml.weight + mu.chunk_id, mu.tags, mu.proof_count, ml.weight FROM {ml} ml JOIN {mu} mu ON mu.id = ml.from_unit_id WHERE ml.to_unit_id = ANY($1::uuid[]) AND ml.link_type = 'semantic' AND mu.fact_type = 'observation' AND mu.id != ALL($1::uuid[]) ) sem_raw GROUP BY id, text, context, event_date, occurred_start, occurred_end, - mentioned_at, fact_type, document_id, chunk_id, tags + mentioned_at, fact_type, document_id, chunk_id, tags, proof_count ORDER BY score DESC LIMIT $2 ), causal_expanded AS ( SELECT DISTINCT ON (mu.id) mu.id, mu.text, mu.context, mu.event_date, mu.occurred_start, mu.occurred_end, mu.mentioned_at, mu.fact_type, mu.document_id, - mu.chunk_id, mu.tags, ml.weight AS score, 'causal'::text AS source + mu.chunk_id, mu.tags, mu.proof_count, ml.weight AS score, 'causal'::text AS source FROM {ml} ml JOIN {mu} mu ON ml.to_unit_id = mu.id WHERE ml.from_unit_id = ANY($1::uuid[]) AND ml.link_type IN ('causes', 'caused_by', 'enables', 'prevents') From ad425c2879a483dae85712575e65077cecc20eba Mon Sep 17 00:00:00 2001 From: Abdulkadirklc <104304986+Abdulkadirklc@users.noreply.github.com> Date: Thu, 2 Apr 2026 19:44:44 +0300 Subject: [PATCH 5/5] fix: remove BFS zombie, clamp proof_norm to [0,1], fix test comment (log1p->math.log) --- .../engine/search/graph_retrieval.py | 211 ------------------ .../hindsight_api/engine/search/reranking.py | 14 +- .../tests/test_reranking_proof_count.py | 20 +- 3 files changed, 16 insertions(+), 229 deletions(-) diff --git a/hindsight-api-slim/hindsight_api/engine/search/graph_retrieval.py b/hindsight-api-slim/hindsight_api/engine/search/graph_retrieval.py index c5af33ecc..377b83cb3 100644 --- a/hindsight-api-slim/hindsight_api/engine/search/graph_retrieval.py +++ b/hindsight-api-slim/hindsight_api/engine/search/graph_retrieval.py @@ -65,214 +65,3 @@ async def retrieve( Tuple of (List of RetrievalResult with activation scores, optional timing info) """ pass -class BFSGraphRetriever(GraphRetriever): - """ - Graph retrieval using BFS-style spreading activation. - - Starting from semantic entry points, spreads activation through - the memory graph (entity, temporal, causal links) using breadth-first - traversal with decaying activation. - - This is the original Hindsight graph retrieval algorithm. - """ - - def __init__( - self, - entry_point_limit: int = 5, - entry_point_threshold: float = 0.5, - activation_decay: float = 0.8, - min_activation: float = 0.1, - batch_size: int = 20, - ): - """ - Initialize BFS graph retriever. - - Args: - entry_point_limit: Maximum number of entry points to start from - entry_point_threshold: Minimum semantic similarity for entry points - activation_decay: Decay factor per hop (activation *= decay) - min_activation: Minimum activation to continue spreading - batch_size: Number of nodes to process per batch (for neighbor fetching) - """ - self.entry_point_limit = entry_point_limit - self.entry_point_threshold = entry_point_threshold - self.activation_decay = activation_decay - self.min_activation = min_activation - self.batch_size = batch_size - - @property - def name(self) -> str: - return "bfs" - - async def retrieve( - self, - pool, - query_embedding_str: str, - bank_id: str, - fact_type: str, - budget: int, - query_text: str | None = None, - semantic_seeds: list[RetrievalResult] | None = None, - temporal_seeds: list[RetrievalResult] | None = None, - adjacency=None, # Not used by BFS - tags: list[str] | None = None, - tags_match: TagsMatch = "any", - tag_groups: list[TagGroup] | None = None, - ) -> tuple[list[RetrievalResult], MPFPTimings | None]: - """ - Retrieve facts using BFS spreading activation. - - Algorithm: - 1. Find entry points (top semantic matches above threshold) - 2. BFS traversal: visit neighbors, propagate decaying activation - 3. Boost causal links (causes, enables, prevents) - 4. Return visited nodes up to budget - - Note: BFS finds its own entry points via embedding search. - The semantic_seeds, temporal_seeds, and adjacency parameters are accepted - for interface compatibility but not used. - """ - async with acquire_with_retry(pool) as conn: - results = await self._retrieve_with_conn( - conn, - query_embedding_str, - bank_id, - fact_type, - budget, - tags=tags, - tags_match=tags_match, - tag_groups=tag_groups, - ) - return results, None - - async def _retrieve_with_conn( - self, - conn, - query_embedding_str: str, - bank_id: str, - fact_type: str, - budget: int, - tags: list[str] | None = None, - tags_match: TagsMatch = "any", - tag_groups: list[TagGroup] | None = None, - ) -> list[RetrievalResult]: - """Internal implementation with connection.""" - from .tags import build_tag_groups_where_clause, build_tags_where_clause_simple - - tags_clause = build_tags_where_clause_simple(tags, 6, match=tags_match) - tag_groups_param_start = 6 + (1 if tags else 0) - groups_clause, groups_params, _ = build_tag_groups_where_clause(tag_groups, tag_groups_param_start) - params = [query_embedding_str, bank_id, fact_type, self.entry_point_threshold, self.entry_point_limit] - if tags: - params.append(tags) - params.extend(groups_params) - - # Step 1: Find entry points - entry_points = await conn.fetch( - f""" - SELECT id, text, context, event_date, occurred_start, occurred_end, - mentioned_at, fact_type, document_id, chunk_id, tags, proof_count, - 1 - (embedding <=> $1::vector) AS similarity - FROM {fq_table("memory_units")} - WHERE bank_id = $2 - AND embedding IS NOT NULL - AND fact_type = $3 - AND (1 - (embedding <=> $1::vector)) >= $4 - {tags_clause} - {groups_clause} - ORDER BY embedding <=> $1::vector - LIMIT $5 - """, - *params, - ) - - if not entry_points: - logger.debug( - f"[BFS] No entry points found for fact_type={fact_type} (tags={tags}, tags_match={tags_match})" - ) - return [] - - logger.debug( - f"[BFS] Found {len(entry_points)} entry points for fact_type={fact_type} " - f"(tags={tags}, tags_match={tags_match})" - ) - - # Step 2: BFS spreading activation - visited = set() - results = [] - queue = [(RetrievalResult.from_db_row(dict(r)), r["similarity"]) for r in entry_points] - budget_remaining = budget - - while queue and budget_remaining > 0: - # Collect a batch of nodes to process - batch_nodes = [] - batch_activations = {} - - while queue and len(batch_nodes) < self.batch_size and budget_remaining > 0: - current, activation = queue.pop(0) - unit_id = current.id - - if unit_id not in visited: - visited.add(unit_id) - budget_remaining -= 1 - current.activation = activation - results.append(current) - batch_nodes.append(current.id) - batch_activations[unit_id] = activation - - # Batch fetch neighbors - if batch_nodes and budget_remaining > 0: - max_neighbors = len(batch_nodes) * 20 - neighbors = await conn.fetch( - f""" - SELECT mu.id, mu.text, mu.context, mu.occurred_start, mu.occurred_end, - mu.mentioned_at, mu.fact_type, mu.proof_count, - mu.document_id, mu.chunk_id, mu.tags, mu.metadata, - ml.weight, ml.link_type, ml.from_unit_id - FROM {fq_table("memory_links")} ml - JOIN {fq_table("memory_units")} mu ON ml.to_unit_id = mu.id - WHERE ml.from_unit_id = ANY($1::uuid[]) - AND ml.weight >= $2 - AND mu.fact_type = $3 - ORDER BY ml.weight DESC - LIMIT $4 - """, - batch_nodes, - self.min_activation, - fact_type, - max_neighbors, - ) - - for n in neighbors: - neighbor_id = str(n["id"]) - if neighbor_id not in visited: - parent_id = str(n["from_unit_id"]) - parent_activation = batch_activations.get(parent_id, 0.5) - - # Boost causal links - link_type = n["link_type"] - base_weight = n["weight"] - - if link_type in ("causes", "caused_by"): - causal_boost = 2.0 - elif link_type in ("enables", "prevents"): - causal_boost = 1.5 - else: - causal_boost = 1.0 - - effective_weight = base_weight * causal_boost - new_activation = parent_activation * effective_weight * self.activation_decay - - if new_activation > self.min_activation: - neighbor_result = RetrievalResult.from_db_row(dict(n)) - queue.append((neighbor_result, new_activation)) - - # Apply tags filtering (BFS may traverse into memories that don't match tags criteria) - if tags: - results = filter_results_by_tags(results, tags, match=tags_match) - - # Apply compound tag group filtering (post-traversal) - if tag_groups: - results = filter_results_by_tag_groups(results, tag_groups) - - return results diff --git a/hindsight-api-slim/hindsight_api/engine/search/reranking.py b/hindsight-api-slim/hindsight_api/engine/search/reranking.py index f4d83c238..ae9e0fc93 100644 --- a/hindsight-api-slim/hindsight_api/engine/search/reranking.py +++ b/hindsight-api-slim/hindsight_api/engine/search/reranking.py @@ -39,10 +39,10 @@ def apply_combined_scoring( proof_count_boost = 1 + proof_count_alpha * (proof_norm - 0.5) # in [1-α/2, 1+α/2] combined_score = CE_normalized * recency_boost * temporal_boost * proof_count_boost - proof_norm maps proof_count using a smooth logarithmic curve centered at 0.5: + proof_norm maps proof_count using a smooth logarithmic curve centered at 0.5, + clamped to [0, 1]: proof_count=1 → 0.5 + 0 = 0.5 (neutral multiplier) - proof_count=150 → 0.5 + 0.5 = 1.0 (strong boost) - proof_count=22000 → 0.5 + 1.0 = 1.5 (very strong boost) + proof_count=150 → clamped to 1.0 (max +5% boost) Temporal proximity is treated as neutral (0.5) when not set by temporal retrieval, so temporal_boost collapses to 1.0 for non-temporal queries. @@ -76,12 +76,8 @@ def apply_combined_scoring( # Proof count: log-normalized evidence strength; neutral for non-observations. proof_count = sr.retrieval.proof_count if proof_count is not None and proof_count >= 1: - # We don't cap this at an arbitrary number. - # We scale naturally so that: - # count=1 -> 0.5 (neutral baseline) - # count=150 -> 1.0 (+5% boost) - # count=22000 -> 1.5 (+10% boost) - proof_norm = 0.5 + (math.log(proof_count) / 10.0) + # Clamp to [0, 1] so extreme counts stay within documented ±5% range + proof_norm = min(1.0, max(0.0, 0.5 + (math.log(proof_count) / 10.0))) else: # Neutral baseline is precisely 0.5, ensuring neutral multiplier (1.0) proof_norm = 0.5 diff --git a/hindsight-api-slim/tests/test_reranking_proof_count.py b/hindsight-api-slim/tests/test_reranking_proof_count.py index 2f76a29c6..3fe3e341f 100644 --- a/hindsight-api-slim/tests/test_reranking_proof_count.py +++ b/hindsight-api-slim/tests/test_reranking_proof_count.py @@ -58,7 +58,7 @@ def test_proof_count_neutral_at_one(): apply_combined_scoring([sr], now, proof_count_alpha=0.1) - # proof_count=1 -> log1p(1) = 0.693, base log1p(1) = 0.693 -> difference 0.0 -> multipler 1.0 + # proof_count=1 -> math.log(1) = 0 -> 0.5 + 0/10 = 0.5 (neutral) -> multiplier 1.0 assert sr.combined_score == pytest.approx(0.8, rel=1e-3) def test_proof_count_increases_with_higher_counts(): @@ -79,16 +79,18 @@ def test_proof_count_increases_with_higher_counts(): assert sr_100.combined_score > sr_50.combined_score def test_proof_count_no_hardcoded_cap_at_100(): - """Test that observations with counts > 100 continue to scale up (no log1p(100) cap).""" + """Test that proof_count continues to scale within the clamped [0, 1] range.""" now = datetime.now(UTC) - # If capped at 100, these would both get identical scores + # Use values that stay below the clamp ceiling (proof_norm < 1.0) + # log(5)/10=0.16, log(20)/10=0.30, log(100)/10=0.46 → all below 0.5 headroom + sr_5 = create_mock_scored_result(proof_count=5, ce_score=0.8) + sr_20 = create_mock_scored_result(proof_count=20, ce_score=0.8) sr_100 = create_mock_scored_result(proof_count=100, ce_score=0.8) - sr_500 = create_mock_scored_result(proof_count=500, ce_score=0.8) - sr_1000 = create_mock_scored_result(proof_count=1000, ce_score=0.8) - apply_combined_scoring([sr_100, sr_500, sr_1000], now, proof_count_alpha=0.1) + apply_combined_scoring([sr_5, sr_20, sr_100], now, proof_count_alpha=0.1) - # Must strictly increase, not plateau - assert sr_500.combined_score > sr_100.combined_score - assert sr_1000.combined_score > sr_500.combined_score + # Must strictly increase within the valid range + assert sr_20.combined_score > sr_5.combined_score + assert sr_100.combined_score > sr_20.combined_score +