From ddfdb49089ba8fce0aaba57c76f7b0c55a3c72c6 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@opensanctions.org>
Date: Tue, 7 Apr 2026 08:17:06 +0200
Subject: [PATCH 1/3] Introduce boosts for typed ES matches

---
 yente/search/queries.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/yente/search/queries.py b/yente/search/queries.py
index 774abada..9f86515e 100644
--- a/yente/search/queries.py
+++ b/yente/search/queries.py
@@ -25,6 +25,18 @@
     {"entity_id": {"order": "asc", "unmapped_type": "keyword"}},
 ]
 
+# Boost factors for non-name property types in entity queries, reflecting their
+# relative importance in the LogicV2 scoring algorithm. Identifiers are near-
+# deterministic match signals (0.85-0.98 weight in LogicV2), dates are highly
+# discriminating, countries are modestly informative.
+TYPE_BOOSTS = {
+    registry.identifier: 8.0,
+    registry.date: 3.0,
+    registry.phone: 3.0,
+    registry.email: 3.0,
+    registry.country: 1.5,
+}
+
 # Boost factors for symbol categories to demote low-information name parts.
 SYMBOL_BOOSTS = {
     Symbol.Category.NUMERIC: 1.4,
@@ -201,7 +213,8 @@ def entity_query(
             query = {"match": {prop.type.group: value}}
             shoulds.append(query)
         elif prop.type.group is not None:
-            shoulds.append(tq(prop.type.group, value))
+            boost = TYPE_BOOSTS.get(prop.type, 1.0)
+            shoulds.append(tq(prop.type.group, value, boost))
 
     return filter_query(
         dataset,

From a7023a990703d9b5c8b8b09501a2a7e7ea2678a8 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@opensanctions.org>
Date: Tue, 7 Apr 2026 08:27:42 +0200
Subject: [PATCH 2/3] Align ES query weighting with LogicV2 scoring priorities

Add TYPE_BOOSTS for non-name property types (identifiers, dates, phones,
emails, countries) to reflect their relative importance in LogicV2.
Align SYMBOL_BOOSTS with LogicV2's SYM_WEIGHTS: correct inverted
LOCATION boost, reduce SYMBOL weight, add NICK and DOMAIN categories.

Refs #1093, #1011

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 plans/scoring-early-stopping.md | 289 ++++++++++++++++++++++++++++++++
 yente/search/queries.py         |   8 +-
 2 files changed, 294 insertions(+), 3 deletions(-)
 create mode 100644 plans/scoring-early-stopping.md

diff --git a/plans/scoring-early-stopping.md b/plans/scoring-early-stopping.md
new file mode 100644
index 00000000..11752dca
--- /dev/null
+++ b/plans/scoring-early-stopping.md
@@ -0,0 +1,289 @@
+---
+description: Heuristics to reduce the number of candidates scored in the /match pipeline
+date: 2026-04-07
+tags: [scoring, performance, matching, issue-1011]
+---
+
+# Early stopping heuristics for candidate scoring
+
+GitHub issue: opensanctions/yente#1011
+
+## Problem
+
+The `/match` endpoint retrieves `limit * MATCH_CANDIDATES` (default 5 * 10 = 50) candidates
+from Elasticsearch and scores **every one** with the full algorithm (LogicV2). Users can
+request up to 500 results, meaning up to 5,000 scoring calls per query. The scoring algorithm
+itself isn't terribly slow — yente just invokes it far too often on candidates that will never
+make it into the response.
+
+## Research data
+
+Analysis of three production log samples (30,000 rows, ~20,800 valid scoring entries, 418
+unique queries, 2026-04-07). Mean ~50 candidates scored per query.
+
+### Most scoring work is wasted
+
+| Metric | Value |
+|---|---|
+| Total scoring calls | 20,772 |
+| Scores < 0.5 (below cutoff) | 82.2% |
+| Scores < 0.3 (clearly wasted) | 47.9% |
+| Scores >= 0.7 (match threshold) | 1.0% |
+| Queries with zero candidates >= 0.5 | 49.3% |
+| Queries with zero candidates >= 0.7 | 84.4% |
+
+About half of all queries produce no candidates above 0.5, and 84% produce no matches
+(>= 0.7). Yet we score all ~50 candidates for every query.
+
+### ES ranking vs algo score correlation
+
+ES ranking is a **weak** predictor of algo score. The best algo-scored result appears at:
+
+| Within top N ES results | % of queries |
+|---|---|
+| Top 1 | 23.2% |
+| Top 3 | 35.9% |
+| Top 5 | 44.7% |
+| Top 10 | 63.4% |
+| Top 20 | 83.3% |
+| Top 50 | 98.3% |
+
+Mean algo score by ES rank bucket (ranks 0-49 contain the bulk of data):
+
+| ES rank bucket | Count | Mean algo score | % with algo >= 0.5 |
+|---|---|---|---|
+| 0-9 | 4,056 | 0.358 | 26.9% |
+| 10-19 | 4,092 | 0.326 | 19.5% |
+| 20-29 | 3,957 | 0.310 | 18.0% |
+| 30-39 | 3,818 | 0.311 | 16.5% |
+| 40-49 | 3,737 | 0.297 | 12.2% |
+| 50+ | 1,112 | ~0.19 | 0.0% |
+
+Key observation: within the first 50 candidates, algo scores decline gently (0.36 → 0.30
+mean) but good results appear at every rank. ES does a good job excluding truly irrelevant
+candidates (rank 50+), but within the top 50 it cannot reliably distinguish good from bad.
+
+### Early stopping simulation
+
+"Stop scoring after N consecutive candidates with algo score below threshold":
+
+| Threshold | Patience | Scoring calls saved | Meaningful best results missed (out of 418) |
+|---|---|---|---|
+| 0.3 | 3 | 50.8% | 22 |
+| 0.3 | 5 | 42.0% | 12 |
+| 0.3 | 7 | 36.9% | 9 |
+| 0.3 | 10 | 31.8% | 5 |
+| 0.3 | 15 | 23.0% | 4 |
+
+Simple early stopping with patience=10 saves ~32% of scoring calls and misses 5 out of
+418 queries (1.2%).
+
+### Adaptive patience
+
+When a query has already produced a score above a trigger value, increase patience to
+avoid cutting off queries that have real matches buried deeper in the candidate list:
+
+| Base patience | Boosted patience | Trigger | Saved | Missed (out of 418) |
+|---|---|---|---|---|
+| 5 | 10 | >= 0.4 | 33.3% | 7 |
+| 5 | 15 | >= 0.4 | 30.6% | 6 |
+| 5 | 20 | >= 0.4 | 27.9% | 5 |
+| 5 | 25 | >= 0.4 | 27.0% | 5 |
+
+Adaptive patience helps: queries with no real matches stop early (patience=5, saves the
+most work), while queries with promising candidates keep looking longer. The approach
+`base=5, boost=20, trigger>=0.4` saves ~28% of scoring calls and misses 5 out of 418
+queries (1.2%).
+
+### Missed results profile
+
+With the recommended adaptive settings (base=5, boost=20, trigger>=0.4, min_candidates=10),
+the 5 missed results are:
+
+| Best score | At ES rank | Stopped after | Total candidates |
+|---|---|---|---|
+| 0.667 | 31 | 16 | 49 |
+| 0.583 | 9 | 12 | 46 |
+| 0.565 | 21 | 10 | 49 |
+| 0.543 | 23 | 10 | 48 |
+| 0.512 | 43 | 10 | 97 |
+
+These are all sub-threshold results (< 0.7) that would appear in the response list with
+`match: false`. The highest missed score is 0.667. For screening use cases where only
+`match: true` matters, the quality impact is effectively zero.
+
+### Index score floor
+
+Adding a minimum ES index score before scoring a candidate provides marginal benefit:
+
+| Index score floor | Candidates scored | Good results missed (algo >= 0.5) |
+|---|---|---|
+| >= 5 | 96.7% | 0 |
+| >= 10 | 81.3% | 2 |
+| >= 15 | 39.4% | 7 |
+
+Since most candidates already have index_score > 5, this doesn't help much. The early
+stopping heuristic is more effective.
+
+### Why MATCH_CANDIDATES=10 is correct (and not the right lever)
+
+The 10x multiplier controls **recall** — how many ES candidates we fetch to ensure the
+best algo-scored result is in the pool. The data shows it's well-calibrated:
+
+| MATCH_CANDIDATES equivalent | ES top N (limit=5) | Best result found |
+|---|---|---|
+| 1x | Top 5 | 44.7% |
+| 2x | Top 10 | 63.4% |
+| 4x | Top 20 | 83.3% |
+| **10x** | **Top 50** | **98.3%** |
+
+Reducing the multiplier would lose real results. And within the 50-candidate window, good
+results are spread across all rank buckets — there's no safe truncation point:
+
+| ES rank bucket | % with algo >= 0.5 |
+|---|---|
+| 0-9 | 26.9% |
+| 10-19 | 19.5% |
+| 20-29 | 18.0% |
+| 30-39 | 16.5% |
+| 40-49 | 12.2% |
+
+However, **49.3% of queries have zero candidates above 0.5**. For those queries, the
+multiplier is pure waste — we fetch and score 50 candidates to return nothing. The
+multiplier is calibrated for the ~50% of queries where matches exist, and the other ~50%
+pay the full cost for no benefit.
+
+The multiplier and early stopping solve different problems: the multiplier controls
+**recall** (keep it at 10x), early stopping controls **wasted compute** (stop scoring
+when it's clearly pointless). Together they preserve result quality while cutting scoring
+work by ~28%.
+
+## Proposed approach
+
+### Consecutive-low early stopping with adaptive patience
+
+Add early stopping logic to `score_results()` in `yente/scoring.py`. After scoring each
+candidate, track how many consecutive candidates have scored below a low threshold. Once
+patience is exhausted, stop scoring remaining candidates. When a promising score has been
+seen, use a higher patience to keep searching.
+
+```python
+async def score_results(
+    algorithm: Type[ScoringAlgorithm],
+    entity: Entity,
+    results: Iterable[Tuple[Entity, float]],
+    threshold: float = settings.SCORE_THRESHOLD,
+    cutoff: float = 0.0,
+    limit: Optional[int] = None,
+    config: ScoringConfig = ScoringConfig.defaults(),
+) -> Tuple[int, List[ScoredEntityResponse]]:
+    scored: List[ScoredEntityResponse] = []
+    matches = 0
+    consecutive_low = 0
+    seen_promising = False
+    for rank, (result, index_score) in enumerate(results):
+        scoring = algorithm.compare(query=entity, result=result, config=config)
+        # ... existing logging and sleep ...
+        response = ScoredEntityResponse.from_entity_result(result, scoring, threshold)
+
+        # Track consecutive low scores for early stopping
+        if response.score > settings.SCORE_EARLY_STOP_THRESHOLD:
+            consecutive_low = 0
+        else:
+            consecutive_low += 1
+
+        # Adaptive patience: extend search when we've seen a promising result
+        if response.score >= settings.SCORE_EARLY_STOP_BOOST_TRIGGER:
+            seen_promising = True
+
+        if response.score <= cutoff:
+            continue
+        if response.match:
+            matches += 1
+        scored.append(response)
+
+        # Early stopping: if we've seen enough consecutive low scores, stop.
+        effective_patience = (
+            settings.SCORE_EARLY_STOP_BOOSTED_PATIENCE if seen_promising
+            else settings.SCORE_EARLY_STOP_PATIENCE
+        )
+        if (consecutive_low >= effective_patience
+                and rank >= settings.SCORE_EARLY_STOP_MIN_CANDIDATES):
+            log.info(
+                "Early stopping after %d consecutive low scores at rank %d",
+                consecutive_low, rank,
+            )
+            break
+
+    scored = sorted(scored, key=lambda r: r.score, reverse=True)
+    if limit is not None:
+        scored = scored[:limit]
+    return matches, scored
+```
+
+Note: the `consecutive_low` counter and `seen_promising` flag are updated before the
+`cutoff` filter — a candidate that's below `cutoff` but above the early-stop threshold
+should still reset the counter.
+
+### New settings in `yente/settings.py`
+
+```python
+SCORE_EARLY_STOP_THRESHOLD: float = 0.3     # scores below this count as "low"
+SCORE_EARLY_STOP_PATIENCE: int = 5          # consecutive low scores before stopping
+SCORE_EARLY_STOP_BOOSTED_PATIENCE: int = 20 # patience after seeing a promising score
+SCORE_EARLY_STOP_BOOST_TRIGGER: float = 0.4 # score that triggers boosted patience
+SCORE_EARLY_STOP_MIN_CANDIDATES: int = 10   # always score at least this many
+```
+
+These should be configurable via environment variables (like other yente settings) so
+operators can tune or disable early stopping without code changes. Setting patience to a
+very high value (e.g., 9999) effectively disables it.
+
+### Not recommended: Index score floor
+
+Skip candidates below a minimum ES index score. The data shows this is less effective
+than early stopping and risks missing good results at lower thresholds. Could be combined
+with the above but adds complexity for marginal gain.
+
+## Testing
+
+- Unit tests: mock algorithm that returns predetermined scores; verify early stopping
+  triggers at the right rank and that results are not lost.
+- Compare `/match` output with and without early stopping on a representative query set
+  to validate that result quality is preserved.
+
+## Risks
+
+- **Missed results**: With adaptive patience (base=5, boosted=20, trigger=0.4), the
+  simulation shows 5 missed results out of 418 queries (1.2%). All are sub-threshold
+  (highest is 0.667, below the 0.7 match threshold). For screening use cases where only
+  `match: true` matters, the quality impact is effectively zero.
+- **Query-dependent behavior**: Some entity types or datasets may have different score
+  distributions. The min_candidates guard (always score at least 10) mitigates this.
+- **Sensitivity to candidate ordering**: Early stopping depends on ES returning candidates
+  in a roughly score-correlated order. If ES ranking degrades (e.g., after index changes),
+  more good results could be missed. The boosted patience provides a buffer for queries
+  where ES and algo scoring clearly diverge.
+
+## Follow-up: raising MATCH_CANDIDATES
+
+Once early stopping is in place, the cost model changes: fetching more candidates from ES
+is cheap, and early stopping caps how many actually get scored. This makes it tempting to
+raise MATCH_CANDIDATES (currently 10) as insurance against the weak ES/algo correlation.
+
+**The data doesn't strongly justify it.** Queries in our sample that fetched beyond 50
+candidates show 0% with algo >= 0.5 past rank 50 — ES relevance drops off hard. And 98.3%
+of best results already fall within the top 50. The remaining 1.7% have best scores below
+0.5 (not meaningful misses).
+
+**The ES/algo divergence is real but bounded.** Per-query Spearman correlation between
+index_score and algo_score has a median of 0.42, with 21.7% of queries showing negative
+correlation. Top-5 overlap between ES and algo rankings is only 35%. The worst observed
+inversion: best algo result (0.592) at ES rank 153. However, even in these worst cases the
+buried results are sub-threshold (< 0.7). The ES query construction (name boosting,
+fuzziness, phonetic matching) would have to substantially fail for a true match to land
+beyond rank 50.
+
+**Recommendation:** Ship early stopping first and measure in production. If the miss rate
+is acceptable, a modest bump (e.g., to 15x) is cheap insurance and worth trying — but
+don't expect a measurable quality improvement based on what we see today.
diff --git a/yente/search/queries.py b/yente/search/queries.py
index 9f86515e..96bd57c8 100644
--- a/yente/search/queries.py
+++ b/yente/search/queries.py
@@ -39,10 +39,12 @@
 
 # Boost factors for symbol categories to demote low-information name parts.
 SYMBOL_BOOSTS = {
-    Symbol.Category.NUMERIC: 1.4,
-    Symbol.Category.LOCATION: 1.1,
+    Symbol.Category.NUMERIC: 1.3,
+    Symbol.Category.LOCATION: 0.8,
     Symbol.Category.ORG_CLASS: 0.7,
-    Symbol.Category.SYMBOL: 0.8,
+    Symbol.Category.SYMBOL: 0.3,
+    Symbol.Category.NICK: 0.8,
+    Symbol.Category.DOMAIN: 0.7,
 }
 
 

From b32efa602a3df3c0736bd8848ce9da782ae50624 Mon Sep 17 00:00:00 2001
From: Friedrich Lindenberg <friedrich@opensanctions.org>
Date: Tue, 7 Apr 2026 08:29:45 +0200
Subject: [PATCH 3/3] Remove plans/ from tracking

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 plans/scoring-early-stopping.md | 289 --------------------------------
 1 file changed, 289 deletions(-)
 delete mode 100644 plans/scoring-early-stopping.md

diff --git a/plans/scoring-early-stopping.md b/plans/scoring-early-stopping.md
deleted file mode 100644
index 11752dca..00000000
--- a/plans/scoring-early-stopping.md
+++ /dev/null
@@ -1,289 +0,0 @@
----
-description: Heuristics to reduce the number of candidates scored in the /match pipeline
-date: 2026-04-07
-tags: [scoring, performance, matching, issue-1011]
----
-
-# Early stopping heuristics for candidate scoring
-
-GitHub issue: opensanctions/yente#1011
-
-## Problem
-
-The `/match` endpoint retrieves `limit * MATCH_CANDIDATES` (default 5 * 10 = 50) candidates
-from Elasticsearch and scores **every one** with the full algorithm (LogicV2). Users can
-request up to 500 results, meaning up to 5,000 scoring calls per query. The scoring algorithm
-itself isn't terribly slow — yente just invokes it far too often on candidates that will never
-make it into the response.
-
-## Research data
-
-Analysis of three production log samples (30,000 rows, ~20,800 valid scoring entries, 418
-unique queries, 2026-04-07). Mean ~50 candidates scored per query.
-
-### Most scoring work is wasted
-
-| Metric | Value |
-|---|---|
-| Total scoring calls | 20,772 |
-| Scores < 0.5 (below cutoff) | 82.2% |
-| Scores < 0.3 (clearly wasted) | 47.9% |
-| Scores >= 0.7 (match threshold) | 1.0% |
-| Queries with zero candidates >= 0.5 | 49.3% |
-| Queries with zero candidates >= 0.7 | 84.4% |
-
-About half of all queries produce no candidates above 0.5, and 84% produce no matches
-(>= 0.7). Yet we score all ~50 candidates for every query.
-
-### ES ranking vs algo score correlation
-
-ES ranking is a **weak** predictor of algo score. The best algo-scored result appears at:
-
-| Within top N ES results | % of queries |
-|---|---|
-| Top 1 | 23.2% |
-| Top 3 | 35.9% |
-| Top 5 | 44.7% |
-| Top 10 | 63.4% |
-| Top 20 | 83.3% |
-| Top 50 | 98.3% |
-
-Mean algo score by ES rank bucket (ranks 0-49 contain the bulk of data):
-
-| ES rank bucket | Count | Mean algo score | % with algo >= 0.5 |
-|---|---|---|---|
-| 0-9 | 4,056 | 0.358 | 26.9% |
-| 10-19 | 4,092 | 0.326 | 19.5% |
-| 20-29 | 3,957 | 0.310 | 18.0% |
-| 30-39 | 3,818 | 0.311 | 16.5% |
-| 40-49 | 3,737 | 0.297 | 12.2% |
-| 50+ | 1,112 | ~0.19 | 0.0% |
-
-Key observation: within the first 50 candidates, algo scores decline gently (0.36 → 0.30
-mean) but good results appear at every rank. ES does a good job excluding truly irrelevant
-candidates (rank 50+), but within the top 50 it cannot reliably distinguish good from bad.
-
-### Early stopping simulation
-
-"Stop scoring after N consecutive candidates with algo score below threshold":
-
-| Threshold | Patience | Scoring calls saved | Meaningful best results missed (out of 418) |
-|---|---|---|---|
-| 0.3 | 3 | 50.8% | 22 |
-| 0.3 | 5 | 42.0% | 12 |
-| 0.3 | 7 | 36.9% | 9 |
-| 0.3 | 10 | 31.8% | 5 |
-| 0.3 | 15 | 23.0% | 4 |
-
-Simple early stopping with patience=10 saves ~32% of scoring calls and misses 5 out of
-418 queries (1.2%).
-
-### Adaptive patience
-
-When a query has already produced a score above a trigger value, increase patience to
-avoid cutting off queries that have real matches buried deeper in the candidate list:
-
-| Base patience | Boosted patience | Trigger | Saved | Missed (out of 418) |
-|---|---|---|---|---|
-| 5 | 10 | >= 0.4 | 33.3% | 7 |
-| 5 | 15 | >= 0.4 | 30.6% | 6 |
-| 5 | 20 | >= 0.4 | 27.9% | 5 |
-| 5 | 25 | >= 0.4 | 27.0% | 5 |
-
-Adaptive patience helps: queries with no real matches stop early (patience=5, saves the
-most work), while queries with promising candidates keep looking longer. The approach
-`base=5, boost=20, trigger>=0.4` saves ~28% of scoring calls and misses 5 out of 418
-queries (1.2%).
-
-### Missed results profile
-
-With the recommended adaptive settings (base=5, boost=20, trigger>=0.4, min_candidates=10),
-the 5 missed results are:
-
-| Best score | At ES rank | Stopped after | Total candidates |
-|---|---|---|---|
-| 0.667 | 31 | 16 | 49 |
-| 0.583 | 9 | 12 | 46 |
-| 0.565 | 21 | 10 | 49 |
-| 0.543 | 23 | 10 | 48 |
-| 0.512 | 43 | 10 | 97 |
-
-These are all sub-threshold results (< 0.7) that would appear in the response list with
-`match: false`. The highest missed score is 0.667. For screening use cases where only
-`match: true` matters, the quality impact is effectively zero.
-
-### Index score floor
-
-Adding a minimum ES index score before scoring a candidate provides marginal benefit:
-
-| Index score floor | Candidates scored | Good results missed (algo >= 0.5) |
-|---|---|---|
-| >= 5 | 96.7% | 0 |
-| >= 10 | 81.3% | 2 |
-| >= 15 | 39.4% | 7 |
-
-Since most candidates already have index_score > 5, this doesn't help much. The early
-stopping heuristic is more effective.
-
-### Why MATCH_CANDIDATES=10 is correct (and not the right lever)
-
-The 10x multiplier controls **recall** — how many ES candidates we fetch to ensure the
-best algo-scored result is in the pool. The data shows it's well-calibrated:
-
-| MATCH_CANDIDATES equivalent | ES top N (limit=5) | Best result found |
-|---|---|---|
-| 1x | Top 5 | 44.7% |
-| 2x | Top 10 | 63.4% |
-| 4x | Top 20 | 83.3% |
-| **10x** | **Top 50** | **98.3%** |
-
-Reducing the multiplier would lose real results. And within the 50-candidate window, good
-results are spread across all rank buckets — there's no safe truncation point:
-
-| ES rank bucket | % with algo >= 0.5 |
-|---|---|
-| 0-9 | 26.9% |
-| 10-19 | 19.5% |
-| 20-29 | 18.0% |
-| 30-39 | 16.5% |
-| 40-49 | 12.2% |
-
-However, **49.3% of queries have zero candidates above 0.5**. For those queries, the
-multiplier is pure waste — we fetch and score 50 candidates to return nothing. The
-multiplier is calibrated for the ~50% of queries where matches exist, and the other ~50%
-pay the full cost for no benefit.
-
-The multiplier and early stopping solve different problems: the multiplier controls
-**recall** (keep it at 10x), early stopping controls **wasted compute** (stop scoring
-when it's clearly pointless). Together they preserve result quality while cutting scoring
-work by ~28%.
-
-## Proposed approach
-
-### Consecutive-low early stopping with adaptive patience
-
-Add early stopping logic to `score_results()` in `yente/scoring.py`. After scoring each
-candidate, track how many consecutive candidates have scored below a low threshold. Once
-patience is exhausted, stop scoring remaining candidates. When a promising score has been
-seen, use a higher patience to keep searching.
-
-```python
-async def score_results(
-    algorithm: Type[ScoringAlgorithm],
-    entity: Entity,
-    results: Iterable[Tuple[Entity, float]],
-    threshold: float = settings.SCORE_THRESHOLD,
-    cutoff: float = 0.0,
-    limit: Optional[int] = None,
-    config: ScoringConfig = ScoringConfig.defaults(),
-) -> Tuple[int, List[ScoredEntityResponse]]:
-    scored: List[ScoredEntityResponse] = []
-    matches = 0
-    consecutive_low = 0
-    seen_promising = False
-    for rank, (result, index_score) in enumerate(results):
-        scoring = algorithm.compare(query=entity, result=result, config=config)
-        # ... existing logging and sleep ...
-        response = ScoredEntityResponse.from_entity_result(result, scoring, threshold)
-
-        # Track consecutive low scores for early stopping
-        if response.score > settings.SCORE_EARLY_STOP_THRESHOLD:
-            consecutive_low = 0
-        else:
-            consecutive_low += 1
-
-        # Adaptive patience: extend search when we've seen a promising result
-        if response.score >= settings.SCORE_EARLY_STOP_BOOST_TRIGGER:
-            seen_promising = True
-
-        if response.score <= cutoff:
-            continue
-        if response.match:
-            matches += 1
-        scored.append(response)
-
-        # Early stopping: if we've seen enough consecutive low scores, stop.
-        effective_patience = (
-            settings.SCORE_EARLY_STOP_BOOSTED_PATIENCE if seen_promising
-            else settings.SCORE_EARLY_STOP_PATIENCE
-        )
-        if (consecutive_low >= effective_patience
-                and rank >= settings.SCORE_EARLY_STOP_MIN_CANDIDATES):
-            log.info(
-                "Early stopping after %d consecutive low scores at rank %d",
-                consecutive_low, rank,
-            )
-            break
-
-    scored = sorted(scored, key=lambda r: r.score, reverse=True)
-    if limit is not None:
-        scored = scored[:limit]
-    return matches, scored
-```
-
-Note: the `consecutive_low` counter and `seen_promising` flag are updated before the
-`cutoff` filter — a candidate that's below `cutoff` but above the early-stop threshold
-should still reset the counter.
-
-### New settings in `yente/settings.py`
-
-```python
-SCORE_EARLY_STOP_THRESHOLD: float = 0.3     # scores below this count as "low"
-SCORE_EARLY_STOP_PATIENCE: int = 5          # consecutive low scores before stopping
-SCORE_EARLY_STOP_BOOSTED_PATIENCE: int = 20 # patience after seeing a promising score
-SCORE_EARLY_STOP_BOOST_TRIGGER: float = 0.4 # score that triggers boosted patience
-SCORE_EARLY_STOP_MIN_CANDIDATES: int = 10   # always score at least this many
-```
-
-These should be configurable via environment variables (like other yente settings) so
-operators can tune or disable early stopping without code changes. Setting patience to a
-very high value (e.g., 9999) effectively disables it.
-
-### Not recommended: Index score floor
-
-Skip candidates below a minimum ES index score. The data shows this is less effective
-than early stopping and risks missing good results at lower thresholds. Could be combined
-with the above but adds complexity for marginal gain.
-
-## Testing
-
-- Unit tests: mock algorithm that returns predetermined scores; verify early stopping
-  triggers at the right rank and that results are not lost.
-- Compare `/match` output with and without early stopping on a representative query set
-  to validate that result quality is preserved.
-
-## Risks
-
-- **Missed results**: With adaptive patience (base=5, boosted=20, trigger=0.4), the
-  simulation shows 5 missed results out of 418 queries (1.2%). All are sub-threshold
-  (highest is 0.667, below the 0.7 match threshold). For screening use cases where only
-  `match: true` matters, the quality impact is effectively zero.
-- **Query-dependent behavior**: Some entity types or datasets may have different score
-  distributions. The min_candidates guard (always score at least 10) mitigates this.
-- **Sensitivity to candidate ordering**: Early stopping depends on ES returning candidates
-  in a roughly score-correlated order. If ES ranking degrades (e.g., after index changes),
-  more good results could be missed. The boosted patience provides a buffer for queries
-  where ES and algo scoring clearly diverge.
-
-## Follow-up: raising MATCH_CANDIDATES
-
-Once early stopping is in place, the cost model changes: fetching more candidates from ES
-is cheap, and early stopping caps how many actually get scored. This makes it tempting to
-raise MATCH_CANDIDATES (currently 10) as insurance against the weak ES/algo correlation.
-
-**The data doesn't strongly justify it.** Queries in our sample that fetched beyond 50
-candidates show 0% with algo >= 0.5 past rank 50 — ES relevance drops off hard. And 98.3%
-of best results already fall within the top 50. The remaining 1.7% have best scores below
-0.5 (not meaningful misses).
-
-**The ES/algo divergence is real but bounded.** Per-query Spearman correlation between
-index_score and algo_score has a median of 0.42, with 21.7% of queries showing negative
-correlation. Top-5 overlap between ES and algo rankings is only 35%. The worst observed
-inversion: best algo result (0.592) at ES rank 153. However, even in these worst cases the
-buried results are sub-threshold (< 0.7). The ES query construction (name boosting,
-fuzziness, phonetic matching) would have to substantially fail for a true match to land
-beyond rank 50.
-
-**Recommendation:** Ship early stopping first and measure in production. If the miss rate
-is acceptable, a modest bump (e.g., to 15x) is cheap insurance and worth trying — but
-don't expect a measurable quality improvement based on what we see today.