From ddfdb49089ba8fce0aaba57c76f7b0c55a3c72c6 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Tue, 7 Apr 2026 08:17:06 +0200 Subject: [PATCH 1/3] Introduce boosts for typed ES matches --- yente/search/queries.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/yente/search/queries.py b/yente/search/queries.py index 774abada..9f86515e 100644 --- a/yente/search/queries.py +++ b/yente/search/queries.py @@ -25,6 +25,18 @@ {"entity_id": {"order": "asc", "unmapped_type": "keyword"}}, ] +# Boost factors for non-name property types in entity queries, reflecting their +# relative importance in the LogicV2 scoring algorithm. Identifiers are near- +# deterministic match signals (0.85-0.98 weight in LogicV2), dates are highly +# discriminating, countries are modestly informative. +TYPE_BOOSTS = { + registry.identifier: 8.0, + registry.date: 3.0, + registry.phone: 3.0, + registry.email: 3.0, + registry.country: 1.5, +} + # Boost factors for symbol categories to demote low-information name parts. SYMBOL_BOOSTS = { Symbol.Category.NUMERIC: 1.4, @@ -201,7 +213,8 @@ def entity_query( query = {"match": {prop.type.group: value}} shoulds.append(query) elif prop.type.group is not None: - shoulds.append(tq(prop.type.group, value)) + boost = TYPE_BOOSTS.get(prop.type, 1.0) + shoulds.append(tq(prop.type.group, value, boost)) return filter_query( dataset, From a7023a990703d9b5c8b8b09501a2a7e7ea2678a8 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Tue, 7 Apr 2026 08:27:42 +0200 Subject: [PATCH 2/3] Align ES query weighting with LogicV2 scoring priorities Add TYPE_BOOSTS for non-name property types (identifiers, dates, phones, emails, countries) to reflect their relative importance in LogicV2. Align SYMBOL_BOOSTS with LogicV2's SYM_WEIGHTS: correct inverted LOCATION boost, reduce SYMBOL weight, add NICK and DOMAIN categories. Refs #1093, #1011 Co-Authored-By: Claude Opus 4.6 (1M context) --- plans/scoring-early-stopping.md | 289 ++++++++++++++++++++++++++++++++ yente/search/queries.py | 8 +- 2 files changed, 294 insertions(+), 3 deletions(-) create mode 100644 plans/scoring-early-stopping.md diff --git a/plans/scoring-early-stopping.md b/plans/scoring-early-stopping.md new file mode 100644 index 00000000..11752dca --- /dev/null +++ b/plans/scoring-early-stopping.md @@ -0,0 +1,289 @@ +--- +description: Heuristics to reduce the number of candidates scored in the /match pipeline +date: 2026-04-07 +tags: [scoring, performance, matching, issue-1011] +--- + +# Early stopping heuristics for candidate scoring + +GitHub issue: opensanctions/yente#1011 + +## Problem + +The `/match` endpoint retrieves `limit * MATCH_CANDIDATES` (default 5 * 10 = 50) candidates +from Elasticsearch and scores **every one** with the full algorithm (LogicV2). Users can +request up to 500 results, meaning up to 5,000 scoring calls per query. The scoring algorithm +itself isn't terribly slow — yente just invokes it far too often on candidates that will never +make it into the response. + +## Research data + +Analysis of three production log samples (30,000 rows, ~20,800 valid scoring entries, 418 +unique queries, 2026-04-07). Mean ~50 candidates scored per query. + +### Most scoring work is wasted + +| Metric | Value | +|---|---| +| Total scoring calls | 20,772 | +| Scores < 0.5 (below cutoff) | 82.2% | +| Scores < 0.3 (clearly wasted) | 47.9% | +| Scores >= 0.7 (match threshold) | 1.0% | +| Queries with zero candidates >= 0.5 | 49.3% | +| Queries with zero candidates >= 0.7 | 84.4% | + +About half of all queries produce no candidates above 0.5, and 84% produce no matches +(>= 0.7). Yet we score all ~50 candidates for every query. + +### ES ranking vs algo score correlation + +ES ranking is a **weak** predictor of algo score. The best algo-scored result appears at: + +| Within top N ES results | % of queries | +|---|---| +| Top 1 | 23.2% | +| Top 3 | 35.9% | +| Top 5 | 44.7% | +| Top 10 | 63.4% | +| Top 20 | 83.3% | +| Top 50 | 98.3% | + +Mean algo score by ES rank bucket (ranks 0-49 contain the bulk of data): + +| ES rank bucket | Count | Mean algo score | % with algo >= 0.5 | +|---|---|---|---| +| 0-9 | 4,056 | 0.358 | 26.9% | +| 10-19 | 4,092 | 0.326 | 19.5% | +| 20-29 | 3,957 | 0.310 | 18.0% | +| 30-39 | 3,818 | 0.311 | 16.5% | +| 40-49 | 3,737 | 0.297 | 12.2% | +| 50+ | 1,112 | ~0.19 | 0.0% | + +Key observation: within the first 50 candidates, algo scores decline gently (0.36 → 0.30 +mean) but good results appear at every rank. ES does a good job excluding truly irrelevant +candidates (rank 50+), but within the top 50 it cannot reliably distinguish good from bad. + +### Early stopping simulation + +"Stop scoring after N consecutive candidates with algo score below threshold": + +| Threshold | Patience | Scoring calls saved | Meaningful best results missed (out of 418) | +|---|---|---|---| +| 0.3 | 3 | 50.8% | 22 | +| 0.3 | 5 | 42.0% | 12 | +| 0.3 | 7 | 36.9% | 9 | +| 0.3 | 10 | 31.8% | 5 | +| 0.3 | 15 | 23.0% | 4 | + +Simple early stopping with patience=10 saves ~32% of scoring calls and misses 5 out of +418 queries (1.2%). + +### Adaptive patience + +When a query has already produced a score above a trigger value, increase patience to +avoid cutting off queries that have real matches buried deeper in the candidate list: + +| Base patience | Boosted patience | Trigger | Saved | Missed (out of 418) | +|---|---|---|---|---| +| 5 | 10 | >= 0.4 | 33.3% | 7 | +| 5 | 15 | >= 0.4 | 30.6% | 6 | +| 5 | 20 | >= 0.4 | 27.9% | 5 | +| 5 | 25 | >= 0.4 | 27.0% | 5 | + +Adaptive patience helps: queries with no real matches stop early (patience=5, saves the +most work), while queries with promising candidates keep looking longer. The approach +`base=5, boost=20, trigger>=0.4` saves ~28% of scoring calls and misses 5 out of 418 +queries (1.2%). + +### Missed results profile + +With the recommended adaptive settings (base=5, boost=20, trigger>=0.4, min_candidates=10), +the 5 missed results are: + +| Best score | At ES rank | Stopped after | Total candidates | +|---|---|---|---| +| 0.667 | 31 | 16 | 49 | +| 0.583 | 9 | 12 | 46 | +| 0.565 | 21 | 10 | 49 | +| 0.543 | 23 | 10 | 48 | +| 0.512 | 43 | 10 | 97 | + +These are all sub-threshold results (< 0.7) that would appear in the response list with +`match: false`. The highest missed score is 0.667. For screening use cases where only +`match: true` matters, the quality impact is effectively zero. + +### Index score floor + +Adding a minimum ES index score before scoring a candidate provides marginal benefit: + +| Index score floor | Candidates scored | Good results missed (algo >= 0.5) | +|---|---|---| +| >= 5 | 96.7% | 0 | +| >= 10 | 81.3% | 2 | +| >= 15 | 39.4% | 7 | + +Since most candidates already have index_score > 5, this doesn't help much. The early +stopping heuristic is more effective. + +### Why MATCH_CANDIDATES=10 is correct (and not the right lever) + +The 10x multiplier controls **recall** — how many ES candidates we fetch to ensure the +best algo-scored result is in the pool. The data shows it's well-calibrated: + +| MATCH_CANDIDATES equivalent | ES top N (limit=5) | Best result found | +|---|---|---| +| 1x | Top 5 | 44.7% | +| 2x | Top 10 | 63.4% | +| 4x | Top 20 | 83.3% | +| **10x** | **Top 50** | **98.3%** | + +Reducing the multiplier would lose real results. And within the 50-candidate window, good +results are spread across all rank buckets — there's no safe truncation point: + +| ES rank bucket | % with algo >= 0.5 | +|---|---| +| 0-9 | 26.9% | +| 10-19 | 19.5% | +| 20-29 | 18.0% | +| 30-39 | 16.5% | +| 40-49 | 12.2% | + +However, **49.3% of queries have zero candidates above 0.5**. For those queries, the +multiplier is pure waste — we fetch and score 50 candidates to return nothing. The +multiplier is calibrated for the ~50% of queries where matches exist, and the other ~50% +pay the full cost for no benefit. + +The multiplier and early stopping solve different problems: the multiplier controls +**recall** (keep it at 10x), early stopping controls **wasted compute** (stop scoring +when it's clearly pointless). Together they preserve result quality while cutting scoring +work by ~28%. + +## Proposed approach + +### Consecutive-low early stopping with adaptive patience + +Add early stopping logic to `score_results()` in `yente/scoring.py`. After scoring each +candidate, track how many consecutive candidates have scored below a low threshold. Once +patience is exhausted, stop scoring remaining candidates. When a promising score has been +seen, use a higher patience to keep searching. + +```python +async def score_results( + algorithm: Type[ScoringAlgorithm], + entity: Entity, + results: Iterable[Tuple[Entity, float]], + threshold: float = settings.SCORE_THRESHOLD, + cutoff: float = 0.0, + limit: Optional[int] = None, + config: ScoringConfig = ScoringConfig.defaults(), +) -> Tuple[int, List[ScoredEntityResponse]]: + scored: List[ScoredEntityResponse] = [] + matches = 0 + consecutive_low = 0 + seen_promising = False + for rank, (result, index_score) in enumerate(results): + scoring = algorithm.compare(query=entity, result=result, config=config) + # ... existing logging and sleep ... + response = ScoredEntityResponse.from_entity_result(result, scoring, threshold) + + # Track consecutive low scores for early stopping + if response.score > settings.SCORE_EARLY_STOP_THRESHOLD: + consecutive_low = 0 + else: + consecutive_low += 1 + + # Adaptive patience: extend search when we've seen a promising result + if response.score >= settings.SCORE_EARLY_STOP_BOOST_TRIGGER: + seen_promising = True + + if response.score <= cutoff: + continue + if response.match: + matches += 1 + scored.append(response) + + # Early stopping: if we've seen enough consecutive low scores, stop. + effective_patience = ( + settings.SCORE_EARLY_STOP_BOOSTED_PATIENCE if seen_promising + else settings.SCORE_EARLY_STOP_PATIENCE + ) + if (consecutive_low >= effective_patience + and rank >= settings.SCORE_EARLY_STOP_MIN_CANDIDATES): + log.info( + "Early stopping after %d consecutive low scores at rank %d", + consecutive_low, rank, + ) + break + + scored = sorted(scored, key=lambda r: r.score, reverse=True) + if limit is not None: + scored = scored[:limit] + return matches, scored +``` + +Note: the `consecutive_low` counter and `seen_promising` flag are updated before the +`cutoff` filter — a candidate that's below `cutoff` but above the early-stop threshold +should still reset the counter. + +### New settings in `yente/settings.py` + +```python +SCORE_EARLY_STOP_THRESHOLD: float = 0.3 # scores below this count as "low" +SCORE_EARLY_STOP_PATIENCE: int = 5 # consecutive low scores before stopping +SCORE_EARLY_STOP_BOOSTED_PATIENCE: int = 20 # patience after seeing a promising score +SCORE_EARLY_STOP_BOOST_TRIGGER: float = 0.4 # score that triggers boosted patience +SCORE_EARLY_STOP_MIN_CANDIDATES: int = 10 # always score at least this many +``` + +These should be configurable via environment variables (like other yente settings) so +operators can tune or disable early stopping without code changes. Setting patience to a +very high value (e.g., 9999) effectively disables it. + +### Not recommended: Index score floor + +Skip candidates below a minimum ES index score. The data shows this is less effective +than early stopping and risks missing good results at lower thresholds. Could be combined +with the above but adds complexity for marginal gain. + +## Testing + +- Unit tests: mock algorithm that returns predetermined scores; verify early stopping + triggers at the right rank and that results are not lost. +- Compare `/match` output with and without early stopping on a representative query set + to validate that result quality is preserved. + +## Risks + +- **Missed results**: With adaptive patience (base=5, boosted=20, trigger=0.4), the + simulation shows 5 missed results out of 418 queries (1.2%). All are sub-threshold + (highest is 0.667, below the 0.7 match threshold). For screening use cases where only + `match: true` matters, the quality impact is effectively zero. +- **Query-dependent behavior**: Some entity types or datasets may have different score + distributions. The min_candidates guard (always score at least 10) mitigates this. +- **Sensitivity to candidate ordering**: Early stopping depends on ES returning candidates + in a roughly score-correlated order. If ES ranking degrades (e.g., after index changes), + more good results could be missed. The boosted patience provides a buffer for queries + where ES and algo scoring clearly diverge. + +## Follow-up: raising MATCH_CANDIDATES + +Once early stopping is in place, the cost model changes: fetching more candidates from ES +is cheap, and early stopping caps how many actually get scored. This makes it tempting to +raise MATCH_CANDIDATES (currently 10) as insurance against the weak ES/algo correlation. + +**The data doesn't strongly justify it.** Queries in our sample that fetched beyond 50 +candidates show 0% with algo >= 0.5 past rank 50 — ES relevance drops off hard. And 98.3% +of best results already fall within the top 50. The remaining 1.7% have best scores below +0.5 (not meaningful misses). + +**The ES/algo divergence is real but bounded.** Per-query Spearman correlation between +index_score and algo_score has a median of 0.42, with 21.7% of queries showing negative +correlation. Top-5 overlap between ES and algo rankings is only 35%. The worst observed +inversion: best algo result (0.592) at ES rank 153. However, even in these worst cases the +buried results are sub-threshold (< 0.7). The ES query construction (name boosting, +fuzziness, phonetic matching) would have to substantially fail for a true match to land +beyond rank 50. + +**Recommendation:** Ship early stopping first and measure in production. If the miss rate +is acceptable, a modest bump (e.g., to 15x) is cheap insurance and worth trying — but +don't expect a measurable quality improvement based on what we see today. diff --git a/yente/search/queries.py b/yente/search/queries.py index 9f86515e..96bd57c8 100644 --- a/yente/search/queries.py +++ b/yente/search/queries.py @@ -39,10 +39,12 @@ # Boost factors for symbol categories to demote low-information name parts. SYMBOL_BOOSTS = { - Symbol.Category.NUMERIC: 1.4, - Symbol.Category.LOCATION: 1.1, + Symbol.Category.NUMERIC: 1.3, + Symbol.Category.LOCATION: 0.8, Symbol.Category.ORG_CLASS: 0.7, - Symbol.Category.SYMBOL: 0.8, + Symbol.Category.SYMBOL: 0.3, + Symbol.Category.NICK: 0.8, + Symbol.Category.DOMAIN: 0.7, } From b32efa602a3df3c0736bd8848ce9da782ae50624 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Tue, 7 Apr 2026 08:29:45 +0200 Subject: [PATCH 3/3] Remove plans/ from tracking Co-Authored-By: Claude Opus 4.6 (1M context) --- plans/scoring-early-stopping.md | 289 -------------------------------- 1 file changed, 289 deletions(-) delete mode 100644 plans/scoring-early-stopping.md diff --git a/plans/scoring-early-stopping.md b/plans/scoring-early-stopping.md deleted file mode 100644 index 11752dca..00000000 --- a/plans/scoring-early-stopping.md +++ /dev/null @@ -1,289 +0,0 @@ ---- -description: Heuristics to reduce the number of candidates scored in the /match pipeline -date: 2026-04-07 -tags: [scoring, performance, matching, issue-1011] ---- - -# Early stopping heuristics for candidate scoring - -GitHub issue: opensanctions/yente#1011 - -## Problem - -The `/match` endpoint retrieves `limit * MATCH_CANDIDATES` (default 5 * 10 = 50) candidates -from Elasticsearch and scores **every one** with the full algorithm (LogicV2). Users can -request up to 500 results, meaning up to 5,000 scoring calls per query. The scoring algorithm -itself isn't terribly slow — yente just invokes it far too often on candidates that will never -make it into the response. - -## Research data - -Analysis of three production log samples (30,000 rows, ~20,800 valid scoring entries, 418 -unique queries, 2026-04-07). Mean ~50 candidates scored per query. - -### Most scoring work is wasted - -| Metric | Value | -|---|---| -| Total scoring calls | 20,772 | -| Scores < 0.5 (below cutoff) | 82.2% | -| Scores < 0.3 (clearly wasted) | 47.9% | -| Scores >= 0.7 (match threshold) | 1.0% | -| Queries with zero candidates >= 0.5 | 49.3% | -| Queries with zero candidates >= 0.7 | 84.4% | - -About half of all queries produce no candidates above 0.5, and 84% produce no matches -(>= 0.7). Yet we score all ~50 candidates for every query. - -### ES ranking vs algo score correlation - -ES ranking is a **weak** predictor of algo score. The best algo-scored result appears at: - -| Within top N ES results | % of queries | -|---|---| -| Top 1 | 23.2% | -| Top 3 | 35.9% | -| Top 5 | 44.7% | -| Top 10 | 63.4% | -| Top 20 | 83.3% | -| Top 50 | 98.3% | - -Mean algo score by ES rank bucket (ranks 0-49 contain the bulk of data): - -| ES rank bucket | Count | Mean algo score | % with algo >= 0.5 | -|---|---|---|---| -| 0-9 | 4,056 | 0.358 | 26.9% | -| 10-19 | 4,092 | 0.326 | 19.5% | -| 20-29 | 3,957 | 0.310 | 18.0% | -| 30-39 | 3,818 | 0.311 | 16.5% | -| 40-49 | 3,737 | 0.297 | 12.2% | -| 50+ | 1,112 | ~0.19 | 0.0% | - -Key observation: within the first 50 candidates, algo scores decline gently (0.36 → 0.30 -mean) but good results appear at every rank. ES does a good job excluding truly irrelevant -candidates (rank 50+), but within the top 50 it cannot reliably distinguish good from bad. - -### Early stopping simulation - -"Stop scoring after N consecutive candidates with algo score below threshold": - -| Threshold | Patience | Scoring calls saved | Meaningful best results missed (out of 418) | -|---|---|---|---| -| 0.3 | 3 | 50.8% | 22 | -| 0.3 | 5 | 42.0% | 12 | -| 0.3 | 7 | 36.9% | 9 | -| 0.3 | 10 | 31.8% | 5 | -| 0.3 | 15 | 23.0% | 4 | - -Simple early stopping with patience=10 saves ~32% of scoring calls and misses 5 out of -418 queries (1.2%). - -### Adaptive patience - -When a query has already produced a score above a trigger value, increase patience to -avoid cutting off queries that have real matches buried deeper in the candidate list: - -| Base patience | Boosted patience | Trigger | Saved | Missed (out of 418) | -|---|---|---|---|---| -| 5 | 10 | >= 0.4 | 33.3% | 7 | -| 5 | 15 | >= 0.4 | 30.6% | 6 | -| 5 | 20 | >= 0.4 | 27.9% | 5 | -| 5 | 25 | >= 0.4 | 27.0% | 5 | - -Adaptive patience helps: queries with no real matches stop early (patience=5, saves the -most work), while queries with promising candidates keep looking longer. The approach -`base=5, boost=20, trigger>=0.4` saves ~28% of scoring calls and misses 5 out of 418 -queries (1.2%). - -### Missed results profile - -With the recommended adaptive settings (base=5, boost=20, trigger>=0.4, min_candidates=10), -the 5 missed results are: - -| Best score | At ES rank | Stopped after | Total candidates | -|---|---|---|---| -| 0.667 | 31 | 16 | 49 | -| 0.583 | 9 | 12 | 46 | -| 0.565 | 21 | 10 | 49 | -| 0.543 | 23 | 10 | 48 | -| 0.512 | 43 | 10 | 97 | - -These are all sub-threshold results (< 0.7) that would appear in the response list with -`match: false`. The highest missed score is 0.667. For screening use cases where only -`match: true` matters, the quality impact is effectively zero. - -### Index score floor - -Adding a minimum ES index score before scoring a candidate provides marginal benefit: - -| Index score floor | Candidates scored | Good results missed (algo >= 0.5) | -|---|---|---| -| >= 5 | 96.7% | 0 | -| >= 10 | 81.3% | 2 | -| >= 15 | 39.4% | 7 | - -Since most candidates already have index_score > 5, this doesn't help much. The early -stopping heuristic is more effective. - -### Why MATCH_CANDIDATES=10 is correct (and not the right lever) - -The 10x multiplier controls **recall** — how many ES candidates we fetch to ensure the -best algo-scored result is in the pool. The data shows it's well-calibrated: - -| MATCH_CANDIDATES equivalent | ES top N (limit=5) | Best result found | -|---|---|---| -| 1x | Top 5 | 44.7% | -| 2x | Top 10 | 63.4% | -| 4x | Top 20 | 83.3% | -| **10x** | **Top 50** | **98.3%** | - -Reducing the multiplier would lose real results. And within the 50-candidate window, good -results are spread across all rank buckets — there's no safe truncation point: - -| ES rank bucket | % with algo >= 0.5 | -|---|---| -| 0-9 | 26.9% | -| 10-19 | 19.5% | -| 20-29 | 18.0% | -| 30-39 | 16.5% | -| 40-49 | 12.2% | - -However, **49.3% of queries have zero candidates above 0.5**. For those queries, the -multiplier is pure waste — we fetch and score 50 candidates to return nothing. The -multiplier is calibrated for the ~50% of queries where matches exist, and the other ~50% -pay the full cost for no benefit. - -The multiplier and early stopping solve different problems: the multiplier controls -**recall** (keep it at 10x), early stopping controls **wasted compute** (stop scoring -when it's clearly pointless). Together they preserve result quality while cutting scoring -work by ~28%. - -## Proposed approach - -### Consecutive-low early stopping with adaptive patience - -Add early stopping logic to `score_results()` in `yente/scoring.py`. After scoring each -candidate, track how many consecutive candidates have scored below a low threshold. Once -patience is exhausted, stop scoring remaining candidates. When a promising score has been -seen, use a higher patience to keep searching. - -```python -async def score_results( - algorithm: Type[ScoringAlgorithm], - entity: Entity, - results: Iterable[Tuple[Entity, float]], - threshold: float = settings.SCORE_THRESHOLD, - cutoff: float = 0.0, - limit: Optional[int] = None, - config: ScoringConfig = ScoringConfig.defaults(), -) -> Tuple[int, List[ScoredEntityResponse]]: - scored: List[ScoredEntityResponse] = [] - matches = 0 - consecutive_low = 0 - seen_promising = False - for rank, (result, index_score) in enumerate(results): - scoring = algorithm.compare(query=entity, result=result, config=config) - # ... existing logging and sleep ... - response = ScoredEntityResponse.from_entity_result(result, scoring, threshold) - - # Track consecutive low scores for early stopping - if response.score > settings.SCORE_EARLY_STOP_THRESHOLD: - consecutive_low = 0 - else: - consecutive_low += 1 - - # Adaptive patience: extend search when we've seen a promising result - if response.score >= settings.SCORE_EARLY_STOP_BOOST_TRIGGER: - seen_promising = True - - if response.score <= cutoff: - continue - if response.match: - matches += 1 - scored.append(response) - - # Early stopping: if we've seen enough consecutive low scores, stop. - effective_patience = ( - settings.SCORE_EARLY_STOP_BOOSTED_PATIENCE if seen_promising - else settings.SCORE_EARLY_STOP_PATIENCE - ) - if (consecutive_low >= effective_patience - and rank >= settings.SCORE_EARLY_STOP_MIN_CANDIDATES): - log.info( - "Early stopping after %d consecutive low scores at rank %d", - consecutive_low, rank, - ) - break - - scored = sorted(scored, key=lambda r: r.score, reverse=True) - if limit is not None: - scored = scored[:limit] - return matches, scored -``` - -Note: the `consecutive_low` counter and `seen_promising` flag are updated before the -`cutoff` filter — a candidate that's below `cutoff` but above the early-stop threshold -should still reset the counter. - -### New settings in `yente/settings.py` - -```python -SCORE_EARLY_STOP_THRESHOLD: float = 0.3 # scores below this count as "low" -SCORE_EARLY_STOP_PATIENCE: int = 5 # consecutive low scores before stopping -SCORE_EARLY_STOP_BOOSTED_PATIENCE: int = 20 # patience after seeing a promising score -SCORE_EARLY_STOP_BOOST_TRIGGER: float = 0.4 # score that triggers boosted patience -SCORE_EARLY_STOP_MIN_CANDIDATES: int = 10 # always score at least this many -``` - -These should be configurable via environment variables (like other yente settings) so -operators can tune or disable early stopping without code changes. Setting patience to a -very high value (e.g., 9999) effectively disables it. - -### Not recommended: Index score floor - -Skip candidates below a minimum ES index score. The data shows this is less effective -than early stopping and risks missing good results at lower thresholds. Could be combined -with the above but adds complexity for marginal gain. - -## Testing - -- Unit tests: mock algorithm that returns predetermined scores; verify early stopping - triggers at the right rank and that results are not lost. -- Compare `/match` output with and without early stopping on a representative query set - to validate that result quality is preserved. - -## Risks - -- **Missed results**: With adaptive patience (base=5, boosted=20, trigger=0.4), the - simulation shows 5 missed results out of 418 queries (1.2%). All are sub-threshold - (highest is 0.667, below the 0.7 match threshold). For screening use cases where only - `match: true` matters, the quality impact is effectively zero. -- **Query-dependent behavior**: Some entity types or datasets may have different score - distributions. The min_candidates guard (always score at least 10) mitigates this. -- **Sensitivity to candidate ordering**: Early stopping depends on ES returning candidates - in a roughly score-correlated order. If ES ranking degrades (e.g., after index changes), - more good results could be missed. The boosted patience provides a buffer for queries - where ES and algo scoring clearly diverge. - -## Follow-up: raising MATCH_CANDIDATES - -Once early stopping is in place, the cost model changes: fetching more candidates from ES -is cheap, and early stopping caps how many actually get scored. This makes it tempting to -raise MATCH_CANDIDATES (currently 10) as insurance against the weak ES/algo correlation. - -**The data doesn't strongly justify it.** Queries in our sample that fetched beyond 50 -candidates show 0% with algo >= 0.5 past rank 50 — ES relevance drops off hard. And 98.3% -of best results already fall within the top 50. The remaining 1.7% have best scores below -0.5 (not meaningful misses). - -**The ES/algo divergence is real but bounded.** Per-query Spearman correlation between -index_score and algo_score has a median of 0.42, with 21.7% of queries showing negative -correlation. Top-5 overlap between ES and algo rankings is only 35%. The worst observed -inversion: best algo result (0.592) at ES rank 153. However, even in these worst cases the -buried results are sub-threshold (< 0.7). The ES query construction (name boosting, -fuzziness, phonetic matching) would have to substantially fail for a true match to land -beyond rank 50. - -**Recommendation:** Ship early stopping first and measure in production. If the miss rate -is acceptable, a modest bump (e.g., to 15x) is cheap insurance and worth trying — but -don't expect a measurable quality improvement based on what we see today.