opensanctions · leonhandreke · Mar 31, 2026 · jbothma · Apr 2, 2026
diff --git a/contrib/validation_report/fixtures/negatives_global.csv b/contrib/validation_report/fixtures/negatives_global.csv
diff --git a/contrib/validation_report/fixtures/negatives_us.csv b/contrib/validation_report/fixtures/negatives_us.csv
diff --git a/contrib/validation_report/fixtures/positives_un_treated.csv b/contrib/validation_report/fixtures/positives_un_treated.csv
diff --git a/contrib/validation_report/fixtures/positives_un_untreated.csv b/contrib/validation_report/fixtures/positives_un_untreated.csv
diff --git a/contrib/validation_report/fixtures/positives_us_congress_untreated.csv b/contrib/validation_report/fixtures/positives_us_congress_untreated.csv
diff --git a/contrib/validation_report/generate_report_data.py b/contrib/validation_report/generate_report_data.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+"""Validation report: compare logic-v1 vs logic-v2 against fixture CSVs via the yente HTTP API."""
+
+import csv
+import json
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import click
+import httpx
+
+log = logging.getLogger(__name__)
+
+ALGORITHMS = ["logic-v1", "logic-v2"]
+BATCH_SIZE = 100
+FIXTURES_DIR = Path(__file__).parent / "fixtures"
+
+
+@dataclass
+class PersonRecord:
+    full_name: str
+    first_name: str
+    middle_name: str | None
+    last_name: str
+    gender: str
+    date_of_birth: str
+    place_of_birth: str
+    nationality: str
+
+
+def read_person_csv(path: Path) -> list[PersonRecord]:
+    persons: list[PersonRecord] = []
+    with open(path, newline="", encoding="utf-8") as f:
+        for row in csv.DictReader(f):
+            persons.append(
+                PersonRecord(
+                    full_name=row.get("full_name", ""),
+                    first_name=row.get("first_name", ""),
+                    middle_name=row.get("middle_name") or None,
+                    last_name=row.get("last_name", ""),
+                    gender=row.get("gender", ""),
+                    date_of_birth=row.get("date_of_birth", ""),
+                    place_of_birth=row.get("place_of_birth", ""),
+                    nationality=row.get("nationality", ""),
+                )
+            )
+    return persons
+
+
+def person_to_query(person: PersonRecord) -> dict[str, Any]:
+    props: dict[str, list[str]] = {}
+    if person.full_name:
+        props["name"] = [person.full_name]
+    if person.first_name:
+        props["firstName"] = [person.first_name]
+    if person.middle_name:
+        props["middleName"] = [person.middle_name]
+    if person.last_name:
+        props["lastName"] = [person.last_name]
+    if person.date_of_birth:
+        props["birthDate"] = [person.date_of_birth]
+    if person.nationality:
+        props["nationality"] = [person.nationality]
+    if person.place_of_birth:
+        props["birthPlace"] = [person.place_of_birth]
+    if person.gender:
+        props["gender"] = [person.gender]
+    return {"schema": "Person", "properties": props}
+
+
+def run_fixture(
+    client: httpx.Client,
+    persons: list[PersonRecord],
+    algorithm: str,
+    base_url: str,
+    dataset: str,
+    fixture_name: str,
+) -> list[dict[str, Any]]:
+    """Returns a list of {score, match} dicts, one per person (top result only)."""
+    results: list[dict[str, Any]] = []
+    total_persons = len(persons)
+
+    for batch_start in range(0, total_persons, BATCH_SIZE):
+        batch = persons[batch_start : batch_start + BATCH_SIZE]
+        batch_end = min(batch_start + BATCH_SIZE, total_persons)
+        log.info("[%s] %s: %d/%d", algorithm, fixture_name, batch_end, total_persons)
+        queries = {f"q{i}": person_to_query(p) for i, p in enumerate(batch)}
+        resp = client.post(
+            f"{base_url}/match/{dataset}",
+            json={"queries": queries},
+            params={"algorithm": algorithm, "limit": 5},
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        for key in queries:
+            hits = data["responses"][key].get("results", [])
+            if hits:
+                top = hits[0]
+                results.append({"score": top["score"], "match": top["match"]})
+            else:
+                results.append({"score": 0.0, "match": False})
+
+    return results
+
+
+@click.command()
+@click.option("--dataset", default="default", show_default=True)
+@click.option("--base-url", default="http://localhost:8000", show_default=True)
+@click.option(
+    "--output",
+    default="report.json",
+    show_default=True,
+    type=click.Path(),
+    help="Path to write the JSON report.",
+)
+def main(dataset: str, base_url: str, output: str) -> None:
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
+    fixtures = sorted(FIXTURES_DIR.glob("*.csv"))
+    if not fixtures:
+        log.error("No fixture CSVs found in %s", FIXTURES_DIR)
+        raise SystemExit(1)
+
+    report: dict[str, dict[str, list[dict[str, Any]]]] = {}
+
+    with httpx.Client(timeout=60.0) as client:
+        for fixture_path in fixtures:
+            persons = read_person_csv(fixture_path)
+            log.info("fixture: %s (%d persons)", fixture_path.name, len(persons))
+            report[fixture_path.name] = {}
+            for algo in ALGORITHMS:
+                report[fixture_path.name][algo] = run_fixture(
+                    client, persons, algo, base_url, dataset, fixture_path.name
+                )
+
+    output_path = Path(output)
+    output_path.write_text(json.dumps(report, indent=2))
+    log.info("report written to %s", output_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/contrib/validation_report/render_report_data.py b/contrib/validation_report/render_report_data.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+"""Render a JSON report data file into an HTML report."""
+
+import hashlib
+import json
+import statistics
+import subprocess
+from datetime import date
+from pathlib import Path
+from typing import Any
+
+import click
+from jinja2 import Environment, FileSystemLoader
+from markdown_it import MarkdownIt
+
+FIXTURES_DIR = Path(__file__).parent / "fixtures"
+TEMPLATES_DIR = Path(__file__).parent / "templates"
+RESOURCES_DIR = Path(__file__).parent / "resources"
+
+FIXTURE_DESCRIPTIONS: dict[str, str] = {
+    "negatives_global.csv": (
+        "Global true-negatives from an internal OpenSanctions reference dataset of synthetic person records. "
+        "Generated with multi-cultural name diversity, geographic correlation, and realistic field variations "
+        "to test false-positive rates across diverse global naming conventions."
+    ),
+    "negatives_us.csv": (
+        "US true-negatives from an internal OpenSanctions reference dataset of synthetic person records. "
+        "Reflects US-based individuals including cultural mixing (e.g. US nationals with international name origins) "
+        "to test false-positive rates for US-centric screening."
+    ),
+    "positives_un_treated.csv": (
+        "Generated from the `un_sc_sanctions` dataset, treated version: minor typos and name reshuffles applied. These are true-positives."
+    ),
+    "positives_un_untreated.csv": (
+        "Generated from the `un_sc_sanctions`. These are true-positives."
+    ),
+    "positives_us_congress_untreated.csv": (
+        "Generated from the `us_congress` dataset. These are true-positives."
+    ),
+}
+
+
+def md5(path: Path) -> str:
+    h = hashlib.md5()
+    h.update(path.read_bytes())
+    return h.hexdigest()
+
+
+def compute_stats(entries: list[dict[str, Any]]) -> dict[str, Any]:
+    scores = [e["score"] for e in entries]
+    matches = sum(1 for e in entries if e["match"])
+    total = len(entries)
+    return {
+        "mean_top_score": statistics.mean(scores) if scores else 0.0,
+        "matches": matches,
+        "total": total,
+        "pct_matches": (matches / total * 100) if total else 0.0,
+    }
+
+
+@click.command()
+@click.argument("input_json", type=click.Path(exists=True))
+@click.option("--dataset", default="default", show_default=True)
+@click.option(
+    "--output",
+    default="report.html",
+    show_default=True,
+    type=click.Path(),
+    help="Path to write the HTML report.",
+)
+def main(input_json: str, dataset: str, output: str) -> None:
+    data: dict[str, dict[str, list[dict[str, Any]]]] = json.loads(
+        Path(input_json).read_text()
+    )
+
+    fixtures = []
+    for fixture_path in sorted(FIXTURES_DIR.glob("*.csv")):
+        lines = fixture_path.read_text(encoding="utf-8").splitlines()
+        records = max(0, len(lines) - 1)  # subtract header
+        fixtures.append(
+            {
+                "name": fixture_path.name,
+                "description": FIXTURE_DESCRIPTIONS.get(fixture_path.name, ""),
+                "md5": md5(fixture_path),
+                "records": records,
+            }
+        )
+
+    results = {
+        fixture_name: {algo: compute_stats(entries) for algo, entries in algos.items()}
+        for fixture_name, algos in data.items()
+    }
+
+    pip_freeze = subprocess.check_output(["pip", "freeze"], text=True).strip()
+    yente_git_version = subprocess.check_output(
+        ["git", "describe", "--tags", "--always", "--dirty"],
+        text=True,
+        cwd=Path(__file__).parent,
+    ).strip()
+
+    env = Environment(
+        loader=FileSystemLoader(TEMPLATES_DIR), keep_trailing_newline=True
+    )
+    md_rendered = env.get_template("report.md.j2").render(
+        date=date.today().isoformat(),
+        dataset=dataset,
+        fixtures=fixtures,
+        results=results,
+        pip_freeze=pip_freeze,
+        yente_git_version=yente_git_version,
+    )
+
+    html_body = MarkdownIt("gfm-like").render(md_rendered)
+    html = (
+        f"<!doctype html>\n<html>\n<head>\n<meta charset='utf-8'>\n"
+        f"<style>\n{(RESOURCES_DIR / 'github-markdown.css').read_text()}\n</style>\n"
+        f"<style>\n{(RESOURCES_DIR / 'report.css').read_text()}\n</style>\n"
+        f"</head>\n<body class='markdown-body'>\n{html_body}</body>\n</html>\n"
+    )
+
+    output_path = Path(output)
+    output_path.write_text(html, encoding="utf-8")
+    print(f"report written to {output_path}")
+
+
+if __name__ == "__main__":
+    main()