Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,001 changes: 1,001 additions & 0 deletions contrib/validation_report/fixtures/negatives_global.csv

Large diffs are not rendered by default.

1,001 changes: 1,001 additions & 0 deletions contrib/validation_report/fixtures/negatives_us.csv

Large diffs are not rendered by default.

684 changes: 684 additions & 0 deletions contrib/validation_report/fixtures/positives_un_treated.csv

Large diffs are not rendered by default.

684 changes: 684 additions & 0 deletions contrib/validation_report/fixtures/positives_un_untreated.csv

Large diffs are not rendered by default.

1,001 changes: 1,001 additions & 0 deletions contrib/validation_report/fixtures/positives_us_congress_untreated.csv

Large diffs are not rendered by default.

143 changes: 143 additions & 0 deletions contrib/validation_report/generate_report_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#!/usr/bin/env python3
"""Validation report: compare logic-v1 vs logic-v2 against fixture CSVs via the yente HTTP API."""

import csv
import json
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Any

import click
import httpx

log = logging.getLogger(__name__)

ALGORITHMS = ["logic-v1", "logic-v2"]
BATCH_SIZE = 100
FIXTURES_DIR = Path(__file__).parent / "fixtures"


@dataclass
class PersonRecord:
full_name: str
first_name: str
middle_name: str | None
last_name: str
gender: str
date_of_birth: str
place_of_birth: str
nationality: str


def read_person_csv(path: Path) -> list[PersonRecord]:
persons: list[PersonRecord] = []
with open(path, newline="", encoding="utf-8") as f:
for row in csv.DictReader(f):
persons.append(
PersonRecord(
full_name=row.get("full_name", ""),
first_name=row.get("first_name", ""),
middle_name=row.get("middle_name") or None,
last_name=row.get("last_name", ""),
gender=row.get("gender", ""),
date_of_birth=row.get("date_of_birth", ""),
place_of_birth=row.get("place_of_birth", ""),
nationality=row.get("nationality", ""),
)
)
return persons


def person_to_query(person: PersonRecord) -> dict[str, Any]:
props: dict[str, list[str]] = {}
if person.full_name:
props["name"] = [person.full_name]
if person.first_name:
props["firstName"] = [person.first_name]
if person.middle_name:
props["middleName"] = [person.middle_name]
if person.last_name:
props["lastName"] = [person.last_name]
if person.date_of_birth:
props["birthDate"] = [person.date_of_birth]
if person.nationality:
props["nationality"] = [person.nationality]
if person.place_of_birth:
props["birthPlace"] = [person.place_of_birth]
if person.gender:
props["gender"] = [person.gender]
return {"schema": "Person", "properties": props}


def run_fixture(
client: httpx.Client,
persons: list[PersonRecord],
algorithm: str,
base_url: str,
dataset: str,
fixture_name: str,
) -> list[dict[str, Any]]:
"""Returns a list of {score, match} dicts, one per person (top result only)."""
results: list[dict[str, Any]] = []
total_persons = len(persons)

for batch_start in range(0, total_persons, BATCH_SIZE):
batch = persons[batch_start : batch_start + BATCH_SIZE]
batch_end = min(batch_start + BATCH_SIZE, total_persons)
log.info("[%s] %s: %d/%d", algorithm, fixture_name, batch_end, total_persons)
queries = {f"q{i}": person_to_query(p) for i, p in enumerate(batch)}
resp = client.post(
f"{base_url}/match/{dataset}",
json={"queries": queries},
params={"algorithm": algorithm, "limit": 5},
)
resp.raise_for_status()
data = resp.json()
for key in queries:
hits = data["responses"][key].get("results", [])
if hits:
top = hits[0]
results.append({"score": top["score"], "match": top["match"]})
else:
results.append({"score": 0.0, "match": False})
Comment on lines +99 to +103
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should the positives include entity IDs and confirm that precisely the entity we're expecting was one of the results considered a match?

If we're merely considering some "match" as a match, maybe that could be a note in the report somewhere under "methodology" or something so that code doesn't have to be read to discover that.


return results


@click.command()
@click.option("--dataset", default="default", show_default=True)
@click.option("--base-url", default="http://localhost:8000", show_default=True)
@click.option(
"--output",
default="report.json",
show_default=True,
type=click.Path(),
help="Path to write the JSON report.",
)
def main(dataset: str, base_url: str, output: str) -> None:
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
fixtures = sorted(FIXTURES_DIR.glob("*.csv"))
if not fixtures:
log.error("No fixture CSVs found in %s", FIXTURES_DIR)
raise SystemExit(1)

report: dict[str, dict[str, list[dict[str, Any]]]] = {}

with httpx.Client(timeout=60.0) as client:
for fixture_path in fixtures:
persons = read_person_csv(fixture_path)
log.info("fixture: %s (%d persons)", fixture_path.name, len(persons))
report[fixture_path.name] = {}
for algo in ALGORITHMS:
report[fixture_path.name][algo] = run_fixture(
client, persons, algo, base_url, dataset, fixture_path.name
)

output_path = Path(output)
output_path.write_text(json.dumps(report, indent=2))
log.info("report written to %s", output_path)


if __name__ == "__main__":
main()
127 changes: 127 additions & 0 deletions contrib/validation_report/render_report_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/usr/bin/env python3
"""Render a JSON report data file into an HTML report."""

import hashlib
import json
import statistics
import subprocess
from datetime import date
from pathlib import Path
from typing import Any

import click
from jinja2 import Environment, FileSystemLoader
from markdown_it import MarkdownIt

FIXTURES_DIR = Path(__file__).parent / "fixtures"
TEMPLATES_DIR = Path(__file__).parent / "templates"
RESOURCES_DIR = Path(__file__).parent / "resources"

FIXTURE_DESCRIPTIONS: dict[str, str] = {
"negatives_global.csv": (
"Global true-negatives from an internal OpenSanctions reference dataset of synthetic person records. "
"Generated with multi-cultural name diversity, geographic correlation, and realistic field variations "
"to test false-positive rates across diverse global naming conventions."
),
"negatives_us.csv": (
"US true-negatives from an internal OpenSanctions reference dataset of synthetic person records. "
"Reflects US-based individuals including cultural mixing (e.g. US nationals with international name origins) "
"to test false-positive rates for US-centric screening."
),
"positives_un_treated.csv": (
"Generated from the `un_sc_sanctions` dataset, treated version: minor typos and name reshuffles applied. These are true-positives."
),
"positives_un_untreated.csv": (
"Generated from the `un_sc_sanctions`. These are true-positives."
),
"positives_us_congress_untreated.csv": (
"Generated from the `us_congress` dataset. These are true-positives."
),
}


def md5(path: Path) -> str:
h = hashlib.md5()
h.update(path.read_bytes())
return h.hexdigest()


def compute_stats(entries: list[dict[str, Any]]) -> dict[str, Any]:
scores = [e["score"] for e in entries]
matches = sum(1 for e in entries if e["match"])
total = len(entries)
return {
"mean_top_score": statistics.mean(scores) if scores else 0.0,
"matches": matches,
"total": total,
"pct_matches": (matches / total * 100) if total else 0.0,
}


@click.command()
@click.argument("input_json", type=click.Path(exists=True))
@click.option("--dataset", default="default", show_default=True)
@click.option(
"--output",
default="report.html",
show_default=True,
type=click.Path(),
help="Path to write the HTML report.",
)
def main(input_json: str, dataset: str, output: str) -> None:
data: dict[str, dict[str, list[dict[str, Any]]]] = json.loads(
Path(input_json).read_text()
)

fixtures = []
for fixture_path in sorted(FIXTURES_DIR.glob("*.csv")):
lines = fixture_path.read_text(encoding="utf-8").splitlines()
records = max(0, len(lines) - 1) # subtract header
fixtures.append(
{
"name": fixture_path.name,
"description": FIXTURE_DESCRIPTIONS.get(fixture_path.name, ""),
"md5": md5(fixture_path),
"records": records,
}
)

results = {
fixture_name: {algo: compute_stats(entries) for algo, entries in algos.items()}
for fixture_name, algos in data.items()
}

pip_freeze = subprocess.check_output(["pip", "freeze"], text=True).strip()
yente_git_version = subprocess.check_output(
["git", "describe", "--tags", "--always", "--dirty"],
text=True,
cwd=Path(__file__).parent,
).strip()

env = Environment(
loader=FileSystemLoader(TEMPLATES_DIR), keep_trailing_newline=True
)
md_rendered = env.get_template("report.md.j2").render(
date=date.today().isoformat(),
dataset=dataset,
fixtures=fixtures,
results=results,
pip_freeze=pip_freeze,
yente_git_version=yente_git_version,
)

html_body = MarkdownIt("gfm-like").render(md_rendered)
html = (
f"<!doctype html>\n<html>\n<head>\n<meta charset='utf-8'>\n"
f"<style>\n{(RESOURCES_DIR / 'github-markdown.css').read_text()}\n</style>\n"
f"<style>\n{(RESOURCES_DIR / 'report.css').read_text()}\n</style>\n"
f"</head>\n<body class='markdown-body'>\n{html_body}</body>\n</html>\n"
)

output_path = Path(output)
output_path.write_text(html, encoding="utf-8")
print(f"report written to {output_path}")


if __name__ == "__main__":
main()
Loading
Loading