Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
8284665
filtered neighbours gets persisted
RobinL Feb 15, 2026
05f3c21
Merge pull request #2933 from moj-analytical-services/persist-filtere…
RobinL Feb 15, 2026
f71f382
4.0.14 release and changelog
RobinL Feb 15, 2026
3ee699f
Merge pull request #2934 from moj-analytical-services/4_0_14_release
RobinL Feb 15, 2026
8dc6c93
faster join to blocked pairs
RobinL Feb 16, 2026
997eddd
faster join to blocked pairs
RobinL Feb 16, 2026
853c0f2
clean up
RobinL Feb 16, 2026
53021d0
pass correct args
RobinL Feb 17, 2026
f450ac9
Merge pull request #2936 from moj-analytical-services/faster_join_to_…
RobinL Feb 17, 2026
b599a01
4_0_15 release
RobinL Feb 17, 2026
0265ce1
Merge pull request #2938 from moj-analytical-services/4_0_15_release
RobinL Feb 17, 2026
3fd85e2
Merge branch 'master' into merge/splink_4_to_5
RobinL Feb 18, 2026
bdedf1f
Merge pull request #2939 from moj-analytical-services/merge/splink_4_…
RobinL Feb 18, 2026
e626bf6
remove orphaned chart
ADBond Feb 17, 2026
bbf2568
cl detailed record as dataclass
ADBond Feb 17, 2026
31c2d8c
comparison name in cl detailed rec
ADBond Feb 17, 2026
e3205ce
detailed record dataclass in comparison
ADBond Feb 17, 2026
7eb4d05
specialised detailed record for settings
ADBond Feb 17, 2026
f7537d8
em iteration record subclass
ADBond Feb 17, 2026
201bfcc
comparison_match_weights_chart using dataclasses directly
ADBond Feb 17, 2026
8fefe1e
test comparison_match_weight_charts
ADBond Feb 17, 2026
6b814c5
match weight charts using dataclass
ADBond Feb 17, 2026
6360bac
mu charts using dataclass
ADBond Feb 17, 2026
21a4ea5
import only for type checking
ADBond Feb 18, 2026
acbec49
avoid kw only
ADBond Feb 18, 2026
cbc429f
iteration charts with dataclasses
ADBond Feb 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Removed salting mechanism as it is no longer required for parallelisation in DuckDB [#2849](https://github.com/moj-analytical-services/splink/pull/2849)
- `pandas` and `numpy` are no longer required dependencies [#2883](https://github.com/moj-analytical-services/splink/pull/2883)

## [4.0.15] - 2026-02-17

### Changed

Faster two_dataset_link_only joins when joining small table to large in duckdb by @RobinL in https://github.com/moj-analytical-services/splink/pull/2936

## [4.0.14] - 2026-02-12

### Changed

* Two dataset link only exploding blocking rule optimisation by @RobinL in https://github.com/moj-analytical-services/splink/pull/2931
* Filtered neighbours gets persisted by @RobinL in https://github.com/moj-analytical-services/splink/pull/2933

## [4.0.13] - 2026-02-12

### Fixed
Expand Down
88 changes: 51 additions & 37 deletions splink/internals/charts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,24 @@
import json
import math
import os
from typing import TYPE_CHECKING, Any, Dict, Union
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any, Protocol, Union

from splink.internals.misc import read_resource
from splink.internals.waterfall_chart import records_to_waterfall_data

if TYPE_CHECKING:
from altair import SchemaBase

from splink.internals.comparison_level import ComparisonLevelDetailedRecord
from splink.internals.em_training_session import (
ModelParameterIterationDetailedRecord,
)
from splink.internals.settings import ModelParameterDetailedRecord
else:
SchemaBase = None
# type alias:
ChartReturnType = Union[Dict[Any, Any], SchemaBase]
ChartReturnType = Union[dict[Any, Any], SchemaBase]


def load_chart_definition(filename):
Expand Down Expand Up @@ -41,6 +48,14 @@ def altair_or_json(
return chart_dict


class AsDictable(Protocol):
def as_dict(self) -> dict[str, Any]: ...


def list_items_as_dicts(lst: Iterable[AsDictable]) -> list[dict[str, Any]]:
return list(map(lambda item: item.as_dict(), lst))


iframe_message = """
To view in Jupyter you can use the following command:

Expand Down Expand Up @@ -86,46 +101,53 @@ def save_offline_chart(
print(iframe_message.format(filename=filename)) # noqa: T201


def match_weights_chart(records, as_dict=False):
def match_weights_chart(
records: list[ModelParameterDetailedRecord], as_dict: bool = False
) -> ChartReturnType:
chart_path = "match_weights_interactive_history.json"
chart = load_chart_definition(chart_path)

# Remove iteration history since this is a static chart
del chart["params"]
del chart["transform"]

records = [r for r in records if r["comparison_vector_value"] != -1]
chart["data"]["values"] = records
records = [r for r in records if r.comparison_vector_value != -1]

bayes_factors = [
abs(l2bf)
for r in records
if (l2bf := r["log2_bayes_factor"]) is not None and not math.isinf(l2bf)
if (l2bf := r.log2_bayes_factor) is not None and not math.isinf(l2bf)
]
max_value = math.ceil(max(bayes_factors))
chart["data"]["values"] = list_items_as_dicts(records)

chart["vconcat"][0]["encoding"]["x"]["scale"]["domain"] = [-max_value, max_value]
chart["vconcat"][1]["encoding"]["x"]["scale"]["domain"] = [-max_value, max_value]

return altair_or_json(chart, as_dict=as_dict)


def comparison_match_weights_chart(records, as_dict=False):
def comparison_match_weights_chart(
records: list[ComparisonLevelDetailedRecord], as_dict: bool = False
) -> ChartReturnType:
chart_path = "match_weights_interactive_history.json"
chart = load_chart_definition(chart_path)

# Remove iteration history since this is a static chart
del chart["vconcat"][0]
# TODO: some render issue if we remove empty top panel, so leave for now
# del chart["vconcat"][0]
del chart["params"]
del chart["transform"]

chart["title"]["text"] = "Comparison summary"
records = [r for r in records if r["comparison_vector_value"] != -1]
chart["data"]["values"] = records
records = [r for r in records if r.comparison_vector_value != -1]
chart["data"]["values"] = list_items_as_dicts(records)
return altair_or_json(chart, as_dict=as_dict)


def m_u_parameters_chart(records, as_dict=False):
def m_u_parameters_chart(
records: list[ModelParameterDetailedRecord], as_dict: bool = False
) -> ChartReturnType:
chart_path = "m_u_parameters_interactive_history.json"
chart = load_chart_definition(chart_path)

Expand All @@ -136,10 +158,10 @@ def m_u_parameters_chart(records, as_dict=False):
records = [
r
for r in records
if r["comparison_vector_value"] != -1
and r["comparison_name"] != "probability_two_random_records_match"
if r.comparison_vector_value != -1
and r.comparison_name != "probability_two_random_records_match"
]
chart["data"]["values"] = records
chart["data"]["values"] = list_items_as_dicts(records)
return altair_or_json(chart, as_dict=as_dict)


Expand All @@ -151,39 +173,45 @@ def probability_two_random_records_match_iteration_chart(records, as_dict=False)
return altair_or_json(chart, as_dict=as_dict)


def match_weights_interactive_history_chart(records, as_dict=False, blocking_rule=None):
def match_weights_interactive_history_chart(
records: list[ModelParameterIterationDetailedRecord],
as_dict: bool = False,
blocking_rule: str | None = None,
) -> ChartReturnType:
chart_path = "match_weights_interactive_history.json"
chart = load_chart_definition(chart_path)

chart["title"]["subtitle"] = f"Training session blocked on {blocking_rule}"

records = [r for r in records if r["comparison_vector_value"] != -1]
chart["data"]["values"] = records
records = [r for r in records if r.comparison_vector_value != -1]

max_iteration = 0
for r in records:
max_iteration = max(r["iteration"], max_iteration)
max_iteration = max(r.iteration, max_iteration)
chart["data"]["values"] = list_items_as_dicts(records)

chart["params"][0]["bind"]["max"] = max_iteration
chart["params"][0]["value"] = max_iteration
return altair_or_json(chart, as_dict=as_dict)


def m_u_parameters_interactive_history_chart(records, as_dict=False):
def m_u_parameters_interactive_history_chart(
records: list[ModelParameterIterationDetailedRecord], as_dict: bool = False
) -> ChartReturnType:
chart_path = "m_u_parameters_interactive_history.json"
chart = load_chart_definition(chart_path)
records = [
r
for r in records
if r["comparison_vector_value"] != -1
and r["comparison_name"] != "probability_two_random_records_match"
if r.comparison_vector_value != -1
and r.comparison_name != "probability_two_random_records_match"
]
chart["data"]["values"] = records

max_iteration = 0
for r in records:
max_iteration = max(r["iteration"], max_iteration)
max_iteration = max(r.iteration, max_iteration)

chart["data"]["values"] = list_items_as_dicts(records)
chart["params"][0]["bind"]["max"] = max_iteration
chart["params"][0]["value"] = max_iteration
return altair_or_json(chart, as_dict=as_dict)
Expand Down Expand Up @@ -349,20 +377,6 @@ def parameter_estimate_comparisons(records, as_dict=False):
return altair_or_json(chart, as_dict=as_dict)


def missingness_chart(records, as_dict=False):
chart_path = "missingness.json"
chart = load_chart_definition(chart_path)

chart["data"]["values"] = records

record_count = records[0]["total_record_count"]

for c in chart["layer"]:
c["title"] = f"Missingness per column out of {record_count:,.0f} records"

return altair_or_json(chart, as_dict=as_dict)


def unlinkables_chart(
records,
x_col="match_weight",
Expand Down
17 changes: 9 additions & 8 deletions splink/internals/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@
join_list_with_commas_final_and,
)

from .comparison_level import ComparisonLevel, _default_m_values, _default_u_values
from .comparison_level import (
ComparisonLevel,
ComparisonLevelDetailedRecord,
_default_m_values,
_default_u_values,
)

# https://stackoverflow.com/questions/39740632/python-type-hinting-without-cyclic-imports
if TYPE_CHECKING:
Expand Down Expand Up @@ -360,15 +365,11 @@ def _is_trained(self):
return self._all_m_are_trained and self._all_u_are_trained

@property
def _as_detailed_records(self) -> list[dict[str, Any]]:
def _as_detailed_records(self) -> list[ComparisonLevelDetailedRecord]:
records = []
for cl in self.comparison_levels:
record = {}
record["comparison_name"] = self.output_column_name
record = {
**record,
**cl._as_detailed_record(self._num_levels, self.comparison_levels),
}
record = cl._as_detailed_record(self._num_levels, self.comparison_levels)
record.comparison_name = self.output_column_name
records.append(record)
return records

Expand Down
87 changes: 54 additions & 33 deletions splink/internals/comparison_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import math
import re
from copy import copy
from dataclasses import asdict, dataclass
from statistics import median
from textwrap import dedent
from typing import Any, Optional, Union, cast
Expand Down Expand Up @@ -116,6 +117,34 @@ def _default_u_values(num_levels: int) -> list[float]:
return u_vals


@dataclass
class ComparisonLevelDetailedRecord:
sql_condition: str | None
label_for_charts: str

has_tf_adjustments: bool
tf_adjustment_column: str | None
tf_adjustment_weight: float | None

is_null_level: bool

m_probability: float | None
u_probability: float | None
m_probability_description: str | None
u_probability_description: str | None

bayes_factor: float | None
log2_bayes_factor: float
bayes_factor_description: str

comparison_vector_value: int
max_comparison_vector_value: int
comparison_name: str | None

def as_dict(self):
return asdict(self)


class ComparisonLevel:
"""Each ComparisonLevel defines a gradation (category) of similarity within a
`Comparison`.
Expand Down Expand Up @@ -735,39 +764,31 @@ def _as_completed_dict(self):

def _as_detailed_record(
self, comparison_num_levels: int, comparison_levels: list[ComparisonLevel]
) -> dict[str, Any]:
) -> ComparisonLevelDetailedRecord:
"A detailed representation of this level to describe it in charting outputs"
output: dict[str, Any] = {}
output["sql_condition"] = self.sql_condition
output["label_for_charts"] = self._label_for_charts_no_duplicates(
comparison_levels
return ComparisonLevelDetailedRecord(
sql_condition=self.sql_condition,
label_for_charts=self._label_for_charts_no_duplicates(comparison_levels),
has_tf_adjustments=self._has_tf_adjustments,
tf_adjustment_column=(
self._tf_adjustment_input_column.input_name
if self._has_tf_adjustments
else None
),
tf_adjustment_weight=self._tf_adjustment_weight,
is_null_level=self.is_null_level,
m_probability=self.m_probability if not self.is_null_level else None,
u_probability=self.u_probability if not self.is_null_level else None,
m_probability_description=self._m_probability_description,
u_probability_description=self._u_probability_description,
bayes_factor=self._bayes_factor,
log2_bayes_factor=self._log2_bayes_factor,
bayes_factor_description=self._bayes_factor_description,
comparison_vector_value=self.comparison_vector_value,
max_comparison_vector_value=comparison_num_levels - 1,
comparison_name=None,
)

if not self._is_null_level:
output["m_probability"] = self.m_probability
output["u_probability"] = self.u_probability

output["m_probability_description"] = self._m_probability_description
output["u_probability_description"] = self._u_probability_description

output["has_tf_adjustments"] = self._has_tf_adjustments
if self._has_tf_adjustments:
output["tf_adjustment_column"] = self._tf_adjustment_input_column.input_name
else:
output["tf_adjustment_column"] = None
output["tf_adjustment_weight"] = self._tf_adjustment_weight

output["is_null_level"] = self.is_null_level
output["bayes_factor"] = self._bayes_factor
output["log2_bayes_factor"] = self._log2_bayes_factor
output["comparison_vector_value"] = self.comparison_vector_value
output["max_comparison_vector_value"] = comparison_num_levels - 1
output["bayes_factor_description"] = self._bayes_factor_description
output["m_probability_description"] = self._m_probability_description
output["u_probability_description"] = self._u_probability_description

return output

def _parameter_estimates_as_records(
self, comparison_num_levels: int, comparison_levels: list[ComparisonLevel]
) -> list[dict[str, Any]]:
Expand All @@ -786,9 +807,9 @@ def _parameter_estimates_as_records(
else:
record["estimated_probability_as_log_odds"] = None

record["sql_condition"] = cl_record["sql_condition"]
record["comparison_level_label"] = cl_record["label_for_charts"]
record["comparison_vector_value"] = cl_record["comparison_vector_value"]
record["sql_condition"] = cl_record.sql_condition
record["comparison_level_label"] = cl_record.label_for_charts
record["comparison_vector_value"] = cl_record.comparison_vector_value
output_records.append(record)

return output_records
Expand Down
Loading