From 7df00248496b3e5c7655380b8db2adc7dd4c3ea3 Mon Sep 17 00:00:00 2001 From: Samanvya Tripathi Date: Wed, 25 Mar 2026 20:45:31 -0400 Subject: [PATCH] fix(eval): exclude booleans from parsed benchmark metrics bool is a subclass of int in Python, so isinstance(True, (int, float)) returns True. Boolean metadata from lm-eval results was leaking into benchmark metrics as 1/0. Added explicit bool exclusion. Fixes #29 --- src/alignrl/eval.py | 5 ++++- tests/test_eval.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/alignrl/eval.py b/src/alignrl/eval.py index b031bf9..2b60888 100644 --- a/src/alignrl/eval.py +++ b/src/alignrl/eval.py @@ -28,7 +28,10 @@ def parse_results(raw: dict[str, Any], model_name: str, stage: str) -> EvalResul """Parse lm-evaluation-harness output into EvalResult.""" benchmarks: dict[str, dict[str, float]] = {} for task_name, metrics in raw.get("results", {}).items(): - benchmarks[task_name] = {k: v for k, v in metrics.items() if isinstance(v, (int, float))} + benchmarks[task_name] = { + k: v for k, v in metrics.items() + if isinstance(v, (int, float)) and not isinstance(v, bool) + } return EvalResult(model_name=model_name, stage=stage, benchmarks=benchmarks) diff --git a/tests/test_eval.py b/tests/test_eval.py index 031023e..14aee44 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -171,3 +171,18 @@ def test_filters_non_numeric(self) -> None: def test_no_results_key(self) -> None: result = parse_results({}, model_name="test", stage="base") assert result.benchmarks == {} + + def test_filters_booleans(self) -> None: + raw = { + "results": { + "gsm8k": { + "exact_match": 0.5, + "has_config": True, + "is_valid": False, + } + } + } + result = parse_results(raw, model_name="test", stage="base") + assert "exact_match" in result.benchmarks["gsm8k"] + assert "has_config" not in result.benchmarks["gsm8k"] + assert "is_valid" not in result.benchmarks["gsm8k"]