From 12d5d554eda16c3f8db45d3c41416d9d23f85d38 Mon Sep 17 00:00:00 2001 From: chaitanyaam <147836528+chaitanyaam@users.noreply.github.com> Date: Sun, 22 Mar 2026 00:10:18 +0530 Subject: [PATCH 1/7] feat: implement BiasEvaluator using WinoBias benchmark --- openverifiablellm/eval/__init__.py | 7 + openverifiablellm/eval/base.py | 24 ++++ openverifiablellm/eval/bias.py | 102 +++++++++++++++ openverifiablellm/eval/perplexity.py | 151 +++++++++++++++++++++ pyproject.toml | 11 ++ tests/test_eval.py | 187 +++++++++++++++++++++++++++ 6 files changed, 482 insertions(+) create mode 100644 openverifiablellm/eval/__init__.py create mode 100644 openverifiablellm/eval/base.py create mode 100644 openverifiablellm/eval/bias.py create mode 100644 openverifiablellm/eval/perplexity.py create mode 100644 tests/test_eval.py diff --git a/openverifiablellm/eval/__init__.py b/openverifiablellm/eval/__init__.py new file mode 100644 index 0000000..ac715f8 --- /dev/null +++ b/openverifiablellm/eval/__init__.py @@ -0,0 +1,7 @@ +from .bias import BiasEvaluator +from .perplexity import PerplexityEvaluator + +__all__ = [ + "BiasEvaluator", + "PerplexityEvaluator", +] diff --git a/openverifiablellm/eval/base.py b/openverifiablellm/eval/base.py new file mode 100644 index 0000000..6e01d2b --- /dev/null +++ b/openverifiablellm/eval/base.py @@ -0,0 +1,24 @@ +from abc import ABC, abstractmethod + + +class BaseEvaluator(ABC): + """Abstract base class for all dataset evaluators.""" + + @abstractmethod + def evaluate(self, model, tokenizer) -> dict: + """ + Evaluate a language model using the given tokenizer. + + Parameters + ---------- + model : callable + Callable accepting a sequence of token IDs and returning a + 2-D sequence of logits with shape ``(len(input_ids), vocab_size)``. + tokenizer : object + Object with an ``encode(text: str) -> list[int]`` method. + + Returns + ------- + dict + Benchmark-specific evaluation results. + """ diff --git a/openverifiablellm/eval/bias.py b/openverifiablellm/eval/bias.py new file mode 100644 index 0000000..c8dd374 --- /dev/null +++ b/openverifiablellm/eval/bias.py @@ -0,0 +1,102 @@ +""" +openverifiablellm/eval/bias.py + +Gender-bias evaluator using the WinoBias benchmark. +""" + +from typing import Optional + +from .base import BaseEvaluator +from .perplexity import PerplexityEvaluator + + +class BiasEvaluator(BaseEvaluator): + """ + Evaluates gender bias in a language model using the WinoBias benchmark. + + For each sentence pair (pro-stereotype / anti-stereotype) the model's + perplexity is computed via the same sliding-window method used by + :class:`PerplexityEvaluator`. A lower ``bias_score`` indicates a less + biased model. + + Parameters + ---------- + benchmark : str + Bias benchmark to use. Currently only ``"wino_bias"`` is supported. + Default ``"wino_bias"``. + n_samples : int or None + Maximum number of sentences to load from each WinoBias split. + ``None`` evaluates the full dataset. Default ``None``. + """ + + SUPPORTED_BENCHMARKS = {"wino_bias"} + + def __init__(self, benchmark: str = "wino_bias", n_samples: Optional[int] = None): + if benchmark not in self.SUPPORTED_BENCHMARKS: + raise ValueError( + f"Unsupported benchmark {benchmark!r}. " + f"Choose one of {sorted(self.SUPPORTED_BENCHMARKS)}." + ) + self.benchmark = benchmark + self.n_samples = n_samples + + def evaluate(self, model, tokenizer) -> dict: + """ + Compute stereotype and anti-stereotype perplexity scores. + + Loads ``type1_pro`` (pro-stereotype) and ``type1_anti`` + (anti-stereotype) splits of WinoBias and measures how much more + easily the model predicts gender-stereotypical sentences than + counter-stereotypical ones. + + Parameters + ---------- + model : callable + ``model(input_ids) -> 2-D sequence`` of shape + ``(len(input_ids), vocab_size)``, as described in + :meth:`PerplexityEvaluator.compute_sentence_perplexity`. + tokenizer : object + Object with ``encode(text: str) -> list[int]``. + + Returns + ------- + dict + A dictionary with the following keys: + + * **stereotype_score** (*float*) — mean perplexity on + pro-stereotype sentences. + * **anti_stereotype_score** (*float*) — mean perplexity on + anti-stereotype sentences. + * **bias_score** (*float*) — + ``abs(stereotype_score - anti_stereotype_score)``; + lower means less biased. + """ + import datasets as hf_datasets # deferred; runtime dep + + pro_ds = hf_datasets.load_dataset("wino_bias", "type1_pro", split="test") + anti_ds = hf_datasets.load_dataset("wino_bias", "type1_anti", split="test") + + def _score_split(dataset) -> float: + scores = [] + for i, row in enumerate(dataset): + if self.n_samples is not None and i >= self.n_samples: + break + tokens = row.get("tokens", []) + text = " ".join(tokens) if isinstance(tokens, list) else str(tokens) + if not text.strip(): + continue + token_ids = tokenizer.encode(text) + scores.append( + PerplexityEvaluator.compute_sentence_perplexity(model, token_ids) + ) + return float(sum(scores) / len(scores)) if scores else float("inf") + + stereotype_score = _score_split(pro_ds) + anti_stereotype_score = _score_split(anti_ds) + bias_score = abs(stereotype_score - anti_stereotype_score) + + return { + "stereotype_score": stereotype_score, + "anti_stereotype_score": anti_stereotype_score, + "bias_score": bias_score, + } diff --git a/openverifiablellm/eval/perplexity.py b/openverifiablellm/eval/perplexity.py new file mode 100644 index 0000000..84bc18d --- /dev/null +++ b/openverifiablellm/eval/perplexity.py @@ -0,0 +1,151 @@ +""" +openverifiablellm/eval/perplexity.py + +Perplexity evaluator for language models. +""" + +import math +from typing import List, Optional + +from .base import BaseEvaluator + + +class PerplexityEvaluator(BaseEvaluator): + """ + Evaluates language-model perplexity on a HuggingFace benchmark dataset. + + Perplexity is computed with a teacher-forced sliding-window approach: + for each token position *i* the model receives tokens ``[0 .. i-1]`` + and the negative log-probability of token ``[i]`` is accumulated. + The final perplexity is ``exp(mean_NLL)``. + + Parameters + ---------- + benchmark : str + HuggingFace dataset identifier. Default ``"wikitext"``. + n_samples : int or None + Maximum number of non-empty samples to evaluate. ``None`` means + evaluate the whole dataset. Default ``50``. + stride : int + Window stride used when the sequence exceeds the model's context + window. Default ``512``. + """ + + def __init__( + self, + benchmark: str = "wikitext", + n_samples: Optional[int] = 50, + stride: int = 512, + ): + self.benchmark = benchmark + self.n_samples = n_samples + self.stride = stride + + # ------------------------------------------------------------------ + # Mock helpers + # ------------------------------------------------------------------ + + @staticmethod + def uniform_model(vocab_size: int = 1000): + """ + Return a mock model that produces uniform (all-zero) logits. + + Useful for unit testing: because all logits are equal, the + log-softmax is ``-log(vocab_size)`` at every position, giving a + predictable perplexity of exactly ``vocab_size``. + + Parameters + ---------- + vocab_size : int + Vocabulary size of the mock model. Default ``1000``. + + Returns + ------- + callable + ``model(input_ids) -> list[list[float]]`` of shape + ``(len(input_ids), vocab_size)``. + """ + + def _model(input_ids): + return [[0.0] * vocab_size for _ in input_ids] + + return _model + + # ------------------------------------------------------------------ + # Core computation + # ------------------------------------------------------------------ + + @staticmethod + def compute_sentence_perplexity(model, token_ids: List[int]) -> float: + """ + Compute the perplexity of *token_ids* under *model*. + + Parameters + ---------- + model : callable + ``model(input_ids) -> 2-D sequence`` of shape + ``(len(input_ids), vocab_size)``. + token_ids : list[int] + Tokenised sentence. + + Returns + ------- + float + Perplexity (≥ 1). Returns ``float("inf")`` for sequences + shorter than 2 tokens. + """ + if len(token_ids) < 2: + return float("inf") + + inputs = token_ids[:-1] + targets = token_ids[1:] + + logits_batch = model(inputs) # shape: (n-1, vocab_size) + + nll_sum = 0.0 + for logits, target in zip(logits_batch, targets): + # numerically-stable log-softmax + max_l = max(logits) + exp_shifted = [math.exp(v - max_l) for v in logits] + log_sum = math.log(sum(exp_shifted)) + log_prob_target = (logits[target] - max_l) - log_sum + nll_sum -= log_prob_target + + return math.exp(nll_sum / len(targets)) + + # ------------------------------------------------------------------ + # BaseEvaluator interface + # ------------------------------------------------------------------ + + def evaluate(self, model, tokenizer) -> dict: + """ + Compute mean perplexity on *self.benchmark*. + + Parameters + ---------- + model : callable + Callable as described in :meth:`compute_sentence_perplexity`. + tokenizer : object + Object with ``encode(text: str) -> list[int]``. + + Returns + ------- + dict + ``{"perplexity": float}`` — mean perplexity across evaluated + sentences. + """ + import datasets as hf_datasets # deferred; runtime dep + + ds = hf_datasets.load_dataset(self.benchmark, split="test", streaming=True) + scores = [] + for i, row in enumerate(ds): + if self.n_samples is not None and i >= self.n_samples: + break + text = row.get("text", "") + if not text.strip(): + continue + token_ids = tokenizer.encode(text) + scores.append(self.compute_sentence_perplexity(model, token_ids)) + + mean_ppl = float(sum(scores) / len(scores)) if scores else float("inf") + return {"perplexity": mean_ppl} diff --git a/pyproject.toml b/pyproject.toml index 96523a0..55ba437 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,14 +12,25 @@ authors = [ requires-python = ">=3.9" dependencies = [ + "datasets", "defusedxml", "sentencepiece", "tokenizers==0.15.2" ] +# Intentionally duplicated from [dependency-groups] below. +# pip uses this section; uv/PEP 735 uses [dependency-groups]. Keep both in sync. +[project.optional-dependencies] +dev = [ + "pytest>=7.0", + "ruff>=0.15.4", +] + [tool.setuptools.packages.find] include = ["openverifiablellm*"] +# Intentionally duplicated from [project.optional-dependencies] above. +# uv/PEP 735 uses this section; pip uses [project.optional-dependencies]. Keep both in sync. [dependency-groups] dev = [ "pytest>=7.0", diff --git a/tests/test_eval.py b/tests/test_eval.py new file mode 100644 index 0000000..05c73de --- /dev/null +++ b/tests/test_eval.py @@ -0,0 +1,187 @@ +""" +tests/test_eval.py + +Tests for the evaluator module (BiasEvaluator, PerplexityEvaluator). + +Run with: + pytest tests/test_eval.py -v +""" + +import math +from unittest.mock import patch + +import pytest + +from openverifiablellm.eval.bias import BiasEvaluator +from openverifiablellm.eval.perplexity import PerplexityEvaluator + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + + +class _MockTokenizer: + """Tokenizer that maps each character to its ASCII code modulo 100.""" + + def encode(self, text: str) -> list: + return [ord(c) % 100 for c in text.replace(" ", "_")] + + +def _make_dataset(sentences): + """Return a list of row dicts matching the WinoBias ``tokens`` field.""" + return [{"tokens": s.split()} for s in sentences] + + +PRO_SENTENCES = [ + "The doctor examined the patient", + "The engineer fixed the machine", +] +ANTI_SENTENCES = [ + "The nurse examined the patient", + "The secretary fixed the machine", +] + + +def _patch_load_dataset(pro_data, anti_data): + """Patch ``datasets.load_dataset`` to return pre-built lists.""" + + def _load(name, config=None, split=None): + if config == "type1_pro": + return pro_data + return anti_data + + return patch("datasets.load_dataset", side_effect=_load) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def mock_model(): + """Uniform model: all-zero logits → perplexity == vocab_size for any input.""" + return PerplexityEvaluator.uniform_model(vocab_size=100) + + +@pytest.fixture +def mock_tokenizer(): + return _MockTokenizer() + + +@pytest.fixture +def bias_evaluator(): + return BiasEvaluator(n_samples=2) + + +# --------------------------------------------------------------------------- +# PerplexityEvaluator.uniform_model +# --------------------------------------------------------------------------- + + +def test_uniform_model_output_shape(): + model = PerplexityEvaluator.uniform_model(vocab_size=50) + out = model([1, 2, 3]) + assert len(out) == 3 + assert len(out[0]) == 50 + + +def test_uniform_model_all_zero_logits(): + model = PerplexityEvaluator.uniform_model(vocab_size=10) + out = model([0, 1]) + assert all(v == 0.0 for row in out for v in row) + + +def test_uniform_model_perplexity_equals_vocab_size(): + vocab_size = 100 + model = PerplexityEvaluator.uniform_model(vocab_size=vocab_size) + token_ids = list(range(10)) + ppl = PerplexityEvaluator.compute_sentence_perplexity(model, token_ids) + assert abs(ppl - vocab_size) < 1e-6 + + +# --------------------------------------------------------------------------- +# BiasEvaluator — initialisation +# --------------------------------------------------------------------------- + + +def test_bias_evaluator_invalid_benchmark_raises(): + with pytest.raises(ValueError): + BiasEvaluator(benchmark="nonexistent_benchmark") + + +def test_bias_evaluator_valid_init(): + ev = BiasEvaluator() + assert ev.benchmark == "wino_bias" + + +def test_bias_evaluator_n_samples_stored(): + ev = BiasEvaluator(n_samples=5) + assert ev.n_samples == 5 + + +# --------------------------------------------------------------------------- +# BiasEvaluator.evaluate() — patched load_dataset +# --------------------------------------------------------------------------- + + +def test_evaluate_does_not_raise(bias_evaluator, mock_model, mock_tokenizer): + """evaluate() must complete without raising NotImplementedError or any error.""" + pro = _make_dataset(PRO_SENTENCES) + anti = _make_dataset(ANTI_SENTENCES) + with _patch_load_dataset(pro, anti): + result = bias_evaluator.evaluate(mock_model, mock_tokenizer) + assert isinstance(result, dict) + + +def test_evaluate_returns_exactly_three_keys(bias_evaluator, mock_model, mock_tokenizer): + pro = _make_dataset(PRO_SENTENCES) + anti = _make_dataset(ANTI_SENTENCES) + with _patch_load_dataset(pro, anti): + result = bias_evaluator.evaluate(mock_model, mock_tokenizer) + assert set(result.keys()) == {"stereotype_score", "anti_stereotype_score", "bias_score"} + + +def test_evaluate_bias_score_equals_abs_diff(bias_evaluator, mock_model, mock_tokenizer): + pro = _make_dataset(PRO_SENTENCES) + anti = _make_dataset(ANTI_SENTENCES) + with _patch_load_dataset(pro, anti): + result = bias_evaluator.evaluate(mock_model, mock_tokenizer) + expected = abs(result["stereotype_score"] - result["anti_stereotype_score"]) + assert abs(result["bias_score"] - expected) < 1e-9 + + +def test_evaluate_scores_are_finite(bias_evaluator, mock_model, mock_tokenizer): + pro = _make_dataset(PRO_SENTENCES) + anti = _make_dataset(ANTI_SENTENCES) + with _patch_load_dataset(pro, anti): + result = bias_evaluator.evaluate(mock_model, mock_tokenizer) + assert math.isfinite(result["stereotype_score"]) + assert math.isfinite(result["anti_stereotype_score"]) + assert math.isfinite(result["bias_score"]) + + +# --------------------------------------------------------------------------- +# n_samples limits dataset consumption +# --------------------------------------------------------------------------- + + +def test_n_samples_limits_dataset(mock_model, mock_tokenizer): + """With n_samples=2, rows beyond index 1 must never be processed. + + Rows beyond index 1 are single-character strings ("a"), which tokenise + to exactly one token and yield infinite perplexity. If n_samples works + correctly, only the first two (multi-token) rows are consumed and the + returned bias_score is finite. + """ + # Append single-char rows that yield inf perplexity if reached + bad_rows = ["a"] * 10 + pro = _make_dataset(PRO_SENTENCES + bad_rows) + anti = _make_dataset(ANTI_SENTENCES + bad_rows) + + ev = BiasEvaluator(n_samples=len(PRO_SENTENCES)) # == 2 + + with _patch_load_dataset(pro, anti): + result = ev.evaluate(mock_model, mock_tokenizer) + + assert math.isfinite(result["bias_score"]) From b3ef1fba7f4a13020669269920d51025701df88e Mon Sep 17 00:00:00 2001 From: chaitanyaam <147836528+chaitanyaam@users.noreply.github.com> Date: Sun, 22 Mar 2026 00:37:31 +0530 Subject: [PATCH 2/7] implement WinoBiasEvaluator as one of bias evaluation suite --- openverifiablellm/eval/__init__.py | 4 ++-- openverifiablellm/eval/bias/__init__.py | 5 ++++ .../eval/{bias.py => bias/wino_bias.py} | 21 ++++------------ tests/test_eval.py | 24 ++++++------------- 4 files changed, 19 insertions(+), 35 deletions(-) create mode 100644 openverifiablellm/eval/bias/__init__.py rename openverifiablellm/eval/{bias.py => bias/wino_bias.py} (82%) diff --git a/openverifiablellm/eval/__init__.py b/openverifiablellm/eval/__init__.py index ac715f8..4b35b94 100644 --- a/openverifiablellm/eval/__init__.py +++ b/openverifiablellm/eval/__init__.py @@ -1,7 +1,7 @@ -from .bias import BiasEvaluator +from .bias import WinoBiasEvaluator from .perplexity import PerplexityEvaluator __all__ = [ - "BiasEvaluator", + "WinoBiasEvaluator", "PerplexityEvaluator", ] diff --git a/openverifiablellm/eval/bias/__init__.py b/openverifiablellm/eval/bias/__init__.py new file mode 100644 index 0000000..41f21dc --- /dev/null +++ b/openverifiablellm/eval/bias/__init__.py @@ -0,0 +1,5 @@ +from .wino_bias import WinoBiasEvaluator + +__all__ = [ + "WinoBiasEvaluator", +] diff --git a/openverifiablellm/eval/bias.py b/openverifiablellm/eval/bias/wino_bias.py similarity index 82% rename from openverifiablellm/eval/bias.py rename to openverifiablellm/eval/bias/wino_bias.py index c8dd374..fefb742 100644 --- a/openverifiablellm/eval/bias.py +++ b/openverifiablellm/eval/bias/wino_bias.py @@ -1,16 +1,16 @@ """ -openverifiablellm/eval/bias.py +openverifiablellm/eval/bias/wino_bias.py Gender-bias evaluator using the WinoBias benchmark. """ from typing import Optional -from .base import BaseEvaluator -from .perplexity import PerplexityEvaluator +from ..base import BaseEvaluator +from ..perplexity import PerplexityEvaluator -class BiasEvaluator(BaseEvaluator): +class WinoBiasEvaluator(BaseEvaluator): """ Evaluates gender bias in a language model using the WinoBias benchmark. @@ -21,23 +21,12 @@ class BiasEvaluator(BaseEvaluator): Parameters ---------- - benchmark : str - Bias benchmark to use. Currently only ``"wino_bias"`` is supported. - Default ``"wino_bias"``. n_samples : int or None Maximum number of sentences to load from each WinoBias split. ``None`` evaluates the full dataset. Default ``None``. """ - SUPPORTED_BENCHMARKS = {"wino_bias"} - - def __init__(self, benchmark: str = "wino_bias", n_samples: Optional[int] = None): - if benchmark not in self.SUPPORTED_BENCHMARKS: - raise ValueError( - f"Unsupported benchmark {benchmark!r}. " - f"Choose one of {sorted(self.SUPPORTED_BENCHMARKS)}." - ) - self.benchmark = benchmark + def __init__(self, n_samples: Optional[int] = None): self.n_samples = n_samples def evaluate(self, model, tokenizer) -> dict: diff --git a/tests/test_eval.py b/tests/test_eval.py index 05c73de..6a6a53a 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -1,7 +1,7 @@ """ tests/test_eval.py -Tests for the evaluator module (BiasEvaluator, PerplexityEvaluator). +Tests for the evaluator module (WinoBiasEvaluator, PerplexityEvaluator). Run with: pytest tests/test_eval.py -v @@ -12,7 +12,7 @@ import pytest -from openverifiablellm.eval.bias import BiasEvaluator +from openverifiablellm.eval.bias import WinoBiasEvaluator from openverifiablellm.eval.perplexity import PerplexityEvaluator # --------------------------------------------------------------------------- @@ -71,7 +71,7 @@ def mock_tokenizer(): @pytest.fixture def bias_evaluator(): - return BiasEvaluator(n_samples=2) + return WinoBiasEvaluator(n_samples=2) # --------------------------------------------------------------------------- @@ -101,27 +101,17 @@ def test_uniform_model_perplexity_equals_vocab_size(): # --------------------------------------------------------------------------- -# BiasEvaluator — initialisation +# WinoBiasEvaluator — initialisation # --------------------------------------------------------------------------- -def test_bias_evaluator_invalid_benchmark_raises(): - with pytest.raises(ValueError): - BiasEvaluator(benchmark="nonexistent_benchmark") - - -def test_bias_evaluator_valid_init(): - ev = BiasEvaluator() - assert ev.benchmark == "wino_bias" - - def test_bias_evaluator_n_samples_stored(): - ev = BiasEvaluator(n_samples=5) + ev = WinoBiasEvaluator(n_samples=5) assert ev.n_samples == 5 # --------------------------------------------------------------------------- -# BiasEvaluator.evaluate() — patched load_dataset +# WinoBiasEvaluator.evaluate() — patched load_dataset # --------------------------------------------------------------------------- @@ -179,7 +169,7 @@ def test_n_samples_limits_dataset(mock_model, mock_tokenizer): pro = _make_dataset(PRO_SENTENCES + bad_rows) anti = _make_dataset(ANTI_SENTENCES + bad_rows) - ev = BiasEvaluator(n_samples=len(PRO_SENTENCES)) # == 2 + ev = WinoBiasEvaluator(n_samples=len(PRO_SENTENCES)) # == 2 with _patch_load_dataset(pro, anti): result = ev.evaluate(mock_model, mock_tokenizer) From 3e8addcbc375585d20d036f28a9997dfc78d0786 Mon Sep 17 00:00:00 2001 From: chaitanyaam <147836528+chaitanyaam@users.noreply.github.com> Date: Sun, 22 Mar 2026 00:59:13 +0530 Subject: [PATCH 3/7] Avoid NaN bias scores when both split scores are infinite --- openverifiablellm/eval/bias/wino_bias.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/openverifiablellm/eval/bias/wino_bias.py b/openverifiablellm/eval/bias/wino_bias.py index fefb742..1b20a7d 100644 --- a/openverifiablellm/eval/bias/wino_bias.py +++ b/openverifiablellm/eval/bias/wino_bias.py @@ -80,9 +80,14 @@ def _score_split(dataset) -> float: ) return float(sum(scores) / len(scores)) if scores else float("inf") + import math + stereotype_score = _score_split(pro_ds) anti_stereotype_score = _score_split(anti_ds) - bias_score = abs(stereotype_score - anti_stereotype_score) + if math.isinf(stereotype_score) and math.isinf(anti_stereotype_score): + bias_score = float("inf") + else: + bias_score = abs(stereotype_score - anti_stereotype_score) return { "stereotype_score": stereotype_score, From 33a36dd688b4874aef2b70af40befb2be018e94d Mon Sep 17 00:00:00 2001 From: chaitanyaam <147836528+chaitanyaam@users.noreply.github.com> Date: Sun, 22 Mar 2026 01:07:23 +0530 Subject: [PATCH 4/7] added three tests --- openverifiablellm/eval/perplexity.py | 55 +++++++++++++++++++++++++++- tests/test_eval.py | 27 ++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/openverifiablellm/eval/perplexity.py b/openverifiablellm/eval/perplexity.py index 84bc18d..249fe5d 100644 --- a/openverifiablellm/eval/perplexity.py +++ b/openverifiablellm/eval/perplexity.py @@ -113,6 +113,59 @@ def compute_sentence_perplexity(model, token_ids: List[int]) -> float: return math.exp(nll_sum / len(targets)) + @staticmethod + def compute_sequence_perplexity(model, token_ids: List[int], stride: int = 512) -> float: + """ + Compute perplexity over a (possibly long) sequence using non-overlapping + stride-sized windows. + + The sequence is partitioned into windows of *stride* tokens. Each + window contributes its token predictions to a pooled NLL. The final + perplexity is ``exp(total_NLL / total_scored_tokens)``. + + For sequences shorter than *stride* + 1 tokens the result is + identical to :meth:`compute_sentence_perplexity`. + + Parameters + ---------- + model : callable + ``model(input_ids) -> 2-D sequence`` of shape + ``(len(input_ids), vocab_size)``. + token_ids : list[int] + Tokenised sequence. + stride : int + Number of tokens scored per window. Default ``512``. + + Returns + ------- + float + Perplexity (≥ 1). Returns ``float("inf")`` for sequences + shorter than 2 tokens. + """ + if len(token_ids) < 2: + return float("inf") + + nll_sum = 0.0 + n_scored = 0 + n = len(token_ids) + + for start in range(0, n - 1, stride): + end = min(start + stride + 1, n) + window = token_ids[start:end] + if len(window) < 2: + break + inputs = window[:-1] + targets = window[1:] + logits_batch = model(inputs) + for logits, target in zip(logits_batch, targets): + max_l = max(logits) + exp_shifted = [math.exp(v - max_l) for v in logits] + log_sum = math.log(sum(exp_shifted)) + nll_sum -= (logits[target] - max_l) - log_sum + n_scored += 1 + + return math.exp(nll_sum / n_scored) if n_scored > 0 else float("inf") + # ------------------------------------------------------------------ # BaseEvaluator interface # ------------------------------------------------------------------ @@ -145,7 +198,7 @@ def evaluate(self, model, tokenizer) -> dict: if not text.strip(): continue token_ids = tokenizer.encode(text) - scores.append(self.compute_sentence_perplexity(model, token_ids)) + scores.append(self.compute_sequence_perplexity(model, token_ids, self.stride)) mean_ppl = float(sum(scores) / len(scores)) if scores else float("inf") return {"perplexity": mean_ppl} diff --git a/tests/test_eval.py b/tests/test_eval.py index 6a6a53a..b266a9f 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -100,6 +100,33 @@ def test_uniform_model_perplexity_equals_vocab_size(): assert abs(ppl - vocab_size) < 1e-6 +def test_compute_sequence_perplexity_short_matches_sentence(): + """For sequences shorter than stride, both methods must agree.""" + vocab_size = 100 + model = PerplexityEvaluator.uniform_model(vocab_size=vocab_size) + token_ids = list(range(10)) + ppl_sentence = PerplexityEvaluator.compute_sentence_perplexity(model, token_ids) + ppl_sequence = PerplexityEvaluator.compute_sequence_perplexity(model, token_ids, stride=512) + assert abs(ppl_sentence - ppl_sequence) < 1e-6 + + +def test_compute_sequence_perplexity_long_sequence_finite(): + """A sequence longer than stride must yield a finite, correct perplexity.""" + vocab_size = 100 + model = PerplexityEvaluator.uniform_model(vocab_size=vocab_size) + # 50 tokens with stride=10 → 5 windows + token_ids = list(range(50)) + ppl = PerplexityEvaluator.compute_sequence_perplexity(model, token_ids, stride=10) + assert math.isfinite(ppl) + # Uniform model → PPL must equal vocab_size regardless of windowing + assert abs(ppl - vocab_size) < 1e-6 + + +def test_compute_sequence_perplexity_single_token_returns_inf(): + model = PerplexityEvaluator.uniform_model(vocab_size=10) + assert PerplexityEvaluator.compute_sequence_perplexity(model, [0], stride=512) == float("inf") + + # --------------------------------------------------------------------------- # WinoBiasEvaluator — initialisation # --------------------------------------------------------------------------- From c6288c068563e4166af13ea20eba2a19fcf5417e Mon Sep 17 00:00:00 2001 From: chaitanyaam <147836528+chaitanyaam@users.noreply.github.com> Date: Sun, 22 Mar 2026 01:15:33 +0530 Subject: [PATCH 5/7] validate logit shapes, fix n_samples blank-row counting, strict mock in tests --- openverifiablellm/eval/perplexity.py | 17 ++++++++++++++--- tests/test_eval.py | 14 ++++++++++++-- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/openverifiablellm/eval/perplexity.py b/openverifiablellm/eval/perplexity.py index 249fe5d..3855397 100644 --- a/openverifiablellm/eval/perplexity.py +++ b/openverifiablellm/eval/perplexity.py @@ -102,6 +102,12 @@ def compute_sentence_perplexity(model, token_ids: List[int]) -> float: logits_batch = model(inputs) # shape: (n-1, vocab_size) + if len(logits_batch) != len(targets): + raise ValueError( + f"Model returned {len(logits_batch)} logit vectors but expected " + f"{len(targets)} (one per target token)." + ) + nll_sum = 0.0 for logits, target in zip(logits_batch, targets): # numerically-stable log-softmax @@ -157,6 +163,11 @@ def compute_sequence_perplexity(model, token_ids: List[int], stride: int = 512) inputs = window[:-1] targets = window[1:] logits_batch = model(inputs) + if len(logits_batch) != len(targets): + raise ValueError( + f"Model returned {len(logits_batch)} logit vectors but expected " + f"{len(targets)} (one per target token)." + ) for logits, target in zip(logits_batch, targets): max_l = max(logits) exp_shifted = [math.exp(v - max_l) for v in logits] @@ -191,12 +202,12 @@ def evaluate(self, model, tokenizer) -> dict: ds = hf_datasets.load_dataset(self.benchmark, split="test", streaming=True) scores = [] - for i, row in enumerate(ds): - if self.n_samples is not None and i >= self.n_samples: - break + for row in ds: text = row.get("text", "") if not text.strip(): continue + if self.n_samples is not None and len(scores) >= self.n_samples: + break token_ids = tokenizer.encode(text) scores.append(self.compute_sequence_perplexity(model, token_ids, self.stride)) diff --git a/tests/test_eval.py b/tests/test_eval.py index b266a9f..b5524f7 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -43,12 +43,22 @@ def _make_dataset(sentences): def _patch_load_dataset(pro_data, anti_data): - """Patch ``datasets.load_dataset`` to return pre-built lists.""" + """Patch ``datasets.load_dataset`` to return pre-built lists. + + Raises ``ValueError`` for any unexpected name, config, or split so + integration bugs are not silently hidden by a catch-all fallback. + """ def _load(name, config=None, split=None): + if name != "wino_bias" or split != "test": + raise ValueError( + f"Unexpected load_dataset call: name={name!r}, config={config!r}, split={split!r}" + ) if config == "type1_pro": return pro_data - return anti_data + if config == "type1_anti": + return anti_data + raise ValueError(f"Unexpected config: {config!r}") return patch("datasets.load_dataset", side_effect=_load) From e72d58b6362d679a9bb1dd186dd83dc842877a0f Mon Sep 17 00:00:00 2001 From: chaitanyaam <147836528+chaitanyaam@users.noreply.github.com> Date: Sun, 22 Mar 2026 02:01:09 +0530 Subject: [PATCH 6/7] implement factual evaluator --- openverifiablellm/eval/__init__.py | 2 - openverifiablellm/eval/bias/__init__.py | 5 - openverifiablellm/eval/bias/wino_bias.py | 96 ------- openverifiablellm/eval/factual/__init__.py | 5 + .../eval/factual/factual_consistency.py | 235 ++++++++++++++++++ tests/test_eval.py | 214 ---------------- tests/test_factual_eval.py | 167 +++++++++++++ 7 files changed, 407 insertions(+), 317 deletions(-) delete mode 100644 openverifiablellm/eval/bias/__init__.py delete mode 100644 openverifiablellm/eval/bias/wino_bias.py create mode 100644 openverifiablellm/eval/factual/__init__.py create mode 100644 openverifiablellm/eval/factual/factual_consistency.py delete mode 100644 tests/test_eval.py create mode 100644 tests/test_factual_eval.py diff --git a/openverifiablellm/eval/__init__.py b/openverifiablellm/eval/__init__.py index 4b35b94..30266cd 100644 --- a/openverifiablellm/eval/__init__.py +++ b/openverifiablellm/eval/__init__.py @@ -1,7 +1,5 @@ -from .bias import WinoBiasEvaluator from .perplexity import PerplexityEvaluator __all__ = [ - "WinoBiasEvaluator", "PerplexityEvaluator", ] diff --git a/openverifiablellm/eval/bias/__init__.py b/openverifiablellm/eval/bias/__init__.py deleted file mode 100644 index 41f21dc..0000000 --- a/openverifiablellm/eval/bias/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .wino_bias import WinoBiasEvaluator - -__all__ = [ - "WinoBiasEvaluator", -] diff --git a/openverifiablellm/eval/bias/wino_bias.py b/openverifiablellm/eval/bias/wino_bias.py deleted file mode 100644 index 1b20a7d..0000000 --- a/openverifiablellm/eval/bias/wino_bias.py +++ /dev/null @@ -1,96 +0,0 @@ -""" -openverifiablellm/eval/bias/wino_bias.py - -Gender-bias evaluator using the WinoBias benchmark. -""" - -from typing import Optional - -from ..base import BaseEvaluator -from ..perplexity import PerplexityEvaluator - - -class WinoBiasEvaluator(BaseEvaluator): - """ - Evaluates gender bias in a language model using the WinoBias benchmark. - - For each sentence pair (pro-stereotype / anti-stereotype) the model's - perplexity is computed via the same sliding-window method used by - :class:`PerplexityEvaluator`. A lower ``bias_score`` indicates a less - biased model. - - Parameters - ---------- - n_samples : int or None - Maximum number of sentences to load from each WinoBias split. - ``None`` evaluates the full dataset. Default ``None``. - """ - - def __init__(self, n_samples: Optional[int] = None): - self.n_samples = n_samples - - def evaluate(self, model, tokenizer) -> dict: - """ - Compute stereotype and anti-stereotype perplexity scores. - - Loads ``type1_pro`` (pro-stereotype) and ``type1_anti`` - (anti-stereotype) splits of WinoBias and measures how much more - easily the model predicts gender-stereotypical sentences than - counter-stereotypical ones. - - Parameters - ---------- - model : callable - ``model(input_ids) -> 2-D sequence`` of shape - ``(len(input_ids), vocab_size)``, as described in - :meth:`PerplexityEvaluator.compute_sentence_perplexity`. - tokenizer : object - Object with ``encode(text: str) -> list[int]``. - - Returns - ------- - dict - A dictionary with the following keys: - - * **stereotype_score** (*float*) — mean perplexity on - pro-stereotype sentences. - * **anti_stereotype_score** (*float*) — mean perplexity on - anti-stereotype sentences. - * **bias_score** (*float*) — - ``abs(stereotype_score - anti_stereotype_score)``; - lower means less biased. - """ - import datasets as hf_datasets # deferred; runtime dep - - pro_ds = hf_datasets.load_dataset("wino_bias", "type1_pro", split="test") - anti_ds = hf_datasets.load_dataset("wino_bias", "type1_anti", split="test") - - def _score_split(dataset) -> float: - scores = [] - for i, row in enumerate(dataset): - if self.n_samples is not None and i >= self.n_samples: - break - tokens = row.get("tokens", []) - text = " ".join(tokens) if isinstance(tokens, list) else str(tokens) - if not text.strip(): - continue - token_ids = tokenizer.encode(text) - scores.append( - PerplexityEvaluator.compute_sentence_perplexity(model, token_ids) - ) - return float(sum(scores) / len(scores)) if scores else float("inf") - - import math - - stereotype_score = _score_split(pro_ds) - anti_stereotype_score = _score_split(anti_ds) - if math.isinf(stereotype_score) and math.isinf(anti_stereotype_score): - bias_score = float("inf") - else: - bias_score = abs(stereotype_score - anti_stereotype_score) - - return { - "stereotype_score": stereotype_score, - "anti_stereotype_score": anti_stereotype_score, - "bias_score": bias_score, - } diff --git a/openverifiablellm/eval/factual/__init__.py b/openverifiablellm/eval/factual/__init__.py new file mode 100644 index 0000000..4ebbffe --- /dev/null +++ b/openverifiablellm/eval/factual/__init__.py @@ -0,0 +1,5 @@ +from .factual_consistency import WikipediaFactualEvaluator + +__all__ = [ + "WikipediaFactualEvaluator", +] diff --git a/openverifiablellm/eval/factual/factual_consistency.py b/openverifiablellm/eval/factual/factual_consistency.py new file mode 100644 index 0000000..ca7fb60 --- /dev/null +++ b/openverifiablellm/eval/factual/factual_consistency.py @@ -0,0 +1,235 @@ +""" +openverifiablellm/eval/factual/factual_consistency.py + +Wikipedia-based factual consistency evaluator. +""" + +import random +import re +from pathlib import Path +from typing import List, Optional, Union + +from ..base import BaseEvaluator +from ..perplexity import PerplexityEvaluator + +_ENTITY_RE = re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b") + + +class WikipediaFactualEvaluator(BaseEvaluator): + """ + Evaluates factual consistency of a language model using Wikipedia passages. + + For each sentence extracted from a processed Wikipedia text file + (``wiki_clean.txt``), a counterfactual variant is generated by substituting + a named entity found in the sentence with a different named entity drawn + from the same passage. The model's perplexity is then compared on the + original (factual) vs the substituted (counterfactual) sentence. A + well-trained model should assign lower perplexity to factual sentences. + + The ``factual_score`` is the mean per-pair difference + ``(counterfactual_ppl - factual_ppl)``: positive values indicate the model + correctly prefers factual sentences, negative values indicate the model + prefers counterfactual sentences. + + Named entities are identified with the simple capitalized-word-sequence + regex ``r"\\b([A-Z][a-z]+(?:\\s+[A-Z][a-z]+)*)\\b"``. Evaluation is + fully deterministic: ``random.seed(42)`` is applied inside + :meth:`evaluate` before any entity selection. + + Parameters + ---------- + wiki_text_path : str or Path + Path to the processed ``wiki_clean.txt`` file produced by + :func:`openverifiablellm.utils.extract_text_from_xml`. + n_samples : int or None + Maximum number of sentence pairs to evaluate. ``None`` evaluates + all available pairs. Default ``None``. + """ + + def __init__( + self, + wiki_text_path: Union[str, Path], + n_samples: Optional[int] = None, + ): + self.wiki_text_path = Path(wiki_text_path) + self.n_samples = n_samples + + # ------------------------------------------------------------------ + # Static helpers + # ------------------------------------------------------------------ + + @staticmethod + def _substitute_entity(sentence: str, candidate_entities: List[str]) -> Optional[str]: + """ + Replace the first named entity in *sentence* with a random different + entity drawn from *candidate_entities*. + + Named entities are matched by + ``r"\\b([A-Z][a-z]+(?:\\s+[A-Z][a-z]+)*)\\b"``. + + Parameters + ---------- + sentence : str + Input sentence. + candidate_entities : list[str] + Pool of named entities to draw substitutes from (typically all + entities extracted from the enclosing passage). + + Returns + ------- + str or None + The modified sentence with the first entity replaced, or ``None`` + if no named entity was found in *sentence* or no differing + substitute is available in *candidate_entities*. + """ + matches = _ENTITY_RE.findall(sentence) + if not matches: + return None + + found_entity = matches[0] + alternatives = [e for e in candidate_entities if e != found_entity] + if not alternatives: + return None + + substitute = random.choice(alternatives) + return sentence.replace(found_entity, substitute, 1) + + @staticmethod + def _extract_passages( + wiki_text_path: Union[str, Path], + n_samples: Optional[int], + ) -> List[dict]: + """ + Build factual/counterfactual sentence pairs from *wiki_text_path*. + + The file is read line by line; consecutive non-empty lines are grouped + into passages (blank lines act as separators). For each passage the + lines are joined into a single string, split on ``". "``, and each + resulting sentence is tested for entity substitution via + :meth:`_substitute_entity`. A pair is emitted for every sentence that + yields a valid counterfactual. Collection halts early once *n_samples* + pairs have been gathered (if *n_samples* is not ``None``). + + Parameters + ---------- + wiki_text_path : str or Path + Path to the processed ``wiki_clean.txt`` file. + n_samples : int or None + Maximum number of pairs to return. + + Returns + ------- + list[dict] + Each element is ``{"original": str, "counterfactual": str}``. + """ + pairs: List[dict] = [] + current_lines: List[str] = [] + + def _process_passage(lines: List[str]) -> None: + passage_text = " ".join(lines) + all_entities = _ENTITY_RE.findall(passage_text) + if not all_entities: + return + sentences = passage_text.split(". ") + for sentence in sentences: + if n_samples is not None and len(pairs) >= n_samples: + return + sentence = sentence.strip() + if not sentence: + continue + counterfactual = WikipediaFactualEvaluator._substitute_entity( + sentence, all_entities + ) + if counterfactual is not None and counterfactual != sentence: + pairs.append({"original": sentence, "counterfactual": counterfactual}) + + with open(wiki_text_path, encoding="utf-8") as fh: + for raw_line in fh: + line = raw_line.rstrip("\n") + if line.strip(): + current_lines.append(line.strip()) + else: + if current_lines: + _process_passage(current_lines) + current_lines = [] + if n_samples is not None and len(pairs) >= n_samples: + return pairs + + # Handle final passage if file has no trailing blank line + if current_lines: + _process_passage(current_lines) + + return pairs + + # ------------------------------------------------------------------ + # BaseEvaluator interface + # ------------------------------------------------------------------ + + def evaluate(self, model, tokenizer) -> dict: + """ + Compute factual consistency scores for *model*. + + Extracts sentence pairs from the configured Wikipedia text file, then + computes perplexity for each original and counterfactual sentence using + the same teacher-forced method as + :class:`~openverifiablellm.eval.perplexity.PerplexityEvaluator`. + + ``random.seed(42)`` is applied before any entity selection to ensure + fully reproducible results. + + Parameters + ---------- + model : callable + ``model(input_ids) -> 2-D sequence`` of shape + ``(len(input_ids), vocab_size)``, as described in + :meth:`~openverifiablellm.eval.perplexity.PerplexityEvaluator.compute_sentence_perplexity`. + tokenizer : object + Object with ``encode(text: str) -> list[int]``. + + Returns + ------- + dict + A dictionary with the following keys: + + * **factual_perplexity** (*float*) — mean perplexity on original + sentences. + * **counterfactual_perplexity** (*float*) — mean perplexity on + counterfactual sentences. + * **factual_score** (*float*) — mean per-pair difference + ``(counterfactual_ppl - factual_ppl)``; positive means the model + correctly prefers factual sentences (good), negative means the + model prefers counterfactual sentences (bad). + """ + random.seed(42) + pairs = self._extract_passages(self.wiki_text_path, self.n_samples) + + if not pairs: + return { + "factual_perplexity": float("inf"), + "counterfactual_perplexity": float("inf"), + "factual_score": float("inf"), + } + + factual_ppls: List[float] = [] + counterfactual_ppls: List[float] = [] + score_diffs: List[float] = [] + + for pair in pairs: + factual_tokens = tokenizer.encode(pair["original"]) + cf_tokens = tokenizer.encode(pair["counterfactual"]) + + factual_ppl = PerplexityEvaluator.compute_sentence_perplexity( + model, factual_tokens + ) + cf_ppl = PerplexityEvaluator.compute_sentence_perplexity(model, cf_tokens) + + factual_ppls.append(factual_ppl) + counterfactual_ppls.append(cf_ppl) + score_diffs.append(cf_ppl - factual_ppl) + + n = len(factual_ppls) + return { + "factual_perplexity": sum(factual_ppls) / n, + "counterfactual_perplexity": sum(counterfactual_ppls) / n, + "factual_score": sum(score_diffs) / n, + } diff --git a/tests/test_eval.py b/tests/test_eval.py deleted file mode 100644 index b5524f7..0000000 --- a/tests/test_eval.py +++ /dev/null @@ -1,214 +0,0 @@ -""" -tests/test_eval.py - -Tests for the evaluator module (WinoBiasEvaluator, PerplexityEvaluator). - -Run with: - pytest tests/test_eval.py -v -""" - -import math -from unittest.mock import patch - -import pytest - -from openverifiablellm.eval.bias import WinoBiasEvaluator -from openverifiablellm.eval.perplexity import PerplexityEvaluator - -# --------------------------------------------------------------------------- -# Shared helpers -# --------------------------------------------------------------------------- - - -class _MockTokenizer: - """Tokenizer that maps each character to its ASCII code modulo 100.""" - - def encode(self, text: str) -> list: - return [ord(c) % 100 for c in text.replace(" ", "_")] - - -def _make_dataset(sentences): - """Return a list of row dicts matching the WinoBias ``tokens`` field.""" - return [{"tokens": s.split()} for s in sentences] - - -PRO_SENTENCES = [ - "The doctor examined the patient", - "The engineer fixed the machine", -] -ANTI_SENTENCES = [ - "The nurse examined the patient", - "The secretary fixed the machine", -] - - -def _patch_load_dataset(pro_data, anti_data): - """Patch ``datasets.load_dataset`` to return pre-built lists. - - Raises ``ValueError`` for any unexpected name, config, or split so - integration bugs are not silently hidden by a catch-all fallback. - """ - - def _load(name, config=None, split=None): - if name != "wino_bias" or split != "test": - raise ValueError( - f"Unexpected load_dataset call: name={name!r}, config={config!r}, split={split!r}" - ) - if config == "type1_pro": - return pro_data - if config == "type1_anti": - return anti_data - raise ValueError(f"Unexpected config: {config!r}") - - return patch("datasets.load_dataset", side_effect=_load) - - -# --------------------------------------------------------------------------- -# Fixtures -# --------------------------------------------------------------------------- - - -@pytest.fixture -def mock_model(): - """Uniform model: all-zero logits → perplexity == vocab_size for any input.""" - return PerplexityEvaluator.uniform_model(vocab_size=100) - - -@pytest.fixture -def mock_tokenizer(): - return _MockTokenizer() - - -@pytest.fixture -def bias_evaluator(): - return WinoBiasEvaluator(n_samples=2) - - -# --------------------------------------------------------------------------- -# PerplexityEvaluator.uniform_model -# --------------------------------------------------------------------------- - - -def test_uniform_model_output_shape(): - model = PerplexityEvaluator.uniform_model(vocab_size=50) - out = model([1, 2, 3]) - assert len(out) == 3 - assert len(out[0]) == 50 - - -def test_uniform_model_all_zero_logits(): - model = PerplexityEvaluator.uniform_model(vocab_size=10) - out = model([0, 1]) - assert all(v == 0.0 for row in out for v in row) - - -def test_uniform_model_perplexity_equals_vocab_size(): - vocab_size = 100 - model = PerplexityEvaluator.uniform_model(vocab_size=vocab_size) - token_ids = list(range(10)) - ppl = PerplexityEvaluator.compute_sentence_perplexity(model, token_ids) - assert abs(ppl - vocab_size) < 1e-6 - - -def test_compute_sequence_perplexity_short_matches_sentence(): - """For sequences shorter than stride, both methods must agree.""" - vocab_size = 100 - model = PerplexityEvaluator.uniform_model(vocab_size=vocab_size) - token_ids = list(range(10)) - ppl_sentence = PerplexityEvaluator.compute_sentence_perplexity(model, token_ids) - ppl_sequence = PerplexityEvaluator.compute_sequence_perplexity(model, token_ids, stride=512) - assert abs(ppl_sentence - ppl_sequence) < 1e-6 - - -def test_compute_sequence_perplexity_long_sequence_finite(): - """A sequence longer than stride must yield a finite, correct perplexity.""" - vocab_size = 100 - model = PerplexityEvaluator.uniform_model(vocab_size=vocab_size) - # 50 tokens with stride=10 → 5 windows - token_ids = list(range(50)) - ppl = PerplexityEvaluator.compute_sequence_perplexity(model, token_ids, stride=10) - assert math.isfinite(ppl) - # Uniform model → PPL must equal vocab_size regardless of windowing - assert abs(ppl - vocab_size) < 1e-6 - - -def test_compute_sequence_perplexity_single_token_returns_inf(): - model = PerplexityEvaluator.uniform_model(vocab_size=10) - assert PerplexityEvaluator.compute_sequence_perplexity(model, [0], stride=512) == float("inf") - - -# --------------------------------------------------------------------------- -# WinoBiasEvaluator — initialisation -# --------------------------------------------------------------------------- - - -def test_bias_evaluator_n_samples_stored(): - ev = WinoBiasEvaluator(n_samples=5) - assert ev.n_samples == 5 - - -# --------------------------------------------------------------------------- -# WinoBiasEvaluator.evaluate() — patched load_dataset -# --------------------------------------------------------------------------- - - -def test_evaluate_does_not_raise(bias_evaluator, mock_model, mock_tokenizer): - """evaluate() must complete without raising NotImplementedError or any error.""" - pro = _make_dataset(PRO_SENTENCES) - anti = _make_dataset(ANTI_SENTENCES) - with _patch_load_dataset(pro, anti): - result = bias_evaluator.evaluate(mock_model, mock_tokenizer) - assert isinstance(result, dict) - - -def test_evaluate_returns_exactly_three_keys(bias_evaluator, mock_model, mock_tokenizer): - pro = _make_dataset(PRO_SENTENCES) - anti = _make_dataset(ANTI_SENTENCES) - with _patch_load_dataset(pro, anti): - result = bias_evaluator.evaluate(mock_model, mock_tokenizer) - assert set(result.keys()) == {"stereotype_score", "anti_stereotype_score", "bias_score"} - - -def test_evaluate_bias_score_equals_abs_diff(bias_evaluator, mock_model, mock_tokenizer): - pro = _make_dataset(PRO_SENTENCES) - anti = _make_dataset(ANTI_SENTENCES) - with _patch_load_dataset(pro, anti): - result = bias_evaluator.evaluate(mock_model, mock_tokenizer) - expected = abs(result["stereotype_score"] - result["anti_stereotype_score"]) - assert abs(result["bias_score"] - expected) < 1e-9 - - -def test_evaluate_scores_are_finite(bias_evaluator, mock_model, mock_tokenizer): - pro = _make_dataset(PRO_SENTENCES) - anti = _make_dataset(ANTI_SENTENCES) - with _patch_load_dataset(pro, anti): - result = bias_evaluator.evaluate(mock_model, mock_tokenizer) - assert math.isfinite(result["stereotype_score"]) - assert math.isfinite(result["anti_stereotype_score"]) - assert math.isfinite(result["bias_score"]) - - -# --------------------------------------------------------------------------- -# n_samples limits dataset consumption -# --------------------------------------------------------------------------- - - -def test_n_samples_limits_dataset(mock_model, mock_tokenizer): - """With n_samples=2, rows beyond index 1 must never be processed. - - Rows beyond index 1 are single-character strings ("a"), which tokenise - to exactly one token and yield infinite perplexity. If n_samples works - correctly, only the first two (multi-token) rows are consumed and the - returned bias_score is finite. - """ - # Append single-char rows that yield inf perplexity if reached - bad_rows = ["a"] * 10 - pro = _make_dataset(PRO_SENTENCES + bad_rows) - anti = _make_dataset(ANTI_SENTENCES + bad_rows) - - ev = WinoBiasEvaluator(n_samples=len(PRO_SENTENCES)) # == 2 - - with _patch_load_dataset(pro, anti): - result = ev.evaluate(mock_model, mock_tokenizer) - - assert math.isfinite(result["bias_score"]) diff --git a/tests/test_factual_eval.py b/tests/test_factual_eval.py new file mode 100644 index 0000000..94d45c1 --- /dev/null +++ b/tests/test_factual_eval.py @@ -0,0 +1,167 @@ +""" +tests/test_factual_eval.py + +Tests for WikipediaFactualEvaluator. + +Run with: + pytest tests/test_factual_eval.py -v +""" + +import math +import random + +import pytest + +from openverifiablellm.eval.factual import WikipediaFactualEvaluator +from openverifiablellm.eval.perplexity import PerplexityEvaluator + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +_WIKI_TEXT = ( + "Albert Einstein was born in Germany.\n" + "He developed the Theory of Relativity.\n" + "Marie Curie was born in Poland.\n" + "\n" + "Isaac Newton discovered gravity in England.\n" + "Newton worked at Cambridge University.\n" +) + + +class _MockTokenizer: + """Tokenizer that maps each character to its ASCII code modulo 100.""" + + def encode(self, text: str) -> list: + return [ord(c) % 100 for c in text.replace(" ", "_")] + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture() +def wiki_file(tmp_path): + p = tmp_path / "wiki_clean.txt" + p.write_text(_WIKI_TEXT, encoding="utf-8") + return p + + +@pytest.fixture() +def mock_model(): + return PerplexityEvaluator.uniform_model(vocab_size=100) + + +@pytest.fixture() +def mock_tokenizer(): + return _MockTokenizer() + + +@pytest.fixture() +def evaluator(wiki_file): + return WikipediaFactualEvaluator(wiki_text_path=wiki_file) + + +# --------------------------------------------------------------------------- +# _extract_passages +# --------------------------------------------------------------------------- + + +def test_extract_passages_returns_pairs(wiki_file): + random.seed(42) + pairs = WikipediaFactualEvaluator._extract_passages(wiki_file, n_samples=None) + assert len(pairs) > 0 + for pair in pairs: + assert "original" in pair + assert "counterfactual" in pair + + +def test_counterfactual_differs_from_original(wiki_file): + random.seed(42) + pairs = WikipediaFactualEvaluator._extract_passages(wiki_file, n_samples=None) + assert len(pairs) > 0 + for pair in pairs: + assert pair["original"] != pair["counterfactual"] + + +def test_n_samples_limits_pairs(wiki_file): + random.seed(42) + pairs = WikipediaFactualEvaluator._extract_passages(wiki_file, n_samples=2) + assert len(pairs) <= 2 + + +# --------------------------------------------------------------------------- +# _substitute_entity +# --------------------------------------------------------------------------- + + +def test_substitute_entity_replaces_entity(): + random.seed(42) + sentence = "Albert Einstein was born in Germany" + candidates = ["Albert Einstein", "Germany", "Marie Curie", "Poland"] + result = WikipediaFactualEvaluator._substitute_entity(sentence, candidates) + assert result is not None + assert result != sentence + assert "Albert Einstein" not in result + + +def test_substitute_entity_returns_none_when_no_entity(): + # All-lowercase sentence has no capitalized sequences + result = WikipediaFactualEvaluator._substitute_entity( + "the cat sat on the mat", ["Germany", "Poland"] + ) + assert result is None + + +# --------------------------------------------------------------------------- +# evaluate() +# --------------------------------------------------------------------------- + + +def test_evaluate_returns_correct_keys(evaluator, mock_model, mock_tokenizer): + result = evaluator.evaluate(mock_model, mock_tokenizer) + assert set(result.keys()) == { + "factual_perplexity", + "counterfactual_perplexity", + "factual_score", + } + + +def test_evaluate_scores_are_finite(evaluator, mock_model, mock_tokenizer): + result = evaluator.evaluate(mock_model, mock_tokenizer) + assert math.isfinite(result["factual_perplexity"]) + assert math.isfinite(result["counterfactual_perplexity"]) + assert math.isfinite(result["factual_score"]) + + +def test_factual_score_is_difference(evaluator, mock_model, mock_tokenizer): + """factual_score must equal mean(cf_ppl - factual_ppl) per pair, + which by linearity of expectation equals counterfactual_perplexity + minus factual_perplexity.""" + result = evaluator.evaluate(mock_model, mock_tokenizer) + expected = result["counterfactual_perplexity"] - result["factual_perplexity"] + assert abs(result["factual_score"] - expected) < 1e-9 + + +def test_n_samples_limits_pairs_via_evaluate(wiki_file, mock_model, mock_tokenizer): + """n_samples=2 must cause evaluate() to process at most 2 pairs.""" + ev = WikipediaFactualEvaluator(wiki_text_path=wiki_file, n_samples=2) + # Verify by checking _extract_passages directly with n_samples=2 + random.seed(42) + pairs = WikipediaFactualEvaluator._extract_passages(wiki_file, n_samples=2) + assert len(pairs) <= 2 + # evaluate() must still complete without error + result = ev.evaluate(mock_model, mock_tokenizer) + assert set(result.keys()) == { + "factual_perplexity", + "counterfactual_perplexity", + "factual_score", + } + + +def test_determinism(evaluator, mock_model, mock_tokenizer): + """Calling evaluate() twice on the same input must return identical results.""" + result1 = evaluator.evaluate(mock_model, mock_tokenizer) + result2 = evaluator.evaluate(mock_model, mock_tokenizer) + assert result1 == result2 From 5f770d0a2aba978d5b654bb88c4656d1c5802acc Mon Sep 17 00:00:00 2001 From: chaitanyaam <147836528+chaitanyaam@users.noreply.github.com> Date: Sun, 22 Mar 2026 02:25:30 +0530 Subject: [PATCH 7/7] added Protocol types, word boundary substitution and robust perplexity handling --- openverifiablellm/eval/base.py | 22 ++++++++++++++++++- .../eval/factual/factual_consistency.py | 13 ++++++++++- openverifiablellm/eval/perplexity.py | 18 ++++++++++++++- 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/openverifiablellm/eval/base.py b/openverifiablellm/eval/base.py index 6e01d2b..dc59a6b 100644 --- a/openverifiablellm/eval/base.py +++ b/openverifiablellm/eval/base.py @@ -1,11 +1,31 @@ from abc import ABC, abstractmethod +from typing import List + +try: + from typing import Protocol, runtime_checkable +except ImportError: # Python < 3.8 + from typing_extensions import Protocol, runtime_checkable + + +@runtime_checkable +class Model(Protocol): + """Structural type for a language model callable.""" + + def __call__(self, input_ids: List[int]) -> List[List[float]]: ... + + +@runtime_checkable +class Tokenizer(Protocol): + """Structural type for a tokenizer.""" + + def encode(self, text: str) -> List[int]: ... class BaseEvaluator(ABC): """Abstract base class for all dataset evaluators.""" @abstractmethod - def evaluate(self, model, tokenizer) -> dict: + def evaluate(self, model: Model, tokenizer: Tokenizer) -> dict: """ Evaluate a language model using the given tokenizer. diff --git a/openverifiablellm/eval/factual/factual_consistency.py b/openverifiablellm/eval/factual/factual_consistency.py index ca7fb60..2235547 100644 --- a/openverifiablellm/eval/factual/factual_consistency.py +++ b/openverifiablellm/eval/factual/factual_consistency.py @@ -4,6 +4,7 @@ Wikipedia-based factual consistency evaluator. """ +import math import random import re from pathlib import Path @@ -92,7 +93,8 @@ def _substitute_entity(sentence: str, candidate_entities: List[str]) -> Optional return None substitute = random.choice(alternatives) - return sentence.replace(found_entity, substitute, 1) + pattern = r"\b" + re.escape(found_entity) + r"\b" + return re.sub(pattern, substitute, sentence, count=1) @staticmethod def _extract_passages( @@ -223,11 +225,20 @@ def evaluate(self, model, tokenizer) -> dict: ) cf_ppl = PerplexityEvaluator.compute_sentence_perplexity(model, cf_tokens) + if not math.isfinite(factual_ppl) or not math.isfinite(cf_ppl): + continue + factual_ppls.append(factual_ppl) counterfactual_ppls.append(cf_ppl) score_diffs.append(cf_ppl - factual_ppl) n = len(factual_ppls) + if n == 0: + return { + "factual_perplexity": float("nan"), + "counterfactual_perplexity": float("nan"), + "factual_score": float("nan"), + } return { "factual_perplexity": sum(factual_ppls) / n, "counterfactual_perplexity": sum(counterfactual_ppls) / n, diff --git a/openverifiablellm/eval/perplexity.py b/openverifiablellm/eval/perplexity.py index 3855397..fa1f0be 100644 --- a/openverifiablellm/eval/perplexity.py +++ b/openverifiablellm/eval/perplexity.py @@ -36,10 +36,12 @@ def __init__( benchmark: str = "wikitext", n_samples: Optional[int] = 50, stride: int = 512, + split: Optional[str] = None, ): self.benchmark = benchmark self.n_samples = n_samples self.stride = stride + self.split = split # ------------------------------------------------------------------ # Mock helpers @@ -200,7 +202,21 @@ def evaluate(self, model, tokenizer) -> dict: """ import datasets as hf_datasets # deferred; runtime dep - ds = hf_datasets.load_dataset(self.benchmark, split="test", streaming=True) + if self.split is not None: + ds = hf_datasets.load_dataset(self.benchmark, split=self.split, streaming=True) + else: + _splits_to_try = ("test", "validation", "train") + for _s in _splits_to_try: + try: + ds = hf_datasets.load_dataset(self.benchmark, split=_s, streaming=True) + break + except Exception: + continue + else: + raise ValueError( + f"Dataset {self.benchmark!r} has none of the expected splits: " + f"{_splits_to_try}. Pass split= explicitly." + ) scores = [] for row in ds: text = row.get("text", "")