diff --git a/pyrit/identifiers/component_identifier.py b/pyrit/identifiers/component_identifier.py index 501d16dbb..027fed85b 100644 --- a/pyrit/identifiers/component_identifier.py +++ b/pyrit/identifiers/component_identifier.py @@ -113,6 +113,7 @@ class ComponentIdentifier: KEY_CLASS_NAME: ClassVar[str] = "class_name" KEY_CLASS_MODULE: ClassVar[str] = "class_module" KEY_HASH: ClassVar[str] = "hash" + KEY_EVAL_HASH: ClassVar[str] = "eval_hash" KEY_PYRIT_VERSION: ClassVar[str] = "pyrit_version" KEY_CHILDREN: ClassVar[str] = "children" LEGACY_KEY_TYPE: ClassVar[str] = "__type__" @@ -127,19 +128,52 @@ class ComponentIdentifier: #: Named child identifiers for compositional identity (e.g., a scorer's target). children: dict[str, Union[ComponentIdentifier, list[ComponentIdentifier]]] = field(default_factory=dict) #: Content-addressed SHA256 hash computed from class, params, and children. - hash: str = field(init=False, compare=False) + #: When ``None`` (the default), it is computed automatically in ``__post_init__``. + #: Pass an explicit value to preserve a pre-computed hash (e.g. from DB storage + #: where params may have been truncated). + hash: Optional[str] = field(default=None, compare=False) #: Version tag for storage. Not included in hash. pyrit_version: str = field(default_factory=lambda: pyrit.__version__, compare=False) + #: Evaluation hash. Computed by EvaluationIdentifier subclasses (e.g. ScorerEvaluationIdentifier) + #: and attached to the identifier so it is always available via ``to_dict()``. + #: Survives DB round-trips even when param values are truncated. + eval_hash: Optional[str] = field(default=None, compare=False) def __post_init__(self) -> None: - """Compute the content-addressed hash at creation time.""" - hash_dict = _build_hash_dict( + """Compute the content-addressed hash at creation time if not already provided.""" + if self.hash is None: + hash_dict = _build_hash_dict( + class_name=self.class_name, + class_module=self.class_module, + params=self.params, + children=self.children, + ) + object.__setattr__(self, "hash", config_hash(hash_dict)) + + def with_eval_hash(self, eval_hash: str) -> ComponentIdentifier: + """ + Return a new frozen ComponentIdentifier with ``eval_hash`` set. + + The original ``hash`` is preserved (important for identifiers + reconstructed from truncated DB data where recomputation would + produce a wrong hash). + + Args: + eval_hash: The evaluation hash to attach. + + Returns: + A new ComponentIdentifier identical to this one but with + ``eval_hash`` set to the given value. + """ + return ComponentIdentifier( class_name=self.class_name, class_module=self.class_module, params=self.params, children=self.children, + hash=self.hash, + pyrit_version=self.pyrit_version, + eval_hash=eval_hash, ) - object.__setattr__(self, "hash", config_hash(hash_dict)) @property def short_hash(self) -> str: @@ -258,6 +292,9 @@ def to_dict(self, *, max_value_length: Optional[int] = None) -> dict[str, Any]: self.KEY_PYRIT_VERSION: self.pyrit_version, } + if self.eval_hash is not None: + result[self.KEY_EVAL_HASH] = self.eval_hash + for key, value in self.params.items(): result[key] = self._truncate_value(value=value, max_length=max_value_length) @@ -324,6 +361,7 @@ def from_dict(cls, data: dict[str, Any]) -> ComponentIdentifier: class_module = data.pop(cls.KEY_CLASS_MODULE, None) or data.pop(cls.LEGACY_KEY_MODULE, None) or "unknown" stored_hash = data.pop(cls.KEY_HASH, None) + stored_eval_hash = data.pop(cls.KEY_EVAL_HASH, None) pyrit_version = data.pop(cls.KEY_PYRIT_VERSION, pyrit.__version__) # Reconstruct children @@ -332,22 +370,16 @@ def from_dict(cls, data: dict[str, Any]) -> ComponentIdentifier: # Everything remaining is a param params = data - identifier = cls( + return cls( class_name=class_name, class_module=class_module, params=params, children=children, + hash=stored_hash, pyrit_version=pyrit_version, + eval_hash=stored_eval_hash, ) - # Preserve stored hash if available — the stored hash was computed from - # untruncated data and is the correct identity. Recomputing from - # potentially truncated DB values would produce a wrong hash. - if stored_hash: - object.__setattr__(identifier, "hash", stored_hash) - - return identifier - def get_child(self, key: str) -> Optional[ComponentIdentifier]: """ Get a single child by key. diff --git a/pyrit/identifiers/evaluation_identifier.py b/pyrit/identifiers/evaluation_identifier.py index 98d338ead..6df3192cf 100644 --- a/pyrit/identifiers/evaluation_identifier.py +++ b/pyrit/identifiers/evaluation_identifier.py @@ -170,12 +170,22 @@ class EvaluationIdentifier(ABC): CHILD_EVAL_RULES: ClassVar[dict[str, ChildEvalRule]] def __init__(self, identifier: ComponentIdentifier) -> None: - """Wrap a ComponentIdentifier and eagerly compute its eval hash.""" + """ + Wrap a ComponentIdentifier and resolve its eval hash. + + If the identifier carries an ``eval_hash`` (preserved from a prior + DB round-trip or set by the scorer), that value is used directly. + Otherwise the eval hash is computed from the identifier's params + and children using the subclass's ``CHILD_EVAL_RULES``. + """ self._identifier = identifier - self._eval_hash = compute_eval_hash( - identifier, - child_eval_rules=self.CHILD_EVAL_RULES, - ) + if identifier.eval_hash is not None: + self._eval_hash = identifier.eval_hash + else: + self._eval_hash = compute_eval_hash( + identifier, + child_eval_rules=self.CHILD_EVAL_RULES, + ) @property def identifier(self) -> ComponentIdentifier: diff --git a/pyrit/memory/memory_models.py b/pyrit/memory/memory_models.py index e9c83b930..9376768bd 100644 --- a/pyrit/memory/memory_models.py +++ b/pyrit/memory/memory_models.py @@ -33,6 +33,10 @@ import pyrit from pyrit.common.utils import to_sha256 from pyrit.identifiers.component_identifier import ComponentIdentifier +from pyrit.identifiers.evaluation_identifier import ( + AtomicAttackEvaluationIdentifier, + ScorerEvaluationIdentifier, +) from pyrit.models import ( AttackOutcome, AttackResult, @@ -51,6 +55,8 @@ SeedType, ) +logger = logging.getLogger(__name__) + # Default pyrit_version for database records created before version tracking was added LEGACY_PYRIT_VERSION = "<0.10.0" @@ -398,7 +404,14 @@ def __init__(self, *, entry: Score): self.score_metadata = entry.score_metadata # Normalize to ComponentIdentifier (handles dict with deprecation warning) then convert to dict for JSON storage normalized_scorer = ComponentIdentifier.normalize(entry.scorer_class_identifier) - self.scorer_class_identifier = normalized_scorer.to_dict(max_value_length=MAX_IDENTIFIER_VALUE_LENGTH) + # Ensure eval_hash is set before truncation so it survives the DB round-trip + if normalized_scorer.eval_hash is None: + normalized_scorer = normalized_scorer.with_eval_hash( + ScorerEvaluationIdentifier(normalized_scorer).eval_hash + ) + self.scorer_class_identifier = normalized_scorer.to_dict( + max_value_length=MAX_IDENTIFIER_VALUE_LENGTH, + ) self.prompt_request_response_id = entry.message_piece_id if entry.message_piece_id else None self.timestamp = entry.timestamp # Store in both columns for backward compatibility @@ -770,8 +783,15 @@ def __init__(self, *, entry: AttackResult): self.attack_identifier = ( _attack_strategy_id.to_dict(max_value_length=MAX_IDENTIFIER_VALUE_LENGTH) if _attack_strategy_id else {} ) + # Ensure eval_hash is set before truncation so it survives the DB round-trip + if entry.atomic_attack_identifier and entry.atomic_attack_identifier.eval_hash is None: + entry.atomic_attack_identifier = entry.atomic_attack_identifier.with_eval_hash( + AtomicAttackEvaluationIdentifier(entry.atomic_attack_identifier).eval_hash + ) self.atomic_attack_identifier = ( - entry.atomic_attack_identifier.to_dict(max_value_length=MAX_IDENTIFIER_VALUE_LENGTH) + entry.atomic_attack_identifier.to_dict( + max_value_length=MAX_IDENTIFIER_VALUE_LENGTH, + ) if entry.atomic_attack_identifier else None ) @@ -974,9 +994,16 @@ def __init__(self, *, entry: ScenarioResult): self.objective_target_identifier = entry.objective_target_identifier.to_dict( max_value_length=MAX_IDENTIFIER_VALUE_LENGTH ) - # Convert ComponentIdentifier to dict for JSON storage + # Ensure eval_hash is set before truncation so it survives the DB round-trip. + if entry.objective_scorer_identifier and entry.objective_scorer_identifier.eval_hash is None: + entry.objective_scorer_identifier = entry.objective_scorer_identifier.with_eval_hash( + ScorerEvaluationIdentifier(entry.objective_scorer_identifier).eval_hash + ) + self.objective_scorer_identifier = ( - entry.objective_scorer_identifier.to_dict(max_value_length=MAX_IDENTIFIER_VALUE_LENGTH) + entry.objective_scorer_identifier.to_dict( + max_value_length=MAX_IDENTIFIER_VALUE_LENGTH, + ) if entry.objective_scorer_identifier else None ) diff --git a/pyrit/scenario/core/atomic_attack.py b/pyrit/scenario/core/atomic_attack.py index 81bea9fe2..ec43e2802 100644 --- a/pyrit/scenario/core/atomic_attack.py +++ b/pyrit/scenario/core/atomic_attack.py @@ -19,6 +19,7 @@ from pyrit.executor.attack import AttackExecutor, AttackStrategy from pyrit.executor.attack.core.attack_executor import AttackExecutorResult from pyrit.identifiers import build_atomic_attack_identifier +from pyrit.identifiers.evaluation_identifier import AtomicAttackEvaluationIdentifier from pyrit.memory import CentralMemory from pyrit.memory.memory_models import MAX_IDENTIFIER_VALUE_LENGTH from pyrit.models import AttackResult, SeedAttackGroup @@ -251,13 +252,19 @@ def _enrich_atomic_attack_identifiers(self, *, results: AttackExecutorResult[Att seed_group=self._seed_groups[idx], ) - # Persist the enriched identifier back to the database + # Persist the enriched identifier back to the database. + # Set eval_hash before truncation so it survives the DB round-trip. + if result.atomic_attack_identifier.eval_hash is None: + result.atomic_attack_identifier = result.atomic_attack_identifier.with_eval_hash( + AtomicAttackEvaluationIdentifier(result.atomic_attack_identifier).eval_hash + ) + if result.attack_result_id: memory.update_attack_result_by_id( attack_result_id=result.attack_result_id, update_fields={ "atomic_attack_identifier": result.atomic_attack_identifier.to_dict( - max_value_length=MAX_IDENTIFIER_VALUE_LENGTH + max_value_length=MAX_IDENTIFIER_VALUE_LENGTH, ), }, ) diff --git a/pyrit/score/float_scale/float_scale_scorer.py b/pyrit/score/float_scale/float_scale_scorer.py index 126dd909f..af39cf5be 100644 --- a/pyrit/score/float_scale/float_scale_scorer.py +++ b/pyrit/score/float_scale/float_scale_scorer.py @@ -59,7 +59,7 @@ def get_scorer_metrics(self) -> Optional["HarmScorerMetrics"]: return None return find_harm_metrics_by_eval_hash( - eval_hash=self.get_eval_hash(), + eval_hash=self.get_identifier().eval_hash, harm_category=self.evaluation_file_mapping.harm_category, ) diff --git a/pyrit/score/scorer.py b/pyrit/score/scorer.py index c1ad1910a..b18a1802a 100644 --- a/pyrit/score/scorer.py +++ b/pyrit/score/scorer.py @@ -23,7 +23,7 @@ pyrit_json_retry, remove_markdown_json, ) -from pyrit.identifiers import ComponentIdentifier, Identifiable +from pyrit.identifiers import ComponentIdentifier, Identifiable, ScorerEvaluationIdentifier from pyrit.memory import CentralMemory, MemoryInterface from pyrit.models import ( ChatMessageRole, @@ -70,21 +70,21 @@ def __init__(self, *, validator: ScorerPromptValidator): """ self._validator = validator - def get_eval_hash(self) -> str: + def get_identifier(self) -> ComponentIdentifier: """ - Compute a behavioral equivalence hash for evaluation grouping. + Get the scorer's identifier with eval_hash always attached. - Delegates to ``ScorerEvaluationIdentifier`` which filters target children - (prompt_target, converter_target) to behavioral params only, so the same - scorer configuration on different deployments produces the same eval hash. + Overrides the base ``Identifiable.get_identifier()`` so that + ``to_dict()`` always emits the ``eval_hash`` key. Returns: - str: A hex-encoded SHA256 hash suitable for eval registry keying. + ComponentIdentifier: The identity with ``eval_hash`` set. """ - # Deferred import to avoid circular dependency (evaluation_identifier → identifiers → …) - from pyrit.identifiers.evaluation_identifier import ScorerEvaluationIdentifier - - return ScorerEvaluationIdentifier(self.get_identifier()).eval_hash + identifier = super().get_identifier() + if identifier.eval_hash is None: + identifier = identifier.with_eval_hash(ScorerEvaluationIdentifier(identifier).eval_hash) + self._identifier = identifier + return identifier @property def scorer_type(self) -> ScoreType: diff --git a/pyrit/score/scorer_evaluation/scorer_evaluator.py b/pyrit/score/scorer_evaluation/scorer_evaluator.py index da08aa0df..df032c9ff 100644 --- a/pyrit/score/scorer_evaluation/scorer_evaluator.py +++ b/pyrit/score/scorer_evaluation/scorer_evaluator.py @@ -275,7 +275,7 @@ def _should_skip_evaluation( - (False, None) if should run evaluation """ try: - scorer_hash = self.scorer.get_eval_hash() + scorer_hash = self.scorer.get_identifier().eval_hash # Determine if this is a harm or objective evaluation metrics_type = MetricsType.OBJECTIVE if isinstance(self.scorer, TrueFalseScorer) else MetricsType.HARM @@ -489,7 +489,7 @@ def _write_metrics_to_registry( replace_evaluation_results( file_path=result_file_path, scorer_identifier=self.scorer.get_identifier(), - eval_hash=self.scorer.get_eval_hash(), + eval_hash=self.scorer.get_identifier().eval_hash, metrics=metrics, ) except Exception as e: diff --git a/pyrit/score/true_false/true_false_scorer.py b/pyrit/score/true_false/true_false_scorer.py index 671dd5797..9074b7917 100644 --- a/pyrit/score/true_false/true_false_scorer.py +++ b/pyrit/score/true_false/true_false_scorer.py @@ -94,7 +94,7 @@ def get_scorer_metrics(self) -> Optional["ObjectiveScorerMetrics"]: if not result_file.exists(): return None - return find_objective_metrics_by_eval_hash(eval_hash=self.get_eval_hash(), file_path=result_file) + return find_objective_metrics_by_eval_hash(eval_hash=self.get_identifier().eval_hash, file_path=result_file) async def _score_async(self, message: Message, *, objective: Optional[str] = None) -> list[Score]: """ diff --git a/tests/unit/identifiers/test_component_identifier.py b/tests/unit/identifiers/test_component_identifier.py index ccf186a14..569fb42bf 100644 --- a/tests/unit/identifiers/test_component_identifier.py +++ b/tests/unit/identifiers/test_component_identifier.py @@ -544,6 +544,100 @@ def test_roundtrip_with_list_children(self): assert isinstance(recon_converters, list) assert len(recon_converters) == 2 + def test_roundtrip_preserves_eval_hash(self): + """Test that eval_hash is preserved through to_dict -> from_dict round-trip.""" + expected_eval_hash = "abc123" * 10 + "abcd" # 64 chars + original = ComponentIdentifier( + class_name="Scorer", + class_module="pyrit.score", + params={"system_prompt": "Score the response"}, + ).with_eval_hash(expected_eval_hash) + d = original.to_dict() + assert d["eval_hash"] == expected_eval_hash + + reconstructed = ComponentIdentifier.from_dict(d) + assert reconstructed.eval_hash == expected_eval_hash + + def test_roundtrip_eval_hash_survives_truncation(self): + """Regression test: eval_hash computed before truncation is preserved after round-trip. + + This is the core bug fix — long params get truncated in to_dict(), which would + cause eval_hash recomputation to produce a wrong hash. By storing eval_hash in + the dict, it survives truncation. + """ + long_prompt = "You are a scorer that evaluates responses. " * 20 # >80 chars + eval_hash_before_truncation = "correct_eval_hash_" + "0" * 46 # 64 chars + original = ComponentIdentifier( + class_name="SelfAskTrueFalseScorer", + class_module="pyrit.score", + params={"system_prompt_template": long_prompt}, + ).with_eval_hash(eval_hash_before_truncation) + + # Serialize with truncation (simulates DB storage) + truncated_dict = original.to_dict(max_value_length=80) + # Params are truncated + assert truncated_dict["system_prompt_template"].endswith("...") + # But eval_hash is preserved + assert truncated_dict["eval_hash"] == eval_hash_before_truncation + + # Deserialize + reconstructed = ComponentIdentifier.from_dict(truncated_dict) + # eval_hash is available on the reconstructed identifier + assert reconstructed.eval_hash == eval_hash_before_truncation + # And it's NOT in params (from_dict pops it as a reserved key) + assert "eval_hash" not in reconstructed.params + + def test_roundtrip_no_eval_hash_when_not_set(self): + """Test that eval_hash is None when not set on the identifier.""" + original = ComponentIdentifier( + class_name="Test", + class_module="mod", + params={"key": "value"}, + ) + d = original.to_dict() + assert "eval_hash" not in d + + reconstructed = ComponentIdentifier.from_dict(d) + assert reconstructed.eval_hash is None + + def test_to_dict_includes_eval_hash_from_prior_roundtrip(self): + """Test that to_dict re-emits eval_hash from a prior round-trip.""" + eval_hash = "deadbeef" * 8 # 64 chars + original = ComponentIdentifier( + class_name="Test", + class_module="mod", + ).with_eval_hash(eval_hash) + d1 = original.to_dict() + reconstructed = ComponentIdentifier.from_dict(d1) + + # Re-serialize — eval_hash should be emitted + d2 = reconstructed.to_dict() + assert d2["eval_hash"] == eval_hash + + def test_double_roundtrip_preserves_eval_hash_and_identity_hash(self): + """Test that both eval_hash and identity hash survive retrieve → re-store → retrieve.""" + long_prompt = "Score the response carefully. " * 20 + original = ComponentIdentifier( + class_name="Scorer", + class_module="pyrit.score", + params={"system_prompt": long_prompt}, + ) + original_hash = original.hash + eval_hash = "eval_" + "a1b2c3d4" * 7 + "a1b2c3" # 64 chars + original = original.with_eval_hash(eval_hash) + + # First round-trip: store with truncation + d1 = original.to_dict(max_value_length=80) + r1 = ComponentIdentifier.from_dict(d1) + assert r1.hash == original_hash + assert r1.eval_hash == eval_hash + + # Second round-trip: re-store (simulating retrieve → use → re-store) + d2 = r1.to_dict(max_value_length=80) + r2 = ComponentIdentifier.from_dict(d2) + assert r2.hash == original_hash + assert r2.eval_hash == eval_hash + class TestComponentIdentifierNormalize: """Tests for normalize class method.""" diff --git a/tests/unit/identifiers/test_evaluation_identifier.py b/tests/unit/identifiers/test_evaluation_identifier.py index cf62299f9..69eda9d48 100644 --- a/tests/unit/identifiers/test_evaluation_identifier.py +++ b/tests/unit/identifiers/test_evaluation_identifier.py @@ -222,3 +222,100 @@ class CustomIdentity(EvaluationIdentifier): }, ) assert identity.eval_hash == expected + + def test_uses_eval_hash_when_available(self): + """Test that EvaluationIdentifier uses eval_hash instead of recomputing.""" + stored_hash = "stored_eval_hash_value_" + "0" * 42 # 64 chars + cid = ComponentIdentifier( + class_name="Scorer", + class_module="pyrit.score", + params={"system_prompt": "truncated..."}, + ).with_eval_hash(stored_hash) + + identity = _StubEvaluationIdentifier(cid) + assert identity.eval_hash == stored_hash + + def test_computes_eval_hash_when_not_set(self): + """Test that eval_hash is computed normally when eval_hash is None.""" + cid = ComponentIdentifier( + class_name="Scorer", + class_module="pyrit.score", + params={"threshold": 0.5}, + ) + assert cid.eval_hash is None + + identity = _StubEvaluationIdentifier(cid) + expected = compute_eval_hash(cid, child_eval_rules=_StubEvaluationIdentifier.CHILD_EVAL_RULES) + assert identity.eval_hash == expected + + def test_truncation_roundtrip_preserves_eval_hash(self): + """Regression test: eval_hash survives DB round-trip with param truncation. + + This is the core scenario for the bug fix. A scorer with a long system_prompt + gets stored to the DB with truncation. The eval_hash computed from the untruncated + identifier is included in to_dict(). After from_dict() reconstruction, the + EvaluationIdentifier should use the stored eval_hash (not recompute from truncated params). + """ + # Build a scorer identifier with a long system_prompt and a target child + long_prompt = "Evaluate whether the response achieves the objective. " * 10 + target_child = ComponentIdentifier( + class_name="OpenAIChatTarget", + class_module="pyrit.prompt_target", + params={"model_name": "gpt-4o", "endpoint": "https://api.openai.com", "temperature": 0.0}, + ) + scorer_id = ComponentIdentifier( + class_name="SelfAskTrueFalseScorer", + class_module="pyrit.score", + params={"system_prompt_template": long_prompt}, + children={"prompt_target": target_child}, + ) + + # Compute eval_hash from the untruncated identifier (the correct hash) + correct_eval_hash = compute_eval_hash(scorer_id, child_eval_rules=_CHILD_EVAL_RULES) + scorer_id = scorer_id.with_eval_hash(correct_eval_hash) + + # Simulate DB storage: serialize with truncation + truncated_dict = scorer_id.to_dict(max_value_length=80) + + # Verify params are actually truncated + assert truncated_dict["system_prompt_template"].endswith("...") + + # Reconstruct from truncated dict (simulates DB read) + reconstructed = ComponentIdentifier.from_dict(truncated_dict) + + # The reconstructed identifier has truncated params, so recomputing would give wrong hash + recomputed = compute_eval_hash(reconstructed, child_eval_rules=_CHILD_EVAL_RULES) + assert recomputed != correct_eval_hash, "Truncated params should produce different eval_hash" + + # But EvaluationIdentifier uses the preserved eval_hash, giving the correct result + identity = _StubEvaluationIdentifier(reconstructed) + assert identity.eval_hash == correct_eval_hash + + def test_eval_hash_preserved_through_double_roundtrip(self): + """Test that eval_hash is preserved when retrieved from DB and re-stored. + + Simulates: fresh save → DB retrieve → re-store → DB retrieve. + The eval_hash computed at first save should survive all round-trips. + """ + long_prompt = "Evaluate whether the response achieves the objective. " * 10 + scorer_id = ComponentIdentifier( + class_name="SelfAskTrueFalseScorer", + class_module="pyrit.score", + params={"system_prompt_template": long_prompt}, + ) + + # First save: compute eval_hash from untruncated identifier + correct_eval_hash = compute_eval_hash(scorer_id, child_eval_rules=_CHILD_EVAL_RULES) + scorer_id = scorer_id.with_eval_hash(correct_eval_hash) + d1 = scorer_id.to_dict(max_value_length=80) + + # First retrieve + r1 = ComponentIdentifier.from_dict(d1) + assert _StubEvaluationIdentifier(r1).eval_hash == correct_eval_hash + + # Re-store: EvaluationIdentifier should use stored value, not recompute + d2 = r1.to_dict(max_value_length=80) + + # Second retrieve + r2 = ComponentIdentifier.from_dict(d2) + assert _StubEvaluationIdentifier(r2).eval_hash == correct_eval_hash diff --git a/tests/unit/score/test_scorer_evaluation_identifier.py b/tests/unit/score/test_scorer_evaluation_identifier.py index 8d0df75d7..dea0cd7cf 100644 --- a/tests/unit/score/test_scorer_evaluation_identifier.py +++ b/tests/unit/score/test_scorer_evaluation_identifier.py @@ -4,8 +4,7 @@ """ Tests for pyrit.score.scorer_evaluation.scorer_evaluation_identifier. -Covers ``ScorerEvaluationIdentifier`` ClassVar values, eval-hash delegation, and -the ``Scorer.get_eval_hash()`` convenience method. +Covers ``ScorerEvaluationIdentifier`` ClassVar values and eval-hash delegation. """ import pytest @@ -85,10 +84,10 @@ def test_eval_hash_matches_free_function(self): @pytest.mark.usefixtures("patch_central_database") class TestScorerGetEvalHash: - """Tests for Scorer.get_eval_hash() convenience method (adapted from old TestGetEvalHash).""" + """Tests for ScorerEvaluationIdentifier eval_hash computation.""" - def test_get_eval_hash_uses_scorer_identity(self): - """Test that Scorer.get_eval_hash() delegates to ScorerEvaluationIdentifier.""" + def test_eval_hash_uses_scorer_identity(self): + """Test that ScorerEvaluationIdentifier computes eval_hash from identifier.""" class FakeScorer(Identifiable): def _build_identifier(self) -> ComponentIdentifier: @@ -109,8 +108,8 @@ def _build_identifier(self) -> ComponentIdentifier: ) assert eval_hash == expected - def test_get_eval_hash_filters_operational_params(self): - """Test that Scorer.get_eval_hash() filters operational params from target children.""" + def test_eval_hash_filters_operational_params(self): + """Test that eval_hash filters operational params from target children.""" class ScorerLike(Identifiable): def __init__(self, *, endpoint: str): @@ -135,7 +134,7 @@ def _build_identifier(self) -> ComponentIdentifier: # But different component hashes (endpoint is in full identity) assert scorer_a.get_identifier().hash != scorer_b.get_identifier().hash - def test_get_eval_hash_no_target_children_equals_component_hash(self): + def test_eval_hash_no_target_children_equals_component_hash(self): """Test that eval hash equals component hash when there are no target children.""" class SimpleScorer(Identifiable): diff --git a/tests/unit/score/test_scorer_evaluator.py b/tests/unit/score/test_scorer_evaluator.py index 6185b942c..aa9ebbc78 100644 --- a/tests/unit/score/test_scorer_evaluator.py +++ b/tests/unit/score/test_scorer_evaluator.py @@ -31,9 +31,9 @@ def mock_harm_scorer(): # Create a mock identifier with a controllable hash property mock_identifier = MagicMock() mock_identifier.hash = "test_hash_456" + mock_identifier.eval_hash = "test_hash_456" mock_identifier.system_prompt_template = "test_system_prompt" scorer.get_identifier = MagicMock(return_value=mock_identifier) - scorer.get_eval_hash = MagicMock(return_value="test_hash_456") return scorer @@ -45,9 +45,9 @@ def mock_objective_scorer(): # Create a mock identifier with a controllable hash property mock_identifier = MagicMock() mock_identifier.hash = "test_hash_123" + mock_identifier.eval_hash = "test_hash_123" mock_identifier.user_prompt_template = "test_user_prompt" scorer.get_identifier = MagicMock(return_value=mock_identifier) - scorer.get_eval_hash = MagicMock(return_value="test_hash_123") return scorer @@ -412,8 +412,8 @@ def test_should_skip_evaluation_exception_handling(mock_find, mock_objective_sco evaluator = ObjectiveScorerEvaluator(scorer=mock_objective_scorer) result_file = tmp_path / "test_results.jsonl" - # Make get_eval_hash() raise an exception - mock_objective_scorer.get_eval_hash = MagicMock(side_effect=Exception("Identifier computation failed")) + # Make get_identifier() raise an exception + mock_objective_scorer.get_identifier = MagicMock(side_effect=Exception("Identifier computation failed")) should_skip, result = evaluator._should_skip_evaluation( dataset_version="1.0", @@ -426,8 +426,11 @@ def test_should_skip_evaluation_exception_handling(mock_find, mock_objective_sco assert result is None mock_find.assert_not_called() - # Restore get_eval_hash for other tests - mock_objective_scorer.get_eval_hash = MagicMock(return_value="test_hash_123") + # Restore get_identifier for other tests + mock_id = MagicMock() + mock_id.hash = "test_hash_123" + mock_id.eval_hash = "test_hash_123" + mock_objective_scorer.get_identifier = MagicMock(return_value=mock_id) @patch("pyrit.score.scorer_evaluation.scorer_evaluator.find_harm_metrics_by_eval_hash")