|
22 | 22 | from dataclasses import asdict |
23 | 23 | from pathlib import Path |
24 | 24 |
|
25 | | -from pyrit.common.path import SCORER_EVALS_HARM_PATH, SCORER_EVALS_OBJECTIVE_PATH |
| 25 | +from pyrit.common.path import SCORER_EVALS_HARM_PATH, SCORER_EVALS_TRUE_FALSE_PATH |
26 | 26 | from pyrit.prompt_target import OpenAIChatTarget |
27 | 27 | from pyrit.score import ( |
28 | 28 | LikertScalePaths, |
|
33 | 33 | SelfAskRefusalScorer, |
34 | 34 | SelfAskTrueFalseScorer, |
35 | 35 | ) |
36 | | -from pyrit.score.true_false.self_ask_true_false_scorer import TRUE_FALSE_QUESTIONS_PATH |
37 | 36 | from pyrit.setup import IN_MEMORY, initialize_pyrit |
38 | 37 |
|
39 | 38 | initialize_pyrit(memory_db_type=IN_MEMORY) |
40 | 39 | target = OpenAIChatTarget() |
41 | 40 |
|
42 | | - |
43 | 41 | # %% [markdown] |
44 | 42 | # ## Running Harm Scorer Evaluation |
45 | 43 | # The following cell demonstrates how to run a custom harm scoring evaluation on a `Scorer` using a dataset of LLM assistant responses and human labels stored as a CSV file. In the following example, we use a small sample `hate_speech` dataset of assistant responses and human scores and run a harm evaluation using `SelfAskLikertScorer` with a Likert scale for hate speech. Sample datasets and saved metrics can be found in the `datasets/score/scorer_evals` directory. |
46 | 44 |
|
| 45 | +# %% [markdown] |
| 46 | +# ### Understanding Harm Scorer Metrics |
| 47 | + |
| 48 | +# %% [markdown] |
| 49 | +# When evaluating harm scorers, several metrics are computed to assess how well the model scores align with human evaluations: |
| 50 | +# |
| 51 | +# **Error Metrics:** |
| 52 | +# - **`mean_absolute_error` (MAE)**: The average absolute difference between model scores and the mean of human scores (gold label). Lower values indicate better alignment. For example, an MAE of 0.15 means the model's scores differ from human scores by 0.15 on average (on a 0-1 scale). |
| 53 | +# - **`mae_standard_error`**: The standard error of the MAE, used to calculate confidence intervals. Indicates the precision of the MAE estimate. |
| 54 | +# |
| 55 | +# **Statistical Significance:** |
| 56 | +# - **`t_statistic`**: From a one-sample t-test comparing model scores to human scores. A high positive value suggests the model systematically scores higher than humans; a high negative value suggests it scores lower. |
| 57 | +# - **`p_value`**: The probability of observing the difference between model and human scores by chance. Values < 0.05 typically indicate statistically significant differences. |
| 58 | +# |
| 59 | +# **Inter-Rater Reliability (Krippendorff's Alpha):** |
| 60 | +# |
| 61 | +# Krippendorff's alpha measures agreement between evaluators, ranging from -1.0 to 1.0: |
| 62 | +# - **1.0**: Perfect agreement |
| 63 | +# - **0.8-1.0**: Strong agreement |
| 64 | +# - **0.6-0.8**: Moderate agreement |
| 65 | +# - **0.0**: Agreement equivalent to chance |
| 66 | +# - **< 0.0**: Systematic disagreement |
| 67 | +# |
| 68 | +# Three alpha values are reported: |
| 69 | +# - **`krippendorff_alpha_humans`**: Agreement among human evaluators only. Measures consistency of the human-labeled "ground truth." Only computed when multiple human raters exist. |
| 70 | +# - **`krippendorff_alpha_model`**: Agreement among multiple model scoring trials. Measures model consistency/reliability. Only computed when `num_scorer_trials > 1`. |
| 71 | +# - **`krippendorff_alpha_combined`**: Agreement across all evaluators (humans + model trials combined). Measures how well the model aligns with the pool of human evaluators. |
| 72 | + |
| 73 | +# %% [markdown] |
| 74 | +# ### Multiple Evaluators: |
| 75 | +# |
| 76 | +# With multiple evaluators, we can measure inter-reliability alignment between evaluators shown below: |
| 77 | + |
47 | 78 | # %% |
48 | 79 | target = OpenAIChatTarget() |
49 | 80 | likert_scorer = SelfAskLikertScorer(chat_target=target, likert_scale_path=LikertScalePaths.HATE_SPEECH_SCALE.value) |
50 | 81 |
|
51 | 82 | # factory method that creates an HarmScorerEvaluator in this case since metrics_type is HARM. |
52 | 83 | evaluator = ScorerEvaluator.from_scorer(scorer=likert_scorer, metrics_type=MetricsType.HARM) |
53 | | -csv_path = f"{str(SCORER_EVALS_HARM_PATH)}/SAMPLE_hate_speech.csv" |
| 84 | +csv_path = f"{str(SCORER_EVALS_HARM_PATH)}/mini_hate_speech.csv" |
54 | 85 | # Uncomment the line below to use the full dataset of approx 200 entries |
55 | | -# csv_path = f"{str(SCORER_EVALS_HARM_PATH)}/hate_speech.csv" |
| 86 | +# csv_path = f"{str(SCORER_EVALS_HARM_PATH)}/hate_speech_multi_score.csv" |
56 | 87 |
|
57 | | -# The dataset_name defaults to "SAMPLE_hate_speech" but can be manually set to any string. Each response is graded once by the scorer. |
| 88 | +# The dataset_name defaults to "mini_hate_speech" but can be manually set to any string. Each response is graded once by the scorer. |
58 | 89 | metrics = await evaluator.run_evaluation_from_csv_async( # type:ignore |
59 | 90 | csv_path=csv_path, |
60 | 91 | assistant_response_col_name="assistant_response", |
61 | 92 | human_label_col_names=["human_score_1", "human_score_2", "human_score_3"], |
62 | 93 | objective_or_harm_col_name="category", |
63 | 94 | num_scorer_trials=1, |
| 95 | + dataset_name="mini_hate_speech", |
| 96 | + assistant_response_data_type_col_name=None, |
64 | 97 | ) |
65 | 98 |
|
66 | | -# Metrics are saved to datasets/score/scorer_evals/harm/SAMPLE_hate_speech_SelfAskLikertScorer_metrics.json |
67 | | -# Results from the model scoring trials are saved to datasets/score/scorer_evals/harm/SAMPLE_hate_speech_SelfAskLikertScorer_scoring_results.csv |
| 99 | +# Metrics are saved to datasets/score/scorer_evals/harm/results/mini_hate_speech_metrics.json |
| 100 | +# Results from the model scoring trials are saved to datasets/score/scorer_evals/harm/results/mini_hate_speech_scoring_results.csv |
68 | 101 | asdict(metrics) |
69 | 102 |
|
| 103 | +# %% [markdown] |
| 104 | +# ### Single-Evaluators: |
| 105 | +# |
| 106 | +# The sample files below have only one human evaluator and thus the inter-reliability metric is not scored between human evaluators |
| 107 | + |
| 108 | +# %% |
| 109 | +from pyrit.score.scorer_evaluation.config_eval_datasets import get_harm_eval_datasets |
| 110 | + |
| 111 | +harm_categories_to_evaluate = ["sexual_content"] |
| 112 | + |
| 113 | +for harm_category in harm_categories_to_evaluate: |
| 114 | + harm_category_map = get_harm_eval_datasets(category=harm_category, metrics_type="harm") |
| 115 | + |
| 116 | + eval_rubric_path = harm_category_map["evaluation_rubric_file_path"] |
| 117 | + csv_path = str(Path(harm_category_map["dataset_file_path"])) |
| 118 | + |
| 119 | + likert_scorer = SelfAskLikertScorer(chat_target=target, likert_scale_path=eval_rubric_path) |
| 120 | + |
| 121 | + evaluator = ScorerEvaluator.from_scorer(scorer=likert_scorer, metrics_type=MetricsType.HARM) |
| 122 | + |
| 123 | + # assistant_response_data_type_col_name is optional and can be used to specify the type of data for each response in the assistant response column. |
| 124 | + metrics = await evaluator.run_evaluation_from_csv_async( # type:ignore |
| 125 | + csv_path=csv_path, |
| 126 | + assistant_response_col_name="assistant_response", |
| 127 | + human_label_col_names=["normalized_score_1"], |
| 128 | + objective_or_harm_col_name="category", |
| 129 | + num_scorer_trials=1, |
| 130 | + assistant_response_data_type_col_name=None, |
| 131 | + dataset_name=harm_category_map["dataset_name"], |
| 132 | + ) |
| 133 | + |
| 134 | + print("Evaluation for harm category:", harm_category) |
| 135 | + print(asdict(metrics)) |
| 136 | + |
70 | 137 | # %% [markdown] |
71 | 138 | # ## Retrieving Metrics |
72 | | -# You can retrieve the metrics from the above evaluation by calling the `get_scorer_metrics` from the `ScorerEvaluator` class or directly from the `Scorer` class and passing in the `dataset_name` (which in this case is `SAMPLE_hate_speech`). This will throw an error if evaluation has not yet been run on that dataset. |
| 139 | +# You can retrieve the metrics from the above evaluation by calling the `get_scorer_metrics` from the `ScorerEvaluator` class or directly from the `Scorer` class and passing in the `dataset_name` (which in this case is `mini_hate_speech`). This will throw an error if evaluation has not yet been run on that dataset. |
73 | 140 |
|
74 | 141 | # %% |
75 | 142 | # Either work for fetching the hate_speech metrics |
76 | | -evaluator.get_scorer_metrics(dataset_name="SAMPLE_hate_speech") |
77 | | -likert_scorer.get_scorer_metrics(dataset_name="SAMPLE_hate_speech", metrics_type=MetricsType.HARM) |
| 143 | +evaluator.get_scorer_metrics(dataset_name="mini_hate_speech") |
| 144 | +likert_scorer.get_scorer_metrics(dataset_name="mini_hate_speech", metrics_type=MetricsType.HARM) |
78 | 145 |
|
79 | 146 | # Retrieve metrics for the full hate_speech dataset that have already been computed and saved by the PyRIT team. |
80 | 147 | # full_metrics = likert_scorer.get_scorer_metrics(dataset_name="hate_speech") |
|
83 | 150 | # ## Running Objective Scorer Evaluation |
84 | 151 | # The following cell demonstrates how to run a custom objective evaluation on a `Scorer` using a dataset of LLM assistant responses and human labels stored as a CSV file. This is much like the previous example except we use the `SelfAskRefusalScorer` that simply determines whether the model response was or was not a refusal. |
85 | 152 |
|
| 153 | +# %% [markdown] |
| 154 | +# ### Understanding Objective Scorer Metrics |
| 155 | + |
| 156 | +# %% [markdown] |
| 157 | +# When evaluating objective (true/false) scorers, the following metrics are computed based on the normalized score from humans as the gold label: |
| 158 | +# |
| 159 | +# - **`accuracy`**: The proportion of responses where the model's overall score matches the human overall score. Ranges from 0.0 to 1.0, where 1.0 means perfect agreement. |
| 160 | +# - **`accuracy_standard_error`**: The standard error of the accuracy estimate, useful for constructing confidence intervals. |
| 161 | +# - **`precision`**: Of all responses the model labeled as positive (True), what proportion were actually positive according to humans? High precision means few false positives. |
| 162 | +# - **`recall`**: Of all responses that were actually positive according to humans, what proportion did the model correctly identify? High recall means few false negatives. |
| 163 | +# - **`f1_score`**: The harmonic mean of precision and recall, providing a balanced measure of the model's performance. Ranges from 0.0 to 1.0 |
| 164 | +# |
| 165 | +# **Example Interpretation:** |
| 166 | +# If a refusal scorer has accuracy=0.92, precision=0.95, recall=0.88, and f1_score=0.91, this means: |
| 167 | +# - The model agrees with human normalized score 92% of the time |
| 168 | +# - When the model says "this is a refusal," it's correct 95% of the time |
| 169 | +# - The model catches 88% of actual refusals (missing 12%) |
| 170 | +# - Overall performance is strong (F1=0.91) |
| 171 | + |
86 | 172 | # %% |
87 | 173 | refusal_scorer = SelfAskRefusalScorer(chat_target=target) |
88 | 174 |
|
89 | 175 | # factory method that creates an ObjectiveScorerEvaluator in this case because the refusal scorer is a true/false scorer. |
90 | 176 | evaluator = ScorerEvaluator.from_scorer(scorer=refusal_scorer) |
91 | | -csv_path = f"{str(SCORER_EVALS_OBJECTIVE_PATH)}/evaluation_datasets_09_22_2025/SAMPLE_mixed_objective_refusal.csv" |
| 177 | +csv_path = f"{str(SCORER_EVALS_TRUE_FALSE_PATH)}/mini_refusal.csv" |
92 | 178 | # Uncomment the line below to use the full dataset of approx 200 entries |
93 | | -# csv_path = f"{str(SCORER_EVALS_OBJECTIVE_PATH)}//evaluation_datasets_09_22_2025/mixed_objective_refusal.csv" |
| 179 | +# csv_path = f"{str(SCORER_EVALS_TRUE_FALSE_PATH)}/refusal.csv" |
94 | 180 |
|
95 | 181 | # assistant_response_data_type_col_name is optional and can be used to specify the type of data for each response in the assistant response column. |
96 | 182 | metrics = await evaluator.run_evaluation_from_csv_async( # type:ignore |
97 | 183 | csv_path=csv_path, |
98 | 184 | assistant_response_col_name="assistant_response", |
99 | | - human_label_col_names=["human_score"], |
| 185 | + human_label_col_names=["normalized_score"], |
100 | 186 | objective_or_harm_col_name="objective", |
101 | 187 | assistant_response_data_type_col_name="data_type", |
102 | 188 | num_scorer_trials=1, |
103 | 189 | ) |
104 | 190 |
|
105 | | -refusal_scorer.get_scorer_metrics(dataset_name="SAMPLE_mixed_objective_refusal") |
| 191 | +refusal_scorer.get_scorer_metrics(dataset_name="mini_refusal") |
106 | 192 |
|
107 | 193 | # Retrieve metrics for the full refusal scorer dataset that have already been computed and saved by the PyRIT team. |
108 | | -# full_metrics = likert_scorer.get_scorer_metrics(dataset_name="mixed_objective_refusal") |
| 194 | +# full_metrics = likert_scorer.get_scorer_metrics(dataset_name="refusal") |
109 | 195 |
|
110 | 196 | # %% [markdown] |
111 | 197 | # In this case we use a `true_false_scorer` which determines whether the responses are harmful or not according to their harm category. |
112 | 198 |
|
113 | 199 | # %% |
114 | | -# Map harm categories to their respective dataset file paths and evaluation rubric file paths |
115 | | -# This allows for easy extension to new harm categories by or editing entries in this map. |
116 | | -# This also allows for easy editing of dataset file paths and evaluation rubric file paths without changing code logic. |
117 | | -harm_category_map = { |
118 | | - "exploit": { |
119 | | - "dataset_file_path": f"{str(SCORER_EVALS_OBJECTIVE_PATH)}/evaluation_datasets_10_21_2025/exploit_eval_dataset.csv", |
120 | | - "dataset_name": "exploit_eval_dataset", |
121 | | - "evaluation_rubric_file_path": str(Path(TRUE_FALSE_QUESTIONS_PATH, "exploits.yaml")), |
122 | | - }, |
123 | | - "hate_speech": { |
124 | | - "dataset_file_path": f"{str(SCORER_EVALS_OBJECTIVE_PATH)}/evaluation_datasets_10_21_2025/hate_speech_eval_dataset.csv", |
125 | | - "dataset_name": "hate_speech_eval_dataset", |
126 | | - "evaluation_rubric_file_path": str(Path(TRUE_FALSE_QUESTIONS_PATH, "hate_speech.yaml")), |
127 | | - }, |
128 | | - "information_integrity": { |
129 | | - "dataset_file_path": f"{str(SCORER_EVALS_OBJECTIVE_PATH)}/evaluation_datasets_10_21_2025/information_integrity_eval_dataset.csv", |
130 | | - "dataset_name": "information_integrity_eval_dataset", |
131 | | - "evaluation_rubric_file_path": str(Path(TRUE_FALSE_QUESTIONS_PATH, "information_integrity.yaml")), |
132 | | - }, |
133 | | - "privacy": { |
134 | | - "dataset_file_path": f"{str(SCORER_EVALS_OBJECTIVE_PATH)}/evaluation_datasets_10_21_2025/privacy_eval_dataset.csv", |
135 | | - "dataset_name": "privacy_eval_dataset", |
136 | | - "evaluation_rubric_file_path": str(Path(TRUE_FALSE_QUESTIONS_PATH, "privacy.yaml")), |
137 | | - }, |
138 | | - "self-harm": { |
139 | | - "dataset_file_path": f"{str(SCORER_EVALS_OBJECTIVE_PATH)}/evaluation_datasets_10_21_2025/self_harm_eval_dataset.csv", |
140 | | - "dataset_name": "self_harm_eval_dataset", |
141 | | - "evaluation_rubric_file_path": str(Path(TRUE_FALSE_QUESTIONS_PATH, "self-harm.yaml")), |
142 | | - }, |
143 | | - "sexual_content": { |
144 | | - "dataset_file_path": f"{str(SCORER_EVALS_OBJECTIVE_PATH)}/evaluation_datasets_10_21_2025/sexual_content_eval_dataset.csv", |
145 | | - "dataset_name": "sexual_content_eval_dataset", |
146 | | - "evaluation_rubric_file_path": str(Path(TRUE_FALSE_QUESTIONS_PATH, "sexual_content.yaml")), |
147 | | - }, |
148 | | - "violence": { |
149 | | - "dataset_file_path": f"{str(SCORER_EVALS_OBJECTIVE_PATH)}/evaluation_datasets_10_21_2025/violence_eval_dataset.csv", |
150 | | - "dataset_name": "violence_eval_dataset", |
151 | | - "evaluation_rubric_file_path": str(Path(TRUE_FALSE_QUESTIONS_PATH, "violence.yaml")), |
152 | | - }, |
153 | | -} |
| 200 | +from pyrit.score.scorer_evaluation.config_eval_datasets import get_harm_eval_datasets |
154 | 201 |
|
155 | 202 | # set this list to the categories you want to evaluate |
156 | 203 | harm_categories_to_evaluate = ["information_integrity"] |
157 | 204 |
|
158 | 205 | for harm_category in harm_categories_to_evaluate: |
159 | | - if harm_category not in harm_category_map: |
160 | | - raise ValueError( |
161 | | - f"Harm category '{harm_category}' not found in harm_category_map. Please add it to the map with the appropriate dataset and rubric file paths." |
162 | | - ) |
163 | | - eval_rubric_path = harm_category_map[harm_category]["evaluation_rubric_file_path"] |
164 | | - csv_path = str(Path(harm_category_map[harm_category]["dataset_file_path"])) |
165 | | - dataset_name = harm_category_map[harm_category]["dataset_name"] |
| 206 | + harm_category_map = get_harm_eval_datasets(category=harm_category, metrics_type="objective") |
| 207 | + eval_rubric_path = harm_category_map["evaluation_rubric_file_path"] |
| 208 | + csv_path = str(Path(harm_category_map["dataset_file_path"])) |
| 209 | + dataset_name = harm_category_map["dataset_name"] |
166 | 210 |
|
167 | 211 | true_false_scorer = SelfAskTrueFalseScorer(true_false_question_path=Path(eval_rubric_path), chat_target=target) |
168 | 212 |
|
169 | 213 | evaluator: ObjectiveScorerEvaluator = ScorerEvaluator.from_scorer(scorer=true_false_scorer) # type: ignore |
170 | 214 |
|
171 | | - # assistant_response_data_type_col_name is optional and can be used to specify the type of data for each response in the assistant response column. |
172 | 215 | metrics = await evaluator.run_evaluation_from_csv_async( # type:ignore |
173 | 216 | csv_path=csv_path, |
174 | 217 | assistant_response_col_name="assistant_response", |
175 | | - human_label_col_names=["human_score"], |
| 218 | + human_label_col_names=["normalized_score"], |
176 | 219 | objective_or_harm_col_name="objective", |
177 | 220 | assistant_response_data_type_col_name="data_type", |
178 | 221 | num_scorer_trials=1, |
|
0 commit comments