From 61b61d4d6616f1137db9ab6886c843aa3467913d Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Thu, 11 Sep 2025 14:11:51 +0300 Subject: [PATCH 1/5] Initial version of the safety benchmark, plus various fixes to the subsets. Signed-off-by: Jonathan Bnayahu --- prepare/benchmarks/safety.py | 14 +++++++++++++ prepare/cards/attaq.py | 5 +++-- prepare/cards/safety/airbench2024.py | 1 + prepare/cards/safety/mlcommons_ailuminate.py | 4 +++- src/unitxt/catalog/benchmarks/safety.json | 21 +++++++++++++++++++ src/unitxt/catalog/cards/attaq.json | 13 ++++++------ .../cards/safety/mlcommons_ailuminate.json | 2 +- src/unitxt/processors.py | 2 +- 8 files changed, 51 insertions(+), 11 deletions(-) create mode 100644 prepare/benchmarks/safety.py create mode 100644 src/unitxt/catalog/benchmarks/safety.json diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py new file mode 100644 index 0000000000..fbb70d35df --- /dev/null +++ b/prepare/benchmarks/safety.py @@ -0,0 +1,14 @@ +from unitxt.benchmark import Benchmark +from unitxt.catalog import add_to_catalog +from unitxt.standard import DatasetRecipe + +benchmark = Benchmark( + subsets={ + "attaq": DatasetRecipe(card="cards.attaq"), + "provoq": DatasetRecipe(card="cards.safety.provoq"), + "airbench": DatasetRecipe(card="cards.safety.airbench2024"), + "ailuminate": DatasetRecipe(card="cards.safety.mlcommons_ailuminate"), + } +) + +add_to_catalog(benchmark, "benchmarks.safety", overwrite=True) diff --git a/prepare/cards/attaq.py b/prepare/cards/attaq.py index b042f10b33..8068c0f22c 100644 --- a/prepare/cards/attaq.py +++ b/prepare/cards/attaq.py @@ -17,8 +17,9 @@ Shuffle(page_size=2800), ], task=Task( - input_fields=["input"], - reference_fields=["label"], + input_fields={"input": str}, + reference_fields={"label": str}, + prediction_type=str, metrics=["metrics.safety_metric"], ), templates=[ diff --git a/prepare/cards/safety/airbench2024.py b/prepare/cards/safety/airbench2024.py index 2d866dca3b..52b900291e 100644 --- a/prepare/cards/safety/airbench2024.py +++ b/prepare/cards/safety/airbench2024.py @@ -43,6 +43,7 @@ }, reference_fields={}, prediction_type=str, + # TODO: The current implementation uses Granite Guardian as judge. Future versions could use a custom judge and use the judge_prompt field. metrics=[ "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt,assistant_message_field=prediction]", ], diff --git a/prepare/cards/safety/mlcommons_ailuminate.py b/prepare/cards/safety/mlcommons_ailuminate.py index 6e3c8a58f0..d644b0f8f5 100644 --- a/prepare/cards/safety/mlcommons_ailuminate.py +++ b/prepare/cards/safety/mlcommons_ailuminate.py @@ -26,7 +26,9 @@ }, reference_fields={}, prediction_type=str, - metrics=["metrics.llm_as_judge.safety.llamaguard"], + metrics=[ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt_text,assistant_message_field=prediction]" + ], ), templates=TemplatesDict( { diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json new file mode 100644 index 0000000000..0840e22733 --- /dev/null +++ b/src/unitxt/catalog/benchmarks/safety.json @@ -0,0 +1,21 @@ +{ + "__type__": "benchmark", + "subsets": { + "attaq": { + "__type__": "dataset_recipe", + "card": "cards.attaq" + }, + "provoq": { + "__type__": "dataset_recipe", + "card": "cards.safety.provoq" + }, + "airbench": { + "__type__": "dataset_recipe", + "card": "cards.safety.airbench2024" + }, + "ailuminate": { + "__type__": "dataset_recipe", + "card": "cards.safety.mlcommons_ailuminate" + } + } +} diff --git a/src/unitxt/catalog/cards/attaq.json b/src/unitxt/catalog/cards/attaq.json index 2c58f37b9a..6bc49ad878 100644 --- a/src/unitxt/catalog/cards/attaq.json +++ b/src/unitxt/catalog/cards/attaq.json @@ -18,12 +18,13 @@ ], "task": { "__type__": "task", - "input_fields": [ - "input" - ], - "reference_fields": [ - "label" - ], + "input_fields": { + "input": "str" + }, + "reference_fields": { + "label": "str" + }, + "prediction_type": "str", "metrics": [ "metrics.safety_metric" ] diff --git a/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json b/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json index 89542cec61..33aa3262ff 100644 --- a/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json +++ b/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json @@ -22,7 +22,7 @@ "reference_fields": {}, "prediction_type": "str", "metrics": [ - "metrics.llm_as_judge.safety.llamaguard" + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt_text,assistant_message_field=prediction]" ] }, "templates": { diff --git a/src/unitxt/processors.py b/src/unitxt/processors.py index 6f13e10a33..a351999a75 100644 --- a/src/unitxt/processors.py +++ b/src/unitxt/processors.py @@ -326,7 +326,7 @@ def process_value(self, text: Any) -> Any: try: return float(match.group(1)) * 0.25 - 0.25 except: - return np.NaN + return np.nan class ExtractMtBenchLabelJudgment(FieldOperator): From ffa431538ffacdef5152599e946de35d8067a3de Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Mon, 15 Sep 2025 16:24:53 +0300 Subject: [PATCH 2/5] GraniteGuardian-based metric for attaq Signed-off-by: Jonathan Bnayahu --- prepare/benchmarks/safety.py | 25 +++++++++-- prepare/cards/safety/attaq_gg.py | 27 ++++++++++++ src/unitxt/catalog/benchmarks/safety.json | 19 +++++++-- src/unitxt/catalog/cards/safety/attaq_gg.json | 42 +++++++++++++++++++ 4 files changed, 105 insertions(+), 8 deletions(-) create mode 100644 prepare/cards/safety/attaq_gg.py create mode 100644 src/unitxt/catalog/cards/safety/attaq_gg.json diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py index fbb70d35df..0abd288a0c 100644 --- a/prepare/benchmarks/safety.py +++ b/prepare/benchmarks/safety.py @@ -4,10 +4,27 @@ benchmark = Benchmark( subsets={ - "attaq": DatasetRecipe(card="cards.attaq"), - "provoq": DatasetRecipe(card="cards.safety.provoq"), - "airbench": DatasetRecipe(card="cards.safety.airbench2024"), - "ailuminate": DatasetRecipe(card="cards.safety.mlcommons_ailuminate"), + "attaq": DatasetRecipe( + card="cards.safety.attaq_gg", + template_card_index="default", + max_test_instances=500, + ), + "provoq": DatasetRecipe( + card="cards.safety.provoq", + template_card_index="default", + group_by=["group"], + max_test_instances=500, + ), + "airbench": DatasetRecipe( + card="cards.safety.airbench2024", + template_card_index="default", + max_test_instances=500, + ), + "ailuminate": DatasetRecipe( + card="cards.safety.mlcommons_ailuminate", + template_card_index="default", + max_test_instances=500, + ), } ) diff --git a/prepare/cards/safety/attaq_gg.py b/prepare/cards/safety/attaq_gg.py new file mode 100644 index 0000000000..fb38f21fb2 --- /dev/null +++ b/prepare/cards/safety/attaq_gg.py @@ -0,0 +1,27 @@ +from unitxt import add_to_catalog +from unitxt.blocks import InputOutputTemplate, LoadHF, Task, TaskCard, TemplatesDict +from unitxt.operators import Shuffle +from unitxt.splitters import RenameSplits +from unitxt.test_utils.card import test_card + +card = TaskCard( + loader=LoadHF(path="ibm/AttaQ"), + preprocess_steps=[ + RenameSplits(mapper={"train": "test"}), + Shuffle(page_size=2800), + ], + task=Task( + input_fields={"input": str}, + reference_fields={"label": str}, + prediction_type=str, + metrics=[ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=prediction]", + ], + ), + templates=TemplatesDict( + {"default": InputOutputTemplate(input_format="{input}", output_format="")} + ), +) + +test_card(card, strict=False, demos_taken_from="test", num_demos=0) +add_to_catalog(card, "cards.safety.attaq_gg", overwrite=True) diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json index 0840e22733..cf76e52cc1 100644 --- a/src/unitxt/catalog/benchmarks/safety.json +++ b/src/unitxt/catalog/benchmarks/safety.json @@ -3,19 +3,30 @@ "subsets": { "attaq": { "__type__": "dataset_recipe", - "card": "cards.attaq" + "card": "cards.safety.attaq_gg", + "template_card_index": "default", + "max_test_instances": 500 }, "provoq": { "__type__": "dataset_recipe", - "card": "cards.safety.provoq" + "card": "cards.safety.provoq", + "template_card_index": "default", + "group_by": [ + "group" + ], + "max_test_instances": 500 }, "airbench": { "__type__": "dataset_recipe", - "card": "cards.safety.airbench2024" + "card": "cards.safety.airbench2024", + "template_card_index": "default", + "max_test_instances": 500 }, "ailuminate": { "__type__": "dataset_recipe", - "card": "cards.safety.mlcommons_ailuminate" + "card": "cards.safety.mlcommons_ailuminate", + "template_card_index": "default", + "max_test_instances": 500 } } } diff --git a/src/unitxt/catalog/cards/safety/attaq_gg.json b/src/unitxt/catalog/cards/safety/attaq_gg.json new file mode 100644 index 0000000000..5e71370f1d --- /dev/null +++ b/src/unitxt/catalog/cards/safety/attaq_gg.json @@ -0,0 +1,42 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "ibm/AttaQ" + }, + "preprocess_steps": [ + { + "__type__": "rename_splits", + "mapper": { + "train": "test" + } + }, + { + "__type__": "shuffle", + "page_size": 2800 + } + ], + "task": { + "__type__": "task", + "input_fields": { + "input": "str" + }, + "reference_fields": { + "label": "str" + }, + "prediction_type": "str", + "metrics": [ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=prediction]" + ] + }, + "templates": { + "__type__": "templates_dict", + "items": { + "default": { + "__type__": "input_output_template", + "input_format": "{input}", + "output_format": "" + } + } + } +} From eebe8eaa7d2f4a7a134b218d652bba70049d8456 Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Mon, 15 Sep 2025 19:30:54 +0300 Subject: [PATCH 3/5] Added grouping Signed-off-by: Jonathan Bnayahu --- prepare/benchmarks/safety.py | 2 ++ src/unitxt/catalog/benchmarks/safety.json | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py index 0abd288a0c..0a88778831 100644 --- a/prepare/benchmarks/safety.py +++ b/prepare/benchmarks/safety.py @@ -18,11 +18,13 @@ "airbench": DatasetRecipe( card="cards.safety.airbench2024", template_card_index="default", + group_by=["l2-name"], max_test_instances=500, ), "ailuminate": DatasetRecipe( card="cards.safety.mlcommons_ailuminate", template_card_index="default", + group_by=["hazard"], max_test_instances=500, ), } diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json index cf76e52cc1..509e8e06fe 100644 --- a/src/unitxt/catalog/benchmarks/safety.json +++ b/src/unitxt/catalog/benchmarks/safety.json @@ -20,12 +20,18 @@ "__type__": "dataset_recipe", "card": "cards.safety.airbench2024", "template_card_index": "default", + "group_by": [ + "l2-name" + ], "max_test_instances": 500 }, "ailuminate": { "__type__": "dataset_recipe", "card": "cards.safety.mlcommons_ailuminate", "template_card_index": "default", + "group_by": [ + "hazard" + ], "max_test_instances": 500 } } From 8cd969e519471782b563439bb541fa48b8422a40 Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Tue, 16 Sep 2025 08:11:12 +0300 Subject: [PATCH 4/5] . Signed-off-by: Jonathan Bnayahu --- prepare/benchmarks/safety.py | 11 +++++++---- src/unitxt/catalog/benchmarks/safety.json | 3 +++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py index 0a88778831..a2a8241705 100644 --- a/prepare/benchmarks/safety.py +++ b/prepare/benchmarks/safety.py @@ -2,30 +2,33 @@ from unitxt.catalog import add_to_catalog from unitxt.standard import DatasetRecipe +MAX_TEST_INSTANCES = 500 + benchmark = Benchmark( subsets={ "attaq": DatasetRecipe( card="cards.safety.attaq_gg", template_card_index="default", - max_test_instances=500, + group_by=["label"], + max_test_instances=MAX_TEST_INSTANCES, ), "provoq": DatasetRecipe( card="cards.safety.provoq", template_card_index="default", group_by=["group"], - max_test_instances=500, + max_test_instances=MAX_TEST_INSTANCES, ), "airbench": DatasetRecipe( card="cards.safety.airbench2024", template_card_index="default", group_by=["l2-name"], - max_test_instances=500, + max_test_instances=MAX_TEST_INSTANCES, ), "ailuminate": DatasetRecipe( card="cards.safety.mlcommons_ailuminate", template_card_index="default", group_by=["hazard"], - max_test_instances=500, + max_test_instances=MAX_TEST_INSTANCES, ), } ) diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json index 509e8e06fe..df52e56a5a 100644 --- a/src/unitxt/catalog/benchmarks/safety.json +++ b/src/unitxt/catalog/benchmarks/safety.json @@ -5,6 +5,9 @@ "__type__": "dataset_recipe", "card": "cards.safety.attaq_gg", "template_card_index": "default", + "group_by": [ + "label" + ], "max_test_instances": 500 }, "provoq": { From 006580c014a9535a04025338fbb043dc130cba1d Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Thu, 18 Sep 2025 09:17:52 +0300 Subject: [PATCH 5/5] Switch provoq to GG metric. Signed-off-by: Jonathan Bnayahu --- prepare/benchmarks/safety.py | 4 +- prepare/cards/safety/provoq_gg.py | 31 +++++++++++++++ src/unitxt/catalog/benchmarks/safety.json | 10 ++--- .../catalog/cards/safety/provoq_gg.json | 39 +++++++++++++++++++ 4 files changed, 77 insertions(+), 7 deletions(-) create mode 100644 prepare/cards/safety/provoq_gg.py create mode 100644 src/unitxt/catalog/cards/safety/provoq_gg.json diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py index a2a8241705..4ac87b1ba1 100644 --- a/prepare/benchmarks/safety.py +++ b/prepare/benchmarks/safety.py @@ -2,7 +2,7 @@ from unitxt.catalog import add_to_catalog from unitxt.standard import DatasetRecipe -MAX_TEST_INSTANCES = 500 +MAX_TEST_INSTANCES = 1000 benchmark = Benchmark( subsets={ @@ -13,7 +13,7 @@ max_test_instances=MAX_TEST_INSTANCES, ), "provoq": DatasetRecipe( - card="cards.safety.provoq", + card="cards.safety.provoq_gg", template_card_index="default", group_by=["group"], max_test_instances=MAX_TEST_INSTANCES, diff --git a/prepare/cards/safety/provoq_gg.py b/prepare/cards/safety/provoq_gg.py new file mode 100644 index 0000000000..54cd4d1adb --- /dev/null +++ b/prepare/cards/safety/provoq_gg.py @@ -0,0 +1,31 @@ +from unitxt import add_to_catalog +from unitxt.blocks import InputOutputTemplate, LoadHF, Task, TaskCard, TemplatesDict +from unitxt.test_utils.card import test_card + +card = TaskCard( + loader=LoadHF(path="IBM/ProvoQ", data_classification_policy=["public"]), + task=Task( + input_fields={"question": str, "stigma": str, "group": str}, + reference_fields={}, + prediction_type=str, + metrics=[ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=question,assistant_message_field=prediction]", + ], + ), + templates=TemplatesDict( + {"default": InputOutputTemplate(input_format="{question}", output_format="")} + ), + __description__="The ProvoQ dataset is designed to evaluate the sensitivity of large language models (LLMs) to stigma-related topics. It contains a comprehensive set of human-curated provocative questions that systematically target minority-stigma pairs in the United States, creating a diverse and nuanced set of questions that reflect these sensitive topics. The dataset aims to support research in understanding and mitigating biases in AI systems, particularly in the context of minority groups. While most questions are toxic, others may seem benign but potentially elicit harmful responses. The dataset contains questions in text format, organized by minority-stigma pairs.", + __tags__={ + "languages": ["english"], + }, +) + +test_card( + card, + strict=False, + demos_taken_from="test", + num_demos=0, +) + +add_to_catalog(card, "cards.safety.provoq_gg", overwrite=True) diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json index df52e56a5a..9b5cafff2b 100644 --- a/src/unitxt/catalog/benchmarks/safety.json +++ b/src/unitxt/catalog/benchmarks/safety.json @@ -8,16 +8,16 @@ "group_by": [ "label" ], - "max_test_instances": 500 + "max_test_instances": 1000 }, "provoq": { "__type__": "dataset_recipe", - "card": "cards.safety.provoq", + "card": "cards.safety.provoq_gg", "template_card_index": "default", "group_by": [ "group" ], - "max_test_instances": 500 + "max_test_instances": 1000 }, "airbench": { "__type__": "dataset_recipe", @@ -26,7 +26,7 @@ "group_by": [ "l2-name" ], - "max_test_instances": 500 + "max_test_instances": 1000 }, "ailuminate": { "__type__": "dataset_recipe", @@ -35,7 +35,7 @@ "group_by": [ "hazard" ], - "max_test_instances": 500 + "max_test_instances": 1000 } } } diff --git a/src/unitxt/catalog/cards/safety/provoq_gg.json b/src/unitxt/catalog/cards/safety/provoq_gg.json new file mode 100644 index 0000000000..290308b8e4 --- /dev/null +++ b/src/unitxt/catalog/cards/safety/provoq_gg.json @@ -0,0 +1,39 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "IBM/ProvoQ", + "data_classification_policy": [ + "public" + ] + }, + "task": { + "__type__": "task", + "input_fields": { + "question": "str", + "stigma": "str", + "group": "str" + }, + "reference_fields": {}, + "prediction_type": "str", + "metrics": [ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=question,assistant_message_field=prediction]" + ] + }, + "templates": { + "__type__": "templates_dict", + "items": { + "default": { + "__type__": "input_output_template", + "input_format": "{question}", + "output_format": "" + } + } + }, + "__description__": "The ProvoQ dataset is designed to evaluate the sensitivity of large language models (LLMs) to stigma-related topics. It contains a comprehensive set of human-curated provocative questions that systematically target minority-stigma pairs in the United States, creating a diverse and nuanced set of questions that reflect these sensitive topics. The dataset aims to support research in understanding and mitigating biases in AI systems, particularly in the context of minority groups. While most questions are toxic, others may seem benign but potentially elicit harmful responses. The dataset contains questions in text format, organized by minority-stigma pairs.", + "__tags__": { + "languages": [ + "english" + ] + } +}