Azure · eugeniavkim · Jul 3, 2025 · Jul 3, 2025 · Jul 3, 2025 · Jul 3, 2025
diff --git a/doc/code/orchestrators/qa_benchmark_orchestrator.ipynb b/doc/code/orchestrators/qa_benchmark_orchestrator.ipynb
@@ -22,10 +22,7 @@
     "\n",
     "from pyrit.common import IN_MEMORY, initialize_pyrit\n",
     "from pyrit.datasets import fetch_wmdp_dataset\n",
-    "from pyrit.models import (\n",
-    "    QuestionAnsweringEntry,\n",
-    "    QuestionChoice,\n",
-    ")\n",
+    "from pyrit.models import QuestionAnsweringEntry, QuestionChoice\n",
     "from pyrit.orchestrator import QuestionAnsweringBenchmarkOrchestrator\n",
     "from pyrit.prompt_target import OpenAIChatTarget\n",
     "from pyrit.score.self_ask_question_answer_scorer import SelfAskQuestionAnswerScorer\n",

diff --git a/pyrit/datasets/adv_bench_dataset.py b/pyrit/datasets/adv_bench_dataset.py
@@ -6,8 +6,7 @@
 
 from pyrit.common.path import DATASETS_PATH
 from pyrit.datasets.dataset_helper import fetch_examples
-from pyrit.models import SeedPromptDataset
-from pyrit.models.seed_prompt import SeedPrompt
+from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset
 
 
 def fetch_adv_bench_dataset(
@@ -67,6 +66,8 @@ def fetch_adv_bench_dataset(
         source=str(Path(DATASETS_PATH) / "data" / "adv_bench_dataset.json"), source_type="file", cache=cache
     )
 
+    HarmCategory._initialize_aliases()
+
     filtered = dataset["data"]  # type: ignore
 
     if main_categories or sub_categories:
@@ -87,7 +88,7 @@ def fetch_adv_bench_dataset(
             data_type="text",
             name="AdvBench Dataset [Extended]",
             dataset_name="AdvBench Dataset",
-            harm_categories=item["main_categories"] + item["sub_categories"],
+            harm_categories=[HarmCategory.parse(cat) for cat in item["main_categories"] + item["sub_categories"]],
             description="""AdvBench is a set of 520 harmful behaviors formulated as instructions. This dataset
             has been extended to include harm categories for better filtering and analysis. The adversary's goal
             is instead to find a single attack string that will cause the model to generate any response that

diff --git a/pyrit/datasets/aya_redteaming_dataset.py b/pyrit/datasets/aya_redteaming_dataset.py
@@ -6,8 +6,7 @@
 from typing import List, Literal, Optional
 
 from pyrit.datasets.dataset_helper import fetch_examples
-from pyrit.models import SeedPromptDataset
-from pyrit.models.seed_prompt import SeedPrompt
+from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset
 
 
 def fetch_aya_redteaming_dataset(
@@ -75,19 +74,24 @@ def fetch_aya_redteaming_dataset(
         data_home=data_home,
     )
 
+    HarmCategory._initialize_aliases()
+
+    parsed_filter_categories = [HarmCategory.parse(c) for c in harm_categories] if harm_categories else None
+
     seed_prompts = []
 
     for example in examples:
-        categories = ast.literal_eval(example["harm_category"])
-        if harm_categories is None or any(cat in categories for cat in harm_categories):
+        raw_categories = ast.literal_eval(example["harm_category"])
+        parsed_categories = [HarmCategory.parse(c) for c in raw_categories]
+        if parsed_filter_categories is None or any(cat in parsed_categories for cat in parsed_filter_categories):
             if harm_scope is None or example["global_or_local"] == harm_scope:
                 seed_prompts.append(
                     SeedPrompt(
                         value=example["prompt"],
                         data_type="text",
                         name="Aya Red-teaming Examples",
                         dataset_name="Aya Red-teaming Examples",
-                        harm_categories=categories,
+                        harm_categories=parsed_categories,
                         source="https://huggingface.co/datasets/CohereForAI/aya_redteaming",
                     )
                 )

diff --git a/pyrit/datasets/ccp_sensitive_prompts_dataset.py b/pyrit/datasets/ccp_sensitive_prompts_dataset.py
@@ -3,8 +3,7 @@
 
 from datasets import load_dataset
 
-from pyrit.models import SeedPromptDataset
-from pyrit.models.seed_prompt import SeedPrompt
+from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset
 
 
 def fetch_ccp_sensitive_prompts_dataset() -> SeedPromptDataset:
@@ -25,14 +24,16 @@ def fetch_ccp_sensitive_prompts_dataset() -> SeedPromptDataset:
         split="train",
     )
 
+    HarmCategory._initialize_aliases()
+
     return SeedPromptDataset(
         prompts=[
             SeedPrompt(
                 value=row["prompt"],
                 data_type="text",
                 name="",
                 dataset_name="CCP-sensitive-prompts",
-                harm_categories=[row["subject"]],
+                harm_categories=[HarmCategory.parse(row["subject"])],
                 description=("Prompts covering topics sensitive to the CCP."),
                 groups=["promptfoo"],
                 source="https://huggingface.co/datasets/promptfoo/CCP-sensitive-prompts",

diff --git a/pyrit/datasets/darkbench_dataset.py b/pyrit/datasets/darkbench_dataset.py
@@ -3,8 +3,7 @@
 
 from datasets import load_dataset
 
-from pyrit.models import SeedPromptDataset
-from pyrit.models.seed_prompt import SeedPrompt
+from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset
 
 
 def fetch_darkbench_dataset() -> SeedPromptDataset:
@@ -22,13 +21,15 @@ def fetch_darkbench_dataset() -> SeedPromptDataset:
     """
     data = load_dataset("apart/darkbench", "default", split="train", data_files="darkbench.tsv")
 
+    HarmCategory._initialize_aliases()
+
     seed_prompts = [
         SeedPrompt(
             value=item["Example"],
             data_type="text",
             name="",
             dataset_name="DarkBench",
-            harm_categories=[item["Deceptive Pattern"]],
+            harm_categories=[HarmCategory.parse(item["Deceptive Pattern"])],
             description=(
                 "DarkBench is a comprehensive benchmark designed to detect dark design patterns in large language"
                 " models (LLMs). Dark patterns are manipulative techniques that influence user behavior, often against"

diff --git a/pyrit/datasets/decoding_trust_stereotypes_dataset.py b/pyrit/datasets/decoding_trust_stereotypes_dataset.py
@@ -6,8 +6,7 @@
 
 from pyrit.common.path import DATASETS_PATH
 from pyrit.datasets.dataset_helper import FILE_TYPE_HANDLERS, fetch_examples
-from pyrit.models import SeedPromptDataset
-from pyrit.models.seed_prompt import SeedPrompt
+from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset
 
 
 def fetch_decoding_trust_stereotypes_dataset(
@@ -51,6 +50,8 @@ def fetch_decoding_trust_stereotypes_dataset(
         Ritik Dutta, Rylan Schaeffer, Sang T. Truong, Simran Arora, Mantas Mazeika, Dan Hendrycks, Zinan Lin,
         Yu Cheng, Sanmi Koyejo, Dawn Song, Bo Li. is available at https://arxiv.org/abs//2306.11698
     """
+    HarmCategory._initialize_aliases()
+
     if system_prompt_type not in ["benign", "untargeted", "targeted"]:
         raise ValueError("Invalid system_prompt_type. Expected 'benign', 'untargeted', or 'targeted'}")
     # Determine the file type from the source URL
@@ -105,7 +106,7 @@ def fetch_decoding_trust_stereotypes_dataset(
             data_type="text",
             name="DecodingTrust Stereotypes Examples",
             dataset_name="DecodingTrust Stereotypes Examples",
-            harm_categories=["stereotypes"],
+            harm_categories=[HarmCategory.parse("stereotypes")],
             description="""A dataset of Decoding Trust 'Stereotypes' example prompts across many different topics
             and target groups. These prompts attempt to convince the target to agree with a harmful
             stereotype.""",

diff --git a/pyrit/datasets/equitymedqa_dataset.py b/pyrit/datasets/equitymedqa_dataset.py
@@ -5,8 +5,7 @@
 
 from datasets import load_dataset
 
-from pyrit.models import SeedPromptDataset
-from pyrit.models.seed_prompt import SeedPrompt
+from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset
 
 # These are the available subsets of the EquityMedQA dataset
 DATA_SUBSETS: list[str] = [
@@ -66,6 +65,8 @@ def fetch_equitymedqa_dataset_unique_values(
     """
     prompts: list[str] = []
 
+    HarmCategory._initialize_aliases()
+
     if subset_name == "all":  # get all subsets
         targets: list[str] = DATA_SUBSETS
 
@@ -89,7 +90,7 @@ def fetch_equitymedqa_dataset_unique_values(
             name="katielink/EquityMedQA",
             dataset_name="katielink/EquityMedQA",
             description="This dataset contains prompts used to assess medical biases in AI systems",
-            harm_categories=["health_bias"],
+            harm_categories=[HarmCategory.parse("health_bias")],
             source="https://huggingface.co/datasets/katielink/EquityMedQA",
         )
         for prompt in prompts

diff --git a/pyrit/datasets/forbidden_questions_dataset.py b/pyrit/datasets/forbidden_questions_dataset.py
@@ -3,8 +3,7 @@
 
 from datasets import load_dataset
 
-from pyrit.models import SeedPromptDataset
-from pyrit.models.seed_prompt import SeedPrompt
+from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset
 
 
 def fetch_forbidden_questions_dataset() -> SeedPromptDataset:
@@ -20,6 +19,8 @@ def fetch_forbidden_questions_dataset() -> SeedPromptDataset:
     """
     data = load_dataset("TrustAIRLab/forbidden_question_set", "default")
 
+    HarmCategory._initialize_aliases()
+
     authors = ["Xinyue Shen", "Zeyuan Chen", "Michael Backes", "Yun Shen", "Yang Zhang"]
     seed_prompts = [
         SeedPrompt(
@@ -28,7 +29,7 @@ def fetch_forbidden_questions_dataset() -> SeedPromptDataset:
             name="TrustAIRLab/forbidden_question_set",
             dataset_name="TrustAIRLab/forbidden_question_set",
             authors=authors,
-            harm_categories=item["content_policy_name"],
+            harm_categories=[HarmCategory.parse(item["content_policy_name"])],
             source="https://huggingface.co/datasets/TrustAIRLab/forbidden_question_set",
             description="""This is the Forbidden Question Set dataset proposed in the ACM CCS 2024 paper
         "Do Anything Now'': Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models.

diff --git a/pyrit/datasets/harmbench_dataset.py b/pyrit/datasets/harmbench_dataset.py
@@ -6,6 +6,7 @@
 
 from pyrit.datasets.dataset_helper import FILE_TYPE_HANDLERS, fetch_examples
 from pyrit.models import SeedPromptDataset
+from pyrit.models.harm_category import HarmCategory
 from pyrit.models.seed_prompt import SeedPrompt
 
 
@@ -41,6 +42,9 @@ def fetch_harmbench_dataset(
         valid_types = ", ".join(FILE_TYPE_HANDLERS.keys())
         raise ValueError(f"Invalid file_type. Expected one of: {valid_types}.")
 
+    # Initialize aliases for associated harm categories
+    HarmCategory._initialize_aliases()
+
     # Required keys to validate each example
     required_keys = {"Behavior", "SemanticCategory"}
 
@@ -62,13 +66,18 @@ def fetch_harmbench_dataset(
         prompts.append(example["Behavior"])
         semantic_categories.add(example["SemanticCategory"])
 
+        # Parse the collected semantic categories into HarmCategory enums
+        parsed_semantic_categories = [
+            HarmCategory.parse(cat) if isinstance(cat, str) else HarmCategory.OTHER for cat in semantic_categories
+        ]
+
     seed_prompts = [
         SeedPrompt(
             value=example,
             data_type="text",
             name="HarmBench Examples",
             dataset_name="HarmBench Examples",
-            harm_categories=list(semantic_categories),
+            harm_categories=parsed_semantic_categories,  # type: ignore
             description="A dataset of HarmBench examples containing various categories such as chemical,"
             "biological, illegal activities, etc.",
         )

diff --git a/pyrit/datasets/librAI_do_not_answer_dataset.py b/pyrit/datasets/librAI_do_not_answer_dataset.py
@@ -3,8 +3,7 @@
 
 from datasets import load_dataset
 
-from pyrit.models import SeedPromptDataset
-from pyrit.models.seed_prompt import SeedPrompt
+from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset
 
 
 def fetch_librAI_do_not_answer_dataset() -> SeedPromptDataset:
@@ -27,14 +26,18 @@ def fetch_librAI_do_not_answer_dataset() -> SeedPromptDataset:
     """
     # Load dataset from Hugging Face
     data = load_dataset("LibrAI/do-not-answer", split="train")
-
+    HarmCategory._initialize_aliases()
     seed_prompts = [
         SeedPrompt(
             value=entry["question"],
             data_type="text",
             name="",
             dataset_name="LibrAI/Do-Not-Answer",
-            harm_categories=[entry["risk_area"], entry["types_of_harm"], entry["specific_harms"]],
+            harm_categories=[
+                HarmCategory.parse(entry.get("risk_area", "")) or HarmCategory.OTHER,
+                HarmCategory.parse(entry.get("types_of_harm", "")) or HarmCategory.OTHER,
+                HarmCategory.parse(entry.get("specific_harms", "")) or HarmCategory.OTHER,
+            ],
             description=(
                 f"This is a prompt from the 'Do Not Answer' dataset under the risk area: {entry['risk_area']}, "
                 f"harm type: {entry['types_of_harm']}, and specific harm: {entry['specific_harms']}."

diff --git a/pyrit/datasets/mlcommons_ailuminate_demo_dataset.py b/pyrit/datasets/mlcommons_ailuminate_demo_dataset.py
@@ -6,25 +6,26 @@
 
 from pyrit.datasets.dataset_helper import fetch_examples
 from pyrit.models import SeedPromptDataset
+from pyrit.models.harm_category import HarmCategory
 from pyrit.models.seed_prompt import SeedPrompt
 
 # https://github.com/mlcommons/ailuminate#hazard-categories-and-abbreviations
 _hazard_categories_map = {
-    "vcr": "Violent Crimes",
-    "src": "Sex-Related Crimes",
-    "cse": "Child Sexual Exploitation",
-    "ssh": "Suicide & Self-Harm",
-    "iwp": "Indiscriminate Weapons (CBRNE)",
-    "ipv": "Intellectual Property Violations",
-    "dfm": "Defamation",
-    "ncr": "Non-Violent Crimes",
-    "hte": "Hate",
-    "prv": "Privacy Violations",
-    "spc_ele": "Specialized Advice; elections",
-    "spc_fin": "Specialized Advice; financial",
-    "spc_hlt": "Specialized Advice; health",
-    "sxc_prn": "Sexual Content; pornographic",
-    "spc_lgl": "Specialized Advice; legal",
+    "vcr": HarmCategory.VIOLENT_CONTENT,
+    "src": HarmCategory.SEXUAL_CONTENT,
+    "cse": HarmCategory.SEXUAL_CONTENT,
+    "ssh": HarmCategory.SELF_HARM,
+    "iwp": HarmCategory.CBRN,
+    "ipv": HarmCategory.PROPRIETARY_INFO,
+    "dfm": HarmCategory.REPUTATIONAL_DAMAGE,
+    "ncr": HarmCategory.ILLEGAL,
+    "hte": HarmCategory.HATESPEECH,
+    "prv": HarmCategory.PPI,
+    "spc_ele": HarmCategory.CAMPAIGNING,
+    "spc_fin": HarmCategory.FINANCIAL_ADVICE,
+    "spc_hlt": HarmCategory.PUBLIC_HEALTH,
+    "sxc_prn": HarmCategory.SEXUAL_CONTENT,
+    "spc_lgl": HarmCategory.LEGAL_ADVICE,
 }
 
 
@@ -58,7 +59,7 @@ def fetch_mlcommons_ailuminate_demo_dataset(
             name="",
             dataset_name="AILuminate v1.0 DEMO Prompt Set",
             # Saving both the full hazard category name and the abbreviation
-            harm_categories=[_hazard_categories_map[example["hazard"]], example["hazard"]],
+            harm_categories=[_hazard_categories_map.get(example.get("hazard", "").lower(), HarmCategory.OTHER)],
             description=(
                 "This dataset contains the DEMO prompt library of the AILuminate 1.0 prompt dataset, created by"
                 " MLCommons AI Risk & Reliability working group. It contains 1,200 human-generated prompts that"

diff --git a/pyrit/datasets/multilingual_vulnerability_dataset.py b/pyrit/datasets/multilingual_vulnerability_dataset.py
@@ -3,8 +3,7 @@
 
 import pandas as pd
 
-from pyrit.models import SeedPromptDataset
-from pyrit.models.seed_prompt import SeedPrompt
+from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset
 
 
 def fetch_multilingual_vulnerability_dataset() -> SeedPromptDataset:
@@ -24,7 +23,7 @@ def fetch_multilingual_vulnerability_dataset() -> SeedPromptDataset:
             data_type="text",
             name=str(row["id"]),
             dataset_name="Multilingual-Vulnerability",
-            harm_categories=[row["type"]],
+            harm_categories=[HarmCategory.parse(row["type"])],
             description="Dataset from 'A Framework to Assess Multilingual Vulnerabilities of LLMs'. "
             "Multilingual prompts demonstrating LLM vulnerabilities, labeled by type. "
             "Paper: https://arxiv.org/pdf/2503.13081",