Skip to content
5 changes: 1 addition & 4 deletions doc/code/orchestrators/qa_benchmark_orchestrator.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@
"\n",
"from pyrit.common import IN_MEMORY, initialize_pyrit\n",
"from pyrit.datasets import fetch_wmdp_dataset\n",
"from pyrit.models import (\n",
" QuestionAnsweringEntry,\n",
" QuestionChoice,\n",
")\n",
"from pyrit.models import QuestionAnsweringEntry, QuestionChoice\n",
"from pyrit.orchestrator import QuestionAnsweringBenchmarkOrchestrator\n",
"from pyrit.prompt_target import OpenAIChatTarget\n",
"from pyrit.score.self_ask_question_answer_scorer import SelfAskQuestionAnswerScorer\n",
Expand Down
7 changes: 4 additions & 3 deletions pyrit/datasets/adv_bench_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@

from pyrit.common.path import DATASETS_PATH
from pyrit.datasets.dataset_helper import fetch_examples
from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_adv_bench_dataset(
Expand Down Expand Up @@ -67,6 +66,8 @@ def fetch_adv_bench_dataset(
source=str(Path(DATASETS_PATH) / "data" / "adv_bench_dataset.json"), source_type="file", cache=cache
)

HarmCategory._initialize_aliases()

filtered = dataset["data"] # type: ignore

if main_categories or sub_categories:
Expand All @@ -87,7 +88,7 @@ def fetch_adv_bench_dataset(
data_type="text",
name="AdvBench Dataset [Extended]",
dataset_name="AdvBench Dataset",
harm_categories=item["main_categories"] + item["sub_categories"],
harm_categories=[HarmCategory.parse(cat) for cat in item["main_categories"] + item["sub_categories"]],
description="""AdvBench is a set of 520 harmful behaviors formulated as instructions. This dataset
has been extended to include harm categories for better filtering and analysis. The adversary's goal
is instead to find a single attack string that will cause the model to generate any response that
Expand Down
14 changes: 9 additions & 5 deletions pyrit/datasets/aya_redteaming_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
from typing import List, Literal, Optional

from pyrit.datasets.dataset_helper import fetch_examples
from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_aya_redteaming_dataset(
Expand Down Expand Up @@ -75,19 +74,24 @@ def fetch_aya_redteaming_dataset(
data_home=data_home,
)

HarmCategory._initialize_aliases()

parsed_filter_categories = [HarmCategory.parse(c) for c in harm_categories] if harm_categories else None

seed_prompts = []

for example in examples:
categories = ast.literal_eval(example["harm_category"])
if harm_categories is None or any(cat in categories for cat in harm_categories):
raw_categories = ast.literal_eval(example["harm_category"])
parsed_categories = [HarmCategory.parse(c) for c in raw_categories]
if parsed_filter_categories is None or any(cat in parsed_categories for cat in parsed_filter_categories):
if harm_scope is None or example["global_or_local"] == harm_scope:
seed_prompts.append(
SeedPrompt(
value=example["prompt"],
data_type="text",
name="Aya Red-teaming Examples",
dataset_name="Aya Red-teaming Examples",
harm_categories=categories,
harm_categories=parsed_categories,
source="https://huggingface.co/datasets/CohereForAI/aya_redteaming",
)
)
Expand Down
7 changes: 4 additions & 3 deletions pyrit/datasets/ccp_sensitive_prompts_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_ccp_sensitive_prompts_dataset() -> SeedPromptDataset:
Expand All @@ -25,14 +24,16 @@ def fetch_ccp_sensitive_prompts_dataset() -> SeedPromptDataset:
split="train",
)

HarmCategory._initialize_aliases()

return SeedPromptDataset(
prompts=[
SeedPrompt(
value=row["prompt"],
data_type="text",
name="",
dataset_name="CCP-sensitive-prompts",
harm_categories=[row["subject"]],
harm_categories=[HarmCategory.parse(row["subject"])],
description=("Prompts covering topics sensitive to the CCP."),
groups=["promptfoo"],
source="https://huggingface.co/datasets/promptfoo/CCP-sensitive-prompts",
Expand Down
7 changes: 4 additions & 3 deletions pyrit/datasets/darkbench_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_darkbench_dataset() -> SeedPromptDataset:
Expand All @@ -22,13 +21,15 @@ def fetch_darkbench_dataset() -> SeedPromptDataset:
"""
data = load_dataset("apart/darkbench", "default", split="train", data_files="darkbench.tsv")

HarmCategory._initialize_aliases()

seed_prompts = [
SeedPrompt(
value=item["Example"],
data_type="text",
name="",
dataset_name="DarkBench",
harm_categories=[item["Deceptive Pattern"]],
harm_categories=[HarmCategory.parse(item["Deceptive Pattern"])],
description=(
"DarkBench is a comprehensive benchmark designed to detect dark design patterns in large language"
" models (LLMs). Dark patterns are manipulative techniques that influence user behavior, often against"
Expand Down
7 changes: 4 additions & 3 deletions pyrit/datasets/decoding_trust_stereotypes_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@

from pyrit.common.path import DATASETS_PATH
from pyrit.datasets.dataset_helper import FILE_TYPE_HANDLERS, fetch_examples
from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_decoding_trust_stereotypes_dataset(
Expand Down Expand Up @@ -51,6 +50,8 @@ def fetch_decoding_trust_stereotypes_dataset(
Ritik Dutta, Rylan Schaeffer, Sang T. Truong, Simran Arora, Mantas Mazeika, Dan Hendrycks, Zinan Lin,
Yu Cheng, Sanmi Koyejo, Dawn Song, Bo Li. is available at https://arxiv.org/abs//2306.11698
"""
HarmCategory._initialize_aliases()

if system_prompt_type not in ["benign", "untargeted", "targeted"]:
raise ValueError("Invalid system_prompt_type. Expected 'benign', 'untargeted', or 'targeted'}")
# Determine the file type from the source URL
Expand Down Expand Up @@ -105,7 +106,7 @@ def fetch_decoding_trust_stereotypes_dataset(
data_type="text",
name="DecodingTrust Stereotypes Examples",
dataset_name="DecodingTrust Stereotypes Examples",
harm_categories=["stereotypes"],
harm_categories=[HarmCategory.parse("stereotypes")],
description="""A dataset of Decoding Trust 'Stereotypes' example prompts across many different topics
and target groups. These prompts attempt to convince the target to agree with a harmful
stereotype.""",
Expand Down
7 changes: 4 additions & 3 deletions pyrit/datasets/equitymedqa_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset

# These are the available subsets of the EquityMedQA dataset
DATA_SUBSETS: list[str] = [
Expand Down Expand Up @@ -66,6 +65,8 @@ def fetch_equitymedqa_dataset_unique_values(
"""
prompts: list[str] = []

HarmCategory._initialize_aliases()

if subset_name == "all": # get all subsets
targets: list[str] = DATA_SUBSETS

Expand All @@ -89,7 +90,7 @@ def fetch_equitymedqa_dataset_unique_values(
name="katielink/EquityMedQA",
dataset_name="katielink/EquityMedQA",
description="This dataset contains prompts used to assess medical biases in AI systems",
harm_categories=["health_bias"],
harm_categories=[HarmCategory.parse("health_bias")],
source="https://huggingface.co/datasets/katielink/EquityMedQA",
)
for prompt in prompts
Expand Down
7 changes: 4 additions & 3 deletions pyrit/datasets/forbidden_questions_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_forbidden_questions_dataset() -> SeedPromptDataset:
Expand All @@ -20,6 +19,8 @@ def fetch_forbidden_questions_dataset() -> SeedPromptDataset:
"""
data = load_dataset("TrustAIRLab/forbidden_question_set", "default")

HarmCategory._initialize_aliases()

authors = ["Xinyue Shen", "Zeyuan Chen", "Michael Backes", "Yun Shen", "Yang Zhang"]
seed_prompts = [
SeedPrompt(
Expand All @@ -28,7 +29,7 @@ def fetch_forbidden_questions_dataset() -> SeedPromptDataset:
name="TrustAIRLab/forbidden_question_set",
dataset_name="TrustAIRLab/forbidden_question_set",
authors=authors,
harm_categories=item["content_policy_name"],
harm_categories=[HarmCategory.parse(item["content_policy_name"])],
source="https://huggingface.co/datasets/TrustAIRLab/forbidden_question_set",
description="""This is the Forbidden Question Set dataset proposed in the ACM CCS 2024 paper
"Do Anything Now'': Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models.
Expand Down
11 changes: 10 additions & 1 deletion pyrit/datasets/harmbench_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from pyrit.datasets.dataset_helper import FILE_TYPE_HANDLERS, fetch_examples
from pyrit.models import SeedPromptDataset
from pyrit.models.harm_category import HarmCategory
from pyrit.models.seed_prompt import SeedPrompt


Expand Down Expand Up @@ -41,6 +42,9 @@ def fetch_harmbench_dataset(
valid_types = ", ".join(FILE_TYPE_HANDLERS.keys())
raise ValueError(f"Invalid file_type. Expected one of: {valid_types}.")

# Initialize aliases for associated harm categories
HarmCategory._initialize_aliases()

# Required keys to validate each example
required_keys = {"Behavior", "SemanticCategory"}

Expand All @@ -62,13 +66,18 @@ def fetch_harmbench_dataset(
prompts.append(example["Behavior"])
semantic_categories.add(example["SemanticCategory"])

# Parse the collected semantic categories into HarmCategory enums
parsed_semantic_categories = [
HarmCategory.parse(cat) if isinstance(cat, str) else HarmCategory.OTHER for cat in semantic_categories
]

seed_prompts = [
SeedPrompt(
value=example,
data_type="text",
name="HarmBench Examples",
dataset_name="HarmBench Examples",
harm_categories=list(semantic_categories),
harm_categories=parsed_semantic_categories, # type: ignore
description="A dataset of HarmBench examples containing various categories such as chemical,"
"biological, illegal activities, etc.",
)
Expand Down
11 changes: 7 additions & 4 deletions pyrit/datasets/librAI_do_not_answer_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_librAI_do_not_answer_dataset() -> SeedPromptDataset:
Expand All @@ -27,14 +26,18 @@ def fetch_librAI_do_not_answer_dataset() -> SeedPromptDataset:
"""
# Load dataset from Hugging Face
data = load_dataset("LibrAI/do-not-answer", split="train")

HarmCategory._initialize_aliases()
seed_prompts = [
SeedPrompt(
value=entry["question"],
data_type="text",
name="",
dataset_name="LibrAI/Do-Not-Answer",
harm_categories=[entry["risk_area"], entry["types_of_harm"], entry["specific_harms"]],
harm_categories=[
HarmCategory.parse(entry.get("risk_area", "")) or HarmCategory.OTHER,
HarmCategory.parse(entry.get("types_of_harm", "")) or HarmCategory.OTHER,
HarmCategory.parse(entry.get("specific_harms", "")) or HarmCategory.OTHER,
],
description=(
f"This is a prompt from the 'Do Not Answer' dataset under the risk area: {entry['risk_area']}, "
f"harm type: {entry['types_of_harm']}, and specific harm: {entry['specific_harms']}."
Expand Down
33 changes: 17 additions & 16 deletions pyrit/datasets/mlcommons_ailuminate_demo_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,26 @@

from pyrit.datasets.dataset_helper import fetch_examples
from pyrit.models import SeedPromptDataset
from pyrit.models.harm_category import HarmCategory
from pyrit.models.seed_prompt import SeedPrompt

# https://github.com/mlcommons/ailuminate#hazard-categories-and-abbreviations
_hazard_categories_map = {
"vcr": "Violent Crimes",
"src": "Sex-Related Crimes",
"cse": "Child Sexual Exploitation",
"ssh": "Suicide & Self-Harm",
"iwp": "Indiscriminate Weapons (CBRNE)",
"ipv": "Intellectual Property Violations",
"dfm": "Defamation",
"ncr": "Non-Violent Crimes",
"hte": "Hate",
"prv": "Privacy Violations",
"spc_ele": "Specialized Advice; elections",
"spc_fin": "Specialized Advice; financial",
"spc_hlt": "Specialized Advice; health",
"sxc_prn": "Sexual Content; pornographic",
"spc_lgl": "Specialized Advice; legal",
"vcr": HarmCategory.VIOLENT_CONTENT,
"src": HarmCategory.SEXUAL_CONTENT,
"cse": HarmCategory.SEXUAL_CONTENT,
"ssh": HarmCategory.SELF_HARM,
"iwp": HarmCategory.CBRN,
"ipv": HarmCategory.PROPRIETARY_INFO,
"dfm": HarmCategory.REPUTATIONAL_DAMAGE,
"ncr": HarmCategory.ILLEGAL,
"hte": HarmCategory.HATESPEECH,
"prv": HarmCategory.PPI,
"spc_ele": HarmCategory.CAMPAIGNING,
"spc_fin": HarmCategory.FINANCIAL_ADVICE,
"spc_hlt": HarmCategory.PUBLIC_HEALTH,
"sxc_prn": HarmCategory.SEXUAL_CONTENT,
"spc_lgl": HarmCategory.LEGAL_ADVICE,
}


Expand Down Expand Up @@ -58,7 +59,7 @@ def fetch_mlcommons_ailuminate_demo_dataset(
name="",
dataset_name="AILuminate v1.0 DEMO Prompt Set",
# Saving both the full hazard category name and the abbreviation
harm_categories=[_hazard_categories_map[example["hazard"]], example["hazard"]],
harm_categories=[_hazard_categories_map.get(example.get("hazard", "").lower(), HarmCategory.OTHER)],
description=(
"This dataset contains the DEMO prompt library of the AILuminate 1.0 prompt dataset, created by"
" MLCommons AI Risk & Reliability working group. It contains 1,200 human-generated prompts that"
Expand Down
5 changes: 2 additions & 3 deletions pyrit/datasets/multilingual_vulnerability_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

import pandas as pd

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.models import HarmCategory, SeedPrompt, SeedPromptDataset


def fetch_multilingual_vulnerability_dataset() -> SeedPromptDataset:
Expand All @@ -24,7 +23,7 @@ def fetch_multilingual_vulnerability_dataset() -> SeedPromptDataset:
data_type="text",
name=str(row["id"]),
dataset_name="Multilingual-Vulnerability",
harm_categories=[row["type"]],
harm_categories=[HarmCategory.parse(row["type"])],
description="Dataset from 'A Framework to Assess Multilingual Vulnerabilities of LLMs'. "
"Multilingual prompts demonstrating LLM vulnerabilities, labeled by type. "
"Paper: https://arxiv.org/pdf/2503.13081",
Expand Down
Loading
Loading