Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT: add DarkBench dataset #821

Merged
merged 4 commits into from
Mar 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ API Reference
fetch_adv_bench_dataset
fetch_aya_redteaming_dataset
fetch_babelscape_alert_dataset
fetch_darkbench_dataset
fetch_decoding_trust_stereotypes_dataset
fetch_examples
fetch_forbidden_questions_dataset
Expand Down
2 changes: 2 additions & 0 deletions pyrit/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pyrit.datasets.adv_bench_dataset import fetch_adv_bench_dataset
from pyrit.datasets.aya_redteaming_dataset import fetch_aya_redteaming_dataset
from pyrit.datasets.babelscape_alert_dataset import fetch_babelscape_alert_dataset
from pyrit.datasets.darkbench_dataset import fetch_darkbench_dataset
from pyrit.datasets.decoding_trust_stereotypes_dataset import fetch_decoding_trust_stereotypes_dataset
from pyrit.datasets.dataset_helper import fetch_examples
from pyrit.datasets.forbidden_questions_dataset import fetch_forbidden_questions_dataset
Expand All @@ -24,6 +25,7 @@
"fetch_adv_bench_dataset",
"fetch_aya_redteaming_dataset",
"fetch_babelscape_alert_dataset",
"fetch_darkbench_dataset",
"fetch_decoding_trust_stereotypes_dataset",
"fetch_examples",
"fetch_forbidden_questions_dataset",
Expand Down
3 changes: 2 additions & 1 deletion pyrit/datasets/aya_redteaming_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import ast
from pathlib import Path
from typing import List, Literal, Optional

Expand Down Expand Up @@ -77,7 +78,7 @@ def fetch_aya_redteaming_dataset(
seed_prompts = []

for example in examples:
categories = eval(example["harm_category"])
categories = ast.literal_eval(example["harm_category"])
if harm_categories is None or any(cat in categories for cat in harm_categories):
if harm_scope is None or example["global_or_local"] == harm_scope:
seed_prompts.append(
Expand Down
37 changes: 37 additions & 0 deletions pyrit/datasets/darkbench_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt


def fetch_darkbench_dataset() -> SeedPromptDataset:
"""
Fetch DarkBench examples and create a SeedPromptDataset.

Returns:
SeedPromptDataset: A SeedPromptDataset containing the examples.
"""
data = load_dataset("anonymous152311/darkbench", "default")

seed_prompts = [
SeedPrompt(
value=item["Example"],
data_type="text",
name="",
dataset_name="DarkBench",
harm_categories=[item["Deceptive Pattern"]],
description="""The DarkBench dataset focuses on dark patterns and is available on Hugging Face,
created by anonymous152311 (https://huggingface.co/anonymous152311). The dataset includes
660 examples, each labeled with a 'Deceptive Pattern' category. These categories indicate
different types of deceptive strategies used in the data, such as:
Anthropomorphization, Brand bias, Harmful generation, Sneaking, Sycophancy, or User retention.""",
source="https://huggingface.co/datasets/anonymous152311/darkbench",
)
for item in data["train"]
]

seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts)
return seed_prompt_dataset
2 changes: 2 additions & 0 deletions tests/integration/datasets/test_fetch_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
fetch_adv_bench_dataset,
fetch_aya_redteaming_dataset,
fetch_babelscape_alert_dataset,
fetch_darkbench_dataset,
fetch_decoding_trust_stereotypes_dataset,
fetch_forbidden_questions_dataset,
fetch_harmbench_dataset,
Expand All @@ -29,6 +30,7 @@
(fetch_adv_bench_dataset, True),
(fetch_aya_redteaming_dataset, True),
(fetch_babelscape_alert_dataset, True),
(fetch_darkbench_dataset, True),
(fetch_decoding_trust_stereotypes_dataset, True),
(fetch_forbidden_questions_dataset, True),
(fetch_harmbench_dataset, True),
Expand Down