diff --git a/doc/api.rst b/doc/api.rst index 9c004dbe1..0d9949481 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -114,6 +114,7 @@ API Reference fetch_adv_bench_dataset fetch_aya_redteaming_dataset fetch_babelscape_alert_dataset + fetch_darkbench_dataset fetch_decoding_trust_stereotypes_dataset fetch_examples fetch_forbidden_questions_dataset diff --git a/pyrit/datasets/__init__.py b/pyrit/datasets/__init__.py index c5484bde3..f230f6172 100644 --- a/pyrit/datasets/__init__.py +++ b/pyrit/datasets/__init__.py @@ -4,6 +4,7 @@ from pyrit.datasets.adv_bench_dataset import fetch_adv_bench_dataset from pyrit.datasets.aya_redteaming_dataset import fetch_aya_redteaming_dataset from pyrit.datasets.babelscape_alert_dataset import fetch_babelscape_alert_dataset +from pyrit.datasets.darkbench_dataset import fetch_darkbench_dataset from pyrit.datasets.decoding_trust_stereotypes_dataset import fetch_decoding_trust_stereotypes_dataset from pyrit.datasets.dataset_helper import fetch_examples from pyrit.datasets.forbidden_questions_dataset import fetch_forbidden_questions_dataset @@ -24,6 +25,7 @@ "fetch_adv_bench_dataset", "fetch_aya_redteaming_dataset", "fetch_babelscape_alert_dataset", + "fetch_darkbench_dataset", "fetch_decoding_trust_stereotypes_dataset", "fetch_examples", "fetch_forbidden_questions_dataset", diff --git a/pyrit/datasets/aya_redteaming_dataset.py b/pyrit/datasets/aya_redteaming_dataset.py index 47bb0d60f..214a93806 100644 --- a/pyrit/datasets/aya_redteaming_dataset.py +++ b/pyrit/datasets/aya_redteaming_dataset.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import ast from pathlib import Path from typing import List, Literal, Optional @@ -77,7 +78,7 @@ def fetch_aya_redteaming_dataset( seed_prompts = [] for example in examples: - categories = eval(example["harm_category"]) + categories = ast.literal_eval(example["harm_category"]) if harm_categories is None or any(cat in categories for cat in harm_categories): if harm_scope is None or example["global_or_local"] == harm_scope: seed_prompts.append( diff --git a/pyrit/datasets/darkbench_dataset.py b/pyrit/datasets/darkbench_dataset.py new file mode 100644 index 000000000..f740ba96c --- /dev/null +++ b/pyrit/datasets/darkbench_dataset.py @@ -0,0 +1,37 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from datasets import load_dataset + +from pyrit.models import SeedPromptDataset +from pyrit.models.seed_prompt import SeedPrompt + + +def fetch_darkbench_dataset() -> SeedPromptDataset: + """ + Fetch DarkBench examples and create a SeedPromptDataset. + + Returns: + SeedPromptDataset: A SeedPromptDataset containing the examples. + """ + data = load_dataset("anonymous152311/darkbench", "default") + + seed_prompts = [ + SeedPrompt( + value=item["Example"], + data_type="text", + name="", + dataset_name="DarkBench", + harm_categories=[item["Deceptive Pattern"]], + description="""The DarkBench dataset focuses on dark patterns and is available on Hugging Face, + created by anonymous152311 (https://huggingface.co/anonymous152311). The dataset includes + 660 examples, each labeled with a 'Deceptive Pattern' category. These categories indicate + different types of deceptive strategies used in the data, such as: + Anthropomorphization, Brand bias, Harmful generation, Sneaking, Sycophancy, or User retention.""", + source="https://huggingface.co/datasets/anonymous152311/darkbench", + ) + for item in data["train"] + ] + + seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts) + return seed_prompt_dataset diff --git a/tests/integration/datasets/test_fetch_datasets.py b/tests/integration/datasets/test_fetch_datasets.py index 4757c52ce..97d4cb329 100644 --- a/tests/integration/datasets/test_fetch_datasets.py +++ b/tests/integration/datasets/test_fetch_datasets.py @@ -7,6 +7,7 @@ fetch_adv_bench_dataset, fetch_aya_redteaming_dataset, fetch_babelscape_alert_dataset, + fetch_darkbench_dataset, fetch_decoding_trust_stereotypes_dataset, fetch_forbidden_questions_dataset, fetch_harmbench_dataset, @@ -29,6 +30,7 @@ (fetch_adv_bench_dataset, True), (fetch_aya_redteaming_dataset, True), (fetch_babelscape_alert_dataset, True), + (fetch_darkbench_dataset, True), (fetch_decoding_trust_stereotypes_dataset, True), (fetch_forbidden_questions_dataset, True), (fetch_harmbench_dataset, True),