IBM · bnayahu · Sep 11, 2025 · Sep 15, 2025 · Sep 15, 2025 · Sep 16, 2025
diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py
@@ -0,0 +1,36 @@
+from unitxt.benchmark import Benchmark
+from unitxt.catalog import add_to_catalog
+from unitxt.standard import DatasetRecipe
+
+MAX_TEST_INSTANCES = 1000
+
+benchmark = Benchmark(
+    subsets={
+        "attaq": DatasetRecipe(
+            card="cards.safety.attaq_gg",
+            template_card_index="default",
+            group_by=["label"],
+            max_test_instances=MAX_TEST_INSTANCES,
+        ),
+        "provoq": DatasetRecipe(
+            card="cards.safety.provoq_gg",
+            template_card_index="default",
+            group_by=["group"],
+            max_test_instances=MAX_TEST_INSTANCES,
+        ),
+        "airbench": DatasetRecipe(
+            card="cards.safety.airbench2024",
+            template_card_index="default",
+            group_by=["l2-name"],
+            max_test_instances=MAX_TEST_INSTANCES,
+        ),
+        "ailuminate": DatasetRecipe(
+            card="cards.safety.mlcommons_ailuminate",
+            template_card_index="default",
+            group_by=["hazard"],
+            max_test_instances=MAX_TEST_INSTANCES,
+        ),
+    }
+)
+
+add_to_catalog(benchmark, "benchmarks.safety", overwrite=True)
diff --git a/prepare/cards/attaq.py b/prepare/cards/attaq.py
@@ -17,8 +17,9 @@
         Shuffle(page_size=2800),
     ],
     task=Task(
-        input_fields=["input"],
-        reference_fields=["label"],
+        input_fields={"input": str},
+        reference_fields={"label": str},
+        prediction_type=str,
         metrics=["metrics.safety_metric"],
     ),
     templates=[

diff --git a/prepare/cards/safety/airbench2024.py b/prepare/cards/safety/airbench2024.py
@@ -43,6 +43,7 @@
         },
         reference_fields={},
         prediction_type=str,
+        # TODO: The current implementation uses Granite Guardian as judge. Future versions could use a custom judge and use the judge_prompt field.
         metrics=[
             "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt,assistant_message_field=prediction]",
         ],

diff --git a/prepare/cards/safety/attaq_gg.py b/prepare/cards/safety/attaq_gg.py
@@ -0,0 +1,27 @@
+from unitxt import add_to_catalog
+from unitxt.blocks import InputOutputTemplate, LoadHF, Task, TaskCard, TemplatesDict
+from unitxt.operators import Shuffle
+from unitxt.splitters import RenameSplits
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadHF(path="ibm/AttaQ"),
+    preprocess_steps=[
+        RenameSplits(mapper={"train": "test"}),
+        Shuffle(page_size=2800),
+    ],
+    task=Task(
+        input_fields={"input": str},
+        reference_fields={"label": str},
+        prediction_type=str,
+        metrics=[
+            "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=prediction]",
+        ],
+    ),
+    templates=TemplatesDict(
+        {"default": InputOutputTemplate(input_format="{input}", output_format="")}
+    ),
+)
+
+test_card(card, strict=False, demos_taken_from="test", num_demos=0)
+add_to_catalog(card, "cards.safety.attaq_gg", overwrite=True)
diff --git a/prepare/cards/safety/mlcommons_ailuminate.py b/prepare/cards/safety/mlcommons_ailuminate.py
@@ -26,7 +26,9 @@
         },
         reference_fields={},
         prediction_type=str,
-        metrics=["metrics.llm_as_judge.safety.llamaguard"],
+        metrics=[
+            "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt_text,assistant_message_field=prediction]"
+        ],
     ),
     templates=TemplatesDict(
         {

diff --git a/prepare/cards/safety/provoq_gg.py b/prepare/cards/safety/provoq_gg.py
@@ -0,0 +1,31 @@
+from unitxt import add_to_catalog
+from unitxt.blocks import InputOutputTemplate, LoadHF, Task, TaskCard, TemplatesDict
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadHF(path="IBM/ProvoQ", data_classification_policy=["public"]),
+    task=Task(
+        input_fields={"question": str, "stigma": str, "group": str},
+        reference_fields={},
+        prediction_type=str,
+        metrics=[
+            "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=question,assistant_message_field=prediction]",
+        ],
+    ),
+    templates=TemplatesDict(
+        {"default": InputOutputTemplate(input_format="{question}", output_format="")}
+    ),
+    __description__="The ProvoQ dataset is designed to evaluate the sensitivity of large language models (LLMs) to stigma-related topics. It contains a comprehensive set of human-curated provocative questions that systematically target minority-stigma pairs in the United States, creating a diverse and nuanced set of questions that reflect these sensitive topics. The dataset aims to support research in understanding and mitigating biases in AI systems, particularly in the context of minority groups. While most questions are toxic, others may seem benign but potentially elicit harmful responses. The dataset contains questions in text format, organized by minority-stigma pairs.",
+    __tags__={
+        "languages": ["english"],
+    },
+)
+
+test_card(
+    card,
+    strict=False,
+    demos_taken_from="test",
+    num_demos=0,
+)
+
+add_to_catalog(card, "cards.safety.provoq_gg", overwrite=True)
diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json
@@ -0,0 +1,41 @@
+{
+    "__type__": "benchmark",
+    "subsets": {
+        "attaq": {
+            "__type__": "dataset_recipe",
+            "card": "cards.safety.attaq_gg",
+            "template_card_index": "default",
+            "group_by": [
+                "label"
+            ],
+            "max_test_instances": 1000
+        },
+        "provoq": {
+            "__type__": "dataset_recipe",
+            "card": "cards.safety.provoq_gg",
+            "template_card_index": "default",
+            "group_by": [
+                "group"
+            ],
+            "max_test_instances": 1000
+        },
+        "airbench": {
+            "__type__": "dataset_recipe",
+            "card": "cards.safety.airbench2024",
+            "template_card_index": "default",
+            "group_by": [
+                "l2-name"
+            ],
+            "max_test_instances": 1000
+        },
+        "ailuminate": {
+            "__type__": "dataset_recipe",
+            "card": "cards.safety.mlcommons_ailuminate",
+            "template_card_index": "default",
+            "group_by": [
+                "hazard"
+            ],
+            "max_test_instances": 1000
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/attaq.json b/src/unitxt/catalog/cards/attaq.json
@@ -18,12 +18,13 @@
     ],
     "task": {
         "__type__": "task",
-        "input_fields": [
-            "input"
-        ],
-        "reference_fields": [
-            "label"
-        ],
+        "input_fields": {
+            "input": "str"
+        },
+        "reference_fields": {
+            "label": "str"
+        },
+        "prediction_type": "str",
         "metrics": [
             "metrics.safety_metric"
         ]

diff --git a/src/unitxt/catalog/cards/safety/attaq_gg.json b/src/unitxt/catalog/cards/safety/attaq_gg.json
@@ -0,0 +1,42 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "ibm/AttaQ"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "rename_splits",
+            "mapper": {
+                "train": "test"
+            }
+        },
+        {
+            "__type__": "shuffle",
+            "page_size": 2800
+        }
+    ],
+    "task": {
+        "__type__": "task",
+        "input_fields": {
+            "input": "str"
+        },
+        "reference_fields": {
+            "label": "str"
+        },
+        "prediction_type": "str",
+        "metrics": [
+            "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=prediction]"
+        ]
+    },
+    "templates": {
+        "__type__": "templates_dict",
+        "items": {
+            "default": {
+                "__type__": "input_output_template",
+                "input_format": "{input}",
+                "output_format": ""
+            }
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json b/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json
@@ -22,7 +22,7 @@
         "reference_fields": {},
         "prediction_type": "str",
         "metrics": [
-            "metrics.llm_as_judge.safety.llamaguard"
+            "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt_text,assistant_message_field=prediction]"
         ]
     },
     "templates": {

diff --git a/src/unitxt/catalog/cards/safety/provoq_gg.json b/src/unitxt/catalog/cards/safety/provoq_gg.json
@@ -0,0 +1,39 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "IBM/ProvoQ",
+        "data_classification_policy": [
+            "public"
+        ]
+    },
+    "task": {
+        "__type__": "task",
+        "input_fields": {
+            "question": "str",
+            "stigma": "str",
+            "group": "str"
+        },
+        "reference_fields": {},
+        "prediction_type": "str",
+        "metrics": [
+            "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=question,assistant_message_field=prediction]"
+        ]
+    },
+    "templates": {
+        "__type__": "templates_dict",
+        "items": {
+            "default": {
+                "__type__": "input_output_template",
+                "input_format": "{question}",
+                "output_format": ""
+            }
+        }
+    },
+    "__description__": "The ProvoQ dataset is designed to evaluate the sensitivity of large language models (LLMs) to stigma-related topics. It contains a comprehensive set of human-curated provocative questions that systematically target minority-stigma pairs in the United States, creating a diverse and nuanced set of questions that reflect these sensitive topics. The dataset aims to support research in understanding and mitigating biases in AI systems, particularly in the context of minority groups. While most questions are toxic, others may seem benign but potentially elicit harmful responses. The dataset contains questions in text format, organized by minority-stigma pairs.",
+    "__tags__": {
+        "languages": [
+            "english"
+        ]
+    }
+}
diff --git a/src/unitxt/processors.py b/src/unitxt/processors.py
@@ -326,7 +326,7 @@ def process_value(self, text: Any) -> Any:
         try:
             return float(match.group(1)) * 0.25 - 0.25
         except:
-            return np.NaN
+            return np.nan
 
 
 class ExtractMtBenchLabelJudgment(FieldOperator):