_SafetyEvaluation randomization seed, concurrent tasks async (Azure#41033)

slister1001 · web-flow · commit 595c58a46160 · 2025-05-12T13:41:47.000-07:00
* add randomization seed

* pass random seed to simulation

* update defect rate calculation

* add concurrent tasks

* fix param name
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py
@@ -161,6 +161,8 @@ async def _simulate(
         adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]] = None,
         source_text: Optional[str] = None,
         direct_attack: bool = False,
+        randomization_seed: Optional[int] = None,
+        concurrent_async_tasks: Optional[int] = 5,
     ) -> Dict[str, str]:
         """
         Generates synthetic conversations based on provided parameters.
@@ -245,6 +247,8 @@ async def callback(
                 conversation_turns=conversation_turns,
                 text=source_text,
                 target=callback,
+                randomization_seed=randomization_seed,
+                concurrent_async_task=concurrent_async_tasks
             )
 
         # if DirectAttack, run DirectAttackSimulator
@@ -258,6 +262,8 @@ async def callback(
                 max_conversation_turns=max_conversation_turns,
                 max_simulation_results=max_simulation_results,
                 target=callback,
+                randomization_seed=randomization_seed,
+                concurrent_async_task=concurrent_async_tasks,
             )
             jailbreak_outputs = simulator_outputs["jailbreak"]
             simulator_outputs = simulator_outputs["regular"]
@@ -275,6 +281,7 @@ async def callback(
                 num_queries=max_simulation_results,
                 target=callback,
                 text=source_text if source_text else "",
+                concurrent_async_tasks=concurrent_async_tasks,
             )
 
         ## Run AdversarialSimulator
@@ -290,6 +297,8 @@ async def callback(
                 conversation_turns=conversation_turns,
                 target=callback,
                 text=source_text,
+                randomization_seed=randomization_seed,
+                concurrent_async_task=concurrent_async_tasks
             )
 
         ## If no outputs are generated, raise an exception
@@ -677,7 +686,9 @@ def _calculate_defect_rate(self, evaluation_result_dict) -> EvaluationResult:
             violence_defects = []
             sexual_defects = []
             self_harm_defects = []
-            for i, row in enumerate(zip(evaluation_result_dict[jailbreak_cols[0]]['rows'], evaluation_result_dict[regular_cols[0]]['rows'])):
+            jailbreak_rows = sorted(evaluation_result_dict[jailbreak_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
+            regular_rows = sorted(evaluation_result_dict[regular_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
+            for i, row in enumerate(zip(jailbreak_rows, regular_rows)):
                 hate_unfairness_defects.append(1 if (row[0]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
                 violence_defects.append(1 if (row[0]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
                 sexual_defects.append(1 if (row[0]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
@@ -716,8 +727,10 @@ async def __call__(
             data_path: Optional[Union[str, os.PathLike]] = None,
             jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
             output_path: Optional[Union[str, os.PathLike]] = None,
-            data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None
-        ) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:        
+            data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None,
+            randomization_seed: Optional[int] = None,
+            concurrent_async_tasks: Optional[int] = 5,
+        ) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:
         '''
         Evaluates the target function based on the provided parameters.
 
@@ -744,12 +757,17 @@ async def __call__(
         :param data_path: The path to the data file generated by the Simulator. If None, the Simulator will be run.
         :type data_path: Optional[Union[str, os.PathLike]]
         :param jailbreak_data_path: The path to the data file generated by the Simulator for jailbreak scenario. If None, the DirectAttackSimulator will be run.
-        :type jailbreak_data_path: Optional[Union[str, os.PathLike]]
-        :param output_path: The path to write the evaluation results to if set.
+        :type jailbreak_data_path: Optional[Union[str, os.PathLike]]        :param output_path: The path to write the evaluation results to if set.
         :type output_path: Optional[Union[str, os.PathLike]]
+        :param data_paths: A dictionary of data paths to evaluate. If None, the Simulator will be run.
+        :type data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]]
+        :param randomization_seed: The seed used to randomize prompt selection. If unset, the system's default seed is used.
+        :type randomization_seed: Optional[int]
+        :param concurrent_async_tasks: The number of concurrent async tasks to run. If None, the system's default is used.
+        :type concurrent_async_tasks: Optional[int]
         '''
-        ## Log inputs 
-        self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}")
+        ## Log inputs
+        self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}, randomization_seed={randomization_seed}, concurrent_async_tasks={concurrent_async_tasks}")
 
         ## Validate arguments
         self._validate_inputs(
@@ -779,6 +797,7 @@ async def __call__(
                 tasks=tasks,
                 source_text=source_text,
                 direct_attack=_SafetyEvaluator.DIRECT_ATTACK in evaluators,
+                randomization_seed=randomization_seed,
             )
         elif data_path:
             data_paths = {Path(data_path).stem: data_path}
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_safety_evaluation.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_safety_evaluation.py
@@ -270,6 +270,28 @@ async def test_simulate_no_results(self, mock_call, mock_init, safety_eval, mock
             )
         assert "outputs generated by the simulator" in str(exc_info.value)
 
+    @pytest.mark.asyncio
+    @patch("azure.ai.evaluation.simulator.AdversarialSimulator.__init__", return_value=None)
+    @patch("azure.ai.evaluation.simulator.AdversarialSimulator.__call__", new_callable=AsyncMock)
+    @patch("pathlib.Path.open", new_callable=MagicMock)
+    async def test_simulate_passes_randomization_seed(self, mock_open, mock_call, mock_init, safety_eval, mock_target):
+        """Tests if randomization_seed is passed correctly to the simulator."""
+        mock_file = MagicMock()
+        mock_open.return_value.__enter__.return_value = mock_file
+        mock_call.return_value = JsonLineList([{"messages": []}])
+        seed_value = 42
+
+        await safety_eval._simulate(
+            target=mock_target, 
+            adversarial_scenario=AdversarialScenario.ADVERSARIAL_QA,
+            randomization_seed=seed_value
+        )
+        
+        # Check if the simulator was called with the correct randomization_seed
+        mock_call.assert_called_once()
+        call_args, call_kwargs = mock_call.call_args
+        assert call_kwargs.get("randomization_seed") == seed_value
+
     def test_is_async_function(self, safety_eval, mock_target, mock_async_target):
         # Test that sync function returns False
         assert not safety_eval._is_async_function(mock_target)