🐛 Fix llama guard adapter breaking because of metadata for different choice responses (#43)

gkumbhat · evaline-ju · web-flow · commit 7678885807ab · 2025-03-14T14:40:52.000-06:00
* 🐛 Fix llama guard adapter breaking because of metadata for different choice responses

Signed-off-by: Gaurav-Kumbhat &lt;Gaurav.Kumbhat@ibm.com&gt;

* Update tests/generative_detectors/test_llama_guard.py

Co-authored-by: Evaline Ju &lt;69598118+evaline-ju@users.noreply.github.com&gt;
Signed-off-by: Gaurav Kumbhat &lt;kumbhat.gaurav@gmail.com&gt;

---------

Signed-off-by: Gaurav-Kumbhat &lt;Gaurav.Kumbhat@ibm.com&gt;
Signed-off-by: Gaurav Kumbhat &lt;kumbhat.gaurav@gmail.com&gt;
Co-authored-by: Evaline Ju &lt;69598118+evaline-ju@users.noreply.github.com&gt;
diff --git a/tests/generative_detectors/test_llama_guard.py b/tests/generative_detectors/test_llama_guard.py
@@ -231,7 +231,52 @@ def test_post_process_content_works_for_safe(llama_guard_detection):
     assert len(new_response.choices) == 1
     assert detection_type == "risk"
     # post_process_completion_results function returns array of metadata per choice
-    assert metadata == []
+    assert metadata == [{}]
+
+
+def test_post_process_content_splits_safe_and_unsafe_categories(llama_guard_detection):
+    safe_message = "safe"
+    unsafe_message = "\n\nunsafe\nS2,S3"
+    response = ChatCompletionResponse(
+        model="foo",
+        usage=UsageInfo(prompt_tokens=1, total_tokens=1),
+        choices=[
+            ChatCompletionResponseChoice(
+                index=0,
+                message=ChatMessage(
+                    content=safe_message,
+                    role=" assistant",
+                ),
+            ),
+            ChatCompletionResponseChoice(
+                index=1,
+                message=ChatMessage(
+                    content=unsafe_message,
+                    role=" assistant",
+                ),
+            ),
+        ],
+    )
+
+    expected_metadata = [{}, {"categories": ["Non-Violent Crimes.", "Sex Crimes."]}]
+
+    safe_score = 0.6
+    unsafe_score = 0.99
+    llama_guard_detection_instance = asyncio.run(llama_guard_detection)
+    # NOTE: we are testing private function here
+    (new_response, scores, detection_type, metadata,) = asyncio.run(
+        llama_guard_detection_instance.post_process_completion_results(
+            response, [safe_score, unsafe_score], "risk"
+        )
+    )
+
+    assert isinstance(new_response, ChatCompletionResponse)
+    assert new_response.choices[1].message.content == "unsafe"
+    assert scores[1] == unsafe_score
+    assert len(new_response.choices) == 2
+    assert detection_type == "risk"
+    # post_process_completion_results function returns array of metadata per choice
+    assert metadata == expected_metadata
 
 
 #### Content detection tests
diff --git a/vllm_detector_adapter/generative_detectors/llama_guard.py b/vllm_detector_adapter/generative_detectors/llama_guard.py
@@ -102,11 +102,11 @@ async def post_process_completion_results(self, response, scores, detection_type
                         logger.warning(
                             f"Category {category} not found in risk bank for model {self.__class__.__name__}"
                         )
-                metadata_per_choice.append(metadata)
             else:
                 # "safe" case
                 new_choices.append(choice)
                 new_scores.append(scores[i])
+            metadata_per_choice.append(metadata)
 
         response.choices = new_choices
         return response, new_scores, detection_type, metadata_per_choice