Allow parse_response to accept token IDs (#41849)

Rocketknight1 · web-flow · commit 38df1e946dea · 2025-10-29T13:04:57.000Z
* Allow tokenizer.parse_response() to accept IDs/arrays directly

* Allow tokenizer.parse_response() to accept IDs/arrays directly
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
@@ -18,6 +18,8 @@
 of output with special method for the Fast tokenizers)
 """
 
+from __future__ import annotations
+
 import copy
 import json
 import os
@@ -783,7 +785,7 @@ def as_tensor(value, dtype=None):
 
         return self
 
-    def to(self, device: Union[str, "torch.device"], *, non_blocking: bool = False) -> "BatchEncoding":
+    def to(self, device: Union[str, torch.device], *, non_blocking: bool = False) -> BatchEncoding:
         """
         Send all values to device by calling `v.to(device, non_blocking=non_blocking)` (PyTorch only).
 
@@ -1858,7 +1860,11 @@ def get_chat_template(self, chat_template: Optional[str] = None, tools: Optional
 
         return chat_template
 
-    def parse_response(self, response: str, schema: Optional[Union[list, dict]] = None):
+    def parse_response(
+        self,
+        response: str | list[str | int | list[int]] | np.ndarray | torch.Tensor,
+        schema: list | dict | None = None,
+    ):
         """
         Converts an output string created by generating text from a model into a parsed message dictionary.
         This method is intended for use with chat models, and will read the tokenizer's `response_schema` attribute to
@@ -1869,16 +1875,29 @@ def parse_response(self, response: str, schema: Optional[Union[list, dict]] = No
 
         Args:
             response (`str`):
-                The output string generated by the model. This should be the decoded string, not raw tokens.
+                The output string generated by the model. This can be either a decoded string or list of strings,
+                or token IDs as a list/array.
             schema (`Union[list, dict]`, *optional*):
                 A response schema that indicates the expected output format and how parsing should be performed.
                 If not provided, the tokenizer's `response_schema` attribute will be used.
         """
+        batched = (
+            (isinstance(response, list) and not isinstance(response[0], int))
+            or getattr(response, "ndim", 0) > 1  # For torch/numpy tensors
+        )
+
         if schema is None:
             if getattr(self, "response_schema", None) is None:
                 raise AttributeError("This tokenizer does not have a `response_schema` for parsing chat responses!")
             schema = self.response_schema
-        return recursive_parse(response, schema)
+        if batched:
+            if not (isinstance(response, list) and isinstance(response[0], str)):
+                response = self.batch_decode(response)
+            return [recursive_parse(single_response, schema) for single_response in response]
+        else:
+            if not isinstance(response, str):
+                response = self.decode(response)
+            return recursive_parse(response, schema)
 
     @classmethod
     def from_pretrained(
@@ -3863,7 +3882,7 @@ def convert_tokens_to_string(self, tokens: list[str]) -> str:
 
     def batch_decode(
         self,
-        sequences: Union[list[int], list[list[int]], np.ndarray, "torch.Tensor"],
+        sequences: Union[list[int], list[list[int]], np.ndarray, torch.Tensor],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: Optional[bool] = None,
         **kwargs,
@@ -3897,7 +3916,7 @@ def batch_decode(
 
     def decode(
         self,
-        token_ids: Union[int, list[int], np.ndarray, "torch.Tensor"],
+        token_ids: Union[int, list[int], np.ndarray, torch.Tensor],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: Optional[bool] = None,
         **kwargs,
diff --git a/tests/utils/test_chat_parsing_utils.py b/tests/utils/test_chat_parsing_utils.py
@@ -200,6 +200,40 @@ def test_tokenizer_method(self):
         tokenizer_parsed_chat = tokenizer.parse_response(model_out)
         self.assertEqual(tokenizer_parsed_chat, parsed_chat)
 
+    def test_batched_inputs(self):
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        model_out = '<|START_THINKING|>I should call a tool.<|END_THINKING|><|START_ACTION|>[\n    {"tool_call_id": "0", "tool_name": "simple_tool", "parameters": {"temperature_format": "Celsius"}}\n]<|END_ACTION|><|END_OF_TURN_TOKEN|>'
+        tokenizer.response_schema = cohere_schema
+        parsed_chat = tokenizer.parse_response(model_out)
+        self.assertEqual(tokenizer.parse_response([model_out]), [parsed_chat])
+        self.assertEqual(tokenizer.parse_response([model_out] * 2), [parsed_chat] * 2)
+
+    def test_token_id_inputs(self):
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")  # Need an actual tokenizer to encode
+        model_out = '<|START_THINKING|>I should call a tool.<|END_THINKING|><|START_ACTION|>[\n    {"tool_call_id": "0", "tool_name": "simple_tool", "parameters": {"temperature_format": "Celsius"}}\n]<|END_ACTION|><|END_OF_TURN_TOKEN|>'
+        tokenizer.response_schema = cohere_schema
+        parsed_chat = tokenizer.parse_response(model_out)
+        tokenized_out = tokenizer(model_out).input_ids
+        self.assertEqual(tokenizer.parse_response(tokenized_out), parsed_chat)
+        self.assertEqual(tokenizer.parse_response([tokenized_out]), [parsed_chat])
+        self.assertEqual(tokenizer.parse_response([tokenized_out] * 2), [parsed_chat] * 2)
+
+    def test_numpy_inputs(self):
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")  # Need an actual tokenizer to encode
+        model_out = '<|START_THINKING|>I should call a tool.<|END_THINKING|><|START_ACTION|>[\n    {"tool_call_id": "0", "tool_name": "simple_tool", "parameters": {"temperature_format": "Celsius"}}\n]<|END_ACTION|><|END_OF_TURN_TOKEN|>'
+        tokenizer.response_schema = cohere_schema
+        parsed_chat = tokenizer.parse_response(model_out)
+        tokenized_out = tokenizer(model_out, return_tensors="np").input_ids
+        self.assertEqual(tokenizer.parse_response(tokenized_out), [parsed_chat])
+
+    def test_tensor_inputs(self):
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")  # Need an actual tokenizer to encode
+        model_out = '<|START_THINKING|>I should call a tool.<|END_THINKING|><|START_ACTION|>[\n    {"tool_call_id": "0", "tool_name": "simple_tool", "parameters": {"temperature_format": "Celsius"}}\n]<|END_ACTION|><|END_OF_TURN_TOKEN|>'
+        tokenizer.response_schema = cohere_schema
+        parsed_chat = tokenizer.parse_response(model_out)
+        tokenized_out = tokenizer(model_out, return_tensors="pt").input_ids
+        self.assertEqual(tokenizer.parse_response(tokenized_out), [parsed_chat])
+
     def test_cohere_template(self):
         model_out = '<|START_THINKING|>I should call a tool.<|END_THINKING|><|START_ACTION|>[\n    {"tool_call_id": "0", "tool_name": "simple_tool", "parameters": {"temperature_format": "Celsius"}}\n]<|END_ACTION|><|END_OF_TURN_TOKEN|>'
         parsed_chat = recursive_parse(model_out, cohere_schema)