✅ based on the PR comments, changed test case to check for an expected number instead of checking if length is non-zero; added return_attention_mask=True in the run_tokenizer method

m-misiura · m-misiura · commit f629248b689b · 2024-12-04T20:16:17.000Z
Signed-off-by: m-misiura &lt;mmisiura@redhat.com&gt;
diff --git a/caikit_nlp/modules/text_generation/text_generation_local.py b/caikit_nlp/modules/text_generation/text_generation_local.py
@@ -592,7 +592,7 @@ def run_tokenizer(
                 The token count
         """
         error.type_check("<NLP48137045E>", str, text=text)
-        tokenized_output = self.model.tokenizer(text)
+        tokenized_output = self.model.tokenizer(text, return_attention_mask=True)
         return TokenizationResults(
             token_count=len(tokenized_output["input_ids"]),
         )
diff --git a/tests/modules/text_generation/test_text_generation_local.py b/tests/modules/text_generation/test_text_generation_local.py
@@ -228,10 +228,10 @@ def test_run_tokenizer_edge_cases(disable_wip, set_cpu_device):
     short_text = "This is a test sentence."
     short_result = model.run_tokenizer(short_text)
     assert isinstance(short_result, TokenizationResults)
-    assert short_result.token_count > 0
+    assert short_result.token_count == len(model.model.tokenizer.encode(short_text))
 
     # Edge case: Long input
     long_text = "This is a test sentence. " * 1000
     long_result = model.run_tokenizer(long_text)
     assert isinstance(long_result, TokenizationResults)
-    assert long_result.token_count > 0
+    assert long_result.token_count == len(model.model.tokenizer.encode(long_text))

Original file line number	Diff line number	Diff line change
`@@ -592,7 +592,7 @@ def run_tokenizer(`
`592`	`592`	`The token count`
`593`	`593`	`"""`
`594`	`594`	`error.type_check("<NLP48137045E>", str, text=text)`
`595`		`- tokenized_output = self.model.tokenizer(text)`
	`595`	`+ tokenized_output = self.model.tokenizer(text, return_attention_mask=True)`
`596`	`596`	`return TokenizationResults(`
`597`	`597`	`token_count=len(tokenized_output["input_ids"]),`
`598`	`598`	`)`