diff --git a/tests/models/video_llama_3/test_modeling_video_llama_3.py b/tests/models/video_llama_3/test_modeling_video_llama_3.py
index 5ad0cf9d7d4c..0a6ba10898d8 100644
--- a/tests/models/video_llama_3/test_modeling_video_llama_3.py
+++ b/tests/models/video_llama_3/test_modeling_video_llama_3.py
@@ -36,6 +36,7 @@
     is_torch_available,
 )
 from transformers.testing_utils import (
+    Expectations,
     backend_empty_cache,
     require_flash_attn,
     require_torch,
@@ -831,7 +832,14 @@ def test_small_model_integration_test(self):
         torch.testing.assert_close(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=1e-4, rtol=1e-4)
 
         output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
-        EXPECTED_DECODED_TEXT = "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress"
+        # fmt: off
+        EXPECTED_DECODED_TEXT = Expectations(
+            {
+                ("cuda", None): "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
+                ("xpu", None): "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant night scene in a bustling Japanese city. A woman in a striking red dress",
+            }
+        ).get_expectation()
+        # fmt: on
 
         self.assertEqual(
             self.processor.decode(output[0], skip_special_tokens=True),
@@ -874,11 +882,21 @@ def test_small_model_integration_test_batch_wo_image(self):
 
         # it should not matter whether two images are the same size or not
         output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None)
+        # fmt: off
+        EXPECTED_DECODED_TEXT = Expectations(
+            {
+                ("cuda", None): [
+                    "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
+                    "user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
+                ],
+                ("xpu", None): [
+                    "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant night scene in a bustling Japanese city. A woman in a striking red dress",
+                    "user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
+                ],
+            }
+        ).get_expectation()
+        # fmt: on
 
-        EXPECTED_DECODED_TEXT = [
-            "user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress",
-            "user\nWhat is relativity?\nassistant\nRelativity is a scientific theory that describes the relationship between space and time. It was first proposed by",
-        ]  # fmt: skip
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,