Enable exporting Voxtral on MPS (#142)

manuelcandales · larryliu0820 · web-flow · commit 08b55f143d37 · 2025-09-29T11:36:19.000-07:00
* Enable exporting Voxtral on MPS

* Take care of example input dtype

* Fix warning

* Reformat

---------

Co-authored-by: Mengwei Liu &lt;larryliu0820@users.noreply.github.com&gt;
Co-authored-by: Mengwei Liu &lt;larryliu@meta.com&gt;
diff --git a/optimum/executorch/attentions/custom_kv_cache.py b/optimum/executorch/attentions/custom_kv_cache.py
@@ -51,7 +51,11 @@ def __init__(
             batch_size=max_batch_size, num_heads=num_heads, head_dim=head_dim, dtype=dtype, device=device
         )
 
-        assert device is None or device == "cpu", "Device must be None or 'cpu'"
+        assert device is None or device in [
+            "cpu",
+            "cuda",
+            "mps",
+        ], "Device must be None or one of 'cpu', 'cuda' or 'mps'."
 
         # Create a list of CustomKVCache instances derived from each layer of the original Transformers cache, one per layer.
         self.kv_cache = torch.nn.ModuleList()
@@ -63,6 +67,8 @@ def __init__(
                 head_dim=layer.head_dim,
                 dtype=dtype,
             )
+            layer_cache.k_cache = layer_cache.k_cache.to(device)
+            layer_cache.v_cache = layer_cache.v_cache.to(device)
             self.kv_cache.append(layer_cache)
 
     def update(
@@ -160,7 +166,7 @@ def from_legacy_cache(
         elif dtype is None and hasattr(legacy_cache.k_cache, "dtype"):
             dtype = legacy_cache.k_cache.dtype
 
-        assert device is None or device == "cpu"
+        # assert device is None or device == "cpu"
         assert dtype is None or dtype == torch.float32
 
         # Use the legacy cache's max_seq_len if max_cache_len is not specified
@@ -206,7 +212,11 @@ def __init__(
             batch_size=max_batch_size, num_heads=num_heads, head_dim=head_dim, dtype=dtype, device=device
         )
 
-        assert device is None or device == "cpu", "Device must be None or 'cpu'"
+        assert device is None or device in [
+            "cpu",
+            "cuda",
+            "mps",
+        ], "Device must be None or one of 'cpu', 'cuda' or 'mps'."
 
         self.cache_position = None
         # Create a list of cache instances, one per layer.
@@ -230,6 +240,8 @@ def __init__(
                     head_dim=layer.head_dim,
                     dtype=dtype,
                 )
+                layer_cache.k_cache = layer_cache.k_cache.to(device)
+                layer_cache.v_cache = layer_cache.v_cache.to(device)
             self.kv_cache.append(layer_cache)
 
     def update(
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
@@ -65,7 +65,7 @@ def prepare_export_inputs(self):
             raise ValueError(
                 f"Unable to obtain sample audio encoder inputs for export for {model_id} - the processor did not return formatted inputs with the 'pixel_values' key: {processed_inputs}"
             )
-        export_inputs = processed_inputs["pixel_values"]
+        export_inputs = processed_inputs["pixel_values"].to(dtype=self.model.dtype)
 
         # 2. Get export dynamic shapes
         dynamic_shapes = None  # No batching for now.
@@ -126,7 +126,7 @@ def prepare_export_inputs(self):
             raise ValueError(
                 f"Unable to obtain sample audio encoder inputs for export for {model_id} - the processor did not return formatted inputs with the 'input_features' key: {processed_inputs}"
             )
-        export_inputs = processed_inputs["input_features"]
+        export_inputs = processed_inputs["input_features"].to(dtype=self.model.dtype)
         # Make sure the export inputs has a batch size > 1 so that it doesn't 0/1 specialize.
         if export_inputs.shape[0] == 1:
             export_inputs = export_inputs.repeat(2, 1, 1)
@@ -242,7 +242,9 @@ def _prepare_decoder_only_export_inputs(self, max_seq_len: int):
 
         # Prepare inputs with dynamic shapes
         seq_length = 3
-        example_inputs_embeds = torch.zeros((1, seq_length, self.config.text_config.hidden_size), dtype=torch.float)
+        example_inputs_embeds = torch.zeros(
+            (1, seq_length, self.config.text_config.hidden_size), dtype=self.model.dtype
+        )
         example_cache_position = torch.arange(seq_length, dtype=torch.long)
 
         seq_len_dim = torch.export.Dim("seq_length_dim", max=max_seq_len)
@@ -311,6 +313,9 @@ def export(
             logging.info(
                 f"Exporting decoder using inputs_embeds({inputs_embeds.shape}), cache_position({cache_position.shape})={cache_position}, dynamic_shapes={dynamic_shapes}"
             )
+            # Move inputs to the same device as the model
+            inputs_embeds = inputs_embeds.to(self.model.device)
+            cache_position = cache_position.to(self.model.device)
             exported_program = exportable_module.export(
                 inputs_embeds=inputs_embeds,
                 cache_position=cache_position,
@@ -341,7 +346,8 @@ def export(
             logging.info(
                 f"Exporting token embeddings using input_ids({input_ids.shape}), dynamic_shapes={dynamic_shapes}"
             )
-
+            # Move inputs to the same device as the model
+            input_ids = input_ids.to(self.model.device)
             token_embedding_exported_program = torch.export.export(
                 self.model.get_input_embeddings(),
                 args=(input_ids,),
@@ -369,6 +375,8 @@ def export(
                 f"Exporting {self.modality} encoder using input_features({input_features.shape}), dynamic_shapes={dynamic_shapes}"
             )
 
+            # Move inputs to the same device as the model
+            input_features = input_features.to(self.model.device)
             encoder_exported_program = torch.export.export(
                 encoder,
                 args=(),
diff --git a/optimum/exporters/executorch/tasks/multimodal_text_to_text.py b/optimum/exporters/executorch/tasks/multimodal_text_to_text.py
@@ -115,7 +115,7 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
         MultiModalTextToTextExportableModule:
             An instance of `MultiModalTextToTextExportableModule` for exporting and lowering to ExecuTorch.
     """
-    device = "cpu"
+    device = kwargs.get("device", "cpu")
     batch_size = 1
     dtype = kwargs.get("dtype", "float32")
     use_custom_sdpa = kwargs.get("use_custom_sdpa", False)
@@ -166,7 +166,7 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
     eager_model = AutoModelForPreTraining.from_pretrained(
         model_name_or_path,
         device_map=device,
-        torch_dtype=dtype,
+        dtype=dtype,
         config=config,
         attn_implementation=attn_implementation,
     )
@@ -177,6 +177,7 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
         cache_config={
             "batch_size": batch_size,
             "max_cache_len": max_length,
+            "device": device,
         },
     )
     decoder_name, audio_encoder_name, vision_encoder_name = _validate_multimodal_components(eager_model)