make mla weight contiguous

xinyu-intel · xinyu-intel · commit cf7a643e176d · 2025-11-27T11:09:59.000+08:00
Signed-off-by: Xinyu Chen &lt;xinyu1.chen@intel.com&gt;
diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py
@@ -352,6 +352,13 @@ def _forward_decode(  # type: ignore
         result = self._v_up_proj(output)
         return result
 
+    # NOTE(Xinyu): Make the loaded weight contiguous to avoid the transpose
+    # during each graph execution
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        super().process_weights_after_loading(act_dtype)
+        self.W_UV = self.W_UV.contiguous()
+        self.W_UK_T = self.W_UK_T.contiguous()
+
     # NOTE(Chendi): PR25184 using output buffer as default, which can't be used in HPU Graph,
     # so we override and always return a new tensor
     def _v_up_proj(self, x):