We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent e38c8e9 commit cf7a643Copy full SHA for cf7a643
vllm_gaudi/attention/backends/hpu_attn.py
@@ -352,6 +352,13 @@ def _forward_decode( # type: ignore
352
result = self._v_up_proj(output)
353
return result
354
355
+ # NOTE(Xinyu): Make the loaded weight contiguous to avoid the transpose
356
+ # during each graph execution
357
+ def process_weights_after_loading(self, act_dtype: torch.dtype):
358
+ super().process_weights_after_loading(act_dtype)
359
+ self.W_UV = self.W_UV.contiguous()
360
+ self.W_UK_T = self.W_UK_T.contiguous()
361
+
362
# NOTE(Chendi): PR25184 using output buffer as default, which can't be used in HPU Graph,
363
# so we override and always return a new tensor
364
def _v_up_proj(self, x):
0 commit comments