multimodal support for unified attn (#423)

attafosu · Copilot · web-flow · commit 8d9286466ff0 · 2025-11-26T09:29:52.000+01:00
- Enables Multimodal support for unified attention

---------

Signed-off-by: attafosu &lt;thomas.atta-fosu@intel.com&gt;
Signed-off-by: Thomas Atta-Fosu &lt;thomas.atta-fosu@intel.com&gt;
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh
@@ -218,6 +218,14 @@ run_qwen2_5_vl_test() {
     echo "✅ Test with multimodal-support with qwen2.5-vl-7b passed."
 }
 
+# Multimodal-support + unified attention with qwen2.5-vl
+run_qwen2_5_vl_unified_attn_test() {
+    echo "➡️ Testing Qwen2.5-VL-7B with unified attention..."
+    VLLM_SKIP_WARMUP=true VLLM_UNIFIED_ATTN=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
+    python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/qwen2.5-vl-7b.yaml"
+    echo "✅ Test multimodal-support + unified attention with qwen2.5-vl-7b passed."
+}
+
 # Spec decode with ngram
 run_spec_decode_ngram_test() {
     echo "➡️ Testing Spec-decode with ngram..."
@@ -292,6 +300,7 @@ launch_all_tests() {
     run_gsm8k_deepseek_test
     run_gsm8k_qwen3_30b_test
     run_qwen2_5_vl_test
+    run_qwen2_5_vl_unified_attn_test
     run_spec_decode_ngram_test
     run_spec_decode_eagle3_test
     run_spec_decode_eagle3_num_spec_2_test
diff --git a/vllm_gaudi/ops/hpu_rotary_embedding.py b/vllm_gaudi/ops/hpu_rotary_embedding.py
@@ -657,6 +657,12 @@ def forward_oot(
     ) -> tuple[torch.Tensor, torch.Tensor]:
         from habana_frameworks.torch.hpex.kernels import (RotaryPosEmbeddingMode, apply_rotary_pos_emb)
 
+        # NOTE (attafosu): positions is expected to be 2D tensor [3, seq_len],
+        # But the unified_attention API sends it as 3D [3, seq_len, 1]
+        # So we flatten it to 2D here
+        if positions.ndim == 3:
+            assert positions.shape[-1] == 1, "Expected last dimension to be 1 for 3d positions"
+            positions = positions.squeeze(-1)
         num_tokens = positions.shape[-1]
         cos_sin = self.cos_sin_cache[positions]
         cos, sin = cos_sin.chunk(2, dim=-1)
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -1440,6 +1440,41 @@ def _gather_mm_embeddings(
 
         return mm_embeds, is_mm_embed
 
+    def _get_model_mm_inputs(
+        self,
+        token_ids: torch.Tensor,
+        total_num_scheduled_tokens: Optional[int],
+        scheduler_output: "SchedulerOutput",
+        req_ids: list[str],
+    ) -> tuple[torch.Tensor | None, dict[str, Any] | None]:
+        inputs_embeds = None
+        model_mm_kwargs = None
+        if self.supports_mm_inputs:
+            # Run the multimodal encoder if any.
+            with self.profiler.record_event('internal', 'prepare_input_encoders'):
+                self._execute_mm_encoder(scheduler_output, req_ids)
+
+            mm_embeds, is_mm_embed = self._gather_mm_embeddings(scheduler_output,
+                                                                req_ids,
+                                                                total_num_scheduled_tokens=total_num_scheduled_tokens)
+            # TODO: Only get embeddings for valid token_ids. Ignore token_ids[<pad_idxs>] # noqa
+            # This may require moving multimodal input preps into _prepare_inputs,        # noqa
+            # to avoid padding issues.
+            htorch.core.mark_step()
+            inputs_embeds = self.model.embed_input_ids(
+                token_ids,
+                multimodal_embeddings=mm_embeds,
+                is_multimodal=is_mm_embed,
+            )
+
+            model_mm_kwargs = self._extract_mm_kwargs(scheduler_output)
+            model_mm_kwargs = MultiModalKwargs.as_kwargs(
+                model_mm_kwargs,
+                device=self.device,
+            )
+
+        return inputs_embeds, model_mm_kwargs
+
     def get_model(self) -> torch.nn.Module:
         if isinstance(self.model, HpuModelAdapter):
             return self.model.model
@@ -1644,6 +1679,54 @@ def _align_and_pad_mrope_positions(self, req_ids: list[str], context_lens: list[
                 dst_start += target_len
         return mrope_position_tensor
 
+    # modified from: vllm/v1/worker/gpu_model_runner.py:_calc_mrope_positions
+    def get_unified_mrope_position_ids(self, req_ids: list[str], num_computed_tokens: torch.tensor,
+                                       num_scheduled_tokens: torch.tensor, target_len: int,
+                                       padding_gen: int) -> torch.Tensor:
+        out_shape = (3, target_len)
+        mrope_position_tensor = torch.full(out_shape, padding_gen, dtype=torch.int32, device='cpu')
+        mrope_pos_ptr = 0
+        for index, req_id in enumerate(req_ids):
+            req = self.requests[req_id]
+            assert req.mrope_positions is not None
+
+            context_len = num_computed_tokens[index]
+            query_len = num_scheduled_tokens[index]
+            num_prompt_tokens = len(
+                req.prompt_token_ids)  # The gpu runner uses either prompt_token_ids or prompt_embeds # noqa 501
+
+            if context_len + query_len > num_prompt_tokens:
+                prompt_part_len = max(0, num_prompt_tokens - context_len)
+                completion_part_len = max(0, query_len - prompt_part_len)
+            else:
+                prompt_part_len = query_len
+                completion_part_len = 0
+
+            assert query_len == prompt_part_len + completion_part_len
+            if prompt_part_len > 0:
+                # prompt's mrope_positions are pre-computed
+                dst_start = mrope_pos_ptr
+                dst_end = mrope_pos_ptr + prompt_part_len
+                src_start = context_len
+                src_end = context_len + prompt_part_len
+                mrope_position_tensor[:, dst_start:dst_end].copy_(req.mrope_positions[:, src_start:src_end],
+                                                                  non_blocking=True)
+
+                mrope_pos_ptr += prompt_part_len
+            if completion_part_len > 0:
+                # compute completion's mrope_positions on-the-fly
+                dst_start = mrope_pos_ptr
+                dst_end = mrope_pos_ptr + completion_part_len
+                pos_for_mrope = MRotaryEmbedding.get_next_input_positions(
+                    mrope_position_delta=req.mrope_position_delta,
+                    context_len=context_len + prompt_part_len,
+                    seq_len=context_len + prompt_part_len + completion_part_len,
+                )
+                mrope_position_tensor[:, dst_start:dst_end] = torch.tensor(pos_for_mrope, dtype=torch.int32)
+                mrope_pos_ptr += completion_part_len
+
+        return mrope_position_tensor.to('hpu', non_blocking=True)
+
     def _skip_bucketing(self, seq_lens, num_blocks):
         return (len(seq_lens), 0, 0)
 
@@ -2857,10 +2940,19 @@ def prepare_unified_batch(self, scheduler_output):
             self._prepare_input_ids(scheduler_output)
             input_ids_hpu = self.input_ids_hpu
 
-        return create_unified_batch(self.input_batch.req_ids, all_token_ids, num_computed_tokens, num_scheduled_tokens,
-                                    num_prompt_tokens, block_table, self.block_size, self.dtype,
-                                    self.unified_attn_persistent_ctx, self.unified_bucketing_fn, self.get_dp_padding,
-                                    input_ids_hpu, num_decodes)
+        batch = create_unified_batch(self.input_batch.req_ids, all_token_ids, num_computed_tokens, num_scheduled_tokens,
+                                     num_prompt_tokens, block_table, self.block_size, self.dtype,
+                                     self.unified_attn_persistent_ctx, self.unified_bucketing_fn, self.get_dp_padding,
+                                     input_ids_hpu, num_decodes)
+        if self.uses_mrope:
+            batch.token_positions = self.get_unified_mrope_position_ids(
+                self.input_batch.req_ids,
+                num_computed_tokens,
+                num_scheduled_tokens,
+                target_len=batch.token_ids.size(0),
+                padding_gen=-1,
+            )
+        return batch
 
     @torch.inference_mode()
     def unified_execute_model(self,
@@ -2871,6 +2963,18 @@ def unified_execute_model(self,
         with self.profiler.record_event('internal', 'prepare_unified_batch'):
             batch = self.prepare_unified_batch(scheduler_output)
         htorch.core.mark_step()
+
+        # Prepare multimodal inputs if any
+        inputs_embeds, model_mm_kwargs = self._get_model_mm_inputs(
+            batch.token_ids.unsqueeze(
+                0  # NOTE(attafosu): We unsqueeze at dim0 here to ensure the input tokens shape matches the expected batch-first format required by "model.embed_input_ids()" call in _get_model_mm_inputs and downstream model components. # noqa E501
+            ),
+            batch.token_ids.shape[0],
+            scheduler_output,
+            self.input_batch.req_ids,
+        )
+        htorch.core.mark_step()
+
         if self.is_driver_worker:
             unified_attn_cfg = self._get_unified_config(batch.attn_metadata, batch.logits_indices)
             (phase, qlen, num_shared_blocks, num_unique_blocks, num_logits) = unified_attn_cfg
@@ -2888,6 +2992,8 @@ def unified_execute_model(self,
                     kv_caches=self.kv_caches,
                     lora_logits_mask=None,
                     lora_mask=None,
+                    inputs_embeds=inputs_embeds,
+                    model_mm_kwargs=model_mm_kwargs,
                     warmup_mode=warmup_mode)
         selected_req_ids = [batch.req_ids_cpu[idx] for idx in batch.logits_groups_cpu.tolist()]
         htorch.core.mark_step()
@@ -3127,33 +3233,13 @@ def sample_tokens(self, grammar_output: "GrammarOutput | None") -> ModelRunnerOu
             for idx, (req_id, prompt_len, token_ids, position_ids, attn_metadata, logits_indices,
                       logits_requests) in enumerate(zip(*shallow_tuple(prefill_data))):
 
-                inputs_embeds = None
-                model_mm_kwargs = None
-                if self.supports_mm_inputs:
-                    # Run the multimodal encoder if any.
-                    with self.profiler.record_event('internal', 'prepare_input_encoders'):
-                        self._execute_mm_encoder(scheduler_output, req_id)
-                    htorch.core.mark_step()
-
-                    mm_embeds, is_mm_embed = self._gather_mm_embeddings(scheduler_output,
-                                                                        req_id,
-                                                                        total_num_scheduled_tokens=token_ids.shape[-1])
-                    htorch.core.mark_step()
-
-                    # TODO: Only get embeddings for valid token_ids. Ignore token_ids[<pad_idxs>] # noqa E501
-                    # This may require moving multimodal input preps into _prepare_inputs,        # noqa E501
-                    # to avoid padding issues.
-                    inputs_embeds = self.model.embed_input_ids(
-                        token_ids,
-                        multimodal_embeddings=mm_embeds,
-                        is_multimodal=is_mm_embed,
-                    )
-
-                    model_mm_kwargs = self._extract_mm_kwargs(scheduler_output)
-                    model_mm_kwargs = MultiModalKwargs.as_kwargs(
-                        model_mm_kwargs,
-                        device=self.device,
-                    )
+                # Prepare multimodal inputs if any
+                inputs_embeds, model_mm_kwargs = self._get_model_mm_inputs(
+                    token_ids,
+                    token_ids.shape[-1],
+                    scheduler_output,
+                    req_id,
+                )
 
                 lora_mask, lora_logits_mask = self._configure_lora(token_ids, self.requests, req_id, True)