vllm-project
diff --git a/‎examples/offline_inference_spyre_cb_test.py‎
Lines changed: 5 additions & 6 deletions b/‎examples/offline_inference_spyre_cb_test.py‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎vllm_spyre/model_executor/model_loader/spyre.py‎
Lines changed: 98 additions & 85 deletions b/‎vllm_spyre/model_executor/model_loader/spyre.py‎
Lines changed: 98 additions & 85 deletions
diff --git a/‎vllm_spyre/platform.py‎
Lines changed: 23 additions & 20 deletions b/‎vllm_spyre/platform.py‎
Lines changed: 23 additions & 20 deletions
@@ -3,15 +3,14 @@
 
 from vllm import LLM, SamplingParams
 
-max_tokens1 = 10
-max_tokens2 = 5
+# RUN with fms branch: https://github.com/foundation-model-stack/
+# foundation-model-stack/tree/paged_attn_mock
+
+max_tokens1 = 65
+max_tokens2 = 67
 max_tokens3 = 7
-max_tokens = max([max_tokens1, max_tokens2, max_tokens3])
 max_num_seqs = 2  # defines max batch size
 
-os.environ["VLLM_SPYRE_WARMUP_PROMPT_LENS"] = '64'
-os.environ["VLLM_SPYRE_WARMUP_NEW_TOKENS"] = str(max_tokens)
-
 # defining here to be able to run/debug directly from VSC (not via terminal)
 os.environ['VLLM_SPYRE_DYNAMO_BACKEND'] = 'eager'
 os.environ['VLLM_SPYRE_USE_CB'] = '1'
 
@@ -1,6 +1,6 @@
 """Utilities for selecting and loading Spyre models."""
 import os
-from typing import Optional
+from typing import Any, Optional
 
 import torch
 import torch._inductor.config
@@ -54,49 +54,68 @@ def __init__(
         # False for finished or padded sequences
         self.indices = None
 
+        # number of right pads (relevant for continuous batching only)
+        self.n_pads_right = 0
+
         # FMS Model
-        fms_model = ContinuousBatchingFmsModel if envs_spyre.VLLM_SPYRE_USE_CB\
-            else StaticBatchingFmsModel
-        self.model = fms_model(
-            model_config,
-            parallel_config,
-            max_prompt_length,
-            max_decode_length,
-        )
+        if envs_spyre.VLLM_SPYRE_USE_CB:
+            self.model = ContinuousBatchingFmsModel(model_config,
+                                                    parallel_config)
+        else:
+            self.model = StaticBatchingFmsModel(
+                model_config,
+                parallel_config,
+                max_prompt_length,
+                max_decode_length,
+            )
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         masks: torch.Tensor,
         is_prompt: bool,
-        tkv: Optional[int] = None,
-        active_pages: Optional[list[int]] = None,
+        current_tkv_mask: Optional[torch.Tensor] = None,
+        left_padded_prompt_mask: Optional[torch.Tensor] = None,
+        block_table: Optional[torch.Tensor] = None,
+        slot_mapping: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
         if is_prompt and not envs_spyre.VLLM_SPYRE_USE_CB:
-            self.model.past_key_value_states = None
+            self.model.past_key_value_states = None  # type: ignore
 
-        extra_kwargs = {}
+        extra_kwargs: dict[str, Any] = {}
         if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND != "sendnn_decoder":
             # Bug in 2.3.1 fixed in 2.4.1 for SDPA flash
             # cpu impl when padding too much
             extra_kwargs["attn_algorithm"] = "math"
 
-        # normal prefil or decoding step
+        if envs_spyre.VLLM_SPYRE_USE_CB:
+            extra_kwargs["current_tkv_mask"] = current_tkv_mask
+            extra_kwargs["left_padded_prompt_mask"] = left_padded_prompt_mask
+            extra_kwargs["block_table"] = block_table
+            extra_kwargs["slot_mapping"] = slot_mapping
+
+        # normal prefill or decoding step
         logits = self.model(
             input_ids,
             position_ids=positions,
             mask=masks,
             use_cache=True,
-            only_last_token=True,
-            tkv=tkv,
-            active_pages=active_pages,
+            only_last_token=not envs_spyre.VLLM_SPYRE_USE_CB,
             **extra_kwargs,
         )
 
-        # removing finished or padded sequences
-        logits = logits[self.indices]
+        if envs_spyre.VLLM_SPYRE_USE_CB:
+            if is_prompt and self.n_pads_right > 0:
+                # get last token before the right padding
+                logits = logits[self.indices, -self.n_pads_right - 1, :]
+            else:
+                # just take last token if no right padding
+                logits = logits[self.indices, -1, :]
+        else:
+            # removing finished or padded sequences
+            logits = logits[self.indices]
 
         return logits
 
@@ -151,11 +170,6 @@ def load_weights(
         **kwargs,
     ) -> None:
 
-        if self.dtype is not model_config.dtype:
-            logger.info(
-                "Ignoring user-provided dtype=%s and using dtype=%s instead.",
-                model_config.dtype, self.dtype)
-
         if model_config.quantization == "gptq":
             if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND == "sendnn_decoder":
                 from fms_mo.aiu_addons.gptq import (  # noqa: F401
@@ -173,13 +187,17 @@ def load_weights(
                 "group_size": quant_cfg['group_size'],
                 "desc_act": quant_cfg['desc_act'],
             }
-            data_type = None
+            self.dtype = None
             model_source = "hf_gptq_aiu"
         else:
             linear_config = {"linear_type": "torch_linear"}
-            data_type = self.dtype
             model_source = "hf"
 
+        if self.dtype is not model_config.dtype:
+            logger.info(
+                "Ignoring user-provided dtype=%s and using dtype=%s instead.",
+                model_config.dtype, self.dtype)
+
         is_local = os.path.isdir(model_config.model)
         model_path = model_config.model
         # Get location of model from HF cache.
@@ -197,7 +215,7 @@ def load_weights(
                                variant=model_config.model,
                                model_path=model_path,
                                source=model_source,
-                               data_type=data_type,
+                               data_type=self.dtype,
                                distributed_strategy=distributed_strategy,
                                group=dist.group.WORLD,
                                fused_weights=fused_weights,
@@ -245,39 +263,51 @@ def load_weights(
 
 class ContinuousBatchingFmsModel(FmsModelBase):
 
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        max_prompt_length: int,
-        max_decode_length: int,
-    ) -> None:
-        super().__init__(model_config, parallel_config, max_prompt_length,
-                         max_decode_length)
+    def __init__(self, model_config: ModelConfig,
+                 parallel_config: ParallelConfig) -> None:
 
-        # physical KV cache on AIU Spyre
+        BLOCK_SIZE = 64
         max_batch = envs_spyre.VLLM_SPYRE_MAX_BATCH_SIZE
         max_model_len = envs_spyre.VLLM_SPYRE_MAX_CONTEXT_LENGTH
 
-        if self.config.model_type == 'llama':
+        # edge case: prompt fills model length: can produce 1 token with prefill
+        max_prompt_length = max_model_len
+        # edge case: prompt will be padded to first block:
+        # can produce 1 token with prefill plus rest of model length
+        max_decode_length = max_model_len - BLOCK_SIZE + 1
+        super().__init__(model_config, parallel_config, max_prompt_length,
+                         max_decode_length)
+
+        # physical KV cache on AIU Spyre: will eventually not live in this class
+        num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+
+        if self.config.model_type in {'llama', 'granite'}:
             num_layers = self.config.num_hidden_layers
-            num_kv_heads = self.config.num_key_value_heads
             head_dim = self.config.hidden_size // \
                 self.config.num_attention_heads
         elif self.config.model_type == 'gpt_bigcode':
             num_layers = self.config.n_layer
-            num_kv_heads = 1 if self.config.multi_query else self.config.n_head
             head_dim = self.config.n_embd // self.config.n_head
         else:
-            print(f"[SpyreCausalLM] model type {self.config.model_type} "
-                  f"not supported in ContinuousBatchingFmsModel")
-
-        # (layers)x(k,v)x[max_batch, num_kv_heads, max_model_len, head_dim]
-        self.fms_kv_cache: list[tuple[torch.Tensor, torch.Tensor]] = [
-            (torch.empty((max_batch, num_kv_heads, max_model_len, head_dim)),
-             torch.empty((max_batch, num_kv_heads, max_model_len, head_dim)))
-            for i in range(num_layers)
-        ]
+            raise NotImplementedError(
+                f"[SpyreCausalLM] model type {self.config.model_type} "
+                f"not supported in ContinuousBatchingFmsModel")
+
+        num_blocks = max_batch * max_model_len // BLOCK_SIZE  # 64
+
+        # List[layers] of Tuple[k,v] of
+        # Tensor[num_blocks, BLOCK_SIZE, num_kv_heads, head_dim]
+        self.past_key_value_states = [(torch.zeros(num_blocks,
+                                                   BLOCK_SIZE,
+                                                   num_kv_heads,
+                                                   head_dim,
+                                                   dtype=self.dtype),
+                                       torch.zeros(num_blocks,
+                                                   BLOCK_SIZE,
+                                                   num_kv_heads,
+                                                   head_dim,
+                                                   dtype=self.dtype))
+                                      for _ in range(num_layers)]
 
     def forward(
         self,
@@ -286,50 +316,36 @@ def forward(
         mask: torch.Tensor,
         use_cache: bool,
         only_last_token: bool,
-        tkv: int,
-        active_pages: list[int],
+        current_tkv_mask: torch.Tensor,
+        left_padded_prompt_mask: torch.Tensor,
+        block_table: torch.Tensor,
+        slot_mapping: torch.Tensor,
         **extra_kwargs,
     ) -> torch.Tensor:
 
-        # read-out (dynamic) kv_cache for decoding steps only,
-        # for prefills kv_cache = None
-        if tkv == 0:  # prefil
-            kv_cache = None
-            tkv = input_ids.shape[1]
-        else:  # decode
-            kv_cache = []
-            active_pages_mask = torch.zeros(self.fms_kv_cache[0][0].shape[0],
-                                            dtype=torch.bool)
-            active_pages_mask[active_pages] = True
-            for layer in range(len(self.fms_kv_cache)):
-                kv_cache.append(
-                    (self.fms_kv_cache[layer][0][active_pages_mask, :, :tkv -
-                                                 1, :],
-                     self.fms_kv_cache[layer][1][active_pages_mask, :, :tkv -
-                                                 1, :]))
+        # mark dynamic: Not sure if that's correct/needed here,
+        # copied from fms branch paged_atten_mock
+        if self.past_key_value_states is not None:
+            for layer in self.past_key_value_states:
+                if isinstance(layer, tuple):
+                    for tensor in layer:
+                        torch._dynamo.mark_dynamic(tensor, 2)
 
         output = self.model(
             input_ids,
             position_ids=position_ids,
             mask=mask,
-            past_key_value_states=kv_cache,
+            past_key_value_states=self.past_key_value_states,
             use_cache=use_cache,
             only_last_token=only_last_token,
+            current_tkv_mask=current_tkv_mask,
+            left_padded_prompt_mask=left_padded_prompt_mask,
+            block_table=block_table,
+            slot_mapping=slot_mapping,
             **extra_kwargs,
         )
-        logits, key_value_states = output
-
-        # updating (physical) KV cache: self.fms_kv_cache
-        for idx, page in enumerate(sorted(active_pages)):
-            for layer in range(len(self.fms_kv_cache)):
-                # inserting partial KV cache at correct location
-                # (page, tkv) in the KV cache of the whole batch
-                self.fms_kv_cache[layer][0][
-                    page, :, :tkv, :] = key_value_states[layer][0][
-                        idx, :, :, :]  # [1, 8, L, 128]
-                self.fms_kv_cache[layer][1][
-                    page, :, :tkv, :] = key_value_states[layer][1][
-                        idx, :, :, :]  # [1, 8, L, 128]
+
+        logits, self.past_key_value_states = output
 
         return logits
 
@@ -356,8 +372,6 @@ def forward(
         mask: torch.Tensor,
         use_cache: bool,
         only_last_token: bool,
-        tkv: int,
-        active_pages: list[int],
         **extra_kwargs,
     ) -> torch.Tensor:
 
@@ -371,7 +385,6 @@ def forward(
             **extra_kwargs,
         )
 
-        logits, past_key_value_states = output
-        self.past_key_value_states = past_key_value_states
+        logits, self.past_key_value_states = output
 
         return logits
@@ -50,6 +50,11 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if scheduler_config.is_multi_step:
             raise NotImplementedError
 
+        # continuous batching related checks
+        if envs_spyre.VLLM_SPYRE_USE_CB and not envs.VLLM_USE_V1:
+            raise NotImplementedError(
+                "Continuous batching is only implemented for vLLM V1")
+
         # Near future TODO: vLLM will have an api to check whether v0 or v1 is
         # used that isn't just checking the environment variable
 
@@ -69,21 +74,22 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                     "vllm_spyre.v1.core.scheduler.ContinuousBatchingSpyreScheduler"
             else:
                 scheduler_config.scheduler_cls = \
-                    "vllm_spyre.v1.core.scheduler.SpyreScheduler"
+                    "vllm_spyre.v1.core.scheduler.StaticBatchingSpyreScheduler"
         else:
             scheduler_config.scheduler_cls = \
                 "vllm_spyre.core.scheduler.SpyreScheduler"
 
-        # Override --max-num-seqs to the biggest warmup batch size
-        # And override --max-model-len to the biggest warmup sequence
-        cls._warmup_shapes = None
-        spyre_warmup_shapes = cls.get_warmup_shapes(scheduler_config)
-        max_batch_size = 0
-        max_seq_len = 0
-        for shape in spyre_warmup_shapes:
-            max_batch_size = max(max_batch_size, shape['batch_size'])
-            max_seq_len = max(max_seq_len,
-                              shape['prompt_length'] + shape['new_tokens'])
+        if not envs_spyre.VLLM_SPYRE_USE_CB:
+            # Override --max-num-seqs to the biggest warmup batch size
+            # And override --max-model-len to the biggest warmup sequence
+            cls._warmup_shapes = None
+            spyre_warmup_shapes = cls.get_warmup_shapes(scheduler_config)
+            max_batch_size = 0
+            max_seq_len = 0
+            for shape in spyre_warmup_shapes:
+                max_batch_size = max(max_batch_size, shape['batch_size'])
+                max_seq_len = max(max_seq_len,
+                                  shape['prompt_length'] + shape['new_tokens'])
 
         if envs.VLLM_USE_V1:
             if envs_spyre.VLLM_SPYRE_USE_CB:
@@ -98,11 +104,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 # The v0 scheduler will run out of blocks if this is overridden
                 scheduler_config.max_num_seqs = max_batch_size
 
-        # continuous batching related checks
-        if envs_spyre.VLLM_SPYRE_USE_CB and not envs.VLLM_USE_V1:
-            raise NotImplementedError(
-                "Continuous batching is only implemented for vLLM V1")
-
         cache_config = vllm_config.cache_config
 
         if cache_config and model_config:
@@ -115,7 +116,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             #       one single block.
             # - Set the number of blocks to the maximum number of sequences, so
             #       the scheduler always thinks there's a block available
-            model_config.max_model_len = max_seq_len
+            if not envs_spyre.VLLM_SPYRE_USE_CB:
+                model_config.max_model_len = max_seq_len
             cache_config.block_size = model_config.max_model_len
 
             if envs.VLLM_USE_V1:
@@ -166,9 +168,10 @@ def get_warmup_shapes(cls, scheduler_config) -> tuple[dict[str, int], ...]:
                     "The lists in VLLM_SPYRE_WARMUP_PROMPT_LENS and "
                     "VLLM_SPYRE_WARMUP_NEW_TOKENS must have equal length")
 
-        logger.info("VLLM_SPYRE_WARMUP_PROMPT_LENS = %s", wup_prompt_lens)
-        logger.info("VLLM_SPYRE_WARMUP_NEW_TOKENS = %s", wup_new_tokens)
-        logger.info("VLLM_SPYRE_WARMUP_BATCH_SIZES = %s", wup_batch_sizes)
+        if not envs_spyre.VLLM_SPYRE_USE_CB:
+            logger.info("VLLM_SPYRE_WARMUP_PROMPT_LENS = %s", wup_prompt_lens)
+            logger.info("VLLM_SPYRE_WARMUP_NEW_TOKENS = %s", wup_new_tokens)
+            logger.info("VLLM_SPYRE_WARMUP_BATCH_SIZES = %s", wup_batch_sizes)
 
         cls._warmup_shapes = tuple(
             sorted([{