From 84cfeeba0891c9e0cba44035176a8684373ab649 Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Wed, 1 Oct 2025 10:55:08 +0000 Subject: [PATCH 01/10] consume fms dev branch Signed-off-by: Yannick Schnider --- .github/workflows/test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 828d1a1b..666a448c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -248,5 +248,8 @@ jobs: # re-install the vllm_sypre package from source source .venv/bin/activate + # overwrite fms main with feature branch + # TODO remove before merging + uv pip install git+https://github.com/foundation-model-stack/foundation-model-stack@granite-2b-expand-q-k-128 --force-reinstall python3 -m pytest ${{ matrix.test_suite.flags }} \ tests -v -m "${{ matrix.test_suite.markers }}" From 3c2e870b7205272cf406a71f50f480c6ea3344a3 Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Wed, 1 Oct 2025 16:02:38 +0200 Subject: [PATCH 02/10] adapt kv cache size Signed-off-by: Yannick Schnider --- vllm_spyre/model_executor/model_loader/spyre.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm_spyre/model_executor/model_loader/spyre.py b/vllm_spyre/model_executor/model_loader/spyre.py index a94db1b6..e9c1d235 100644 --- a/vllm_spyre/model_executor/model_loader/spyre.py +++ b/vllm_spyre/model_executor/model_loader/spyre.py @@ -353,10 +353,17 @@ def __init__( self.kv_cache_specs['num_kv_heads'] = model_config.get_num_kv_heads( parallel_config) - if self.config.model_type in {'llama', 'granite'}: + if self.config.model_type in {'llama', 'granite', 'granitemoehybrid'}: self.kv_cache_specs['num_layers'] = self.config.num_hidden_layers self.kv_cache_specs['head_dim'] = self.config.hidden_size // \ self.config.num_attention_heads + + # *** ALERT *** Granite 2b hack for AIU Compiler + if self.config.model_type == 'granitemoehybrid' and self.kv_cache_specs[ + 'head_dim'] < 128: + self.kv_cache_specs['head_dim'] = 128 // self.kv_cache_specs[ + 'head_dim'] * self.kv_cache_specs['head_dim'] + elif self.config.model_type == 'gpt_bigcode': self.kv_cache_specs['num_layers'] = self.config.n_layer self.kv_cache_specs[ From 039c59aa198362969709076fd9a428464886d884 Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Wed, 1 Oct 2025 17:53:02 +0200 Subject: [PATCH 03/10] fix fmt Signed-off-by: Yannick Schnider --- vllm_spyre/model_executor/model_loader/spyre.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_spyre/model_executor/model_loader/spyre.py b/vllm_spyre/model_executor/model_loader/spyre.py index e9c1d235..7cb8995e 100644 --- a/vllm_spyre/model_executor/model_loader/spyre.py +++ b/vllm_spyre/model_executor/model_loader/spyre.py @@ -359,8 +359,8 @@ def __init__( self.config.num_attention_heads # *** ALERT *** Granite 2b hack for AIU Compiler - if self.config.model_type == 'granitemoehybrid' and self.kv_cache_specs[ - 'head_dim'] < 128: + if (self.config.model_type == 'granitemoehybrid' + and self.kv_cache_specs['head_dim'] < 128): self.kv_cache_specs['head_dim'] = 128 // self.kv_cache_specs[ 'head_dim'] * self.kv_cache_specs['head_dim'] From e46ff483fc63774d63e931cca7a6339a4156fa38 Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Wed, 1 Oct 2025 19:56:46 +0200 Subject: [PATCH 04/10] load granite model types correctly too Signed-off-by: Yannick Schnider --- vllm_spyre/model_executor/model_loader/spyre.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm_spyre/model_executor/model_loader/spyre.py b/vllm_spyre/model_executor/model_loader/spyre.py index 7cb8995e..87da0200 100644 --- a/vllm_spyre/model_executor/model_loader/spyre.py +++ b/vllm_spyre/model_executor/model_loader/spyre.py @@ -359,8 +359,7 @@ def __init__( self.config.num_attention_heads # *** ALERT *** Granite 2b hack for AIU Compiler - if (self.config.model_type == 'granitemoehybrid' - and self.kv_cache_specs['head_dim'] < 128): + if self.kv_cache_specs['head_dim'] < 128: self.kv_cache_specs['head_dim'] = 128 // self.kv_cache_specs[ 'head_dim'] * self.kv_cache_specs['head_dim'] @@ -471,6 +470,8 @@ def set_past_key_value_states(self, num_blocks) -> None: dtype=self.dtype)) for _ in range(self.kv_cache_specs['num_layers']) ] + print('vllm p kv', self.past_key_value_states[0][0].shape) + # torch.Size([128, 64, 8, 64]) else: from fms_mo.aiu_addons.fp8.fp8_utils import ScaledTensor batch_size = max(2, self.scheduler_config.max_num_seqs) From f03d61b1fbc8e81a8928d29aa85d217226e4064c Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Wed, 1 Oct 2025 19:58:52 +0200 Subject: [PATCH 05/10] remove print Signed-off-by: Yannick Schnider --- vllm_spyre/model_executor/model_loader/spyre.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm_spyre/model_executor/model_loader/spyre.py b/vllm_spyre/model_executor/model_loader/spyre.py index 87da0200..0a001852 100644 --- a/vllm_spyre/model_executor/model_loader/spyre.py +++ b/vllm_spyre/model_executor/model_loader/spyre.py @@ -470,8 +470,6 @@ def set_past_key_value_states(self, num_blocks) -> None: dtype=self.dtype)) for _ in range(self.kv_cache_specs['num_layers']) ] - print('vllm p kv', self.past_key_value_states[0][0].shape) - # torch.Size([128, 64, 8, 64]) else: from fms_mo.aiu_addons.fp8.fp8_utils import ScaledTensor batch_size = max(2, self.scheduler_config.max_num_seqs) From ecd957fbe057d9d87516c2623b49a18093c1dff1 Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Thu, 2 Oct 2025 12:41:37 +0200 Subject: [PATCH 06/10] exclude llama from head dim expansion hack Signed-off-by: Yannick Schnider --- vllm_spyre/model_executor/model_loader/spyre.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_spyre/model_executor/model_loader/spyre.py b/vllm_spyre/model_executor/model_loader/spyre.py index 0a001852..8992a345 100644 --- a/vllm_spyre/model_executor/model_loader/spyre.py +++ b/vllm_spyre/model_executor/model_loader/spyre.py @@ -359,7 +359,8 @@ def __init__( self.config.num_attention_heads # *** ALERT *** Granite 2b hack for AIU Compiler - if self.kv_cache_specs['head_dim'] < 128: + if ('granite' in self.config.model_type + and self.kv_cache_specs['head_dim'] < 128): self.kv_cache_specs['head_dim'] = 128 // self.kv_cache_specs[ 'head_dim'] * self.kv_cache_specs['head_dim'] From 8f40bf271b9f1f3cc45c9b07e025bcb18d0087d8 Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Tue, 7 Oct 2025 23:06:02 +0200 Subject: [PATCH 07/10] grab new head_dim arg directly from fms Signed-off-by: Yannick Schnider --- vllm_spyre/model_executor/model_loader/spyre.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/vllm_spyre/model_executor/model_loader/spyre.py b/vllm_spyre/model_executor/model_loader/spyre.py index 8992a345..8269d517 100644 --- a/vllm_spyre/model_executor/model_loader/spyre.py +++ b/vllm_spyre/model_executor/model_loader/spyre.py @@ -355,14 +355,7 @@ def __init__( if self.config.model_type in {'llama', 'granite', 'granitemoehybrid'}: self.kv_cache_specs['num_layers'] = self.config.num_hidden_layers - self.kv_cache_specs['head_dim'] = self.config.hidden_size // \ - self.config.num_attention_heads - - # *** ALERT *** Granite 2b hack for AIU Compiler - if ('granite' in self.config.model_type - and self.kv_cache_specs['head_dim'] < 128): - self.kv_cache_specs['head_dim'] = 128 // self.kv_cache_specs[ - 'head_dim'] * self.kv_cache_specs['head_dim'] + self.kv_cache_specs['head_dim'] = self.model.config.head_dim elif self.config.model_type == 'gpt_bigcode': self.kv_cache_specs['num_layers'] = self.config.n_layer From 29dc43e8137caa6f36f9ca309523c1199b8c1b3c Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Tue, 7 Oct 2025 23:19:34 +0200 Subject: [PATCH 08/10] support llama again Signed-off-by: Yannick Schnider --- vllm_spyre/model_executor/model_loader/spyre.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm_spyre/model_executor/model_loader/spyre.py b/vllm_spyre/model_executor/model_loader/spyre.py index 8269d517..7aec70a6 100644 --- a/vllm_spyre/model_executor/model_loader/spyre.py +++ b/vllm_spyre/model_executor/model_loader/spyre.py @@ -355,8 +355,11 @@ def __init__( if self.config.model_type in {'llama', 'granite', 'granitemoehybrid'}: self.kv_cache_specs['num_layers'] = self.config.num_hidden_layers - self.kv_cache_specs['head_dim'] = self.model.config.head_dim - + if 'granite' in self.config.model_type: + self.kv_cache_specs['head_dim'] = self.model.config.head_dim + else: + self.kv_cache_specs['head_dim'] = self.config.hidden_size // \ + self.config.num_attention_heads elif self.config.model_type == 'gpt_bigcode': self.kv_cache_specs['num_layers'] = self.config.n_layer self.kv_cache_specs[ From 083c55ba0d38a81496e943ba03eab4f1760d7b95 Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Wed, 15 Oct 2025 09:07:38 +0200 Subject: [PATCH 09/10] remove fms feature branch as merged Signed-off-by: Yannick Schnider --- .github/workflows/test.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 666a448c..828d1a1b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -248,8 +248,5 @@ jobs: # re-install the vllm_sypre package from source source .venv/bin/activate - # overwrite fms main with feature branch - # TODO remove before merging - uv pip install git+https://github.com/foundation-model-stack/foundation-model-stack@granite-2b-expand-q-k-128 --force-reinstall python3 -m pytest ${{ matrix.test_suite.flags }} \ tests -v -m "${{ matrix.test_suite.markers }}" From 049956db895f2fda4c445e0370ff7b715a432d39 Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Wed, 15 Oct 2025 14:11:07 +0200 Subject: [PATCH 10/10] establish backward compatibility, simplify Signed-off-by: Yannick Schnider --- vllm_spyre/model_executor/model_loader/spyre.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm_spyre/model_executor/model_loader/spyre.py b/vllm_spyre/model_executor/model_loader/spyre.py index 2e2aa7bb..c5d8b4e6 100644 --- a/vllm_spyre/model_executor/model_loader/spyre.py +++ b/vllm_spyre/model_executor/model_loader/spyre.py @@ -341,11 +341,9 @@ def __init__( if self.config.model_type in {'llama', 'granite', 'granitemoehybrid'}: self.kv_cache_specs['num_layers'] = self.config.num_hidden_layers - if 'granite' in self.config.model_type: - self.kv_cache_specs['head_dim'] = self.model.config.head_dim - else: - self.kv_cache_specs['head_dim'] = self.config.hidden_size // \ - self.config.num_attention_heads + self.kv_cache_specs['head_dim'] = getattr( + self.model.config, "head_dim", + self.config.hidden_size // self.config.num_attention_heads) elif self.config.model_type == 'gpt_bigcode': self.kv_cache_specs['num_layers'] = self.config.n_layer self.kv_cache_specs[