fix crashes with the usage of top_k (#543)

tjohnson31415 · web-flow · commit 2d0293d34075 · 2025-10-28T13:29:05.000-06:00
# Description We found a couple of ways to crash the server with edge cases in top_k: - setting `top_k > vocab_size` in the request - mixing greedy requests and sampling requests with `top_k > 0` in the same batch See #542 for details on the crashes. The "fix" in this PR is to just copy the logic from the vLLM's GPU `InputBatch` for setting the value of top_k: clamping the value to `vocab_size` and setting the default top_k to `vocab_size` instead of 0 in a mixed batch. REF: https://github.com/vllm-project/vllm/blob/fc168c33f35e0610d41206e864b6bf90fe613f19/vllm/v1/worker/gpu_input_batch.py#L353-L357 ## Related Issues FIX #542 --------- Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
diff --git a/tests/v1/worker/test_spyre_input_batch.py b/tests/v1/worker/test_spyre_input_batch.py
@@ -63,7 +63,7 @@ def _construct_expected_sampling_metadata(
     presence_penalties = [0.0 for _ in range(num_reqs)]
     frequency_penalties = [0.0 for _ in range(num_reqs)]
     repetition_penalties = [1.0 for _ in range(num_reqs)]
-    top_k = [0 for _ in range(num_reqs)]
+    top_k = [VOCAB_SIZE for _ in range(num_reqs)]
     top_p = [0.0 for _ in range(num_reqs)]
     temperature = [0.0 for _ in range(num_reqs)]
     allowed_token_ids_mask = torch.zeros(num_reqs,
@@ -85,7 +85,8 @@ def _construct_expected_sampling_metadata(
             req.sampling_params.frequency_penalty)
         repetition_penalties[index_in_input_batch] = (
             req.sampling_params.repetition_penalty)
-        top_k[index_in_input_batch] = req.sampling_params.top_k
+        if req.sampling_params.top_k > 0:
+            top_k[index_in_input_batch] = req.sampling_params.top_k
         top_p[index_in_input_batch] = req.sampling_params.top_p
         temperature[index_in_input_batch] = req.sampling_params.temperature
         if req.sampling_params.allowed_token_ids:
@@ -102,7 +103,7 @@ def _construct_expected_sampling_metadata(
         all_random=True,
         top_p=None if all(x == 1.0 for x in top_p) else torch.tensor(
             top_p, dtype=torch.float, device=device),
-        top_k=None if all(x == 0 for x in top_k) else torch.tensor(
+        top_k=None if all(x == VOCAB_SIZE for x in top_k) else torch.tensor(
             top_k, dtype=torch.int, device=device),
         generators={},
         max_num_logprobs=0,
@@ -133,7 +134,7 @@ def _construct_expected_sampling_metadata(
 
 def _create_sampling_params():
     return SamplingParams(
-        top_k=np.random.randint(1, 10),
+        top_k=np.random.randint(0, 10),
         top_p=np.random.uniform(0.0, 1.0),
         presence_penalty=np.random.uniform(-2.0, 2.0),
         repetition_penalty=np.random.uniform(0.0, 2.0),
@@ -267,3 +268,30 @@ def test_sampling_metadata_in_input_batch(batch_size: int):
         reqs, req_ids_retained, input_batch, device=torch.device(device))
 
     compare_results(sampling_metadata, expected_sampling_metadata)
+
+
+@pytest.mark.cpu
+@pytest.mark.worker
+def test_sampling_metadata_topk_edges():
+    device = torch.device('cpu')
+    input_batch: SamplingInputBatch = SamplingInputBatch(
+        max_num_reqs=2,
+        max_model_len=1024,
+        device=device,
+        pin_memory=is_pin_memory_available(),
+        vocab_size=VOCAB_SIZE,
+    )
+
+    # top_k should be clamped to VOCAB_SIZE
+    req = _construct_cached_request_state(0)
+    req.sampling_params = SamplingParams(temperature=1.0, top_k=VOCAB_SIZE + 1)
+    input_batch.add_request(req, 0)
+
+    # in a batch with both greedy and sampling, default top_k should be
+    # VOCAB_SIZE
+    req = _construct_cached_request_state(1)
+    req.sampling_params = SamplingParams(temperature=0)
+    input_batch.add_request(req, 1)
+
+    assert input_batch.top_k[0] == VOCAB_SIZE
+    assert input_batch.top_k[1] == VOCAB_SIZE
diff --git a/vllm_spyre/v1/worker/spyre_input_batch.py b/vllm_spyre/v1/worker/spyre_input_batch.py
@@ -425,9 +425,12 @@ def add_request(
         self.top_p_cpu[req_index] = sampling_params.top_p
         if sampling_params.top_p < 1:
             self.top_p_reqs.add(req_id)
-        self.top_k_cpu[req_index] = sampling_params.top_k
-        if sampling_params.top_k > 0:
+        top_k = sampling_params.top_k
+        if 0 < top_k < self.vocab_size:
             self.top_k_reqs.add(req_id)
+        else:
+            top_k = self.vocab_size
+        self.top_k_cpu[req_index] = top_k
         self.frequency_penalties_cpu[
             req_index] = sampling_params.frequency_penalty
         if sampling_params.frequency_penalty != 0.0: