From 9196b30c8863bdfccc854541b3dba587244b7b0b Mon Sep 17 00:00:00 2001
From: Joe Runde <joe@joerun.de>
Date: Mon, 18 Aug 2025 13:07:57 -0600
Subject: [PATCH] :fire: remove long context test with bad config

Signed-off-by: Joe Runde <joe@joerun.de>
---
 tests/e2e/test_spyre_cb.py | 52 --------------------------------------
 1 file changed, 52 deletions(-)

diff --git a/tests/e2e/test_spyre_cb.py b/tests/e2e/test_spyre_cb.py
index 812201a3a..634b2c2d3 100644
--- a/tests/e2e/test_spyre_cb.py
+++ b/tests/e2e/test_spyre_cb.py
@@ -109,58 +109,6 @@ def test_api_cb_generates_correct_max_tokens(
     assert response.usage.completion_tokens == max_tokens
 
 
-@pytest.mark.cb
-@pytest.mark.spyre
-@pytest.mark.xfail  # TODO: remove once a spyre-base image supports this
-@pytest.mark.parametrize("model", get_spyre_model_list())
-def test_continuous_batching_with_long_contexts(model, monkeypatch):
-    """Tests that continuous batching generates the same outputs on the spyre
-    cards as it does on cpu, when the max context length is set to 4k.
-    This ensures that the compiler is generating the correct programs for long
-    context cases, but we test here with small prompts for speed.
-
-    Importantly, we're generating the cpu results to compare against using vllm
-    as well, instead of using transformers directly. This ensures that the model
-    code is all the same, and the only difference is the torch compilation
-    backend.
-    """
-    max_model_len = 4096
-    max_num_seqs = 4
-    prompts = get_chicken_soup_prompts(4)
-
-    sampling_params = SamplingParams(max_tokens=20,
-                                     temperature=0,
-                                     ignore_eos=True,
-                                     logprobs=0)
-
-    vllm_cpu_results = generate_spyre_vllm_output(
-        model=model,
-        prompts=prompts,
-        max_model_len=max_model_len,
-        sampling_params=sampling_params,
-        tensor_parallel_size=1,
-        backend="eager",
-        max_num_seqs=max_num_seqs,
-        use_cb=True,
-        monkeypatch=monkeypatch)
-
-    vllm_spyre_results = generate_spyre_vllm_output(
-        model=model,
-        prompts=prompts,
-        max_model_len=max_model_len,
-        sampling_params=sampling_params,
-        tensor_parallel_size=1,
-        backend="sendnn",
-        max_num_seqs=max_num_seqs,
-        use_cb=True,
-        monkeypatch=monkeypatch)
-
-    for i in range(len(vllm_cpu_results)):
-        # As long as no sequences have top candidate tokens with very close
-        # logprobs, the generated text should be identical.
-        assert vllm_cpu_results[i]["text"] == vllm_spyre_results[i]["text"]
-
-
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize(