From 9196b30c8863bdfccc854541b3dba587244b7b0b Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Mon, 18 Aug 2025 13:07:57 -0600 Subject: [PATCH] :fire: remove long context test with bad config Signed-off-by: Joe Runde --- tests/e2e/test_spyre_cb.py | 52 -------------------------------------- 1 file changed, 52 deletions(-) diff --git a/tests/e2e/test_spyre_cb.py b/tests/e2e/test_spyre_cb.py index 812201a3a..634b2c2d3 100644 --- a/tests/e2e/test_spyre_cb.py +++ b/tests/e2e/test_spyre_cb.py @@ -109,58 +109,6 @@ def test_api_cb_generates_correct_max_tokens( assert response.usage.completion_tokens == max_tokens -@pytest.mark.cb -@pytest.mark.spyre -@pytest.mark.xfail # TODO: remove once a spyre-base image supports this -@pytest.mark.parametrize("model", get_spyre_model_list()) -def test_continuous_batching_with_long_contexts(model, monkeypatch): - """Tests that continuous batching generates the same outputs on the spyre - cards as it does on cpu, when the max context length is set to 4k. - This ensures that the compiler is generating the correct programs for long - context cases, but we test here with small prompts for speed. - - Importantly, we're generating the cpu results to compare against using vllm - as well, instead of using transformers directly. This ensures that the model - code is all the same, and the only difference is the torch compilation - backend. - """ - max_model_len = 4096 - max_num_seqs = 4 - prompts = get_chicken_soup_prompts(4) - - sampling_params = SamplingParams(max_tokens=20, - temperature=0, - ignore_eos=True, - logprobs=0) - - vllm_cpu_results = generate_spyre_vllm_output( - model=model, - prompts=prompts, - max_model_len=max_model_len, - sampling_params=sampling_params, - tensor_parallel_size=1, - backend="eager", - max_num_seqs=max_num_seqs, - use_cb=True, - monkeypatch=monkeypatch) - - vllm_spyre_results = generate_spyre_vllm_output( - model=model, - prompts=prompts, - max_model_len=max_model_len, - sampling_params=sampling_params, - tensor_parallel_size=1, - backend="sendnn", - max_num_seqs=max_num_seqs, - use_cb=True, - monkeypatch=monkeypatch) - - for i in range(len(vllm_cpu_results)): - # As long as no sequences have top candidate tokens with very close - # logprobs, the generated text should be identical. - assert vllm_cpu_results[i]["text"] == vllm_spyre_results[i]["text"] - - @pytest.mark.cb @pytest.mark.parametrize("model", get_spyre_model_list()) @pytest.mark.parametrize(