vllm-project · yannicks1 · Apr 9, 2025 · Mar 19, 2025 · Mar 19, 2025 · Mar 21, 2025
@@ -0,0 +1,83 @@
+import os
+import time
+
+from vllm import LLM, SamplingParams
+
+max_tokens1 = 10
+max_tokens2 = 5
+max_tokens3 = 7
+max_tokens = max([max_tokens1, max_tokens2, max_tokens3])
+max_num_seqs = 2  # defines max batch size
+
+os.environ["VLLM_SPYRE_WARMUP_PROMPT_LENS"] = '64'
+os.environ["VLLM_SPYRE_WARMUP_NEW_TOKENS"] = str(max_tokens)
+
+# defining here to be able to run/debug directly from VSC (not via terminal)
+os.environ['VLLM_SPYRE_DYNAMO_BACKEND'] = 'eager'
+os.environ['VLLM_SPYRE_USE_CB'] = '1'
+os.environ['VLLM_USE_V1'] = '1'
+
+os.environ['VLLM_SPYRE_MAX_CONTEXT_LENGTH'] = '2048'
+os.environ['VLLM_SPYRE_MAX_BATCH_SIZE'] = str(max_num_seqs)
+
+# Sample prompts.
+template = (
+    "Below is an instruction that describes a task. Write a response that "
+    "appropriately completes the request. Be polite in your response to the "
+    "user.\n\n### Instruction:\n{}\n\n### Response:")
+
+prompt1 = template.format(
+    "Provide a list of instructions for preparing chicken soup for a family "
+    "of four.")
+
+prompt2 = template.format("Provide instructions for preparing chicken soup.")
+
+prompt3 = template.format(
+    "Provide a list of instructions for preparing chicken soup for a family.")
+
+prompts = [
+    prompt1,
+    prompt2,
+    prompt3,
+]
+
+# Create a sampling params object.
+sampling_params1 = SamplingParams(max_tokens=max_tokens1,
+                                  temperature=0.0,
+                                  ignore_eos=True)
+
+sampling_params2 = SamplingParams(max_tokens=max_tokens2,
+                                  temperature=0.0,
+                                  ignore_eos=True)
+
+sampling_params3 = SamplingParams(max_tokens=max_tokens3,
+                                  temperature=0.0,
+                                  ignore_eos=True)
+
+sampling_params = [
+    sampling_params1,
+    sampling_params2,
+    sampling_params3,
+]
+
+# Create an LLM.
+llm = LLM(model="/models/llama-194m",
+          tokenizer="/models/llama-194m",
+          max_model_len=2048,
+          block_size=2048)
+
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+print("=============== GENERATE")
+t0 = time.time()
+outputs = llm.generate(prompts, sampling_params)
+print("Time elaspsed for %d tokens is %.2f sec" %
+      (len(outputs[0].outputs[0].token_ids), time.time() - t0))
+print("===============")
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+print("===============")
+for output in outputs:
+    print(output.outputs[0])
diff --git a/tests/test_spyre_tensor_parallel.py b/tests/test_spyre_tensor_parallel.py
@@ -46,7 +46,6 @@ def test_output(
     test using 'pytest --capture=no tests/spyre/test_spyre_tensore_parallel.py'
     After debugging, DISABLE_ASSERTS should be reset to 'False'.
     '''
-
     max_new_tokens = max([t[1] for t in warmup_shapes])
 
     vllm_sampling_params = SamplingParams(

@@ -6,6 +6,9 @@
     VLLM_SPYRE_WARMUP_PROMPT_LENS: Optional[List[int]] = None
     VLLM_SPYRE_WARMUP_NEW_TOKENS: Optional[List[int]] = None
     VLLM_SPYRE_WARMUP_BATCH_SIZES: Optional[List[int]] = None
+    VLLM_SPYRE_USE_CB: bool = False
+    VLLM_SPYRE_MAX_BATCH_SIZE: int = 0
+    VLLM_SPYRE_MAX_CONTEXT_LENGTH: int = 0
 
 environment_variables: Dict[str, Callable[[], Any]] = {
     # Defines the prompt lengths the Spyre accelerator should be prepared
@@ -40,6 +43,18 @@
     # - "eager": Skip compile entirely (for debug and testing
     "VLLM_SPYRE_DYNAMO_BACKEND":
     lambda: os.getenv("VLLM_SPYRE_DYNAMO_BACKEND", "sendnn_decoder"),
+
+    # If set, use the V1 continuous batching implementation
+    "VLLM_SPYRE_USE_CB":
+    lambda: bool(int(os.getenv("VLLM_SPYRE_USE_CB", "0"))),
+
+    # Maximal supported batch size
+    "VLLM_SPYRE_MAX_BATCH_SIZE":
+    lambda: int(os.getenv("VLLM_SPYRE_MAX_BATCH_SIZE", "0")),
+
+    # Maximal supported context length
+    "VLLM_SPYRE_MAX_CONTEXT_LENGTH":
+    lambda: int(os.getenv("VLLM_SPYRE_MAX_CONTEXT_LENGTH", "0")),
 }