feat(scaffolding): add streaming scaffolding_llm.generate_async support

dc3671 · dc3671 · commit 1f24b4efe15f · 2025-06-18T18:49:50.000-07:00
Signed-off-by: Zhenhuan Chen &lt;chenzhh3671@gmail.com&gt;
diff --git a/examples/scaffolding/contrib/AsyncGeneration/stream_generation_controller.py b/examples/scaffolding/contrib/AsyncGeneration/stream_generation_controller.py
@@ -3,7 +3,8 @@
 from typing import List
 
 from tensorrt_llm.scaffolding import Controller, GenerationTask, Task
-from tensorrt_llm.scaffolding.contrib import StreamGenerationTask
+from tensorrt_llm.scaffolding.contrib.AsyncGeneration import \
+    StreamGenerationTask
 
 
 class NativeStreamGenerationController(Controller):
diff --git a/examples/scaffolding/contrib/Dynasor/scaffolding_dynasor_run.py b/examples/scaffolding/contrib/Dynasor/scaffolding_dynasor_run.py
@@ -1,8 +1,9 @@
 import argparse
+import asyncio
 
 from tensorrt_llm.scaffolding import (MajorityVoteController, ScaffoldingLlm,
                                       TRTLLMWorker)
-from tensorrt_llm.scaffolding.contrib import DynasorGenerationController
+from tensorrt_llm.scaffolding.contrib.Dynasor import DynasorGenerationController
 
 
 def parse_arguments():
@@ -16,13 +17,16 @@ def parse_arguments():
     parser.add_argument("--max_num_tokens", type=int, default=7000)
     parser.add_argument("--majority_vote", action='store_true')
     parser.add_argument('--sample_num', type=int, default=3)
+    parser.add_argument('--streaming', action='store_true')
     args = parser.parse_args()
     return args
 
 
 def test(prompts, proposer_worker, args):
     dynasor_generation_controller = DynasorGenerationController(
-        generation_dir=args.model_dir, max_tokens=args.max_num_tokens)
+        generation_dir=args.model_dir,
+        max_tokens=args.max_num_tokens,
+        streaming=args.streaming)
 
     # If majority voting is requested, wrap the controller in MajorityVoteController
     if args.majority_vote:
@@ -47,9 +51,25 @@ def test(prompts, proposer_worker, args):
             },
         )
 
-    results = llm.generate(prompts)
-    for result in results:
-        print(result.output.output_str)
+    if args.streaming:
+
+        async def task(prompt: str):
+            i = 0
+            async for result in llm.generate_async(prompt):
+                i += 1
+                print(">>>", i, result)
+                async for output in result.output:
+                    print(i, len(output.outputs[0].text))
+            print(f">>> final output {len(output.outputs[0].text)}\n",
+                  output.outputs[0].text)
+            # print(f">>> final result.output {len(result.output.outputs[0].text)} {result.output}\n", result.output.outputs[0].text)
+
+        asyncio.run_coroutine_threadsafe(task(prompts[0]), llm.loop).result()
+    else:
+        results = llm.generate(prompts)
+        for result in results:
+            print(result.output.outputs[0].text)
+
     print(f"main shutting down...")
     llm.shutdown()
     print(f"worker shutting down...")
@@ -62,8 +82,8 @@ def main():
 
     prompts = [
         "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\r\n\r\n",
-        "There exist real numbers $x$ and $y$, both greater than 1, such that $\\log_x\\left(y^x\\right)=\\log_y\\left(x^{4y}\\right)=10$. Find $xy$.",
-        "Find the largest possible real part of \\[(75+117i)z+\\frac{96+144i}{z}\\]where $z$ is a complex number with $|z|=4$.",
+        # "There exist real numbers $x$ and $y$, both greater than 1, such that $\\log_x\\left(y^x\\right)=\\log_y\\left(x^{4y}\\right)=10$. Find $xy$.",
+        # "Find the largest possible real part of \\[(75+117i)z+\\frac{96+144i}{z}\\]where $z$ is a complex number with $|z|=4$.",
     ]
 
     llm_worker = TRTLLMWorker.init_with_new_llm(
diff --git a/examples/scaffolding/run_basic_generation.py b/examples/scaffolding/run_basic_generation.py
@@ -28,7 +28,7 @@ def test_sync(prompts, proposer_worker):
     )
     results = llm.generate(prompts)
     for result in results:
-        print(result.output.output_str)
+        print(result.output.outputs[0].text)
     print(f'main shutting down...')
     llm.shutdown()
     print(f'worker shutting down...')
@@ -40,16 +40,24 @@ def test_async(prompt, proposer_worker):
 
     async def test_async_func(prompt, proposer_worker):
         prototype_controller = NativeGenerationController(
-            sampling_params={"temperature": 0.9})
+            sampling_params={"temperature": 0.9}, streaming=True)
         llm = ScaffoldingLlm(
             prototype_controller,
             {NativeGenerationController.WorkerTag.GENERATION: proposer_worker},
         )
-
-        future = llm.generate_async(prompt)
-
-        result = await future.aresult()
-        print(result.output.output_str)
+        i = 0
+
+        async for result in llm.generate_async(prompt):
+            i += 1
+            print(">>>", i, result)
+            async for output in result.output:
+                print(len(output.outputs[0].text))
+            # print(result.output,
+            #       end='\n' if result.finished else '\r',
+            #       flush=True)
+
+        # result = await future.aresult()
+        # print(result.output.output_str)
 
         print(f'main shutting down...')
         llm.shutdown()
diff --git a/examples/scaffolding/token_budget_majority_vote.py b/examples/scaffolding/token_budget_majority_vote.py
@@ -77,16 +77,17 @@ def main():
     args = parse_arguments()
     workers = {}
 
-    llm_worker = TRTLLMWorker.init_with_new_llm(args.model_dir,
-                                                backend="pytorch",
-                                                max_batch_size=32,
-                                                max_num_tokens=4096,
-                                                temperature=0.9)
+    llm_worker = TRTLLMWorker.init_with_new_llm(
+        args.model_dir,
+        max_batch_size=32,
+        max_num_tokens=4096,
+    )
 
     prototype_generation_controller = NativeGenerationController(
-        custom_sampling_params={
+        sampling_params={
             "max_tokens": 4096,
             "top_p": 0.9,
+            "temperature": 0.9,
         })
     workers[NativeGenerationController.WorkerTag.GENERATION] = llm_worker
 
diff --git a/tensorrt_llm/scaffolding/__init__.py b/tensorrt_llm/scaffolding/__init__.py
@@ -1,7 +1,7 @@
 from .benchmark import ScaffoldingBenchRequest, async_scaffolding_benchmark
 from .controller import (BestOfNController, Controller, MajorityVoteController,
                          NativeGenerationController, NativeRewardController,
-                         ParallelProcess, ScaffoldingOutput)
+                         ParallelProcess)
 from .math_utils import (extract_answer_from_boxed, extract_answer_with_regex,
                          get_digit_majority_vote_result)
 from .scaffolding_llm import ScaffoldingLlm
diff --git a/tensorrt_llm/scaffolding/contrib/AsyncGeneration/stream_generation.py b/tensorrt_llm/scaffolding/contrib/AsyncGeneration/stream_generation.py
@@ -51,7 +51,7 @@ async def get_step_or_more_tokens(task: StreamGenerationTask):
         if task.request_handle._done:
             task.end_flag = True
 
-    sampling_params = worker.combine_sampling_params_with_generation_task(task)
+    sampling_params = worker.convert_task_params(task)
     if task.request_handle is None:
         task.request_handle = worker.llm.generate_async(
             task.input_str, sampling_params=sampling_params, streaming=True)
diff --git a/tensorrt_llm/scaffolding/contrib/Dynasor/dynasor_controller.py b/tensorrt_llm/scaffolding/contrib/Dynasor/dynasor_controller.py
@@ -15,11 +15,14 @@ class WorkerTag(Enum):
 
     # Certainty_threshold and chunk_size controls the compute saving level
     # Decreasing the certainty_threshold and chunk_size will save tokens but may risk at compromising accuracy.
-    def __init__(self,
-                 generation_dir,
-                 max_tokens=8192,
-                 certainty_threshold=3,
-                 chunk_size=64):
+    def __init__(
+        self,
+        generation_dir,
+        max_tokens=8192,
+        certainty_threshold=3,
+        chunk_size=64,
+        streaming=False,
+    ):
         """
         Initializes the controller with parameters controlling token limits and certainty thresholds.
 
@@ -46,6 +49,7 @@ def __init__(self,
             trust_remote_code=False,
             use_fast=True,
         )
+        self.streaming = streaming
 
     def process(self, tasks: List[GenerationTask], **kwargs):
         """
@@ -70,12 +74,14 @@ def process(self, tasks: List[GenerationTask], **kwargs):
         proposer_task.temperature = 0.6
         proposer_task.top_p = 0.95
         proposer_task.worker_tag = self.WorkerTag.GENERATION
+        proposer_task.streaming = self.streaming
 
         probe_task = GenerationTask()
         probe_task.max_tokens = 20
         probe_task.temperature = 0.6
         probe_task.top_p = 0.95
         probe_task.worker_tag = self.WorkerTag.GENERATION
+        probe_task.streaming = self.streaming
 
         probe_answers = []
         probe_responses = []
@@ -96,9 +102,13 @@ def process(self, tasks: List[GenerationTask], **kwargs):
             probe_task.input_str = current_prompt + self.probe_suffix
 
             # For the probe task, append the suffix to force a chain-of-thought leading to an answer.
+            print("[DynasorGenerationController] probe_task")
             yield [probe_task]
 
             # Retrieve the output from the probe task.
+            # if probe_task.streaming:
+            #     print("[DynasorGenerationController] wait result for probe_task")
+            #     probe_task.result.result()
             probe_text = probe_task.output_str
 
             # Extract the potential answer from the probe response.
@@ -120,6 +130,7 @@ def process(self, tasks: List[GenerationTask], **kwargs):
                         probe_answers[-self.certainty_threshold:])
                     == self.certainty_threshold
                     and sum(probe_certain_count) == self.certainty_threshold):
+                tasks[0].result = probe_task.result
                 # If the current prompt indicates the chain-of-thought phase has ended, use one type of suffix.
                 if "</think>" in current_prompt:
                     tasks[0].output_str = (current_prompt + self.answer_suffix +
@@ -133,13 +144,18 @@ def process(self, tasks: List[GenerationTask], **kwargs):
                     return
 
             # if not confident, do another round of generation
+            print("[DynasorGenerationController] proposer_task")
             yield [proposer_task]
 
             # Append the newly generated text from the proposer to the current prompt for the next iteration.
+            # if proposer_task.streaming:
+            #     print("[DynasorGenerationController] wait result for proposer_task")
+            #     proposer_task.result.result()
             current_prompt += proposer_task.output_str
 
         # If the maximum token limit is reached without satisfying the certainty condition,
         # output the accumulated prompt as the final output.
+        tasks[0].result = proposer_task.result
         tasks[0].output_str = current_prompt
         return
 
diff --git a/tensorrt_llm/scaffolding/contrib/__init__.py b/tensorrt_llm/scaffolding/contrib/__init__.py
@@ -1,21 +0,0 @@
-from tensorrt_llm.scaffolding import *  # noqa
-
-from .AsyncGeneration import StreamGenerationTask, stream_generation_handler
-from .Dynasor import DynasorGenerationController
-from .mcp import (ChatTask, MCPCallTask, MCPController, MCPListTask, MCPWorker,
-                  chat_handler)
-
-__all__ = [
-    # AsyncGeneration
-    "stream_generation_handler",
-    "StreamGenerationTask",
-    # Dynasor
-    "DynasorGenerationController",
-    #mcp
-    "MCPController",
-    "MCPWorker",
-    "MCPCallTask",
-    "MCPListTask",
-    "ChatTask",
-    "chat_handler"
-]
diff --git a/tensorrt_llm/scaffolding/controller.py b/tensorrt_llm/scaffolding/controller.py
@@ -6,18 +6,12 @@
 import torch
 from torch.nn import functional as F
 
+from tensorrt_llm.executor.result import GenerationResult
 from tensorrt_llm.logger import logger
 from tensorrt_llm.scaffolding.math_utils import get_digit_majority_vote_result
-from tensorrt_llm.scaffolding.task import (GenerationTask, ScaffoldingOutput,
-                                           Task)
+from tensorrt_llm.scaffolding.task import GenerationTask, Task
 
-
-class ScaffoldingOutput:
-
-    def __init__(self):
-        self.output_str = None
-        # reserved for customized controller
-        self.customized_output = None
+# from .result import ScaffoldingOutput
 
 
 class Controller(ABC):
@@ -28,11 +22,12 @@ def __init__(self):
     def clone(self):
         return copy.deepcopy(self)
 
-    def generate(self, prompt: str, **kwargs) -> ScaffoldingOutput:
+    def generate(self, prompt: str, **kwargs) -> GenerationResult:
         task = GenerationTask.create_from_prompt(prompt)
 
         yield from self.process([task], **kwargs)
 
+        # print("[Controller.generate] task.output in generate", task.result)
         return task.create_scaffolding_output()
 
     def process(self, tasks: List[Task], **kwargs):
@@ -57,7 +52,7 @@ class NativeGenerationController(Controller):
     class WorkerTag(Enum):
         GENERATION = "generation"
 
-    def __init__(self, sampling_params: dict = None):
+    def __init__(self, sampling_params: dict = None, streaming: bool = False):
         super().__init__()
         if sampling_params is None:
             sampling_params = {}
@@ -67,13 +62,15 @@ def __init__(self, sampling_params: dict = None):
                     f"{key} is not a supported field for GenerationTask")
                 sampling_params.pop(key)
         self.sampling_params = sampling_params
+        self.streaming = streaming
 
     def process(self, tasks: List[Task], **kwargs):
         for task in tasks:
             task.worker_tag = self.WorkerTag.GENERATION
             for key, value in self.sampling_params.items():
                 if getattr(task, key) is None:
                     setattr(task, key, value)
+            task.streaming = self.streaming
 
         yield tasks
 
diff --git a/tensorrt_llm/scaffolding/result.py b/tensorrt_llm/scaffolding/result.py
diff --git a/tensorrt_llm/scaffolding/scaffolding_llm.py b/tensorrt_llm/scaffolding/scaffolding_llm.py
diff --git a/tensorrt_llm/scaffolding/task.py b/tensorrt_llm/scaffolding/task.py
diff --git a/tensorrt_llm/scaffolding/worker.py b/tensorrt_llm/scaffolding/worker.py