fix: generation functions for torch.compile llm combination (#85)

johnrachwan123 · web-flow · commit 3aebee757a0e · 2025-05-01T22:20:54.000+02:00
* fix: take and arg or kwargs as input to the llm and stop when the llm has generated EOS token

* fix: perform quantization on cpu always

* fix: mypy errors

* fix: handle review comments

* fix: mypy error
diff --git a/src/pruna/algorithms/compilation/torch_compile.py b/src/pruna/algorithms/compilation/torch_compile.py
@@ -343,6 +343,7 @@ def causal_lm_logic(model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         compile_mode=smash_config["mode"],
         compile_fullgraph=smash_config["fullgraph"],
         batch_size=smash_config["batch_size"],
+        device=smash_config.device,
     )
     # If we are using max-autotune-no-cudagraphs, we need to handle the cudagraphs manually.
     if smash_config["mode"] == "max-autotune-no-cudagraphs":
diff --git a/src/pruna/algorithms/compilation/utils.py b/src/pruna/algorithms/compilation/utils.py
@@ -19,6 +19,8 @@
 from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers.cache_utils import StaticCache
 
+from pruna.logging.logger import pruna_logger
+
 
 class TransformersGenerator:
     """
@@ -43,6 +45,8 @@ class TransformersGenerator:
         Whether to compile the full computation graph or use partial graph compilation.
     batch_size : int, default=1
         The batch size to use for text generation.
+    device : str, default='cuda'
+        The device to use for text generation.
     """
 
     def __init__(
@@ -54,6 +58,7 @@ def __init__(
         compile_mode: str = "reduce-overhead",
         compile_fullgraph: bool = True,
         batch_size: int = 1,
+        device: str = "cuda",
     ):
         """
         Initialize the TransformersGenerator.
@@ -87,21 +92,23 @@ def __init__(
             torch._dynamo.config.inline_inbuilt_nn_modules = False  # torch 2.5.0 fix
 
         self.model = model
-        self.device = model.device
+        self.device = device
         self.temperature = temperature
         self.top_k = top_k
         self.use_cache = True
         self.compile_mode = compile_mode
         self.compile_fullgraph = compile_fullgraph
         self.batch_size = batch_size
+        self.cache_batch_size = batch_size
         self.cache_size = max_kv_cache_size
+        self.eos_token_id = getattr(model.config, "eos_token_id", None)
+        if self.eos_token_id is None:
+            pruna_logger.warning("Warning: eos_token_id is None. This may affect generation stopping criteria.")
 
         self.setup_cache()
 
         self.decode_one_token = torch.compile(  # type: ignore
-            self.decode_one_token,
-            mode=self.compile_mode,
-            fullgraph=self.compile_fullgraph
+            self.decode_one_token, mode=self.compile_mode, fullgraph=self.compile_fullgraph
         )
 
         self.init()
@@ -110,6 +117,7 @@ def __init__(
         # Cuda Graph section
         self.static_input = torch.zeros((1, 1), device=self.device, dtype=torch.int32)
         self.static_output = torch.zeros((1, 1), device=self.device, dtype=torch.int32)
+        self.original_gen_next_token = self.gen_next_token
         self.cuda_graph = None
         self.do_capture_graph = False
         ############################
@@ -198,10 +206,7 @@ def logits_to_probs(self, logits: torch.Tensor, temperature: float = 1.0, top_k:
         return probs
 
     def sample(
-        self,
-        logits: torch.Tensor,
-        temperature: float = 1.0,
-        top_k: int | None = None
+        self, logits: torch.Tensor, temperature: float = 1.0, top_k: int | None = None
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Sample one token from the model.
@@ -286,18 +291,38 @@ def setup(self, inputs: torch.Tensor, max_new_tokens: int):
         None
             This method initializes internal state for generation but does not return a value.
         """
+        new_batch_size = inputs.shape[0]
+
+        # Check if batch size changed compared to the cache configuration
+        if new_batch_size != self.cache_batch_size:
+            pruna_logger.info(
+                f"Batch size changed from {self.cache_batch_size} to {new_batch_size}. Re-initializing StaticCache."
+            )
+            self.batch_size = new_batch_size
+            self.cache_batch_size = new_batch_size
+            self.setup_cache()
+
+            # If CUDA graph was used, it's now invalid
+            if hasattr(self, "cuda_graph") and self.cuda_graph is not None:
+                pruna_logger.warning("CUDA graph is invalidated due to batch size change. Disabling CUDA graph usage.")
+                self.cuda_graph = None
+                self.gen_next_token = self.original_gen_next_token
+                self.do_capture_graph = False
+
+        # Reset cache contents (does not change shape)
         self.reset_cache()
+
         self.inputs = inputs
         self.batch_size, self.seq_length = self.inputs.shape
         self.cache_position = torch.arange(self.seq_length, device=self.device)
-        # initialize the generated ids with zeros of the shape (batch_size, seq_length + max_new_tokens + 1)
+        # initialize the generated ids with zeros
         self.generated_ids = torch.zeros(
             self.batch_size,
             self.seq_length + max_new_tokens + 1,
             dtype=torch.int,
             device=self.device,
         )
-        # copy the input ids to the generated ids at the cache position.
+        # copy the input ids to the generated ids
         self.generated_ids[:, self.cache_position] = self.inputs.to(torch.int)
 
     def prefill(self) -> torch.Tensor:
@@ -340,11 +365,11 @@ def gen_next_token(self, current_token: torch.Tensor) -> torch.Tensor:
             The next token generated by the model.
         """
         next_token = self.decode_one_token(
-                current_token.clone(),
-                cache_position=self.cache_position + 1,
-                past_key_values=self.past_key_values,
-                temperature=self.temperature,
-                top_k=self.top_k,
+            current_token.clone(),
+            cache_position=self.cache_position + 1,
+            past_key_values=self.past_key_values,
+            temperature=self.temperature,
+            top_k=self.top_k,
         )
         self.cache_position += 1
         self.generated_ids[:, self.cache_position] = next_token.int()
@@ -354,7 +379,7 @@ def enable_cuda_graph(
         self,
         iters: int = 2,
         prompt_tokenized: list[int] = [596, 8830, 315, 6913, 19476, 11, 1778, 439, 279, 12939],
-        max_kv_cache_size: int = 1024
+        max_kv_cache_size: int = 1024,
     ) -> None:
         """
         Enable the CUDA graph and capture the graph on random prompt.
@@ -375,17 +400,15 @@ def enable_cuda_graph(
             but does not return any value.
         """
         _ = self.generate(
-            torch.tensor(prompt_tokenized, device=self.model.device).unsqueeze(0),
-            max_new_tokens=max_kv_cache_size
+            torch.tensor(prompt_tokenized, device=self.model.device).unsqueeze(0), max_new_tokens=max_kv_cache_size
         )
         for _ in range(iters):
             # need to reset the graph before capturing it at each iteration
             # to avoid block/thread errors.
             self.do_capture_graph = True
             self.gen_next_token = self.gen_next_token_withgraph  # type: ignore
             _ = self.generate(
-                torch.tensor(prompt_tokenized, device=self.model.device).unsqueeze(0),
-                max_new_tokens=max_kv_cache_size
+                torch.tensor(prompt_tokenized, device=self.model.device).unsqueeze(0), max_new_tokens=max_kv_cache_size
             )
 
     def gen_next_token_withgraph(self, current_token: torch.Tensor) -> torch.Tensor:
@@ -426,54 +449,100 @@ def gen_next_token_withgraph(self, current_token: torch.Tensor) -> torch.Tensor:
         return next_token
 
     def next_token_iterator(
-        self,
-        current_token: torch.Tensor,
-        max_new_tokens: int,
-        cleanup: bool = True
+        self, current_token: torch.Tensor, max_new_tokens: int, cleanup: bool = True
     ) -> torch.Tensor:
         """
-        Generate the next token.
+        Generate the next token, stopping at max_new_tokens or EOS for each sequence in the batch.
 
         Parameters
         ----------
         current_token : torch.Tensor
-            The current token.
+            The current token tensor of shape (batch_size, 1).
         max_new_tokens : int
             The maximum number of new tokens to generate.
         cleanup : bool
-            Whether to cleanup the inputs, generated ids, and cache position.
+            Whether to cleanup the inputs, generated ids, and cache position after generation.
 
         Returns
         -------
         torch.Tensor
-            The generated tokens.
+            The generated tokens tensor of shape (batch_size, seq_length + generated_length),
+            including the input prompt and potentially EOS tokens. Sequences that finish early
+            will have EOS followed by padding (initial zeros).
         """
+        # Keep track of sequences that haven't finished yet (encountered EOS)
+        # Assumes initial state is unfinished for all sequences in the batch
+        unfinished_sequences = torch.ones(self.batch_size, dtype=torch.bool, device=self.device)
+
+        # Loop for a maximum of max_new_tokens - 1 steps (as prefill generates the first)
         for i in range(1, max_new_tokens):
-            current_token = self.gen_next_token(current_token)
-        output_tokens = self.generated_ids
+            # Generate the next token for all sequences
+            current_token = self.gen_next_token(current_token)  # Updates self.generated_ids internally
+
+            # Check if the generated token is the EOS token for any currently unfinished sequence
+            if self.eos_token_id is not None:
+                # Check which sequences produced the EOS token THIS step
+                # current_token shape is (batch_size, 1), squeeze to (batch_size,)
+                # Only consider sequences that were previously unfinished
+                finished_this_step = (current_token.squeeze(-1) == self.eos_token_id) & unfinished_sequences
+                # Update the overall tracker for unfinished sequences
+                unfinished_sequences &= ~finished_this_step
+
+            # Stop generation if all sequences in the batch have finished
+            if not unfinished_sequences.any():
+                break
+
+        # Determine the actual length generated (up to the current cache position)
+        # .item() is safe as cache_position should be a 0-dim tensor
+        final_seq_len = self.cache_position.item() + 1
+        # Clone the relevant part of generated_ids before potential cleanup
+        output_tokens = self.generated_ids[:, : int(final_seq_len)].clone()
 
         if cleanup:
+            # Delete internal state tensors, but not output_tokens which is returned
             del self.inputs, self.generated_ids, self.cache_position
             torch.cuda.empty_cache()
 
         return output_tokens
 
     @torch.inference_mode()
-    def generate(self, input_ids: torch.Tensor, max_new_tokens: int = 100) -> torch.Tensor:
+    def generate(self, *args, **kwargs) -> torch.Tensor:
         """
-        Generate the tokens.
+        Generate tokens using the model.
 
         Parameters
         ----------
-        input_ids : torch.Tensor
-            The input ids.
-        max_new_tokens : int
-            The maximum number of new tokens to generate.
+        *args : tuple
+            Variable length argument list (not used directly).
+        **kwargs : dict
+            Keyword arguments dictionary that must contain:
+            - input_ids : torch.Tensor
+                The input token ids that serve as the prompt.
+            - max_new_tokens : int
+                The maximum number of new tokens to generate.
 
         Returns
         -------
         torch.Tensor
-            The generated tokens.
+            The generated tokens, including the input prompt and potentially an EOS token.
         """
-        self.setup(inputs=input_ids, max_new_tokens=max_new_tokens)
-        return self.next_token_iterator(self.prefill(), max_new_tokens)
+        # Extract parameters from kwargs with defaults from instance variables
+        self.temperature = kwargs.pop("temperature", self.temperature)
+        self.top_k = kwargs.pop("top_k", self.top_k)
+        self.use_cache = kwargs.pop("use_cache", self.use_cache)
+
+        # Log any kwargs that are not explicitly handled
+        unhandled_kwargs = {
+            k: v
+            for k, v in kwargs.items()
+            if k not in ["input_ids", "max_new_tokens", "temperature", "top_k", "batch_size"]
+        }
+        if unhandled_kwargs:
+            pruna_logger.warning(f"Unhandled kwargs in generate method: {unhandled_kwargs}")
+
+        # Update instance variables with any provided values
+        self.setup(
+            inputs=kwargs["input_ids"] if "input_ids" in kwargs else args[0],
+            max_new_tokens=kwargs["max_new_tokens"] if "max_new_tokens" in kwargs else args[1],
+        )
+        return self.next_token_iterator(self.prefill(), kwargs["max_new_tokens"])
diff --git a/src/pruna/algorithms/quantization/hqq.py b/src/pruna/algorithms/quantization/hqq.py
@@ -24,6 +24,7 @@
 from pruna.config.smash_config import SmashConfigPrefixWrapper
 from pruna.engine.model_checks import is_causal_lm
 from pruna.engine.save import SAVE_FUNCTIONS
+from pruna.engine.utils import move_to_device, safe_memory_cleanup
 from pruna.logging.filter import SuppressOutput
 from pruna.logging.logger import pruna_logger
 
@@ -116,9 +117,10 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
 
         quant_config_hqq = imported_modules["BaseQuantizeConfig"](nbits=weight_quantization_bits, group_size=group_size)
         quant_config_hf = imported_modules["HqqConfig"](nbits=weight_quantization_bits, group_size=group_size)
-
+        move_to_device(model, "cpu")
+        safe_memory_cleanup()
         try:  # Try to quantize the model using HQQ
-            smashed_model = imported_modules["AutoHQQHFModel"].quantize_model(
+            model = imported_modules["AutoHQQHFModel"].quantize_model(
                 model,
                 quant_config=quant_config_hqq,
                 device=smash_config["device"],
@@ -131,7 +133,7 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
             temp_dir = tempfile.mkdtemp(dir=base_temp_dir)
             model.save_pretrained(temp_dir)
 
-            smashed_model = AutoModelForCausalLM.from_pretrained(
+            model = AutoModelForCausalLM.from_pretrained(
                 temp_dir,
                 quantization_config=quant_config_hf,
                 trust_remote_code=True,
@@ -149,7 +151,7 @@ def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
         except Exception as e:
             pruna_logger.error(f"Error: {e}")
             pass
-        return smashed_model
+        return model
 
     def import_algorithm_packages(self) -> Dict[str, Any]:
         """

Original file line number	Diff line number	Diff line change
`@@ -343,6 +343,7 @@ def causal_lm_logic(model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:`
`343`	`343`	`compile_mode=smash_config["mode"],`
`344`	`344`	`compile_fullgraph=smash_config["fullgraph"],`
`345`	`345`	`batch_size=smash_config["batch_size"],`
	`346`	`+ device=smash_config.device,`
`346`	`347`	`)`
`347`	`348`	`# If we are using max-autotune-no-cudagraphs, we need to handle the cudagraphs manually.`
`348`	`349`	`if smash_config["mode"] == "max-autotune-no-cudagraphs":`