fix: make kv cache dynamic based on input (#87)

johnrachwan123 · web-flow · commit 0d5f576068e4 · 2025-05-02T12:22:48.000+02:00
* fix: this makes the kv cache dynamic based on the input

* fix: add the input size as a parameter as well

* fix: recompile cuda graph in case of max-autotune-no-cudagraphs model

* fix: remove support for max-autotune-no-cudagraphs mode
diff --git a/src/pruna/algorithms/compilation/torch_compile.py b/src/pruna/algorithms/compilation/torch_compile.py
@@ -347,7 +347,7 @@ def causal_lm_logic(model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
     )
     # If we are using max-autotune-no-cudagraphs, we need to handle the cudagraphs manually.
     if smash_config["mode"] == "max-autotune-no-cudagraphs":
-        gen.enable_cuda_graph(max_kv_cache_size=smash_config["seqlen_manual_cuda_graph"])
+        pruna_logger.error("max-autotune-no-cudagraphs is not supported for causal language models.")
     model.generate = gen.generate
     return model
 
diff --git a/src/pruna/algorithms/compilation/utils.py b/src/pruna/algorithms/compilation/utils.py
@@ -294,20 +294,24 @@ def setup(self, inputs: torch.Tensor, max_new_tokens: int):
         new_batch_size = inputs.shape[0]
 
         # Check if batch size changed compared to the cache configuration
-        if new_batch_size != self.cache_batch_size:
+        # Round up max_new_tokens to the nearest 1000 for better memory allocation
+        rounded_cache_size = ((inputs.shape[1] + max_new_tokens + 999) // 1000) * 1000
+        if new_batch_size != self.cache_batch_size or self.cache_size != rounded_cache_size:
             pruna_logger.info(
-                f"Batch size changed from {self.cache_batch_size} to {new_batch_size}. Re-initializing StaticCache."
+                f"Cache size changed from {self.cache_batch_size}x{self.cache_size} to "
+                f"{new_batch_size}x{rounded_cache_size}. Re-initializing StaticCache."
             )
             self.batch_size = new_batch_size
             self.cache_batch_size = new_batch_size
+            self.cache_size = rounded_cache_size
             self.setup_cache()
 
-            # If CUDA graph was used, it's now invalid
+            # If CUDA graph was used, recompile the graph
             if hasattr(self, "cuda_graph") and self.cuda_graph is not None:
-                pruna_logger.warning("CUDA graph is invalidated due to batch size change. Disabling CUDA graph usage.")
-                self.cuda_graph = None
-                self.gen_next_token = self.original_gen_next_token
-                self.do_capture_graph = False
+                pruna_logger.warning(
+                    "CUDA graph is invalidated due to batch size or cache size change. Recompiling the graph."
+                )
+                self.enable_cuda_graph(max_kv_cache_size=self.cache_size)
 
         # Reset cache contents (does not change shape)
         self.reset_cache()

Original file line number	Diff line number	Diff line change
`@@ -347,7 +347,7 @@ def causal_lm_logic(model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:`
`347`	`347`	`)`
`348`	`348`	`# If we are using max-autotune-no-cudagraphs, we need to handle the cudagraphs manually.`
`349`	`349`	`if smash_config["mode"] == "max-autotune-no-cudagraphs":`
`350`		`- gen.enable_cuda_graph(max_kv_cache_size=smash_config["seqlen_manual_cuda_graph"])`
	`350`	`+ pruna_logger.error("max-autotune-no-cudagraphs is not supported for causal language models.")`
`351`	`351`	`model.generate = gen.generate`
`352`	`352`	`return model`
`353`	`353`