[Bugfix] Staged 2of4 example (#1238)

## Purpose ## * Fix staged 2of4 example ## Background ## * When #1160 landed, this change introduced a bug in the recipe container which meant that the recipe was not recompiled after `append`ing. This caused sgpt to initialize twice and gptq to never initialize, leading to a sparsity-only quantization config * At some point, a changed was introduced which causes previous stages to become reconstructed after recipe recompilation. This means that without resetting the session in between stages, previous stages will initialize twice. * In order to avoid this issue, this PR introduces `session.reset()` in between stages * This change has the consequence of creating `recipe.yaml` files which do not have the full recipe history. However, I believe this is acceptable for the time being, as the stage runner and this work flow will be removed in the next release. --------- Signed-off-by: Kyle Sayers <[email protected]>
vllm-project · Mar 11, 2025 · 8290679 · 8290679
1 parent c87ae55
commit 8290679
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 4 deletions.
diff --git a/src/llmcompressor/recipe/container.py b/src/llmcompressor/recipe/container.py
@@ -123,7 +123,7 @@ def _check_compile_recipe(self):
 
         :return: True if the recipes were compiled, False otherwise
         """
-        if self.compiled_recipe is None and self.recipes:
+        if self.recipes:
             self.compiled_recipe = Recipe.simplify_combine_recipes(self.recipes)
 
     def check_any_recipe_exists(self) -> bool:

diff --git a/src/llmcompressor/transformers/finetune/runner.py b/src/llmcompressor/transformers/finetune/runner.py
@@ -17,7 +17,6 @@
 from llmcompressor.core import active_session
 from llmcompressor.pytorch.model_load.helpers import (
     get_completed_stages,
-    get_session_model,
     save_checkpoint,
     save_completed_stages,
 )
@@ -183,6 +182,10 @@ def run_sequential_stages(
                     "the stage name."
                 )
 
+            # skip stages which have already been applied
+            if stage_name in completed_stages:
+                continue
+
             # setup checkpoint dir, TODO: this should be optional
             self._output_dir = os.path.join(
                 self.parent_output_dir, "stage_" + stage_name
@@ -222,6 +225,7 @@ def run_sequential_stages(
                     recipe_stage=stage_name,
                 )
             elif run_type is StageRunType.TRAIN:
+                self.trainer.model = model
                 self.train(checkpoint=checkpoint, stage=stage_name)
 
             checkpoint = None
@@ -248,11 +252,10 @@ def run_sequential_stages(
 
             # setup for next stage
             session = active_session()
-            session.reset_stage()
+            session.reset()
 
             # synchronize and clean up memory
             self.trainer.accelerator.wait_for_everyone()
-            self.trainer.model = get_session_model()
             torch.cuda.empty_cache()
             self.trainer.accelerator.free_memory()
             self.trainer.accelerator.wait_for_everyone()