From 865364f8a99c9d2e5004f585a8d6423caa8ac6a6 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Mon, 24 Feb 2025 15:04:49 -0500
Subject: [PATCH 01/23] [Oneshot Refactor] Main refactor (#1110)

ORDER OF REVIEWS:
1. https://github.com/vllm-project/llm-compressor/pull/1108
2. https://github.com/vllm-project/llm-compressor/pull/1103
3. https://github.com/vllm-project/llm-compressor/pull/1109
4. https://github.com/vllm-project/llm-compressor/pull/1110 <- current
PR


SUMMARY:
* Create a class to decouple dependency to `main`. Class `Oneshot`
consists of pre-processing, carrying out oneshot logic and
post-processing
* Move the oneshot class and method under
`llmcompressor/entrypoints/oneshot.py`.
* Add ReadMe in `/llmcompressor/entrypoints` for info on oneshot
* Delete oneshot logic from `/finetune` directory, add deprecation
warning
* Remove apply used only for stagerunner oneshot pathway in session.py
and session_function.py
* Add oneshot only calibration dataloader logic
* Add a return variable of `model: PretrainedModel` for `def oneshot`.
* Make oneshot carryout logic independent of `TrainingArguments`
* remove `overwrite_output_dir` as oneshot input arg -> only used for
`TrainingArguments`
* Update README on `/finetune` path. Remove `oneshot` logic and `oneshot
with fsdp`
* Update `wrap_save_pretrained` logic to run only if not updated already
-> used for stage runner to avoid double wrapping


Entrypoints:
```python3
from llmcompressor import oneshot
oneshot(**kwargs) # calls Oneshot
```

or

```python3
from llmcompressor import Oneshot
oneshot = Oneshot(**kwargs)
oneshot() # preprocesss, carries out logic and post process
```


TEST PLAN:
Pass all tests and examples.
Verified
https://github.com/vllm-project/llm-compressor/blob/main/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
works as expected.

FOLLOW UPS:
* Stage runner removal
* Update entrypoints folder with train, eval, predict, etc.

---------

Signed-off-by: George Ohashi <george@neuralmagic.com>
Signed-off-by: George <george@neuralmagic.com>
---
 README.md                                     |   2 +-
 .../cpu_offloading_fp8.py                     |   2 +-
 .../mult_gpus_int8_device_map.py              |   2 +-
 .../multi_gpu_int8.py                         |   2 +-
 examples/multimodal_audio/whisper_example.py  |   2 +-
 .../multimodal_vision/idefics3_example.py     |   2 +-
 examples/multimodal_vision/llava_example.py   |   2 +-
 examples/multimodal_vision/mllama_example.py  |   2 +-
 .../multimodal_vision/phi3_vision_example.py  |   2 +-
 examples/multimodal_vision/pixtral_example.py |   2 +-
 .../multimodal_vision/qwen2_vl_example.py     |   2 +-
 examples/quantization_kv_cache/README.md      |   2 +-
 .../gemma2_fp8_kv_example.py                  |   2 +-
 .../llama3_fp8_kv_example.py                  |   2 +-
 .../phi3.5_fp8_kv_example.py                  |   2 +-
 examples/quantization_w4a16/README.md         |   2 +-
 examples/quantization_w8a8_fp8/README.md      |   2 +-
 .../quantization_w8a8_fp8/gemma2_example.py   |   2 +-
 .../llama3.2_vision_example.py                |   2 +-
 .../quantization_w8a8_fp8/llama3_example.py   |   2 +-
 .../quantization_w8a8_fp8/llava1.5_example.py |   2 +-
 .../quantization_w8a8_fp8/qwen2vl_example.py  |   2 +-
 .../quantization_w8a8_fp8/whisper_example.py  |   2 +-
 examples/quantization_w8a8_int8/README.md     |   2 +-
 .../quantization_w8a8_int8/gemma2_example.py  |   2 +-
 .../quantization_w8a8_int8/llama3_example.py  |   2 +-
 examples/quantizing_moe/README.md             |   4 +-
 examples/quantizing_moe/deepseek_moe_w4a16.py |   2 +-
 .../quantizing_moe/deepseek_moe_w8a8_fp8.py   |   2 +-
 .../quantizing_moe/deepseek_moe_w8a8_int8.py  |   2 +-
 .../quantizing_moe/mixtral_moe_w8a8_fp8.py    |   3 +-
 .../llama3_8b_2of4.py                         |   2 +-
 src/llmcompressor/__init__.py                 |   2 +-
 src/llmcompressor/args/README.md              |   4 +-
 src/llmcompressor/core/__init__.py            |   1 -
 src/llmcompressor/core/session.py             |  13 -
 src/llmcompressor/core/session_functions.py   |  57 ---
 src/llmcompressor/entrypoints/README.md       |  85 ++++
 src/llmcompressor/entrypoints/__init__.py     |   2 +
 src/llmcompressor/entrypoints/oneshot.py      | 373 ++++++++++++++++++
 .../transformers/finetune/README.md           |  45 ---
 .../finetune/data/data_helpers.py             |  76 ++++
 .../transformers/finetune/runner.py           |  70 ++--
 .../transformers/finetune/session_mixin.py    |  27 +-
 .../transformers/finetune/text_generation.py  |  26 +-
 .../compressed_tensors_utils.py               |   5 +-
 .../transformers/utils/helpers.py             |   9 +
 tests/e2e/e2e_utils.py                        |   2 +-
 .../llmcompressor/entrypoints/test_oneshot.py |  35 ++
 .../compression/test_quantization.py          |   9 +-
 .../test_finetune_no_recipe_custom_dataset.py |   1 -
 .../test_finetune_oneshot_with_modifier.py    |   2 +-
 .../finetune/test_oneshot_then_finetune.py    |   3 +-
 .../transformers/gptq/test_oneshot.py         |   3 +-
 .../transformers/kv_cache/test_kv_cache.py    |   2 +-
 .../obcq/test_consecutive_runs.py             |   2 +-
 .../obcq/test_mask_structure_preservation.py  |   2 +-
 .../transformers/obcq/test_obcq_completion.py |   3 +-
 .../transformers/obcq/test_obcq_sparsity.py   |  15 +-
 .../transformers/oneshot/test_api_inputs.py   |   2 +-
 .../test_compress_tensor_utils.py             |   7 +-
 61 files changed, 680 insertions(+), 270 deletions(-)
 create mode 100644 src/llmcompressor/entrypoints/README.md
 create mode 100644 src/llmcompressor/entrypoints/__init__.py
 create mode 100644 src/llmcompressor/entrypoints/oneshot.py
 create mode 100644 tests/llmcompressor/entrypoints/test_oneshot.py

diff --git a/README.md b/README.md
index ffc651da8..e61e2a49e 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ Quantization is applied by selecting an algorithm and calling the `oneshot` API.
 ```python
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 # Select quantization algorithm. In this case, we:
 #   * apply SmoothQuant to make the activations easier to quantize
diff --git a/examples/big_models_with_accelerate/cpu_offloading_fp8.py b/examples/big_models_with_accelerate/cpu_offloading_fp8.py
index b5135af5c..248759ba4 100644
--- a/examples/big_models_with_accelerate/cpu_offloading_fp8.py
+++ b/examples/big_models_with_accelerate/cpu_offloading_fp8.py
@@ -1,7 +1,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
diff --git a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
index 470ed1773..6819a7e97 100644
--- a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
+++ b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
@@ -2,9 +2,9 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
diff --git a/examples/big_models_with_accelerate/multi_gpu_int8.py b/examples/big_models_with_accelerate/multi_gpu_int8.py
index 50d0bea08..0a19b92e3 100644
--- a/examples/big_models_with_accelerate/multi_gpu_int8.py
+++ b/examples/big_models_with_accelerate/multi_gpu_int8.py
@@ -1,8 +1,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic"
diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index 4f2928dca..cd2cdd373 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -2,8 +2,8 @@
 from datasets import load_dataset
 from transformers import WhisperProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration
 
 # Select model and load it.
diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py
index b9c0e6aaa..2d2148ec5 100644
--- a/examples/multimodal_vision/idefics3_example.py
+++ b/examples/multimodal_vision/idefics3_example.py
@@ -4,8 +4,8 @@
 from PIL import Image
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableIdefics3ForConditionalGeneration
 
 # Load model.
diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py
index fab342cb2..a84112a43 100644
--- a/examples/multimodal_vision/llava_example.py
+++ b/examples/multimodal_vision/llava_example.py
@@ -3,8 +3,8 @@
 from PIL import Image
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
 
 # Load model.
diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py
index 6ad3638a3..1777d7689 100644
--- a/examples/multimodal_vision/mllama_example.py
+++ b/examples/multimodal_vision/mllama_example.py
@@ -3,8 +3,8 @@
 from PIL import Image
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableMllamaForConditionalGeneration
 
 # Load model.
diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py
index fa9e7fdb2..c90f0e240 100644
--- a/examples/multimodal_vision/phi3_vision_example.py
+++ b/examples/multimodal_vision/phi3_vision_example.py
@@ -5,8 +5,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 
 # Load model.
 model_id = "microsoft/Phi-3-vision-128k-instruct"
diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py
index 891819bc6..b67d9af56 100644
--- a/examples/multimodal_vision/pixtral_example.py
+++ b/examples/multimodal_vision/pixtral_example.py
@@ -3,8 +3,8 @@
 from PIL import Image
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
 
 # Load model.
diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py
index 7cd014d36..c722e39ae 100644
--- a/examples/multimodal_vision/qwen2_vl_example.py
+++ b/examples/multimodal_vision/qwen2_vl_example.py
@@ -6,8 +6,8 @@
 from qwen_vl_utils import process_vision_info
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableQwen2VLForConditionalGeneration
 
 # Load model.
diff --git a/examples/quantization_kv_cache/README.md b/examples/quantization_kv_cache/README.md
index 906990c21..cdf2be611 100644
--- a/examples/quantization_kv_cache/README.md
+++ b/examples/quantization_kv_cache/README.md
@@ -75,7 +75,7 @@ Configure and apply the FP8 quantization for weights, activations, and KV cache.
 Notice the new `kv_cache_scheme` section:
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 recipe = """
 quant_stage:
diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
index c90c42757..4290af34f 100644
--- a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
@@ -1,7 +1,7 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 # Select model and load it.
 MODEL_ID = "google/gemma-2-9b-it"
diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
index 6c08d4acc..72872b913 100644
--- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -2,7 +2,7 @@
 from loguru import logger
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
index 4221f290b..2f49f7380 100644
--- a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
@@ -1,7 +1,7 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 # Select model and load it.
 # Phi-3.5 is a special case for KV cache quantization because it has
diff --git a/examples/quantization_w4a16/README.md b/examples/quantization_w4a16/README.md
index 718975331..7e7f482be 100644
--- a/examples/quantization_w4a16/README.md
+++ b/examples/quantization_w4a16/README.md
@@ -86,7 +86,7 @@ In our case, we will apply the default GPTQ recipe for `int4` (which uses static
 > See the `Recipes` documentation for more information on making complex recipes
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 
 # Configure the quantization algorithm to run.
diff --git a/examples/quantization_w8a8_fp8/README.md b/examples/quantization_w8a8_fp8/README.md
index 091946623..2b817ba1e 100644
--- a/examples/quantization_w8a8_fp8/README.md
+++ b/examples/quantization_w8a8_fp8/README.md
@@ -54,7 +54,7 @@ We recommend targeting all `Linear` layers using the `FP8_DYNAMIC` scheme, which
 Since simple PTQ does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 
 # Configure the simple PTQ quantization
diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
index 5b1b2ae79..77664f2d5 100644
--- a/examples/quantization_w8a8_fp8/gemma2_example.py
+++ b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -1,7 +1,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "google/gemma-2-27b-it"
 
diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
index d6ea7b363..c99d0bfcc 100644
--- a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
+++ b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
@@ -1,7 +1,7 @@
 from transformers import AutoProcessor, MllamaForConditionalGeneration
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py
index 6dc870b32..a66200239 100644
--- a/examples/quantization_w8a8_fp8/llama3_example.py
+++ b/examples/quantization_w8a8_fp8/llama3_example.py
@@ -1,7 +1,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py
index 6b3a721a1..31cb4cb94 100644
--- a/examples/quantization_w8a8_fp8/llava1.5_example.py
+++ b/examples/quantization_w8a8_fp8/llava1.5_example.py
@@ -1,7 +1,7 @@
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "llava-hf/llava-1.5-7b-hf"
 
diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py
index ab7e4f682..564fc6644 100644
--- a/examples/quantization_w8a8_fp8/qwen2vl_example.py
+++ b/examples/quantization_w8a8_fp8/qwen2vl_example.py
@@ -1,7 +1,7 @@
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
 
diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py
index df18b0d11..cadcc6e8c 100644
--- a/examples/quantization_w8a8_fp8/whisper_example.py
+++ b/examples/quantization_w8a8_fp8/whisper_example.py
@@ -1,8 +1,8 @@
 from datasets import load_dataset
 from transformers import AutoProcessor, WhisperForConditionalGeneration
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "openai/whisper-large-v2"
 
diff --git a/examples/quantization_w8a8_int8/README.md b/examples/quantization_w8a8_int8/README.md
index a7e15c330..02aab76c7 100644
--- a/examples/quantization_w8a8_int8/README.md
+++ b/examples/quantization_w8a8_int8/README.md
@@ -86,7 +86,7 @@ We first select the quantization algorithm. For W8A8, we want to:
 > See the `Recipes` documentation for more information on recipes
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 
diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
index 456c5ec45..fd3339d3c 100644
--- a/examples/quantization_w8a8_int8/gemma2_example.py
+++ b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -1,8 +1,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 
 # 1) Select model and load it.
 MODEL_ID = "google/gemma-2-2b-it"
diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py
index a97ed3198..5d8a605e9 100644
--- a/examples/quantization_w8a8_int8/llama3_example.py
+++ b/examples/quantization_w8a8_int8/llama3_example.py
@@ -1,9 +1,9 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-from llmcompressor.transformers import oneshot
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
diff --git a/examples/quantizing_moe/README.md b/examples/quantizing_moe/README.md
index 0bb6bf007..70243caf1 100644
--- a/examples/quantizing_moe/README.md
+++ b/examples/quantizing_moe/README.md
@@ -51,7 +51,7 @@ NOTE: `.*block_sparse_moe.gate` layers do not quantize well, hence they are igno
 The `oneshot` method applies the selected recipe to your model and dataset without requiring any fine-tuning. The model will be sparsified and saved to `Mixtral-8x7B-Instruct-v0.1-FP8`.
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 output_dir = "Mixtral-8x7B-Instruct-v0.1-FP8"
 
@@ -61,7 +61,7 @@ oneshot(
     recipe=recipe,
     save_compressed=True,
     output_dir=output_dir,
-    overwrite_output_dir=True,
+    
     max_seq_length=2048,
     num_calibration_samples=512,
 )
diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
index 26f3c55d6..7b3a8c2c9 100644
--- a/examples/quantizing_moe/deepseek_moe_w4a16.py
+++ b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -2,7 +2,7 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 # NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
index a6e7d2c38..55ff7593c 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -1,8 +1,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 # NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
 # Please consider either downgrading your transformers version to a
diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
index 3a5a3c8f7..05c722795 100644
--- a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
+++ b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -2,8 +2,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 # NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
index bbbde067e..eb8ced090 100644
--- a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
+++ b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
@@ -2,8 +2,8 @@
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
@@ -45,7 +45,6 @@
     max_seq_length=MAX_SEQ_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     save_compressed=SAVE_COMPRESSED,
-    overwrite_output_dir=True,
     output_dir=SAVE_DIR,
 )
 
diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
index 85cb38867..f18585e3e 100644
--- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
+++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
@@ -3,10 +3,10 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.obcq import SparseGPTModifier
 from llmcompressor.modifiers.pruning import ConstantPruningModifier
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 # Configuration
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
diff --git a/src/llmcompressor/__init__.py b/src/llmcompressor/__init__.py
index 264d434f0..6f174a59e 100644
--- a/src/llmcompressor/__init__.py
+++ b/src/llmcompressor/__init__.py
@@ -36,7 +36,6 @@
 
 from llmcompressor.core.session_functions import (
     active_session,
-    apply,
     callbacks,
     create_session,
     finalize,
@@ -44,3 +43,4 @@
     pre_initialize_structure,
     reset_session,
 )
+from llmcompressor.entrypoints import Oneshot, oneshot
diff --git a/src/llmcompressor/args/README.md b/src/llmcompressor/args/README.md
index 43697b17a..4691a615c 100644
--- a/src/llmcompressor/args/README.md
+++ b/src/llmcompressor/args/README.md
@@ -5,7 +5,7 @@ Parsers in `llm-compressor` define the input arguments required for various entr
 Each entry point (e.g., oneshot) carries out its logic based on the provided input arguments, `model`, `recipe`, and `dataset`.
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 model = ...
 recipe = ...
@@ -24,7 +24,7 @@ oneshot(
 
 These input arguments can be overloaded into the function signature and will be parsed using Hugging Face's [argument parser](https://github.com/huggingface/transformers/blob/main/src/transformers/hf_argparser.py). The parsers define the acceptable inputs; therefore any arguments to be passed in must be defined.
 
-`llm-compressor` uses four parsers, located in `llm_compressor/arg_parser`:
+`llm-compressor` uses four parsers, located in `llm_compressor/args`:
 * ModelArguments
 * DatasetArguments
 * RecipeArguments
diff --git a/src/llmcompressor/core/__init__.py b/src/llmcompressor/core/__init__.py
index 171c95395..75335164d 100644
--- a/src/llmcompressor/core/__init__.py
+++ b/src/llmcompressor/core/__init__.py
@@ -11,7 +11,6 @@
 from llmcompressor.core.session_functions import (
     LifecycleCallbacks,
     active_session,
-    apply,
     callbacks,
     create_session,
     finalize,
diff --git a/src/llmcompressor/core/session.py b/src/llmcompressor/core/session.py
index 7c489f36f..888db3f1e 100644
--- a/src/llmcompressor/core/session.py
+++ b/src/llmcompressor/core/session.py
@@ -200,19 +200,6 @@ def finalize(self, **kwargs) -> ModifiedState:
             modifier_data=mod_data,
         )
 
-    def apply(self, **kwargs):
-        """
-        Apply the recipe in one-shot manner. This will invoke the initialize
-        and then finalize methods for each modifier in the session's lifecycle.
-        This will also set the session's state to the finalized state.
-
-        :param kwargs: additional kwargs to pass to the lifecycle's initialize and
-            finalize methods
-        """
-        self.initialize(**kwargs)
-
-        return self.finalize(**kwargs)
-
     def event(
         self,
         event_type: EventType,
diff --git a/src/llmcompressor/core/session_functions.py b/src/llmcompressor/core/session_functions.py
index 9a123a030..da54872c4 100644
--- a/src/llmcompressor/core/session_functions.py
+++ b/src/llmcompressor/core/session_functions.py
@@ -14,7 +14,6 @@
     "pre_initialize_structure",
     "initialize",
     "finalize",
-    "apply",
     "callbacks",
     "LifecycleCallbacks",
 ]
@@ -143,62 +142,6 @@ def finalize(**kwargs) -> ModifiedState:
     return active_session().finalize(**kwargs)
 
 
-def apply(
-    recipe: Union[str, List[str], "Recipe", List["Recipe"], None] = None,
-    recipe_stage: Union[str, List[str], None] = None,
-    recipe_args: Optional[Dict[str, Any]] = None,
-    model: Optional[Any] = None,
-    teacher_model: Optional[Any] = None,
-    train_data: Optional[Any] = None,
-    val_data: Optional[Any] = None,
-    test_data: Optional[Any] = None,
-    calib_data: Optional[Any] = None,
-    copy_data: bool = True,
-    start: Optional[float] = None,
-    steps_per_epoch: Optional[int] = None,
-    batches_per_step: Optional[int] = None,
-    **kwargs,
-) -> ModifiedState:
-    """
-    A method to apply the recipe in one-shot manner. This will invoke the initialize
-    and then finalize methods for each modifier in the active session's lifecycle.
-
-    :param recipe: the recipe to use for the sparsification, can be a path to a
-        recipe file, a raw recipe string, a recipe object, or a list of recipe objects.
-    :param recipe_stage: the stage to target for the sparsification
-    :param recipe_args: the args to use for overriding the recipe defaults
-    :param model: the model to sparsify
-    :param teacher_model: the teacher model to use for knowledge distillation
-    :param train_data: the training data to use for the sparsification
-    :param val_data: the validation data to use for the sparsification
-    :param test_data: the testing data to use for the sparsification
-    :param calib_data: the calibration data to use for the sparsification
-    :param copy_data: True to copy the data, False otherwise
-    :param start: the start epoch to use for the sparsification
-    :param steps_per_epoch: the number of steps per epoch to use for the
-        sparsification
-    :param batches_per_step: the number of batches per step to use for
-    :param kwargs: additional kwargs to pass to the current session's apply method
-    :return: the modified state of the active session after applying the recipe
-    """
-    return active_session().apply(
-        recipe=recipe,
-        recipe_stage=recipe_stage,
-        recipe_args=recipe_args,
-        model=model,
-        teacher_model=teacher_model,
-        train_data=train_data,
-        val_data=val_data,
-        test_data=test_data,
-        calib_data=calib_data,
-        copy_data=copy_data,
-        start=start,
-        steps_per_epoch=steps_per_epoch,
-        batches_per_step=batches_per_step,
-        **kwargs,
-    )
-
-
 class LifecycleCallbacks:
     """
     A class for invoking lifecycle events for the active session
diff --git a/src/llmcompressor/entrypoints/README.md b/src/llmcompressor/entrypoints/README.md
new file mode 100644
index 000000000..85cb0ce2f
--- /dev/null
+++ b/src/llmcompressor/entrypoints/README.md
@@ -0,0 +1,85 @@
+# LLM Compressor Entrypoints
+
+## Oneshot
+
+Model optimizations compress models while preserving accuracy. One-shot in LLM-Compressor supports faster inference on vLLM by applying post-training quantization (PTQ) or sparsification
+
+### PTQ
+PTQ is performed to reduce the precision of quantizable weights (e.g., linear layers) to a lower bit-width. Supported formats are:
+- W4A16
+- W8A8-INT8 
+- W8A8-FP8
+
+### Sparsification
+Sparsification reduces model complexity by pruning selected weight values to zero while retaining essential weights in a subset of parameters. Supported formats include:
+-  2:4-Sparsity with FP8 Weight, FP8 Input Activation
+
+
+## Code
+
+Example scripts for all the above formats are located in the [examples](../../../examples/) folder. A [W8A8-FP8](../../../examples/quantization_w8a8_fp8/llama3_example.py) example is shown below: 
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+recipe = QuantizationModifier(
+    targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
+)
+
+oneshot(model=model, recipe=recipe)
+```
+
+### Input Arguments
+`oneshot` only accepts arguments defined in `src/llmcompressor/args`, which are dataclasses categorized into [`ModelArguments`](../../llmcompressor/args/model_arguments.py), [`DatasetArguments`](../../llmcompressor/args/dataset_arguments.py) and [`RecipeArguments`](../../llmcompressor/args/recipe_arguments.py). If an undefined input argument is provided, an error will be raised.
+
+The high-level description of the argument parser is as follows:
+
+- `ModelArguments`: Arguments for loading and configuring a pretrained model
+    (e.g., `AutoModelForCausalLM`).
+- `DatasetArguments`: Arguments for dataset-related configurations, such as
+    calibration dataloaders.
+- `RecipeArguments`: Arguments for defining and configuring recipes that specify
+    optimization actions.
+
+For more information, please check the [README.md](../../llmcompressor/args/README.md) in `src/llmcompressor/args`.
+
+
+### Lifecycle
+
+The oneshot calibration lifecycle consists of three steps:
+1. **Preprocessing**:
+    - Instantiates a pretrained model and tokenizer/processor.
+    - Ensures input and output embedding layers are untied if they share
+        tensors.
+    - Patches the model to include additional functionality for saving with
+        quantization configurations.
+2. **Oneshot Calibration**:
+    - Optimizes the model based on the recipe (instructions for optimizing the model). The 
+        recipe defines the `Modifiers` (e.g., `GPTQModifier`, `SparseGPTModifier`) to apply, which
+        contain logic how to quantize or sparsify a model. 
+3. **Postprocessing**:
+    - Saves the model, tokenizer/processor, and configuration to the specified
+        `output_dir`.
+
+### Saving an Optimized Model
+
+To save an optimized model, the recommended approach is to specify `output_dir` in the input argument. For example, to save the model in the `./oneshot_model` directory,
+
+```python3
+oneshot(
+    ...,
+    output_dir="./oneshot_model",
+)
+```    
+
+This will automatically save the model in the SafeTensors format, along with the tokenizer/processor, recipe, and the configuration file.
diff --git a/src/llmcompressor/entrypoints/__init__.py b/src/llmcompressor/entrypoints/__init__.py
new file mode 100644
index 000000000..dd1d4aa83
--- /dev/null
+++ b/src/llmcompressor/entrypoints/__init__.py
@@ -0,0 +1,2 @@
+# flake8: noqa
+from .oneshot import Oneshot, oneshot
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
new file mode 100644
index 000000000..9c57b423a
--- /dev/null
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -0,0 +1,373 @@
+from pathlib import PosixPath
+from typing import Optional, Tuple
+
+from loguru import logger
+from torch.utils.data import DataLoader
+from transformers import HfArgumentParser, PreTrainedModel
+
+from llmcompressor.args import DatasetArguments, ModelArguments, RecipeArguments
+from llmcompressor.core.session_functions import active_session
+from llmcompressor.transformers.finetune.data.data_helpers import (
+    get_calibration_dataloader,
+)
+from llmcompressor.transformers.finetune.text_generation import (
+    initialize_model_from_path,
+    initialize_processor_from_path,
+)
+from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
+    modify_save_pretrained,
+    patch_tied_tensors_bug,
+)
+from llmcompressor.transformers.utils.helpers import resolve_processor_from_model_args
+
+__all__ = ["Oneshot", "oneshot", "parse_oneshot_args"]
+
+
+class Oneshot:
+    """
+    Class responsible for carrying out one-shot calibration on a pretrained model.
+
+    This class handles the entire lifecycle of one-shot calibration, including
+    preprocessing (model and tokenizer/processor initialization), model optimization
+    (quantization or sparsification), and postprocessing (saving outputs). The
+    intructions for model optimization can be specified by using a recipe.
+
+    - **Input Keyword Arguments:**
+        `kwargs` are parsed into:
+        - `model_args`: Arguments for loading and configuring a pretrained model
+          (e.g., `AutoModelForCausalLM`).
+        - `data_args`: Arguments for dataset-related configurations, such as
+          calibration dataloaders.
+        - `recipe_args`: Arguments for defining and configuring recipes that specify
+          optimization actions.
+
+        Parsers are defined in `src/llmcompressor/args/`.
+
+    - **Lifecycle Overview:**
+        The oneshot calibration lifecycle consists of three steps:
+        1. **Preprocessing**:
+            - Instantiates a pretrained model and tokenizer/processor.
+            - Ensures input and output embedding layers are untied if they share
+              tensors.
+            - Patches the model to include additional functionality for saving with
+              quantization configurations.
+        2. **Oneshot Calibration**:
+            - Optimizes the model using a global `CompressionSession` and applies
+              recipe-defined modifiers (e.g., `GPTQModifier`, `SparseGPTModifier`)
+        3. **Postprocessing**:
+            - Saves the model, tokenizer/processor, and configuration to the specified
+              `output_dir`.
+
+    - **Usage:**
+        ```python
+        oneshot = Oneshot(model=model, recipe=recipe, dataset=dataset)
+        oneshot()
+
+        # Access the processed components
+        model = oneshot.model
+        processor = oneshot.processor
+        recipe = oneshot.recipe
+        ```
+
+    Methods:
+        __init__(**kwargs):
+            Initializes the `Oneshot` object by parsing input arguments, performing
+            preprocessing, and setting instance attributes.
+
+        run(**kwargs):
+            Performs the one-shot calibration process by preparing a calibration
+            dataloader, applying recipe modifiers to the model, and executing
+            postprocessing steps.
+
+        save():
+            Saves the calibrated model and tokenizer/processor to the specified
+            `output_dir`. Supports saving in compressed formats based on model
+            arguments.
+
+        apply_recipe_modifiers(calibration_dataloader, **kwargs):
+            Applies lifecycle actions (e.g., `initialize`, `finalize`) using modifiers
+            defined in the recipe. Each action is executed via the global
+            `CompressionSession`.
+
+        _pre_process():
+            Handles preprocessing steps, including model initialization,
+            tokenizer/processor setup, and resolving tied embedding issues.
+
+        check_tied_embeddings():
+            Logs a warning if `tie_word_embeddings=True`, which may interfere with
+            saving in the one-shot workflow.
+
+        _post_process():
+            Executes postprocessing steps such as saving the model and resetting
+            lifecycle actions, especially when a custom `output_dir` is specified.
+    """
+
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        """
+        Initializes the `Oneshot` class with provided arguments.
+
+        Parses the input keyword arguments into `model_args`, `data_args`, and
+        `recipe_args`. Performs preprocessing to initialize the model and
+        tokenizer/processor.
+
+        :param model_args: ModelArguments parameters, responsible for controlling
+            model loading and saving logic
+        :param data_args: DatasetArguments parameters, responsible for controlling
+            dataset loading, preprocessing and dataloader loading
+        :param recipe_args: RecipeArguments parameters, responsible for containing
+            recipe-related parameters
+        :param output_dir: Path to save the output model after carrying out oneshot
+
+        """
+
+        model_args, data_args, recipe_args, output_dir = parse_oneshot_args(**kwargs)
+
+        self.model_args = model_args
+        self.data_args = data_args
+        self.recipe_args = recipe_args
+        self.output_dir = output_dir
+
+        # Set instance attributes
+        self.model = self.model_args.model
+        self.processor = self.model_args.processor
+        self.recipe = self.recipe_args.recipe
+
+    @classmethod
+    def from_args(
+        cls, model_args, data_args, recipe_args, output_dir, do_preprocess: bool = True
+    ):
+        """
+        Used only for the stage runner to populate the args.
+        """
+        instance = super().__new__(cls)
+        instance.model_args = model_args
+        instance.data_args = data_args
+        instance.recipe_args = recipe_args
+        instance.output_dir = output_dir
+
+        # only run for the first oneshot call
+        if do_preprocess:
+            instance._pre_process()
+
+        # Set instance attributes
+        instance.model = instance.model_args.model
+        instance.recipe = instance.recipe_args.recipe
+        instance.processor = instance.model_args.processor
+
+        return instance
+
+    def __call__(self):
+        """
+        Performs one-shot calibration.
+
+        This method prepares a calibration dataloader using dataset arguments and
+        applies recipe-based modifiers to optimize the model. The lifecycle actions
+        are executed sequentially, and the modified model is saved during
+        postprocessing.
+
+        """
+        # TODO: move back once stage runner is removed
+        # Preprocess the model and tokenizer/processor
+        self._pre_process()
+        self.model = self.model_args.model
+        self.recipe = self.recipe_args.recipe
+        self.processor = self.model_args.processor
+
+        calibration_dataloader = get_calibration_dataloader(
+            self.data_args, self.processor
+        )
+        self.apply_recipe_modifiers(
+            calibration_dataloader=calibration_dataloader,
+        )
+        self._post_process()
+
+    def save(self):
+        """
+        Saves the model and tokenizer/processor to the output directory.
+
+        The model is saved in a compressed format if specified in `model_args`.
+        The tokenizer or processor, if available, is also saved.
+
+        Raises:
+            ValueError: If saving fails due to an invalid `output_dir` or other issues.
+        """
+        self.model.save_pretrained(
+            self.output_dir,
+            save_compressed=self.model_args.save_compressed,
+        )
+        if self.processor is not None:
+            self.processor.save_pretrained(self.output_dir)
+
+    def apply_recipe_modifiers(
+        self,
+        calibration_dataloader: Optional[DataLoader],
+        recipe_stage: Optional[str] = None,
+    ):
+        """
+        Applies recipe modifiers to the model during the lifecycle.
+
+        The modifiers are defined in the recipe and executed via lifecycle actions
+        (`initialize`, `finalize`) through the global `CompressionSession`.
+
+
+        :param: calibration_dataloader: Dataloader for calibration data.
+
+        Raises:
+            RuntimeError: If any modifier fails during execution.
+        """
+
+        session = active_session()
+
+        session_kwargs = dict(
+            model=self.model,
+            recipe=self.recipe,
+            recipe_args=self.recipe_args.recipe_args,
+            calib_data=calibration_dataloader,
+            start=-1,  # oneshot-specific argument
+            copy_data=False,
+            min_tokens_per_module=getattr(self, "min_tokens_per_module", None),
+            recipe_stage=recipe_stage,
+        )
+
+        session.initialize(**session_kwargs)
+        session.finalize(**session_kwargs)
+
+    def _pre_process(self):
+        """
+        Prepares the model and tokenizer/processor for calibration.
+
+        - Initializes the model if it's specified as a path or string.
+        - Applies patches to fix tied tensor issues and modifies `save_pretrained`
+          behavior.
+        - Initializes the processor if specified as a path or `None`.
+        - Sets the minimum tokens per module if `data_args` are provided.
+
+        Raises:
+            FileNotFoundError: If the model or processor path is invalid.
+        """
+        self.check_tied_embeddings()
+
+        # Initialize model
+        if isinstance(self.model_args.model, (str, PosixPath)):
+            self.model_args.model, _ = initialize_model_from_path(self.model_args)
+
+        patch_tied_tensors_bug(self.model_args.model)
+        modify_save_pretrained(self.model_args.model)
+
+        # Initialize processor
+        if isinstance(self.model_args.processor, (str, type(None))):
+            self.model_args.processor = initialize_processor_from_path(
+                self.model_args, self.model_args.model
+            )
+            # TODO: move to init once stage runner is removed
+            self.processor = self.model_args.processor
+
+        # Set minimum tokens per module if data arguments are provided
+        if self.data_args:
+            self.min_tokens_per_module = self.data_args.min_tokens_per_module
+
+    def check_tied_embeddings(self):
+        """
+        Logs a warning if the model has tied word embeddings.
+
+        The `tie_word_embeddings` flag may cause issues during saving in the one-shot
+        calibration workflow due to shared tensor addresses.
+        """
+        if self.model_args.tie_word_embeddings:
+            logger.debug(
+                "The tie_word_embeddings flag is by default set to False. "
+                "This guarantees that the one-shot algorithm saves the final "
+                "weights without errors. Detected tie_word_embeddings=True. "
+                "This may cause issues with the one-shot algorithm on save."
+            )
+
+    def _post_process(self):
+        """
+        Executes post-calibration steps.
+
+        This method saves the model and resets lifecycle actions if the `output_dir`
+        is not the default directory.
+
+        Raises:
+            ValueError: If saving fails due to invalid configurations.
+        """
+        if self.output_dir is not None:
+            self.save()
+            return
+
+        logger.warning(
+            "Optimized model not saved. To save, please provide",
+            "`output_dir` as input arg.",
+            "Ex. `oneshot(..., output_dir=...)`",
+        )
+
+
+def oneshot(**kwargs) -> PreTrainedModel:
+    one_shot = Oneshot(**kwargs)
+    one_shot()
+
+    return one_shot.model
+
+
+def parse_oneshot_args(
+    **kwargs,
+) -> Tuple[ModelArguments, DatasetArguments, RecipeArguments, str]:
+    """
+    Parses kwargs by grouping into model, data or training arg groups:
+        * model_args in
+            src/llmcompressor/transformers/utils/arg_parser/model_args.py
+        * data_args in
+            src/llmcompressor/transformers/utils/arg_parser/data_args.py
+        * recipe_args in
+            src/llmcompressor/transformers/utils/arg_parser/recipe_args.py
+        * training_args in
+            src/llmcompressor/transformers/utils/arg_parser/training_args.py
+    """
+    output_dir = kwargs.pop("output_dir", None)
+
+    parser = HfArgumentParser((ModelArguments, DatasetArguments, RecipeArguments))
+
+    if not kwargs:
+
+        def _get_output_dir_from_argv() -> Optional[str]:
+            import sys
+
+            output_dir = None
+            if "--output_dir" in sys.argv:
+                index = sys.argv.index("--output_dir")
+                sys.argv.pop(index)
+                if index < len(sys.argv):  # Check if value exists afer the flag
+                    output_dir = sys.argv.pop(index)
+
+            return output_dir
+
+        output_dir = _get_output_dir_from_argv() or output_dir
+        parsed_args = parser.parse_args_into_dataclasses()
+    else:
+        parsed_args = parser.parse_dict(kwargs)
+
+    model_args, data_args, recipe_args = parsed_args
+
+    if recipe_args.recipe_args is not None:
+        if not isinstance(recipe_args.recipe_args, dict):
+            arg_dict = {}
+            for recipe_arg in recipe_args.recipe_args:
+                key, value = recipe_arg.split("=")
+                arg_dict[key] = value
+            recipe_args.recipe_args = arg_dict
+
+    # raise depreciation warnings
+    if data_args.remove_columns is not None:
+        logger.waning(
+            "`remove_columns` argument is depreciated. When tokenizing datasets, all "
+            "columns which are invalid inputs the tokenizer will be removed",
+            DeprecationWarning,
+        )
+
+    # silently assign tokenizer to processor
+    resolve_processor_from_model_args(model_args)
+
+    return model_args, data_args, recipe_args, output_dir
diff --git a/src/llmcompressor/transformers/finetune/README.md b/src/llmcompressor/transformers/finetune/README.md
index 453fb91cb..e1312b799 100644
--- a/src/llmcompressor/transformers/finetune/README.md
+++ b/src/llmcompressor/transformers/finetune/README.md
@@ -80,51 +80,6 @@ Finetuning arguments are split up into 3 groups:
 * RecipeArguments: `src/llmcompressor/transformers/utils/arg_parser/recipe_arguments.py`
 
 
-## Running One-Shot with FSDP
-```bash
-accelerate launch 
-    --config_file example_fsdp_config.yaml 
-    --no_python llmcompressor.transformers.text_generation.oneshot
-    --model PATH_TO_MODEL
-    --num_calibration_samples 512
-    --dataset DATASET_NAME
-    --dataset_config_name OPTIONAL
-    --max_seq_len OPTIONAL
-    --concatenate_data OPTIONAL
-    --recipe PATH_TO_RECIPE
-    --output_dir PATH_TO_OUTPUT
-    --splits "train"
-    --pad_to_max_length False
-```
-
-
-## Running One-shot from Python (without FSDP)
-```python
-from llmcompressor.transformers import oneshot
-
-model ="Xenova/llama2.c-stories15M"
-dataset_name = "open_platypus"
-concatenate_data = False
-pad_to_max_length = False
-output_dir = "./output_oneshot"
-recipe = "test_oneshot_recipe.yaml"
-overwrite_output_dir = True
-splits = {
-    "calibration": "train[:20%]"
-}
-
-oneshot(
-    model=model,
-    dataset=dataset_name,
-    concatenate_data=concatenate_data,
-    output_dir=output_dir,
-    recipe=recipe,
-    overwrite_output_dir=overwrite_output_dir,
-    pad_to_max_length = pad_to_max_length,
-    splits = splits
-)
-```
-
 ## Running Multi-Stage Recipes
 
 A recipe can be run stage-by-stage by setting `run_stages` to `True` or calling the 
diff --git a/src/llmcompressor/transformers/finetune/data/data_helpers.py b/src/llmcompressor/transformers/finetune/data/data_helpers.py
index 23c70e561..d8a06a7a9 100644
--- a/src/llmcompressor/transformers/finetune/data/data_helpers.py
+++ b/src/llmcompressor/transformers/finetune/data/data_helpers.py
@@ -1,9 +1,11 @@
 import logging
 import os
+import re
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
 from datasets import Dataset, load_dataset
+from loguru import logger
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from transformers.data import default_data_collator
 
@@ -15,6 +17,7 @@
     "get_raw_dataset",
     "make_dataset_splits",
     "get_custom_datasets_from_path",
+    "get_calibration_dataloader",
 ]
 
 
@@ -243,3 +246,76 @@ def do_transform(candidate: str) -> bool:
             transform_dataset_key(dataset_key)
 
     return data_files
+
+
+def get_calibration_dataloader(
+    data_args,
+    processor,
+    add_labels: bool = False,  # for oneshot
+    do_oneshot=True,
+) -> torch.utils.data.DataLoader:
+    """
+    Loads datasets for each flow based on data_args, stores a Dataset for each
+    enabled flow in self.datasets
+
+    :param processor: processor or tokenizer to use for dataset tokenization
+    :param add_labels: if True, add labels column to dataset splits
+    """
+    if data_args.dataset is None:
+        logger.info(
+            "Running oneshot without calibration data. This is expected for "
+            "weight-only and dynamic quantization"
+        )
+        return
+
+    splits = data_args.splits
+    tokenized_datasets = {}
+
+    def _get_split_name(inp_str):
+        # strip out split name, for ex train[60%:] -> train
+        match = re.match(r"(\w*)\[.*\]", inp_str)
+        if match is not None:
+            return match.group(1)
+        return inp_str
+
+    if splits is None:
+        splits = {"all": None}
+    elif isinstance(splits, str):
+        splits = {_get_split_name(splits): splits}
+    elif isinstance(splits, List):
+        splits = {_get_split_name(s): s for s in splits}
+
+    # default to custom dataset if dataset provided isn't a string
+    registry_id = data_args.dataset if isinstance(data_args.dataset, str) else "custom"
+    for split_name, split_str in splits.items():
+        dataset = data_args.dataset
+        if hasattr(dataset, "column_names") and "input_ids" in dataset.column_names:
+            # dataset is already tokenized
+            tokenized_datasets[split_name] = dataset
+        else:
+            # dataset needs to be tokenized
+            from llmcompressor.transformers.finetune.data.base import (
+                TextGenerationDataset,
+            )
+
+            dataset_manager = TextGenerationDataset.load_from_registry(
+                registry_id,
+                data_args=data_args,
+                split=split_str,
+                processor=processor,
+            )
+            tokenized_datasets[split_name] = dataset_manager(add_labels=add_labels)
+
+    datasets = make_dataset_splits(
+        tokenized_datasets,
+        do_oneshot=do_oneshot,
+    )
+
+    calibration_dataset = datasets.get("calibration")
+
+    return format_calibration_data(
+        tokenized_dataset=calibration_dataset,
+        num_calibration_samples=data_args.num_calibration_samples,
+        do_shuffle=data_args.shuffle_calibration_samples,
+        collate_fn=data_args.data_collator,
+    )
diff --git a/src/llmcompressor/transformers/finetune/runner.py b/src/llmcompressor/transformers/finetune/runner.py
index 769b84248..ab7542229 100644
--- a/src/llmcompressor/transformers/finetune/runner.py
+++ b/src/llmcompressor/transformers/finetune/runner.py
@@ -19,7 +19,6 @@
     get_session_model,
     save_completed_stages,
 )
-from llmcompressor.pytorch.utils import tensors_to_device
 from llmcompressor.recipe import Recipe, StageRunType
 from llmcompressor.transformers.finetune.data import TextGenerationDataset
 from llmcompressor.transformers.finetune.data.data_helpers import (
@@ -27,7 +26,7 @@
     make_dataset_splits,
 )
 from llmcompressor.typing import Processor
-from llmcompressor.utils.fsdp.helpers import is_fsdp_model, save_model_and_recipe
+from llmcompressor.utils.fsdp.helpers import save_model_and_recipe
 
 
 class StageRunner:
@@ -136,37 +135,6 @@ def get_dataset_split(self, split_name: str) -> Dataset:
         """
         return self.datasets.get(split_name, None)
 
-    def one_shot(self, stage: Optional[str] = None):
-        """
-        Run oneshot calibration on the active model
-
-        :param stage: which stage of the recipe to run, or None to run whole recipe
-        """
-        logger.info("*** One Shot ***")
-
-        calib_data = None
-        if self.get_dataset_split("calibration") is not None:
-            calib_data = format_calibration_data(
-                tokenized_dataset=self.get_dataset_split("calibration"),
-                num_calibration_samples=self._data_args.num_calibration_samples,
-                do_shuffle=self._data_args.shuffle_calibration_samples,
-                collate_fn=self._data_args.data_collator,
-                accelerator=self.trainer.accelerator,
-            )
-
-            # if we don't run a forward pass after initializing the FSDP model for the
-            # first time, calls to summon_full_params will fail ¯\_(ツ)_/¯
-            if is_fsdp_model(self.trainer.model):
-                dummy_inp = dict(next(iter(calib_data)))
-                model_device = next(self.trainer.model.parameters()).device
-                dummy_inp = tensors_to_device(dummy_inp, model_device)
-                with torch.no_grad():
-                    self.trainer.model(**dummy_inp)
-
-        self.trainer.accelerator.wait_for_everyone()
-
-        self.trainer.one_shot(calibration_data=calib_data, stage=stage)
-
     def train(self, checkpoint: str, stage: Optional[str] = None):
         """
         Run trainer's training loop on train_dataset, saving the resulting model to
@@ -218,13 +186,13 @@ def run_sequential_stages(self, checkpoint: Optional[str] = None):
 
         :param checkpoint: optional checkpoint to pick up a stage from
         """
-
         recipe_obj = Recipe.create_instance(self._recipe_args.recipe)
         with self.trainer.accelerator.main_process_first():
             checkpoint_dir = self._model_args.model
             completed_stages = get_completed_stages(checkpoint_dir)
 
         self.trainer.accelerator.wait_for_everyone()
+        do_preprocess = True
 
         for stage in recipe_obj.stages:
             # validate stage
@@ -256,15 +224,39 @@ def run_sequential_stages(self, checkpoint: Optional[str] = None):
 
             # run stage
             if run_type is StageRunType.ONESHOT:
-                self.one_shot(stage=stage_name)
+                from llmcompressor import Oneshot
+
+                model = get_session_model()
+                self._model_args.model = model
+
+                oneshot = Oneshot.from_args(
+                    model_args=self._model_args,
+                    data_args=self._data_args,
+                    recipe_args=self._recipe_args,
+                    output_dir=self._training_args.output_dir,
+                    do_preprocess=do_preprocess,
+                )
+
+                calib_data = format_calibration_data(
+                    tokenized_dataset=self.get_dataset_split("calibration"),
+                    num_calibration_samples=self._data_args.num_calibration_samples,
+                    do_shuffle=self._data_args.shuffle_calibration_samples,
+                    collate_fn=self._data_args.data_collator,
+                    accelerator=self.trainer.accelerator,
+                )
+
+                if do_preprocess:
+                    do_preprocess = False
+                oneshot.apply_recipe_modifiers(
+                    calibration_dataloader=calib_data,
+                    recipe_stage=stage_name,
+                )
             elif run_type is StageRunType.TRAIN:
                 self.train(checkpoint=checkpoint, stage=stage_name)
+
             checkpoint = None
 
-            if (
-                self._training_args.output_dir
-                != TrainingArguments.__dataclass_fields__["output_dir"].default
-            ):
+            if self._training_args.output_dir:
                 save_model_and_recipe(
                     model=self.trainer.model,
                     save_path=self._output_dir,
diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
index dc78385e9..dd5a25ff8 100644
--- a/src/llmcompressor/transformers/finetune/session_mixin.py
+++ b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -7,13 +7,12 @@
 import torch
 from loguru import logger
 from torch.nn import Module
-from torch.utils.data import DataLoader, IterableDataset
+from torch.utils.data import IterableDataset
 from transformers.trainer_callback import TrainerState
 from transformers.trainer_utils import get_last_checkpoint
 
 from llmcompressor.core import (
     active_session,
-    apply,
     callbacks,
     create_session,
     finalize,
@@ -443,30 +442,6 @@ def predict(self, *args, **kwargs):
 
         return output
 
-    def one_shot(
-        self, calibration_data: Optional[DataLoader] = None, stage: Optional[str] = None
-    ):
-        """
-        Run oneshot calibration on the active model
-        :param stage: which stage of the recipe to run, or None to run whole recipe
-        :param calib_data: dataloader of calibration data
-        """
-        apply(
-            recipe=self.recipe,
-            recipe_stage=stage,
-            recipe_args=self.recipe_args,
-            model=self.model,
-            calib_data=calibration_data,
-            start=-1,
-            copy_data=False,
-            accelerator=self.accelerator,
-            min_tokens_per_module=self.min_tokens_per_module,
-        )
-
-        # log model sparsity
-        # self.maybe_log_model_sparsification()
-        self.accelerator.wait_for_everyone()
-
     def save_model(self, output_dir: str, _internal_call=False, _is_oneshot=False):
         """
         Override of the save_model function and expects it to exist in the parent.
diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py
index d79d8cbbe..b17fc5591 100644
--- a/src/llmcompressor/transformers/finetune/text_generation.py
+++ b/src/llmcompressor/transformers/finetune/text_generation.py
@@ -22,6 +22,7 @@
 from pathlib import PosixPath
 from typing import Optional
 
+from compressed_tensors.utils.helpers import deprecated
 from loguru import logger
 from transformers import (
     AutoConfig,
@@ -83,19 +84,16 @@ def eval(**kwargs):
     main(model_args, data_args, recipe_args, training_args)
 
 
-def oneshot(**kwargs):
-    """
-    CLI entrypoint for running oneshot calibration
-    """
-    # TODO: Get rid of training args when Oneshot refactor comes in
-    model_args, data_args, recipe_args, training_args = parse_args(**kwargs)
-    training_args.do_oneshot = True
-
-    main(model_args, data_args, recipe_args, training_args)
-
+@deprecated(
+    message=(
+        "`from llmcompressor.transformers import oneshot` is deprecated, "
+        "please use `from llmcompressor import oneshot`."
+    )
+)
+def oneshot(**kwargs) -> None:
+    from llmcompressor import oneshot
 
-# alias
-one_shot = oneshot
+    oneshot(**kwargs)
 
 
 def apply(**kwargs):
@@ -442,10 +440,6 @@ def main(
             checkpoint = last_checkpoint
         stage_runner.train(checkpoint)
 
-    # One Shot
-    if training_args.do_oneshot:
-        stage_runner.one_shot()
-
     # Evaluation
     if training_args.do_eval:
         stage_runner.evaluate()
diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
index ec9951f6a..f64857fec 100644
--- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
+++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
@@ -209,8 +209,9 @@ def skip(*args, **kwargs):
         save_pretrained_wrapper._overriden = True
         return save_pretrained_wrapper
 
-    # wrap save_pretrained
-    model.save_pretrained = save_pretrained_compressed(model.save_pretrained)
+    # wrap save_pretrained if not already
+    if not getattr(model.save_pretrained, "_overriden", False):
+        model.save_pretrained = save_pretrained_compressed(model.save_pretrained)
 
 
 # HACK: Override the dtype_byte_size function in transformers to support float8 types
diff --git a/src/llmcompressor/transformers/utils/helpers.py b/src/llmcompressor/transformers/utils/helpers.py
index 349ff8f09..0d9fd4f7d 100644
--- a/src/llmcompressor/transformers/utils/helpers.py
+++ b/src/llmcompressor/transformers/utils/helpers.py
@@ -157,3 +157,12 @@ def recipe_from_huggingface_model_id(
         recipe = None
 
     return recipe
+
+
+def resolve_processor_from_model_args(model_args: "ModelArguments"):
+    # silently assign tokenizer to processor
+    if model_args.tokenizer:
+        if model_args.processor:
+            raise ValueError("Cannot use both a tokenizer and processor")
+        model_args.processor = model_args.tokenizer
+    model_args.tokenizer = None
diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
index 41e2434ab..be1a6d324 100644
--- a/tests/e2e/e2e_utils.py
+++ b/tests/e2e/e2e_utils.py
@@ -2,8 +2,8 @@
 from loguru import logger
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
-from llmcompressor.transformers import oneshot
 from tests.test_timer.timer_utils import log_time
 from tests.testing_utils import preprocess_tokenize_dataset
 
diff --git a/tests/llmcompressor/entrypoints/test_oneshot.py b/tests/llmcompressor/entrypoints/test_oneshot.py
new file mode 100644
index 000000000..4a7f2a5a7
--- /dev/null
+++ b/tests/llmcompressor/entrypoints/test_oneshot.py
@@ -0,0 +1,35 @@
+from transformers import AutoModelForCausalLM
+
+from llmcompressor import Oneshot
+from llmcompressor.entrypoints.oneshot import parse_oneshot_args
+
+
+def test_oneshot_from_args():
+    # Select model and load it.
+    stub = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    model = AutoModelForCausalLM.from_pretrained(stub)
+    dataset = "HuggingFaceH4/ultrachat_200k"
+
+    NUM_CALIBRATION_SAMPLES = 512
+    MAX_SEQUENCE_LENGTH = 2048
+
+    recipe = "foo_recipe"
+
+    output_dir = "bar_output_dir"
+
+    model_args, data_args, recipe_args, output_dir = parse_oneshot_args(
+        model=model,
+        dataset=dataset,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+        output_dir=output_dir,
+    )
+
+    oneshot = Oneshot.from_args(model_args, data_args, recipe_args, output_dir)
+    assert oneshot.model == model
+    assert oneshot.model_args is model_args
+    assert oneshot.data_args is data_args
+    assert oneshot.recipe_args is recipe_args
+    assert oneshot.model_args is model_args
+    assert oneshot.output_dir is output_dir
diff --git a/tests/llmcompressor/transformers/compression/test_quantization.py b/tests/llmcompressor/transformers/compression/test_quantization.py
index 0d34d1ca0..e68d8b42a 100644
--- a/tests/llmcompressor/transformers/compression/test_quantization.py
+++ b/tests/llmcompressor/transformers/compression/test_quantization.py
@@ -10,9 +10,9 @@
 from torch.utils.data import DataLoader
 from transformers import AutoModelForCausalLM, AutoTokenizer, DefaultDataCollator
 
+from llmcompressor import oneshot
 from llmcompressor.args import DatasetArguments
 from llmcompressor.pytorch.utils import tensors_to_device
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.finetune.data import TextGenerationDataset
 from tests.testing_utils import parse_params, requires_gpu
 
@@ -59,10 +59,9 @@ def _run_oneshot(model, recipe, dataset, output_dir):
         max_seq_length = 512
         pad_to_max_length = False
 
-        oneshot(
+        model = oneshot(
             model=model,
             dataset=dataset,
-            overwrite_output_dir=True,
             output_dir=output_dir,
             max_seq_length=max_seq_length,
             num_calibration_samples=num_calibration_samples,
@@ -72,9 +71,7 @@ def _run_oneshot(model, recipe, dataset, output_dir):
             splits={"calibration": "train_gen[:5%]"},
             save_compressed=False,
         )
-        from llmcompressor.pytorch.model_load.helpers import get_session_model
-
-        return get_session_model()
+        return model
 
     def _get_quant_info(self, model):
         quant_info_weights = {}
diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py
index e7c8e7b9a..f8f8d9827 100644
--- a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py
+++ b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py
@@ -37,7 +37,6 @@ def preprocessing_func(example):
             model=self.model,
             dataset=self.file_extension,
             output_dir=self.output,
-            overwrite_output_dir=True,
             recipe=None,
             num_train_epochs=self.num_train_epochs,
             concatenate_data=concatenate_data,
diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_oneshot_with_modifier.py b/tests/llmcompressor/transformers/finetune/test_finetune_oneshot_with_modifier.py
index ec517e2d6..91d1e8587 100644
--- a/tests/llmcompressor/transformers/finetune/test_finetune_oneshot_with_modifier.py
+++ b/tests/llmcompressor/transformers/finetune/test_finetune_oneshot_with_modifier.py
@@ -21,8 +21,8 @@ def setUp(self):
         self.output = Path("./finetune_output")
 
     def test_oneshot_with_modifier_object(self):
+        from llmcompressor import oneshot
         from llmcompressor.modifiers.obcq.base import SparseGPTModifier
-        from llmcompressor.transformers import oneshot
 
         recipe_str = [
             SparseGPTModifier(sparsity=0.5, targets=[r"re:model.layers.\d+$"])
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py
index 76ea21706..d4d65469d 100644
--- a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py
+++ b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py
@@ -6,9 +6,10 @@
 from transformers import AutoModelForCausalLM
 from transformers.utils.quantization_config import CompressedTensorsConfig
 
+from llmcompressor import oneshot
 from llmcompressor.core import create_session
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot, train
+from llmcompressor.transformers import train
 
 
 @pytest.mark.unit
diff --git a/tests/llmcompressor/transformers/gptq/test_oneshot.py b/tests/llmcompressor/transformers/gptq/test_oneshot.py
index c391890b2..6b1622260 100644
--- a/tests/llmcompressor/transformers/gptq/test_oneshot.py
+++ b/tests/llmcompressor/transformers/gptq/test_oneshot.py
@@ -69,12 +69,11 @@ def setUp(self):
         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
     def test_oneshot_application(self):
-        from llmcompressor.transformers import oneshot
+        from llmcompressor import oneshot
 
         oneshot(
             model=self.model,
             dataset=self.dataset,
-            overwrite_output_dir=True,
             output_dir=self.output,
             recipe=self.recipe,
             oneshot_device=self.device,
diff --git a/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py b/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py
index f98a06a91..dafedbfa3 100644
--- a/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py
+++ b/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py
@@ -8,8 +8,8 @@
 from datasets import load_dataset
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.core import reset_session
-from llmcompressor.transformers import oneshot
 
 NUM_CALIBRATION_SAMPLES = 16
 MAX_SEQUENCE_LENGTH = 512
diff --git a/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py b/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py
index 16c9003be..b1331afc0 100644
--- a/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py
+++ b/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py
@@ -26,10 +26,10 @@ def _test_consecutive_runs(
     ):
         import math
 
+        from llmcompressor import oneshot
         from llmcompressor.core import active_session
         from llmcompressor.pytorch.model_load.helpers import initialize_recipe
         from llmcompressor.pytorch.utils.helpers import tensor_sparsity
-        from llmcompressor.transformers import oneshot
         from llmcompressor.utils.pytorch import qat_active
 
         # test recipe with 50% sparsity, quantization and smoothquant
diff --git a/tests/llmcompressor/transformers/obcq/test_mask_structure_preservation.py b/tests/llmcompressor/transformers/obcq/test_mask_structure_preservation.py
index 5095fe827..10cde875a 100644
--- a/tests/llmcompressor/transformers/obcq/test_mask_structure_preservation.py
+++ b/tests/llmcompressor/transformers/obcq/test_mask_structure_preservation.py
@@ -47,9 +47,9 @@ def test_mask_structure_preserved(self):
 
         import torch
 
+        from llmcompressor import oneshot
         from llmcompressor.pytorch.model_load.helpers import get_session_model
         from llmcompressor.pytorch.utils.helpers import tensor_sparsity
-        from llmcompressor.transformers import oneshot
         from llmcompressor.utils.pytorch import qat_active
 
         tolerance = 1e-3
diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_completion.py b/tests/llmcompressor/transformers/obcq/test_obcq_completion.py
index e4974a956..1016cf422 100644
--- a/tests/llmcompressor/transformers/obcq/test_obcq_completion.py
+++ b/tests/llmcompressor/transformers/obcq/test_obcq_completion.py
@@ -47,9 +47,9 @@ def labeled_dataloader(self, dataset_name, model_name):
     def _test_oneshot_completion(self, model_name: str = None):
         import torch
 
+        from llmcompressor import oneshot
         from llmcompressor.pytorch.model_load.helpers import get_session_model
         from llmcompressor.pytorch.utils import tensors_to_device
-        from llmcompressor.transformers import oneshot
 
         oneshot(
             model=self.model,
@@ -62,7 +62,6 @@ def _test_oneshot_completion(self, model_name: str = None):
             output_dir=self.output,
             clear_sparse_session=False,
             precision="bfloat16",
-            bf16=True,
         )
 
         first_tiny_model = get_session_model()
diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py b/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py
index 0ef7f872d..7be914149 100644
--- a/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py
+++ b/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py
@@ -26,11 +26,10 @@ def setUp(self):
         self.output = "./oneshot_output"
 
     def test_sparsities(self):
-        from llmcompressor.pytorch.model_load.helpers import get_session_model
+        from llmcompressor import oneshot
         from llmcompressor.pytorch.utils.helpers import tensor_sparsity
-        from llmcompressor.transformers import oneshot
 
-        oneshot(
+        model = oneshot(
             model=self.model,
             dataset=self.dataset,
             oneshot_device=self.device,
@@ -42,8 +41,6 @@ def test_sparsities(self):
             output_dir=self.output,
         )
 
-        model = get_session_model()
-
         layer_1_sparse = tensor_sparsity(model.model.layers[1].self_attn.k_proj.weight)
         assert math.isclose(layer_1_sparse.item(), self.sparsity, rel_tol=1e-4)
         layer_2_dense = tensor_sparsity(model.model.layers[2].self_attn.k_proj.weight)
@@ -78,11 +75,10 @@ def setUp(self):
         )
 
     def test_sparsities_gpu(self):
-        from llmcompressor.pytorch.model_load.helpers import get_session_model
+        from llmcompressor import oneshot
         from llmcompressor.pytorch.utils.helpers import tensor_sparsity
-        from llmcompressor.transformers import oneshot
 
-        oneshot(
+        model = oneshot(
             model=self.model,
             dataset=self.dataset,
             oneshot_device=self.device,
@@ -93,11 +89,8 @@ def test_sparsities_gpu(self):
             clear_sparse_session=False,
             output_dir=self.output,
             precision="bfloat16",
-            bf16=True,
         )
 
-        model = get_session_model()
-
         layer_1_sparse = tensor_sparsity(model.model.layers[1].self_attn.k_proj.weight)
         assert math.isclose(layer_1_sparse.item(), self.sparsity, rel_tol=1e-4)
         layer_2_dense = tensor_sparsity(model.model.layers[2].self_attn.k_proj.weight)
diff --git a/tests/llmcompressor/transformers/oneshot/test_api_inputs.py b/tests/llmcompressor/transformers/oneshot/test_api_inputs.py
index e14479bcc..152051608 100644
--- a/tests/llmcompressor/transformers/oneshot/test_api_inputs.py
+++ b/tests/llmcompressor/transformers/oneshot/test_api_inputs.py
@@ -48,7 +48,7 @@ def wrapped_preprocess_func(sample):
             self.tokenizer = None
 
     def test_one_shot_inputs(self):
-        from llmcompressor.transformers import oneshot
+        from llmcompressor import oneshot
 
         oneshot(
             model=self.model,
diff --git a/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py
index eeb6e95ae..d8eced287 100644
--- a/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py
+++ b/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py
@@ -19,9 +19,9 @@
 from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.utils.quantization_config import CompressedTensorsConfig
 
+from llmcompressor import oneshot
 from llmcompressor.core import reset_session
 from llmcompressor.pytorch.utils.helpers import tensor_sparsity
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.sparsity_config import (
     SparsityConfigMetadata,
 )
@@ -167,8 +167,6 @@ def test_dense_model_save(tmp_path, skip_compression_stats, save_compressed):
     ],
 )
 def test_quant_model_reload(format, dtype, tmp_path):
-    from llmcompressor.pytorch.model_load.helpers import get_session_model
-
     recipe_str = (
         "tests/llmcompressor/transformers/compression/recipes/new_quant_simple.yaml"
     )
@@ -182,7 +180,7 @@ def test_quant_model_reload(format, dtype, tmp_path):
     splits = {"calibration": "train[:10%]"}
 
     # create a quantized model
-    oneshot(
+    model = oneshot(
         model=model_path,
         dataset=dataset,
         num_calibration_samples=num_calibration_samples,
@@ -195,7 +193,6 @@ def test_quant_model_reload(format, dtype, tmp_path):
     )
 
     # Fetch the oneshot model
-    model = get_session_model()
     og_state_dict = model.state_dict()
     save_path_compressed = tmp_path / "compressed"
 

From ffd3ef96c5d19da347c58e8cb589037dc80320ff Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Mon, 24 Feb 2025 17:06:06 -0500
Subject: [PATCH 02/23] [StageRunner Removal] Remove Evalulate / validate
 pathway (#1145)

SUMMARY:
* Remove eval pathway from `main`
* Remove eval pathway from `StageRunner`
*  Remove eval pathway from `SessionMixin`
* Remove logic to make eval dataset split

TEST PLAN:
* Pass existing tests
* Delete tests involving eval

---------

Signed-off-by: George Ohashi <george@neuralmagic.com>
---
 src/llmcompressor/args/dataset_arguments.py   |  9 +--------
 .../transformers/finetune/__init__.py         |  2 +-
 .../finetune/data/data_helpers.py             | 13 +++----------
 .../transformers/finetune/runner.py           | 16 ++--------------
 .../transformers/finetune/session_mixin.py    | 18 ------------------
 .../transformers/finetune/text_generation.py  | 15 ++++-----------
 .../finetune/data/test_dataset_helpers.py     | 19 ++++---------------
 .../finetune/test_session_mixin.py            |  2 --
 8 files changed, 15 insertions(+), 79 deletions(-)

diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
index 50d3277f4..957fe636d 100644
--- a/src/llmcompressor/args/dataset_arguments.py
+++ b/src/llmcompressor/args/dataset_arguments.py
@@ -70,7 +70,7 @@ class CustomDatasetArguments(DVCDatasetArguments):
 class DatasetArguments(CustomDatasetArguments):
     """
     Arguments pertaining to what data we are going to input our model for
-    calibration, training or eval
+    calibration, training
 
     Using `HfArgumentParser` we can turn this class into argparse
     arguments to be able to specify them on the command line
@@ -150,13 +150,6 @@ class DatasetArguments(CustomDatasetArguments):
             "of training examples to this value if set."
         },
     )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number "
-            "of evaluation examples to this value if set."
-        },
-    )
     max_predict_samples: Optional[int] = field(
         default=None,
         metadata={
diff --git a/src/llmcompressor/transformers/finetune/__init__.py b/src/llmcompressor/transformers/finetune/__init__.py
index 6c75b902b..e583526d6 100644
--- a/src/llmcompressor/transformers/finetune/__init__.py
+++ b/src/llmcompressor/transformers/finetune/__init__.py
@@ -2,4 +2,4 @@
 
 from .data import TextGenerationDataset
 from .session_mixin import SessionManagerMixIn
-from .text_generation import apply, compress, eval, oneshot, train
+from .text_generation import apply, compress, oneshot, train
diff --git a/src/llmcompressor/transformers/finetune/data/data_helpers.py b/src/llmcompressor/transformers/finetune/data/data_helpers.py
index d8a06a7a9..37e9d0489 100644
--- a/src/llmcompressor/transformers/finetune/data/data_helpers.py
+++ b/src/llmcompressor/transformers/finetune/data/data_helpers.py
@@ -97,17 +97,15 @@ def get_raw_dataset(
 def make_dataset_splits(
     tokenized_datasets: Dict[str, Any],
     do_train: bool = False,
-    do_eval: bool = False,
     do_predict: bool = False,
     do_oneshot: bool = False,
 ) -> Dict[str, Dataset]:
     """
     Restructures the datasets dictionary based on what tasks will be run
-    (train, eval, predict)
+    (train, predict)
 
     :param tokenized_datasets: dictionary of processed datasets
     :param do_train: Whether to store the train dataset
-    :param do_eval: Whether to store the validation dataset
     :param do_predict: Whether to store the test dataset
     :param do_oneshot: Whether to store the calibration dataset
     :return: Datasets to be used by the requested tasks
@@ -119,16 +117,12 @@ def make_dataset_splits(
         if isinstance(tokenized_datasets, Dataset):
             tokenized_datasets = {"train": tokenized_datasets}
 
-    train_split = eval_split = predict_split = calib_split = None
+    train_split = predict_split = calib_split = None
 
     if do_train:
         if "train" not in tokenized_datasets:
             raise ValueError("--do_train requires a train dataset")
         train_split = tokenized_datasets["train"]
-    if do_eval:
-        if "validation" not in tokenized_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_split = tokenized_datasets["validation"]
     if do_predict:
         if "test" not in tokenized_datasets:
             raise ValueError("--do_predict requires a test dataset")
@@ -142,7 +136,6 @@ def make_dataset_splits(
 
     split_datasets = {
         "train": train_split,
-        "validation": eval_split,
         "test": predict_split,
         "calibration": calib_split,
     }
@@ -222,7 +215,7 @@ def transform_dataset_keys(data_files: Dict[str, Any]):
     Transform dict keys to `train`, `val` or `test` for the given input dict
     if matches exist with the existing keys. Note that there can only be one
     matching file name.
-    Ex. Folder(train_eval.json)          -> Folder(train.json)
+    Ex. Folder(train_foo.json)           -> Folder(train.json)
         Folder(train1.json, train2.json) -> Same
 
     :param data_files: The dict where keys will be transformed
diff --git a/src/llmcompressor/transformers/finetune/runner.py b/src/llmcompressor/transformers/finetune/runner.py
index ab7542229..19fc05694 100644
--- a/src/llmcompressor/transformers/finetune/runner.py
+++ b/src/llmcompressor/transformers/finetune/runner.py
@@ -31,14 +31,14 @@
 
 class StageRunner:
     """
-    Launcher class for train, eval and one_shot flows. Manages data splits for each
+    Launcher class for train, and one_shot flows. Manages data splits for each
     flow and configurations. In the future this class will also handle alternating
     between the different flows
 
     LifeCycle
         - populate_datasets()
         - set_trainer()
-        - train() / evaluate() / predict()
+        - train() / predict()
 
     :param model_args: Arguments pertaining to model/config/processor
     :param data_args: Arguments pertaining to what data to use for different flows
@@ -121,7 +121,6 @@ def _get_split_name(inp_str):
         self.datasets = make_dataset_splits(
             tokenized_datasets,
             do_train=self._training_args.do_train,
-            do_eval=self._training_args.do_eval,
             do_predict=self._training_args.do_predict,
             do_oneshot=self._training_args.do_oneshot,
         )
@@ -156,17 +155,6 @@ def train(self, checkpoint: str, stage: Optional[str] = None):
         # this includes saving the state, optimizer and scheduler
         self.trainer.save_model(output_dir=self._output_dir)
 
-    def evaluate(self):
-        """
-        Run trainer's evaluation loop on eval_dataset, logging the desired metrics
-        """
-        logger.info("*** Evaluate ***")
-        metrics = self.trainer.evaluate(self.get_dataset_split("validation"))
-
-        metrics["eval_samples"] = len(self.get_dataset_split("validation"))
-        self.trainer.log_metrics("eval", metrics)
-        self.trainer.save_metrics("eval", metrics)
-
     def predict(self):
         """
         Run trainer's prediction loop on predict_dataset, logging the desired metrics
diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
index dd5a25ff8..0ab314c00 100644
--- a/src/llmcompressor/transformers/finetune/session_mixin.py
+++ b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -44,7 +44,6 @@
 TRAINER_STATE_NAME = "trainer_state.json"
 METADATA_ARGS = [
     "per_device_train_batch_size",
-    "per_device_eval_batch_size",
     "max_seq_length",
     "save_safetensors",
     "fp16",
@@ -409,23 +408,6 @@ def train(self, *args, stage: Optional[str] = None, **kwargs):
 
         return output
 
-    def evaluate(self, *args, **kwargs):
-        """
-        Run a sparsification evaluation cycle.
-        Runs initialize_structure for the sparse session before calling
-        super().evaluate() and finalization of the session after.
-
-        :param args: positional args to pass to super().evaluate()
-        :param kwargs: keyword args to pass to super().evaluate()
-        :return: the output from super.evaluate()
-        """
-        self.initialize_structure()
-
-        output = super().evaluate(*args, **kwargs)
-        self.finalize_session()
-
-        return output
-
     def predict(self, *args, **kwargs):
         """
         Run a sparsification prediction cycle.
diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py
index b17fc5591..7bd40229f 100644
--- a/src/llmcompressor/transformers/finetune/text_generation.py
+++ b/src/llmcompressor/transformers/finetune/text_generation.py
@@ -92,13 +92,12 @@ def eval(**kwargs):
 )
 def oneshot(**kwargs) -> None:
     from llmcompressor import oneshot
-
     oneshot(**kwargs)
 
 
 def apply(**kwargs):
     """
-    CLI entrypoint for any of training, eval, predict or oneshot
+    CLI entrypoint for any of training, predict or oneshot
     """
     report_to = kwargs.get("report_to", None)
     model_args, data_args, recipe_args, training_args = parse_args(**kwargs)
@@ -323,12 +322,12 @@ def main(
         - Trainer()
             - SessionMixIn()
             - HFTransformersTrainer()
-        - StageRunner.train() and/or evaluate() and/or predict() and/or oneshot()
+        - StageRunner.train() and/or predict() and/or oneshot()
 
     :param model_args: Arguments pertaining to which model/config/tokenizer we are
     going to fine-tune from
     :param data_args: Arguments pertaining to what data we are going to input our model
-    for training and eval
+    for training
     :param training_args: Arguments pertaining to training loop configuration
     """
 
@@ -358,7 +357,7 @@ def main(
         f"distributed training: {bool(training_args.local_rank != -1)}, "
         f"16-bits training: {training_args.fp16}"
     )
-    logger.info(f"Training/evaluation parameters {training_args}")
+    logger.info(f"Training parameters {training_args}")
 
     # Detecting last checkpoint.
     last_checkpoint = None
@@ -397,7 +396,6 @@ def main(
     add_labels = training_args.do_train or training_args.run_stages
     stage_runner.populate_datasets(processor=processor, add_labels=add_labels)
     train_dataset = stage_runner.get_dataset_split("train")
-    eval_dataset = stage_runner.get_dataset_split("validation")
     calib_dataset = stage_runner.get_dataset_split("calibration")
 
     trainer = Trainer(
@@ -409,7 +407,6 @@ def main(
         model_args=model_args,
         data_args=data_args,
         train_dataset=train_dataset or calib_dataset,
-        eval_dataset=eval_dataset,
         processing_class=processor,
         data_collator=data_args.data_collator,
     )
@@ -440,10 +437,6 @@ def main(
             checkpoint = last_checkpoint
         stage_runner.train(checkpoint)
 
-    # Evaluation
-    if training_args.do_eval:
-        stage_runner.evaluate()
-
     # Prediction
     if training_args.do_predict:
         stage_runner.predict()
diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
index 7eb74f9f9..319ad765c 100644
--- a/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
+++ b/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
@@ -15,18 +15,12 @@ def test_combined_datasets():
     raw_wikitext2 = get_raw_dataset(data_args)
     datasets = {"all": raw_wikitext2}
 
-    split_datasets = make_dataset_splits(
-        datasets, do_train=True, do_eval=True, do_predict=True
-    )
+    split_datasets = make_dataset_splits(datasets, do_train=True, do_predict=True)
     assert split_datasets.get("train") is not None
-    assert split_datasets.get("validation") is not None
     assert split_datasets.get("test") is not None
 
-    split_datasets = make_dataset_splits(
-        datasets, do_train=True, do_eval=False, do_predict=True
-    )
+    split_datasets = make_dataset_splits(datasets, do_train=True, do_predict=True)
     assert split_datasets.get("train") is not None
-    assert split_datasets.get("validation") is None
     assert split_datasets.get("test") is not None
 
 
@@ -41,15 +35,10 @@ def test_separate_datasets():
         raw_wikitext2 = get_raw_dataset(data_args, split=split_str)
         datasets[split_name] = raw_wikitext2
 
-    split_datasets = make_dataset_splits(
-        datasets, do_train=True, do_eval=True, do_predict=False
-    )
+    split_datasets = make_dataset_splits(datasets, do_train=True, do_predict=False)
     assert split_datasets.get("train") is not None
-    assert split_datasets.get("validation") is not None
     assert split_datasets.get("test") is None
 
     with pytest.raises(ValueError):
         # fails due to no test split specified
-        split_datasets = make_dataset_splits(
-            datasets, do_train=True, do_eval=True, do_predict=True
-        )
+        split_datasets = make_dataset_splits(datasets, do_train=True, do_predict=True)
diff --git a/tests/llmcompressor/transformers/finetune/test_session_mixin.py b/tests/llmcompressor/transformers/finetune/test_session_mixin.py
index f10c0ed51..4fa981de9 100644
--- a/tests/llmcompressor/transformers/finetune/test_session_mixin.py
+++ b/tests/llmcompressor/transformers/finetune/test_session_mixin.py
@@ -49,13 +49,11 @@ def mixin_trainer():
     model = AutoModelForCausalLM.from_pretrained(model_state_path)
     recipe = "tests/llmcompressor/transformers/finetune/test_quantization.yaml"
     train_dataset = "open-platypus"
-    eval_dataset = "open-platypus"
 
     return MixInTest(
         model=model,
         recipe=recipe,
         train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
     )
 
 
From b06c4cf49642196f18b89b1e0c780d558507c46a Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Mon, 24 Feb 2025 19:48:21 -0500
Subject: [PATCH 03/23] [StageRemoval] Remove Predict pathway (#1146)

SUMMARY:
* Remove predict pathway from `main`
* Remove predict pathway from `StageRunner`
* Remove logic to make predict dataset split
* Remove `max_predict_samples` from `DatasetArguments`
* Remove any docs/comment that has `predict` inside
* Rename `predicted_ids` to `output_ids`

TEST PLAN:
* Pass existing tests
* Delete tests involving predict

---------

Signed-off-by: George <george@neuralmagic.com>
---
 .../quantization_w8a8_fp8/whisper_example.py  |  4 +-
 src/llmcompressor/args/dataset_arguments.py   |  9 ----
 src/llmcompressor/args/training_arguments.py  |  4 +-
 .../finetune/data/data_helpers.py             | 13 ++----
 .../transformers/finetune/runner.py           | 15 +------
 .../transformers/finetune/session_mixin.py    | 41 -------------------
 .../transformers/finetune/text_generation.py  | 10 ++---
 .../finetune/data/test_dataset_helpers.py     | 14 +++----
 8 files changed, 18 insertions(+), 92 deletions(-)

diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py
index cadcc6e8c..5efd08a57 100644
--- a/examples/quantization_w8a8_fp8/whisper_example.py
+++ b/examples/quantization_w8a8_fp8/whisper_example.py
@@ -35,8 +35,8 @@
     sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt"
 ).input_features
 input_features = input_features.to(model.device)
-predicted_ids = model.generate(input_features, language="en", forced_decoder_ids=None)
-print(processor.batch_decode(predicted_ids, skip_special_tokens=False)[0])
+output_ids = model.generate(input_features, language="en", forced_decoder_ids=None)
+print(processor.batch_decode(output_ids, skip_special_tokens=False)[0])
 # Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel
 print("==========================================")
 
diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
index 957fe636d..31f7c73bb 100644
--- a/src/llmcompressor/args/dataset_arguments.py
+++ b/src/llmcompressor/args/dataset_arguments.py
@@ -150,15 +150,6 @@ class DatasetArguments(CustomDatasetArguments):
             "of training examples to this value if set."
         },
     )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of "
-                "prediction examples to this value if set."
-            ),
-        },
-    )
     min_tokens_per_module: Optional[float] = field(
         default=None,
         metadata={
diff --git a/src/llmcompressor/args/training_arguments.py b/src/llmcompressor/args/training_arguments.py
index ec87024d4..7149f3d47 100644
--- a/src/llmcompressor/args/training_arguments.py
+++ b/src/llmcompressor/args/training_arguments.py
@@ -26,8 +26,8 @@ class TrainingArguments(HFTrainingArgs):
     output_dir: str = field(
         default="./output",
         metadata={
-            "help": "The output directory where the model predictions and "
-            "checkpoints will be written."
+            "help": "The output directory where the model safetensors, "
+            "recipe, config, and optionally checkpoints will be written."
         },
     )
 
diff --git a/src/llmcompressor/transformers/finetune/data/data_helpers.py b/src/llmcompressor/transformers/finetune/data/data_helpers.py
index 37e9d0489..cf9b81f69 100644
--- a/src/llmcompressor/transformers/finetune/data/data_helpers.py
+++ b/src/llmcompressor/transformers/finetune/data/data_helpers.py
@@ -97,17 +97,15 @@ def get_raw_dataset(
 def make_dataset_splits(
     tokenized_datasets: Dict[str, Any],
     do_train: bool = False,
-    do_predict: bool = False,
     do_oneshot: bool = False,
 ) -> Dict[str, Dataset]:
     """
     Restructures the datasets dictionary based on what tasks will be run
-    (train, predict)
+    train
 
     :param tokenized_datasets: dictionary of processed datasets
-    :param do_train: Whether to store the train dataset
-    :param do_predict: Whether to store the test dataset
     :param do_oneshot: Whether to store the calibration dataset
+
     :return: Datasets to be used by the requested tasks
     """
 
@@ -117,16 +115,12 @@ def make_dataset_splits(
         if isinstance(tokenized_datasets, Dataset):
             tokenized_datasets = {"train": tokenized_datasets}
 
-    train_split = predict_split = calib_split = None
+    train_split = calib_split = None
 
     if do_train:
         if "train" not in tokenized_datasets:
             raise ValueError("--do_train requires a train dataset")
         train_split = tokenized_datasets["train"]
-    if do_predict:
-        if "test" not in tokenized_datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        predict_split = tokenized_datasets["test"]
     if do_oneshot:
         calib_split = tokenized_datasets.get("calibration")
         if calib_split is None:
@@ -136,7 +130,6 @@ def make_dataset_splits(
 
     split_datasets = {
         "train": train_split,
-        "test": predict_split,
         "calibration": calib_split,
     }
     return split_datasets
diff --git a/src/llmcompressor/transformers/finetune/runner.py b/src/llmcompressor/transformers/finetune/runner.py
index 19fc05694..37f7fbb12 100644
--- a/src/llmcompressor/transformers/finetune/runner.py
+++ b/src/llmcompressor/transformers/finetune/runner.py
@@ -38,7 +38,7 @@ class StageRunner:
     LifeCycle
         - populate_datasets()
         - set_trainer()
-        - train() / predict()
+        - train()
 
     :param model_args: Arguments pertaining to model/config/processor
     :param data_args: Arguments pertaining to what data to use for different flows
@@ -121,7 +121,6 @@ def _get_split_name(inp_str):
         self.datasets = make_dataset_splits(
             tokenized_datasets,
             do_train=self._training_args.do_train,
-            do_predict=self._training_args.do_predict,
             do_oneshot=self._training_args.do_oneshot,
         )
 
@@ -155,18 +154,6 @@ def train(self, checkpoint: str, stage: Optional[str] = None):
         # this includes saving the state, optimizer and scheduler
         self.trainer.save_model(output_dir=self._output_dir)
 
-    def predict(self):
-        """
-        Run trainer's prediction loop on predict_dataset, logging the desired metrics
-        """
-        logger.info("*** Predict ***")
-        results = self.trainer.predict(self.dataset["test"])
-        metrics = results.metrics
-
-        metrics["predict_samples"] = len(self.dataset["test"])
-        self.trainer.log_metrics("predict", metrics)
-        self.trainer.save_metrics("predict", metrics)
-
     def run_sequential_stages(self, checkpoint: Optional[str] = None):
         """
         Run the recipe stage by stage, allowing for alternating between one-shot and
diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
index 0ab314c00..dcf1dacb7 100644
--- a/src/llmcompressor/transformers/finetune/session_mixin.py
+++ b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -344,31 +344,6 @@ def compute_loss(
 
         return loss
 
-    def prediction_step(
-        self,
-        model: Module,
-        inputs: Dict[str, Union[torch.Tensor, Any]],
-        prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
-    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """
-        Wraps the prediction step from the original trainer to remove any input entry
-        that should not be passed to the model.
-        This situation may arise when distillation is used and the teacher model
-        contains more inputs than the student model.
-        """
-        self._check_super_defined("prediction_step")
-
-        inputs = {k: inputs[k] for k in inputs if k in self._model_signature_columns}
-
-        model_outputs = super().prediction_step(
-            model=model,
-            inputs=inputs,
-            prediction_loss_only=prediction_loss_only,
-            ignore_keys=ignore_keys,
-        )
-        return model_outputs
-
     def train(self, *args, stage: Optional[str] = None, **kwargs):
         """
         Run a sparsification training cycle. Runs initialization for the sparse session
@@ -408,22 +383,6 @@ def train(self, *args, stage: Optional[str] = None, **kwargs):
 
         return output
 
-    def predict(self, *args, **kwargs):
-        """
-        Run a sparsification prediction cycle.
-        Runs initialize_structure for the sparse session before calling
-        super().predict() and finalization of the session after.
-
-        :param args: positional args to pass to super().predict()
-        :param kwargs: keyword args to pass to super().predict()
-        :return: the output from super.predict()
-        """
-        self.initialize_structure()
-        output = super().predict(*args, **kwargs)
-        self.finalize_session()
-
-        return output
-
     def save_model(self, output_dir: str, _internal_call=False, _is_oneshot=False):
         """
         Override of the save_model function and expects it to exist in the parent.
diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py
index 7bd40229f..9cb733c30 100644
--- a/src/llmcompressor/transformers/finetune/text_generation.py
+++ b/src/llmcompressor/transformers/finetune/text_generation.py
@@ -92,12 +92,13 @@ def eval(**kwargs):
 )
 def oneshot(**kwargs) -> None:
     from llmcompressor import oneshot
+
     oneshot(**kwargs)
 
 
 def apply(**kwargs):
     """
-    CLI entrypoint for any of training, predict or oneshot
+    CLI entrypoint for any of training, oneshot
     """
     report_to = kwargs.get("report_to", None)
     model_args, data_args, recipe_args, training_args = parse_args(**kwargs)
@@ -322,7 +323,8 @@ def main(
         - Trainer()
             - SessionMixIn()
             - HFTransformersTrainer()
-        - StageRunner.train() and/or predict() and/or oneshot()
+        - StageRunner.train() and/or  oneshot()
+
 
     :param model_args: Arguments pertaining to which model/config/tokenizer we are
     going to fine-tune from
@@ -437,10 +439,6 @@ def main(
             checkpoint = last_checkpoint
         stage_runner.train(checkpoint)
 
-    # Prediction
-    if training_args.do_predict:
-        stage_runner.predict()
-
     # save if model was provided as a string or custom output_dir was set
 
     if isinstance(model_args.model, str) or (
diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
index 319ad765c..7b475fdb5 100644
--- a/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
+++ b/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
@@ -14,14 +14,11 @@ def test_combined_datasets():
     )
     raw_wikitext2 = get_raw_dataset(data_args)
     datasets = {"all": raw_wikitext2}
-
-    split_datasets = make_dataset_splits(datasets, do_train=True, do_predict=True)
+    split_datasets = make_dataset_splits(datasets, do_train=True)
     assert split_datasets.get("train") is not None
-    assert split_datasets.get("test") is not None
 
-    split_datasets = make_dataset_splits(datasets, do_train=True, do_predict=True)
+    split_datasets = make_dataset_splits(datasets, do_train=True)
     assert split_datasets.get("train") is not None
-    assert split_datasets.get("test") is not None
 
 
 @pytest.mark.unit
@@ -35,10 +32,11 @@ def test_separate_datasets():
         raw_wikitext2 = get_raw_dataset(data_args, split=split_str)
         datasets[split_name] = raw_wikitext2
 
-    split_datasets = make_dataset_splits(datasets, do_train=True, do_predict=False)
+    split_datasets = make_dataset_splits(datasets, do_train=True)
     assert split_datasets.get("train") is not None
-    assert split_datasets.get("test") is None
 
     with pytest.raises(ValueError):
         # fails due to no test split specified
-        split_datasets = make_dataset_splits(datasets, do_train=True, do_predict=True)
+
+        datasets.pop("train")
+        split_datasets = make_dataset_splits(datasets, do_train=True)

From f326cd4209af2cdcb865c2006144d4fd5823ff04 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 24 Feb 2025 20:57:09 -0500
Subject: [PATCH 04/23] Fix 2of4 Apply Example (#1181)

Summary
- Fix typo
- Fix info log at the end of the script to no longer be cut off
---
 .../llama7b_sparse_w4a16.py                               | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
index 884952f5e..2cca58815 100644
--- a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
+++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -54,8 +54,8 @@
     warmup_ratio=warmup_ratio,
 )
 logger.info(
-    "Note: llcompressor does not currently support running ",
-    "compressed models in the marlin-24 format. The model ",
-    "produced from this example can be run on vLLM with ",
-    "dtype=torch.float16",
+    "llmcompressor does not currently support running compressed models in the marlin24 format."  # noqa
+)
+logger.info(
+    "The model produced from this example can be run on vLLM with dtype=torch.float16"
 )

From 2fe002718085b910ed6247e35754a5442f572068 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Mon, 24 Feb 2025 22:06:00 -0500
Subject: [PATCH 05/23] Fix Sparse2of4 Example (#1182)

Summary
- Should be saving compressed for all cases
---
 examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
index f18585e3e..ad878a3ce 100644
--- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
+++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
@@ -116,5 +116,5 @@ def get_recipe(fp8_enabled):
 print("==========================================\n")
 
 # Save compressed model and tokenizer
-model.save_pretrained(save_dir, save_compressed=args.fp8)
+model.save_pretrained(save_dir)
 tokenizer.save_pretrained(save_dir)

From d810e4abc006ad49e4dc655924b2d655b6c579a1 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 24 Feb 2025 23:26:58 -0500
Subject: [PATCH 06/23] Add qwen moe w4a16 example (#1186)

Made this moe for easy testing with 60 experts in vLLM
https://huggingface.co/nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16

---------

Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 examples/quantizing_moe/qwen_moe_w4a16.py | 92 +++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 examples/quantizing_moe/qwen_moe_w4a16.py

diff --git a/examples/quantizing_moe/qwen_moe_w4a16.py b/examples/quantizing_moe/qwen_moe_w4a16.py
new file mode 100644
index 000000000..df98d0513
--- /dev/null
+++ b/examples/quantizing_moe/qwen_moe_w4a16.py
@@ -0,0 +1,92 @@
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import oneshot
+from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
+
+# adjust based off number of desired GPUs
+# if not enough memory is available, some layers will automatically be offloaded to cpu
+device_map = calculate_offload_device_map(
+    MODEL_ID,
+    reserve_for_hessians=True,
+    num_gpus=2,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16, trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for W416 quantization with a group size of 128
+# since the MoE gate layers are sensitive to quantization, we add them to the ignore
+# list so they remain at full precision
+recipe = GPTQModifier(
+    targets="Linear",
+    scheme="W4A16",
+    ignore=["lm_head", "re:.*mlp.gate$", "re:.*mlp.shared_expert_gate$"],
+)
+
+SAVE_DIR = MODEL_ID.split("/")[1] + "-quantized.w4a16"
+
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    save_compressed=True,
+    trust_remote_code_model=True,
+    output_dir=SAVE_DIR,
+)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=20)
+print(tokenizer.decode(output[0]))
+print("==========================================")

From 6e101b287b906268147e2e03a3eeec0be46d232b Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 25 Feb 2025 10:46:25 -0500
Subject: [PATCH 07/23] [Callbacks] Consolidate Saving Methods (#1168)

## Purpose ##
* Simplify all methods of saving into one point, namely the wrapped
`save_pretrained` function
* Precursor to #1160
* Needed for having a single point for saving on top of existing recipes

## Background ##
All the things needed to be done during saving
1. Save the model weights, potentially compressed
2. Save the processor
3. Update the recipe checkpoint
4. Copy any necessary python files from the model cache
5. Only save on the main process

After these changes, (1, 2, 3, 4) will be done within the
`save_pretrained` function, and (5) will be the responsibility of the
caller. (3) will be implemented by #1160 so as not to conflict with
existing logic in pre_init

All of the places where a model is saved are
* If an output dir is specified, at the end of the main function
* Between stages of the stage runner
* Between epochs of the HF Trainer
* By the user after oneshot/training completes

After these changes, all of these will be replaced by a single
`save_checkpoint` function which calls `save_pretrained` to do all the
necessary things

## Changes ##
* Remove `save_model_and_recipe`
  * Saving recipes is now done by `save_pretrained` function
* Implement `save_checkpoint`
  * Single entrypoint for saving a model and its processor
  * Performs actions (1, 2, 4)
* Replace all locations where a model is saved with `save_checkpoint`
  * All applicable callers with only saving on the main process (5)
* Remove support for `modify_fsdp_model_save_pretrained` and
`unwrap_and_export_model`, to be added back in a future release

---------

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 .../pytorch/model_load/helpers.py             |  43 ++------
 .../transformers/finetune/runner.py           |  14 ++-
 .../transformers/finetune/session_mixin.py    |  48 ++-------
 .../transformers/finetune/text_generation.py  |  20 ++--
 .../compressed_tensors_utils.py               | 101 ++++--------------
 src/llmcompressor/utils/fsdp/helpers.py       |  32 ------
 6 files changed, 64 insertions(+), 194 deletions(-)

diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
index 5ddc7ebd5..e2e1a91b7 100644
--- a/src/llmcompressor/pytorch/model_load/helpers.py
+++ b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -6,6 +6,7 @@
 from loguru import logger
 from safetensors import safe_open
 from torch.nn import Module
+from transformers import PreTrainedModel
 
 from llmcompressor.core import active_session, create_session, pre_initialize_structure
 from llmcompressor.typing import Processor
@@ -14,20 +15,19 @@
 
 __all__ = [
     "initialize_recipe",
-    "save_model_and_recipe",
     "copy_python_files_from_model_cache",
     "fallback_to_cpu",
     "parse_dtype",
     "get_session_model",
     "get_completed_stages",
     "save_completed_stages",
+    "save_checkpoint",
 ]
 
 
 def initialize_recipe(model: Module, recipe_path: str):
     """
     Initializes a recipe that has been previously applied to the model
-
     :param model: PyTorch model to apply structure to
     :param recipe_path: path to recipe to apply to the model
     """
@@ -49,43 +49,22 @@ def initialize_recipe(model: Module, recipe_path: str):
     logger.info(f"Applied {msg} to the model")
 
 
-def save_model_and_recipe(
-    model: Module,
+def save_checkpoint(
     save_path: str,
-    processor: Optional[Processor] = None,
-    save_safetensors: bool = False,
-    save_compressed: bool = False,
+    model: PreTrainedModel,
+    processor: Processor,
+    save_safetensors: bool = True,
+    save_compressed: bool = True,
 ):
-    """
-    Save a model, processor and the currently loaded recipe to file
-
-    :param model: pytorch model to save
-    :param save_path: path to save output to
-    :param processor: model processor or tokenizer to save
-    :param save_safetensors: whether to save as safetensors or pickle (bin)
-    :param save_compressed: whether to compress sparse weights on disk
-    """
-    # avoid circular import
-    from llmcompressor.transformers.utils.helpers import RECIPE_FILE_NAME
-
+    # saving the model also saves the recipe
     model.save_pretrained(
-        save_path, save_compressed=save_compressed, safe_serialization=save_safetensors
+        save_path,
+        save_safetensors=save_safetensors,
+        save_compressed=save_compressed,
     )
-
     if processor is not None:
         processor.save_pretrained(save_path)
 
-    logger.info("Saving output to {}".format(os.path.abspath(save_path)))
-
-    recipe_path = os.path.join(save_path, RECIPE_FILE_NAME)
-    session = active_session()
-    recipe_yaml_str = session.get_serialized_recipe()
-    with open(recipe_path, "w") as fp:
-        fp.write(recipe_yaml_str)
-
-    # copy python files from cache dir to save_path if any
-    copy_python_files_from_model_cache(model, save_path)
-
 
 def fallback_to_cpu(device: str) -> str:
     """
diff --git a/src/llmcompressor/transformers/finetune/runner.py b/src/llmcompressor/transformers/finetune/runner.py
index 37f7fbb12..dd45b7daf 100644
--- a/src/llmcompressor/transformers/finetune/runner.py
+++ b/src/llmcompressor/transformers/finetune/runner.py
@@ -17,6 +17,7 @@
 from llmcompressor.pytorch.model_load.helpers import (
     get_completed_stages,
     get_session_model,
+    save_checkpoint,
     save_completed_stages,
 )
 from llmcompressor.recipe import Recipe, StageRunType
@@ -26,7 +27,6 @@
     make_dataset_splits,
 )
 from llmcompressor.typing import Processor
-from llmcompressor.utils.fsdp.helpers import save_model_and_recipe
 
 
 class StageRunner:
@@ -231,14 +231,20 @@ def run_sequential_stages(self, checkpoint: Optional[str] = None):
 
             checkpoint = None
 
-            if self._training_args.output_dir:
-                save_model_and_recipe(
-                    model=self.trainer.model,
+            # save model between stages
+            if (
+                self._training_args.output_dir
+                != TrainingArguments.__dataclass_fields__["output_dir"].default
+                and self.trainer.accelerator.is_main_process
+            ):
+                save_checkpoint(
                     save_path=self._output_dir,
+                    model=self.trainer.model,
                     processor=self.processor,
                     save_safetensors=self._training_args.save_safetensors,
                     save_compressed=self._model_args.save_compressed,
                 )
+            self.trainer.accelerator.wait_for_everyone()
 
             # save stage to checkpoint dir
             if self.trainer.accelerator.is_main_process:
diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
index dcf1dacb7..c6e35c2fc 100644
--- a/src/llmcompressor/transformers/finetune/session_mixin.py
+++ b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -23,15 +23,13 @@
 from llmcompressor.modifiers.distillation.utils.pytorch.model_wrapper import (
     KDModelWrapper,
 )
-from llmcompressor.pytorch.model_load.helpers import get_session_model
+from llmcompressor.pytorch.model_load.helpers import get_session_model, save_checkpoint
 from llmcompressor.pytorch.utils import ModuleSparsificationInfo
-from llmcompressor.transformers import RECIPE_FILE_NAME
 from llmcompressor.transformers.finetune.callbacks import (
     DisableHalfPrecisionCallback,
     TrainingLoopCallbacks,
 )
 from llmcompressor.utils.fsdp.context import summon_full_params_context
-from llmcompressor.utils.fsdp.helpers import is_fsdp_model, save_pretrained_fsdp
 from llmcompressor.utils.pytorch import qat_active
 
 if TYPE_CHECKING:
@@ -64,8 +62,8 @@ class SessionManagerMixIn:
     def __init__(
         self,
         recipe: str,
+        data_args: "DatasetArguments",
         model_args: "ModelArguments",
-        data_args: Optional["DatasetArguments"] = None,
         teacher: Optional[Union[Module, str]] = None,
         recipe_args: Optional[Union[Dict[str, Any], str]] = None,
         **kwargs,
@@ -183,7 +181,6 @@ def initialize_structure(self, stage: Optional[str] = None):
         """
         Initialize any recipe structural changes such as quantization on the model,
         return immediately if session has already been initialized
-
         :param stage: Optional stage of recipe to run, or None to run all stages
         """
         session = active_session()
@@ -399,44 +396,19 @@ def save_model(self, output_dir: str, _internal_call=False, _is_oneshot=False):
 
         # knowledge distillation requires making wrappers transparent during
         if isinstance(self.model, KDModelWrapper):
-            self.model.prepare_for_save()
+            self.model.prepare_for_save()  # TODO: move to finalize
 
-        if not is_fsdp_model(self.model):
-            self.model.save_pretrained(
+        # save checkpoint
+        self.save_state()
+        if self.accelerator.is_main_process:
+            processor = getattr(self, "processing_class", self.tokenizer)
+            save_checkpoint(
                 output_dir,
-                save_compressed=self.model_args.save_compressed,
-                safe_serialization=self.args.save_safetensors,
-            )
-        else:  # FSDP model
-            save_pretrained_fsdp(
                 model=self.model,
-                accelerator=self.accelerator,
-                output_dir=output_dir,
+                processor=processor,
+                save_safetensors=self.args.save_safetensors,
                 save_compressed=self.model_args.save_compressed,
-                save_safetensors=self.metadata.get("save_safetensors", False),
-            )
-
-        self.save_state()
-        processor = getattr(self, "processing_class", self.tokenizer)
-        if processor is not None:
-            processor.save_pretrained(output_dir)
-
-        if not self.recipe:
-            return
-
-        if self.accelerator.is_main_process:
-            # save recipe, will contain modifiers from the model's original recipe as
-            # well as those added from self.recipe
-            recipe_path = os.path.join(output_dir, RECIPE_FILE_NAME)
-            session = active_session()
-            recipe_yaml_str = session.get_serialized_recipe()
-            with open(recipe_path, "w") as fp:
-                fp.write(recipe_yaml_str)
-
-            logger.info(
-                f"Saved LLM Compressor recipe with model state to {recipe_path}"
             )
-
         self.accelerator.wait_for_everyone()
 
         if isinstance(self.model, KDModelWrapper):
diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py
index 9cb733c30..c1be354db 100644
--- a/src/llmcompressor/transformers/finetune/text_generation.py
+++ b/src/llmcompressor/transformers/finetune/text_generation.py
@@ -46,12 +46,12 @@
     get_session_model,
     initialize_recipe,
     parse_dtype,
+    save_checkpoint,
 )
 from llmcompressor.recipe import Recipe, StageRunType
 from llmcompressor.transformers.finetune.runner import StageRunner
 from llmcompressor.transformers.finetune.trainer import Trainer
 from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
-    modify_fsdp_model_save_pretrained,
     modify_save_pretrained,
     patch_tied_tensors_bug,
 )
@@ -415,7 +415,10 @@ def main(
 
     # wrap model.save_pretrained
     if is_fsdp_model(model):
-        modify_fsdp_model_save_pretrained(trainer, processor)
+        raise NotImplementedError(
+            "FSDP models are not supported in the current release but will be "
+            "suported in future releases of LLM Compressor"
+        )
     else:
         modify_save_pretrained(model)
 
@@ -440,16 +443,19 @@ def main(
         stage_runner.train(checkpoint)
 
     # save if model was provided as a string or custom output_dir was set
-
     if isinstance(model_args.model, str) or (
         training_args.output_dir
         != TrainingArguments.__dataclass_fields__["output_dir"].default
+        and trainer.accelerator.is_main_process
     ):
-        model.save_pretrained(
-            training_args.output_dir, save_compressed=model_args.save_compressed
+        save_checkpoint(
+            save_path=training_args.output_dir,
+            model=model,
+            processor=processor,
+            save_safetensors=True,
+            save_compressed=model_args.save_compressed,
         )
-        if processor is not None:
-            processor.save_pretrained(training_args.output_dir)
+    trainer.accelerator.wait_for_everyone()
 
     # Clean up the CompressionSession before exit if requested
     if recipe_args.clear_sparse_session:
diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
index f64857fec..b3ac28383 100644
--- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
+++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
@@ -16,6 +16,7 @@
 )
 from loguru import logger
 from safetensors.torch import storage_ptr
+from transformers import PreTrainedModel
 
 from llmcompressor.core import active_session
 from llmcompressor.pytorch.model_load.helpers import copy_python_files_from_model_cache
@@ -26,81 +27,11 @@
     SparsityConfigMetadata,
 )
 from llmcompressor.transformers.utils import RECIPE_FILE_NAME
-from llmcompressor.typing import Processor
-from llmcompressor.utils.fsdp.helpers import (
-    find_and_move_state_dicts_to_cpu,
-    unwrap_and_export_model,
-)
 
-__all__ = ["modify_save_pretrained", "modify_fsdp_model_save_pretrained"]
+__all__ = ["modify_save_pretrained"]
 
 
-def modify_fsdp_model_save_pretrained(trainer, processor: Processor):
-    """
-    Overrides a PreTrainedModel's save_pretrained() method with a wrapped version that
-    supports compression for fsdp model
-    """
-
-    def save_pretrained_compressed(save_pretrained_method):
-        if getattr(save_pretrained_method, "_overridden", False):
-            # `model.save_pretrained` has already been replaced, return.
-            return save_pretrained_method
-
-        # Keep a weak reference to the model class and unbound save_pretrained
-        # method so we can call the original
-        original_save_pretrained = save_pretrained_method.__func__
-        del save_pretrained_method
-
-        @wraps(original_save_pretrained)
-        def save_pretrained_wrapper(
-            save_directory: str,
-            **kwargs,
-        ):
-            """
-            Wrapper around PreTrainedModel.save_pretrained(), adds functionality for
-            saving models in a compressed format on disk. The compression format is
-            saved to the model's config file
-
-            :param save_directory: output directory to save model to
-            :param sparsity_config: optional sparsity config to compress model with,
-            if no config is provided it will be inferred from the model
-            :param quantization_format: optional compression format for quantized
-            models. If none is provided it will be inferred from the model
-            :param save_compressed: whether or not to compress the model on disk
-            :param skip_compression_stats: whether to skip the calculation of
-            compression statistics (such as global sparsity and sparsity structure) when
-            saving a model in dense format
-            :param kwargs: additional kwargs to pass on to model.save_pretrained
-            """
-            try:
-                trainer.save_model(output_dir=save_directory, _is_oneshot=True)
-            except AssertionError:
-                # fallback to this in the case of quantization
-                unwrap_and_export_model(
-                    model=trainer.model,
-                    accelerator=trainer.accelerator,
-                    output_dir=save_directory,
-                    processor=processor,
-                )
-                # only allow the main process move the state
-                # dicts to cpu
-                if trainer.accelerator.is_main_process:
-                    # assuming quantization is the last step
-                    # we no longer need the original model
-                    # and can safely delete it to save memory
-                    del trainer.model
-                    find_and_move_state_dicts_to_cpu(save_directory)
-
-        save_pretrained_wrapper._overriden = True
-        return save_pretrained_wrapper
-
-    # wrap save_pretrained
-    trainer.model.save_pretrained = save_pretrained_compressed(
-        trainer.model.save_pretrained
-    )
-
-
-def modify_save_pretrained(model: torch.nn.Module):
+def modify_save_pretrained(model: PreTrainedModel):
     """
     Overrides a PreTrainedModel's save_pretrained() method with a wrapped version that
     supports compression
@@ -124,6 +55,7 @@ def save_pretrained_wrapper(
             sparsity_config: Optional[SparsityCompressionConfig] = None,
             quantization_format: Optional[str] = None,
             save_compressed: bool = True,
+            safe_serialization: bool = True,
             skip_compression_stats: bool = False,
             disable_sparse_compression: bool = False,
             **kwargs,
@@ -189,19 +121,16 @@ def skip(*args, **kwargs):
             # make sure we're on the main process when saving
             if state_dict is not None and len(state_dict) > 0:
                 compressed_state_dict = compressor.compress(model, state_dict)
-
-                kwargs["safe_serialization"] = kwargs.get("safe_serialization", True)
                 original_save_pretrained.__get__(model, model_class)(
-                    save_directory, state_dict=compressed_state_dict, **kwargs
+                    save_directory,
+                    state_dict=compressed_state_dict,
+                    safe_serialization=safe_serialization,
+                    **kwargs,
                 )
                 compressor.update_config(save_directory)
 
-            recipe_path = os.path.join(save_directory, RECIPE_FILE_NAME)
-            session = active_session()
-
-            if (recipe_yaml_str := session.get_serialized_recipe()) is not None:
-                with open(recipe_path, "w") as fp:
-                    fp.write(recipe_yaml_str)
+            # TODO: update existing recipe
+            update_and_save_recipe(model.name_or_path, save_directory)
 
             # copy python files from cache dir to save_path if any
             copy_python_files_from_model_cache(model, save_directory)
@@ -321,3 +250,13 @@ def get_model_compressor(
         sparsity_config=sparsity_config,
         quantization_format=quantization_format,
     )
+
+
+def update_and_save_recipe(model_path: str, save_directory: str):
+    # TODO: update existing recipe
+    recipe_path = os.path.join(save_directory, RECIPE_FILE_NAME)
+    session = active_session()
+
+    if (recipe_yaml_str := session.get_serialized_recipe()) is not None:
+        with open(recipe_path, "w") as fp:
+            fp.write(recipe_yaml_str)
diff --git a/src/llmcompressor/utils/fsdp/helpers.py b/src/llmcompressor/utils/fsdp/helpers.py
index 3a3248fa5..53fc04ca8 100644
--- a/src/llmcompressor/utils/fsdp/helpers.py
+++ b/src/llmcompressor/utils/fsdp/helpers.py
@@ -17,15 +17,11 @@
 from torch.nn import Module
 
 from llmcompressor.core.state import State
-from llmcompressor.pytorch.model_load.helpers import save_model_and_recipe
-from llmcompressor.typing import Processor
-from llmcompressor.utils.pytorch import set_layer
 
 __all__ = [
     "is_fsdp_model",
     "maybe_get_wrapped",
     "set_wrapped_model",
-    "unwrap_and_export_model",
     "save_pretrained_fsdp",
     "get_fsdp_parent",
     "find_and_move_state_dicts_to_cpu",
@@ -72,34 +68,6 @@ def set_wrapped_model(state: State, wrapped_model: Module):
         state.model = wrapped_model
 
 
-def unwrap_and_export_model(model, accelerator, output_dir: str, processor: Processor):
-    """
-    Recursively unwraps an FSDP model, then saves the unwrapped model and the
-    currently active recipe to disk
-
-    :param model: model to unwrap
-    :param accelerator: Accelerator instance used to perform unwrapping
-    :param output_dir: where to save output model
-    :param processor: processor used by the model
-    """
-    full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
-    with FullyShardedDataParallel.state_dict_type(
-        model,
-        StateDictType.FULL_STATE_DICT,
-        full_state_dict_config,
-    ):
-        unwrapped_model = accelerator.unwrap_model(model)
-        for name, module in unwrapped_model.named_modules():
-            if isinstance(module, FullyShardedDataParallel):
-                set_layer(name, accelerator.unwrap_model(module), unwrapped_model)
-
-        save_model_and_recipe(
-            model=unwrapped_model,
-            save_path=output_dir,
-            processor=processor,
-        )
-
-
 def find_and_move_state_dicts_to_cpu(output_dir: str):
     """
     Looks for state dicts in the output directory and overwrites them

From d3d2d1d84fe6434781989cdf8a6a1389c2addd32 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
Date: Tue, 25 Feb 2025 12:46:48 -0600
Subject: [PATCH 08/23] lmeval tests multimodal (#1150)

SUMMARY:
In order to add multimodal tests to lm_eval, I had to expand the data
model and moved some hard-coded logic into conditionals.

This includes a folder re-org, so that lm-eval tests were moved out of
`tests/e2e` and into their own `tests/lmeval` folder (they are
integration+regression tests).

We'll have to make sure QA team is aware of the re-org once this lands,
as they might have to change the github ci/cd accordingly. This adds
roughly 35 minutes of testing to the weekly tests


TEST PLAN:
No new source code

---------

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 .../transformers/tracing/__init__.py          |  6 +-
 .../transformers/tracing/debug.py             |  4 +
 tests/e2e/e2e_utils.py                        | 34 +++++--
 .../fp8_dynamic_per_token.yaml                |  8 --
 ...int8_channel_weight_dynamic_per_token.yaml |  2 +-
 .../recipe_w4a16_actorder_weight.yaml         |  2 +-
 tests/e2e/vLLM/test_vllm.py                   |  2 +
 tests/{integration => lmeval}/__init__.py     |  0
 .../lmeval/configs/fp8_dynamic_per_token.yaml |  7 ++
 .../configs}/fp8_static_per_tensor.yaml       |  9 +-
 .../configs}/int8_w8a8_dynamic_per_token.yaml |  9 +-
 .../configs/vl_fp8_dynamic_per_token.yaml     | 16 +++
 .../vl_int8_w8a8_dynamic_per_token.yaml       | 19 ++++
 .../configs/vl_w4a16_actorder_weight.yaml     | 19 ++++
 .../configs}/w4a16_actorder_group.yaml        | 11 +--
 .../configs}/w4a16_actorder_weight.yaml       | 11 +--
 .../configs}/w4a16_grouped_quant.yaml         | 11 +--
 tests/{e2e/vLLM => lmeval}/test_lmeval.py     | 54 +++++-----
 tests/testing_utils.py                        | 98 +++++++++----------
 19 files changed, 200 insertions(+), 122 deletions(-)
 delete mode 100644 tests/e2e/vLLM/lm_eval_configs/fp8_dynamic_per_token.yaml
 rename tests/{integration => lmeval}/__init__.py (100%)
 create mode 100644 tests/lmeval/configs/fp8_dynamic_per_token.yaml
 rename tests/{e2e/vLLM/lm_eval_configs => lmeval/configs}/fp8_static_per_tensor.yaml (56%)
 rename tests/{e2e/vLLM/lm_eval_configs => lmeval/configs}/int8_w8a8_dynamic_per_token.yaml (69%)
 create mode 100644 tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
 create mode 100644 tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
 create mode 100644 tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
 rename tests/{e2e/vLLM/lm_eval_configs => lmeval/configs}/w4a16_actorder_group.yaml (59%)
 rename tests/{e2e/vLLM/lm_eval_configs => lmeval/configs}/w4a16_actorder_weight.yaml (59%)
 rename tests/{e2e/vLLM/lm_eval_configs => lmeval/configs}/w4a16_grouped_quant.yaml (53%)
 rename tests/{e2e/vLLM => lmeval}/test_lmeval.py (76%)

diff --git a/src/llmcompressor/transformers/tracing/__init__.py b/src/llmcompressor/transformers/tracing/__init__.py
index d5643efb8..29a976663 100644
--- a/src/llmcompressor/transformers/tracing/__init__.py
+++ b/src/llmcompressor/transformers/tracing/__init__.py
@@ -8,16 +8,18 @@
     Qwen2VLForConditionalGeneration as TraceableQwen2VLForConditionalGeneration,
 )
 from .idefics3 import (
-    Idefics3ForConditionalGeneration as TraceableIdefics3ForConditionalGeneration
+    Idefics3ForConditionalGeneration as TraceableIdefics3ForConditionalGeneration,
 )
 from .whisper import (
-    WhisperForConditionalGeneration as TraceableWhisperForConditionalGeneration
+    WhisperForConditionalGeneration as TraceableWhisperForConditionalGeneration,
 )
 from .qwen2_5_vl import (
     Qwen2_5_VLForConditionalGeneration as TraceableQwen2_5_VLForConditionalGeneration
 )
+from .debug import get_model_class
 
 __all__ = [
+    "get_model_class",
     "TraceableLlavaForConditionalGeneration",
     "TraceableMllamaForConditionalGeneration",
     "TraceableQwen2VLForConditionalGeneration",
diff --git a/src/llmcompressor/transformers/tracing/debug.py b/src/llmcompressor/transformers/tracing/debug.py
index 3b31366b1..ccce917a7 100644
--- a/src/llmcompressor/transformers/tracing/debug.py
+++ b/src/llmcompressor/transformers/tracing/debug.py
@@ -12,6 +12,10 @@
 from llmcompressor.transformers import TextGenerationDataset
 from llmcompressor.args import DatasetArguments
 
+__all__ = [
+    "get_model_class"
+]
+
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Trace a model into subgraphs")
diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
index be1a6d324..30c02a1ec 100644
--- a/tests/e2e/e2e_utils.py
+++ b/tests/e2e/e2e_utils.py
@@ -1,23 +1,27 @@
+import torch
 from datasets import load_dataset
 from loguru import logger
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoProcessor
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
+from llmcompressor.transformers.tracing import get_model_class
 from tests.test_timer.timer_utils import log_time
-from tests.testing_utils import preprocess_tokenize_dataset
+from tests.testing_utils import process_dataset
 
 
 @log_time
-def _load_model_and_tokenizer(
+def _load_model_and_processor(
     model: str,
+    model_class: str,
     device: str,
 ):
-    loaded_model = AutoModelForCausalLM.from_pretrained(
+    pretrained_model_class = get_model_class(model_class)
+    loaded_model = pretrained_model_class.from_pretrained(
         model, device_map=device, torch_dtype="auto"
     )
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    return loaded_model, tokenizer
+    processor = AutoProcessor.from_pretrained(model)
+    return loaded_model, processor
 
 
 @log_time
@@ -30,6 +34,7 @@ def _run_oneshot(device: str, **oneshot_kwargs):
 
 def run_oneshot_for_e2e_testing(
     model: str,
+    model_class: str,
     device: str,
     num_calibration_samples: int,
     max_seq_length: int,
@@ -43,16 +48,27 @@ def run_oneshot_for_e2e_testing(
     # Load model.
     oneshot_kwargs = {}
 
-    loaded_model, tokenizer = _load_model_and_tokenizer(model=model, device=device)
+    loaded_model, processor = _load_model_and_processor(
+        model=model, model_class=model_class, device=device
+    )
 
     if dataset_id:
         ds = load_dataset(dataset_id, name=dataset_config, split=dataset_split)
         ds = ds.shuffle(seed=42).select(range(num_calibration_samples))
-        ds = preprocess_tokenize_dataset(ds, tokenizer, max_seq_length)
+        ds = process_dataset(ds, processor, max_seq_length)
         oneshot_kwargs["dataset"] = ds
         oneshot_kwargs["max_seq_length"] = max_seq_length
         oneshot_kwargs["num_calibration_samples"] = num_calibration_samples
 
+        # Define a data collator for multimodal inputs.
+        if "flickr30k" in dataset_id:
+
+            def data_collator(batch):
+                assert len(batch) == 1
+                return {key: torch.tensor(value) for key, value in batch[0].items()}
+
+            oneshot_kwargs["data_collator"] = data_collator
+
     oneshot_kwargs["model"] = loaded_model
     if recipe:
         oneshot_kwargs["recipe"] = recipe
@@ -72,4 +88,4 @@ def run_oneshot_for_e2e_testing(
     logger.info("ONESHOT KWARGS", oneshot_kwargs)
     _run_oneshot(device=device, **oneshot_kwargs)
 
-    return oneshot_kwargs["model"], tokenizer
+    return oneshot_kwargs["model"], processor
diff --git a/tests/e2e/vLLM/lm_eval_configs/fp8_dynamic_per_token.yaml b/tests/e2e/vLLM/lm_eval_configs/fp8_dynamic_per_token.yaml
deleted file mode 100644
index fc610bae9..000000000
--- a/tests/e2e/vLLM/lm_eval_configs/fp8_dynamic_per_token.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-cadence: "weekly"
-model: meta-llama/Meta-Llama-3-8B-Instruct
-scheme: FP8_DYNAMIC
-num_fewshot: 5
-limit: 1000
-task: "gsm8k"
-exact_match,flexible-extract: 0.75
-exact_match,strict-match: 0.75
diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
index 367437e5a..c3ecdea86 100644
--- a/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
+++ b/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
@@ -3,7 +3,7 @@ quant_stage:
     SmoothQuantModifier:
       smoothing_strength: 0.8
     GPTQModifier:
-      ignore: [lm_head]
+      ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*"]
       config_groups:
         group_0:
           weights: {num_bits: 8, type: int, symmetric: true, strategy: channel}
diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
index 0c8476883..4efa211a2 100644
--- a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
+++ b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
@@ -1,7 +1,7 @@
 quant_stage:
   quant_modifiers:
     GPTQModifier:
-      ignore: ["lm_head"]
+      ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*"]
       config_groups:
         group_0:
           weights:
diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
index 6632ca633..63e844423 100644
--- a/tests/e2e/vLLM/test_vllm.py
+++ b/tests/e2e/vLLM/test_vllm.py
@@ -71,6 +71,7 @@ def set_up(self):
             pytest.skip("Skipping test; cadence mismatch")
 
         self.model = eval_config["model"]
+        self.model_class = eval_config.get("model_class", "AutoModelForCausalLM")
         self.scheme = eval_config.get("scheme")
         self.dataset_id = eval_config.get("dataset_id")
         self.dataset_config = eval_config.get("dataset_config")
@@ -104,6 +105,7 @@ def test_vllm(self):
             self.save_dir = self.model.split("/")[1] + f"-{self.scheme}"
         oneshot_model, tokenizer = run_oneshot_for_e2e_testing(
             model=self.model,
+            model_class=self.model_class,
             device=self.device,
             num_calibration_samples=self.num_calibration_samples,
             max_seq_length=self.max_seq_length,
diff --git a/tests/integration/__init__.py b/tests/lmeval/__init__.py
similarity index 100%
rename from tests/integration/__init__.py
rename to tests/lmeval/__init__.py
diff --git a/tests/lmeval/configs/fp8_dynamic_per_token.yaml b/tests/lmeval/configs/fp8_dynamic_per_token.yaml
new file mode 100644
index 000000000..b89bb4552
--- /dev/null
+++ b/tests/lmeval/configs/fp8_dynamic_per_token.yaml
@@ -0,0 +1,7 @@
+cadence: "weekly"
+model: meta-llama/Meta-Llama-3-8B-Instruct
+scheme: FP8_DYNAMIC
+lmeval:
+  metrics:
+    exact_match,flexible-extract: 0.75
+    exact_match,strict-match: 0.75
diff --git a/tests/e2e/vLLM/lm_eval_configs/fp8_static_per_tensor.yaml b/tests/lmeval/configs/fp8_static_per_tensor.yaml
similarity index 56%
rename from tests/e2e/vLLM/lm_eval_configs/fp8_static_per_tensor.yaml
rename to tests/lmeval/configs/fp8_static_per_tensor.yaml
index 0b6d42a46..e4d31cef2 100644
--- a/tests/e2e/vLLM/lm_eval_configs/fp8_static_per_tensor.yaml
+++ b/tests/lmeval/configs/fp8_static_per_tensor.yaml
@@ -1,10 +1,9 @@
 cadence: "weekly"
 model: meta-llama/Meta-Llama-3-8B-Instruct
 scheme: FP8
-num_fewshot: 5
-limit: 1000
-task: "gsm8k"
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
-exact_match,flexible-extract: 0.75
-exact_match,strict-match: 0.75
+lmeval:
+  metrics:
+    exact_match,flexible-extract: 0.75
+    exact_match,strict-match: 0.75
diff --git a/tests/e2e/vLLM/lm_eval_configs/int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/int8_w8a8_dynamic_per_token.yaml
similarity index 69%
rename from tests/e2e/vLLM/lm_eval_configs/int8_w8a8_dynamic_per_token.yaml
rename to tests/lmeval/configs/int8_w8a8_dynamic_per_token.yaml
index 446ca1e7f..3e6c364e0 100644
--- a/tests/e2e/vLLM/lm_eval_configs/int8_w8a8_dynamic_per_token.yaml
+++ b/tests/lmeval/configs/int8_w8a8_dynamic_per_token.yaml
@@ -2,10 +2,9 @@ cadence: "weekly"
 model: meta-llama/Meta-Llama-3-8B-Instruct
 scheme: INT8_dyn_per_token
 recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
-num_fewshot: 5
-limit: 1000
-task: "gsm8k"
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
-exact_match,flexible-extract: 0.77
-exact_match,strict-match: 0.76
+lmeval:
+  metrics:
+    exact_match,flexible-extract: 0.77
+    exact_match,strict-match: 0.76
\ No newline at end of file
diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
new file mode 100644
index 000000000..3ae64f093
--- /dev/null
+++ b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
@@ -0,0 +1,16 @@
+cadence: weekly
+model: Qwen/Qwen2-VL-2B-Instruct
+model_class: TraceableQwen2VLForConditionalGeneration
+scheme: FP8_DYNAMIC
+lmeval:
+  model: "hf-multimodal"
+  model_args:
+    dtype: bfloat16
+    add_bos_token: True
+    convert_img_format: True
+  task: mmmu_val_economics
+  num_fewshot: 0
+  limit: 1000
+  batch_size: 8
+  metrics:
+    acc,none: 0.333
diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
new file mode 100644
index 000000000..22b5d8419
--- /dev/null
+++ b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
@@ -0,0 +1,19 @@
+cadence: "weekly"
+model: llava-hf/llava-1.5-7b-hf
+model_class: TraceableLlavaForConditionalGeneration
+scheme: INT8_dyn_per_token
+recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
+dataset_id: lmms-lab/flickr30k
+dataset_split: "test[:512]"
+lmeval:
+  model: "hf-multimodal"
+  model_args:
+    dtype: bfloat16
+    add_bos_token: True
+    convert_img_format: True
+  task: mmmu_val_economics
+  num_fewshot: 0
+  limit: 1000
+  metrics:
+    acc,none: 0.233
+  batch_size: 8
\ No newline at end of file
diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
new file mode 100644
index 000000000..b7fa161c8
--- /dev/null
+++ b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
@@ -0,0 +1,19 @@
+cadence: "weekly"
+model: Qwen/Qwen2-VL-2B-Instruct
+model_class: TraceableQwen2VLForConditionalGeneration
+recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
+dataset_id: lmms-lab/flickr30k
+dataset_split: "test[:512]"
+scheme: W4A16_actorder_group
+lmeval:
+  model: "hf-multimodal"
+  model_args:
+    dtype: bfloat16
+    add_bos_token: True
+    convert_img_format: True
+  task: mmmu_val_economics
+  num_fewshot: 0
+  limit: 1000
+  metrics:
+    acc,none: 0.4
+  batch_size: 4
\ No newline at end of file
diff --git a/tests/e2e/vLLM/lm_eval_configs/w4a16_actorder_group.yaml b/tests/lmeval/configs/w4a16_actorder_group.yaml
similarity index 59%
rename from tests/e2e/vLLM/lm_eval_configs/w4a16_actorder_group.yaml
rename to tests/lmeval/configs/w4a16_actorder_group.yaml
index c599b740e..9ff9ebacc 100644
--- a/tests/e2e/vLLM/lm_eval_configs/w4a16_actorder_group.yaml
+++ b/tests/lmeval/configs/w4a16_actorder_group.yaml
@@ -1,11 +1,10 @@
 cadence: "weekly"
 model: meta-llama/Meta-Llama-3-8B-Instruct
+scheme: W4A16_actorder_group
 recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml
-num_fewshot: 5
-limit: 1000
-task: "gsm8k"
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
-exact_match,flexible-extract: 0.72
-exact_match,strict-match: 0.72
-scheme: W4A16_actorder_group
\ No newline at end of file
+lmeval:
+  metrics:
+    exact_match,flexible-extract: 0.72
+    exact_match,strict-match: 0.72
diff --git a/tests/e2e/vLLM/lm_eval_configs/w4a16_actorder_weight.yaml b/tests/lmeval/configs/w4a16_actorder_weight.yaml
similarity index 59%
rename from tests/e2e/vLLM/lm_eval_configs/w4a16_actorder_weight.yaml
rename to tests/lmeval/configs/w4a16_actorder_weight.yaml
index 7297ea4e5..612274218 100644
--- a/tests/e2e/vLLM/lm_eval_configs/w4a16_actorder_weight.yaml
+++ b/tests/lmeval/configs/w4a16_actorder_weight.yaml
@@ -1,11 +1,10 @@
 cadence: "weekly"
 model: meta-llama/Meta-Llama-3-8B-Instruct
+scheme: W4A16_actorder_group
 recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
-num_fewshot: 5
-limit: 1000
-task: "gsm8k"
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
-exact_match,flexible-extract: 0.72
-exact_match,strict-match: 0.72
-scheme: W4A16_actorder_weight
\ No newline at end of file
+lmeval:
+  metrics:
+    exact_match,flexible-extract: 0.72
+    exact_match,strict-match: 0.72
diff --git a/tests/e2e/vLLM/lm_eval_configs/w4a16_grouped_quant.yaml b/tests/lmeval/configs/w4a16_grouped_quant.yaml
similarity index 53%
rename from tests/e2e/vLLM/lm_eval_configs/w4a16_grouped_quant.yaml
rename to tests/lmeval/configs/w4a16_grouped_quant.yaml
index a4c7b6244..45728a5b6 100644
--- a/tests/e2e/vLLM/lm_eval_configs/w4a16_grouped_quant.yaml
+++ b/tests/lmeval/configs/w4a16_grouped_quant.yaml
@@ -1,11 +1,10 @@
 cadence: "weekly"
 model: meta-llama/Meta-Llama-3-8B-Instruct
-num_fewshot: 5
-limit: 1000
-task: "gsm8k"
-exact_match,flexible-extract: 0.72
-exact_match,strict-match: 0.72
 scheme: W4A16
 dataset_id: HuggingFaceH4/ultrachat_200k
 dataset_split: train_sft
-quant_type: "GPTQ"
\ No newline at end of file
+quant_type: "GPTQ"
+lmeval:
+  metrics:
+    exact_match,flexible-extract: 0.72
+    exact_match,strict-match: 0.72
diff --git a/tests/e2e/vLLM/test_lmeval.py b/tests/lmeval/test_lmeval.py
similarity index 76%
rename from tests/e2e/vLLM/test_lmeval.py
rename to tests/lmeval/test_lmeval.py
index 4e11123a5..e5b9efcef 100644
--- a/tests/e2e/vLLM/test_lmeval.py
+++ b/tests/lmeval/test_lmeval.py
@@ -6,11 +6,23 @@
 import pytest
 import yaml
 from loguru import logger
+from pydantic import BaseModel
 
 from llmcompressor.core import active_session
 from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing
 from tests.examples.utils import requires_gpu_count
 
+
+class LmEvalConfig(BaseModel):
+    model: str = "hf"
+    model_args: dict = {"add_bos_token": True, "dtype": "bfloat16"}
+    task: str = "gsm8k"
+    num_fewshot: int = 5
+    limit: int = 1000
+    metrics: dict
+    batch_size: int = 100
+
+
 try:
     import lm_eval
 
@@ -51,6 +63,8 @@ def set_up(self):
             pytest.skip("Skipping test; cadence mismatch")
 
         self.model = eval_config["model"]
+        self.model_class = eval_config.get("model_class", "AutoModelForCausalLM")
+        self.lmeval = LmEvalConfig(**eval_config.get("lmeval", {}))
         self.scheme = eval_config.get("scheme")
         self.dataset_id = eval_config.get("dataset_id")
         self.dataset_config = eval_config.get("dataset_config")
@@ -58,11 +72,6 @@ def set_up(self):
         self.recipe = eval_config.get("recipe")
         self.quant_type = eval_config.get("quant_type")
         self.save_dir = eval_config.get("save_dir")
-        self.task = eval_config.get("task")
-        self.num_fewshot = eval_config.get("num_fewshot")
-        self.limit = eval_config.get("limit")
-        self.exact_flex = eval_config.get("exact_match,flexible-extract")
-        self.exact_strict = eval_config.get("exact_match,strict-match")
 
         logger.info("========== RUNNING ==============")
         logger.info(self.scheme)
@@ -76,8 +85,9 @@ def test_lm_eval(self):
         self.set_up()
         if not self.save_dir:
             self.save_dir = self.model.split("/")[1] + f"-{self.scheme}"
-        oneshot_model, tokenizer = run_oneshot_for_e2e_testing(
+        oneshot_model, processor = run_oneshot_for_e2e_testing(
             model=self.model,
+            model_class=self.model_class,
             device=self.device,
             num_calibration_samples=self.num_calibration_samples,
             max_seq_length=self.max_seq_length,
@@ -91,7 +101,7 @@ def test_lm_eval(self):
 
         logger.info("================= SAVING TO DISK ======================")
         oneshot_model.save_pretrained(self.save_dir)
-        tokenizer.save_pretrained(self.save_dir)
+        processor.save_pretrained(self.save_dir)
         recipe_path = os.path.join(self.save_dir, "recipe.yaml")
 
         # Use the session to fetch the recipe;
@@ -104,26 +114,26 @@ def test_lm_eval(self):
 
         logger.info("================= Running LM Eval ======================")
 
-        model_args = f"pretrained={self.save_dir},add_bos_token=True"
+        model_args = {"pretrained": self.save_dir}
+        model_args.update(self.lmeval.model_args)
         results = lm_eval.simple_evaluate(
-            model="hf",
+            model=self.lmeval.model,
             model_args=model_args,
-            tasks=[self.task],
-            num_fewshot=self.num_fewshot,
-            limit=self.limit,
+            tasks=[self.lmeval.task],
+            num_fewshot=self.lmeval.num_fewshot,
+            limit=self.lmeval.limit,
             device="cuda:0",
-            batch_size=100,
+            batch_size=self.lmeval.batch_size,
         )
 
-        metrics = results["results"][self.task]
-        exact_match_strict = metrics.get("exact_match,strict-match")
-        exact_match_flex = metrics.get("exact_match,flexible-extract")
-        logger.info("Exact Match, Strict")
-        logger.info(exact_match_strict)
-        logger.info("Exact Match, Flex")
-        logger.info(exact_match_flex)
-        assert numpy.isclose(exact_match_strict, self.exact_strict, rtol=0.05)
-        assert numpy.isclose(exact_match_flex, self.exact_flex, rtol=0.05)
+        metrics = results["results"][self.lmeval.task]
+        for metric, expected_val in self.lmeval.metrics.items():
+            actual_val = metrics.get(metric)
+            logger.info(
+                f"Comparing {metric}: Expected {expected_val}, Got {actual_val}"
+            )
+            assert numpy.isclose(expected_val, actual_val, rtol=0.05)
+
         self.tear_down()
 
     def tear_down(self):
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index 257506784..07bb58b99 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -9,7 +9,7 @@
 
 import yaml
 from datasets import Dataset
-from transformers import PreTrainedTokenizer
+from transformers import ProcessorMixin
 
 from tests.data import CustomTestConfig, TestConfig
 
@@ -125,8 +125,8 @@ def run_cli_command(cmd: List[str], cwd: Optional[Union[str, Path]] = None):
     return run(cmd, stdout=PIPE, stderr=STDOUT, check=False, encoding="utf-8", cwd=cwd)
 
 
-def preprocess_tokenize_dataset(
-    ds: Dataset, tokenizer: PreTrainedTokenizer, max_seq_length: int
+def process_dataset(
+    ds: Dataset, processor: ProcessorMixin, max_seq_length: int
 ) -> Dataset:
     """
     Helper function to preprocess and tokenize a dataset according to presets
@@ -138,11 +138,8 @@ def preprocess_tokenize_dataset(
     ds_name = ds.info.dataset_name.lower()
     if ds_name == "gsm8k":
 
-        def preprocess(example):
-            return example
-
-        def tokenize(sample):
-            return tokenizer(
+        def process(sample):
+            return processor(
                 sample["question"],
                 padding=False,
                 max_length=max_seq_length,
@@ -152,17 +149,12 @@ def tokenize(sample):
 
     elif ds_name == "ultrachat_200k":
 
-        def preprocess(example):
-            return {
-                "text": tokenizer.apply_chat_template(
-                    example["messages"],
+        def process(sample):
+            return processor(
+                processor.apply_chat_template(
+                    sample["messages"],
                     tokenize=False,
-                )
-            }
-
-        def tokenize(sample):
-            return tokenizer(
-                sample["text"],
+                ),
                 padding=False,
                 max_length=max_seq_length,
                 truncation=True,
@@ -171,17 +163,12 @@ def tokenize(sample):
 
     elif ds_name == "llm_compression_calibration":
 
-        def preprocess(example):
-            return {
-                "text": tokenizer.apply_chat_template(
-                    example["text"],
+        def process(sample):
+            return processor(
+                processor.apply_chat_template(
+                    sample["text"],
                     tokenize=False,
-                )
-            }
-
-        def tokenize(sample):
-            return tokenizer(
-                sample["text"],
+                ),
                 padding=False,
                 max_length=max_seq_length,
                 truncation=True,
@@ -190,17 +177,12 @@ def tokenize(sample):
 
     elif ds_name == "open-platypus":
         # use the output rather than the instruction
-        def preprocess(example):
-            return {
-                "text": tokenizer.apply_chat_template(
-                    example["output"],
+        def process(sample):
+            return processor(
+                processor.apply_chat_template(
+                    sample["output"],
                     tokenize=False,
-                )
-            }
-
-        def tokenize(sample):
-            return tokenizer(
-                sample["text"],
+                ),
                 padding=False,
                 max_length=max_seq_length,
                 truncation=True,
@@ -209,32 +191,46 @@ def tokenize(sample):
 
     elif ds_name == "slimorca-deduped-cleaned-corrected":
         # find the first element corresponding to a message from a human
-        def preprocess(example):
+        def process(sample):
             conversation_idx = 0
-            for idx, conversation in enumerate(example["conversations"]):
+            for idx, conversation in enumerate(sample["conversations"]):
                 if conversation["from"] == "human":
                     conversation_idx = idx
                     break
-            return {
-                "text": tokenizer.apply_chat_template(
-                    example["conversations"][conversation_idx]["value"],
+            return processor(
+                processor.apply_chat_template(
+                    sample["conversations"][conversation_idx]["value"],
                     tokenize=False,
-                )
-            }
-
-        def tokenize(sample):
-            return tokenizer(
-                sample["text"],
+                ),
                 padding=False,
                 max_length=max_seq_length,
                 truncation=True,
                 add_special_tokens=False,
             )
 
+    elif ds_name == "flickr30k":
+
+        def process(sample):
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": "What does the image show?"},
+                    ],
+                }
+            ]
+            return {
+                "text": processor.apply_chat_template(
+                    messages,
+                    add_generation_prompt=True,
+                ),
+                "images": sample["image"],
+            }
+
     else:
         raise NotImplementedError(f"Cannot preprocess dataset {ds.info.dataset_name}")
 
-    ds = ds.map(preprocess)
-    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ds = ds.map(process, remove_columns=ds.column_names)
 
     return ds

From 77e4f4c9f9f171c16594f24ece6af18a9d6766c8 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Tue, 25 Feb 2025 16:40:23 -0500
Subject: [PATCH 09/23] [Dataset Performance] Add num workers on dataset
 processing - labels, tokenization (#1189)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SUMMARY:
* Add `preprocessing_num_workers` to run dataset processing in parallel
for 2:4 example.

Before:
Tokenizing: 371.12 examples/s,
Adding labels: 1890.18 examples/s,
Tokenizing: 333.39 examples/s
```bash
Tokenizing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12802/12802 [00:34<00:00, 371.12 examples/s]
Adding labels: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12802/12802 [00:06<00:00, 1890.18 examples/s]
Tokenizing:   9%|█████████▌                                                                                                     | 22077/256032 [00:59<11:41, 333.39 examples/s
```


After  (num_proc=8):
Tokenizing: 2703.93 examples/s,
Adding labels: 5524.98 examples/s,
Tokenizing: 2925.98 examples/s
```bash
Tokenizing (num_proc=8): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 12802/12802 [00:04<00:00, 2703.93 examples/s]
Adding labels (num_proc=8): 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 12802/12802 [00:02<00:00, 5524.98 examples/s]
Tokenizing (num_proc=8): 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 256032/256032 [01:27<00:00, 2925.98 examples/s]
```

TEST PLAN:
* Pass existing tests

Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
index 2cca58815..d617cff12 100644
--- a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
+++ b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -33,6 +33,7 @@
 bf16 = False  # using full precision for training
 lr_scheduler_type = "cosine"
 warmup_ratio = 0.1
+preprocessing_num_workers = 8
 
 # this will run the recipe stage by stage:
 # oneshot sparsification -> finetuning -> oneshot quantization
@@ -52,6 +53,7 @@
     learning_rate=learning_rate,
     lr_scheduler_type=lr_scheduler_type,
     warmup_ratio=warmup_ratio,
+    preprocessing_num_workers=preprocessing_num_workers,
 )
 logger.info(
     "llmcompressor does not currently support running compressed models in the marlin24 format."  # noqa

From 5df3a2877c0ebfce09f9fe8e6b1f4c2d6f7edbff Mon Sep 17 00:00:00 2001
From: Eldar Kurtic <eldar@neuralmagic.com>
Date: Wed, 26 Feb 2025 14:18:37 +0100
Subject: [PATCH 10/23] Fix a minor typo (#1191)

---
 src/llmcompressor/modifiers/quantization/quantization/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py
index 8af5ee41a..3a8946aef 100644
--- a/src/llmcompressor/modifiers/quantization/quantization/base.py
+++ b/src/llmcompressor/modifiers/quantization/quantization/base.py
@@ -372,5 +372,5 @@ def _check_token_distribution(
                     f"received less than {int(threshold * 100)}% "
                     "of calibration batch tokens "
                     f"({token_count}/{total_token_count} tokens). "
-                    "This could result may harm the quantization quality."
+                    "This could harm the quantization quality."
                 )

From a88b72b02add4c2eac9a63ee9e7b932fea08f7cd Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 26 Feb 2025 15:36:59 -0500
Subject: [PATCH 11/23] [Callbacks] Remove pre_initialize_structure (#1160)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose ##
* Remove pre_initialize_structure to simplify codebase
* Fix recipe appending for appending a recipe to a model which already
has a recipe
* Remove misleading logging messages

```
2025-02-17T17:48:38.477750-0500 | _check_create_state | INFO - State created for compression lifecycle
2025-02-17T17:48:38.478670-0500 | pre_initialize_structure | INFO - Compression lifecycle structure pre-initialized for 0 modifiers
2025-02-17T17:48:38.478836-0500 | pre_initialize_structure | INFO - Compression lifecycle structure pre-initialized for 0 modifiers
```

## Prerequisites ##
* #1168

## Follow-ups ##
* Remove double initialization

## Changes ##
The preinitialization step used to fulfill a few purposes
* Construct the lifecycle state
  * This is now done by the dataclass directly
```python3
- state: Optional[State] = None
+ state: Optional[State] = field(default_factory=State)
```

* Populate state with model and recipe
  * This is now done (and has always been done) by `initialize`
* Some functions such as Trainer.init_model attempt to access the model
through the session before `initialize` is called. In these cases, we
can pass the model directly
```python3
trainer = Trainer(
-     model_init=get_session_model,
+     model_init=lambda: model,
```

* Prepend recipes to the recipe.yaml if the model has already been
compressed once
* Move this logic from preinitialization to the save_pretrained function
  * Consolidate all save pathways to use the the same wrapped method

```python3
def save_pretrained_wrapper(...):
    update_and_save_recipe(model.name_or_path, save_directory)
```

* Provide a way for modifiers to influence the model after they have
already been applied
* This can still be a enacted via recipe validation, but likely no
longer has a use case and shouldn't be done automatically, at most the
LLM Compressor should warn if the recipe configuration is invalid /
requires modification

* Create quantization modifier on GPTQ
  * This is now done within the `on_initialize` function
* In the future, this should be done by a high-level recipe validation
step
```python3
def on_initialize(...)
-     self.on_initialize_structure(state, **kwargs)
+     self._maybe_build_quant_modifier(state.model)
````

* Remove `EventType.order()` method which is unused

* Extend the `Recipe.simplify_recipe` class method to support strings

## Lifecycle ##
1. `create_session()` (doesn't do much and can be hidden behind
`initialize`)
2. `initialize(model=..., recipe=...)`
    1. Maybe `start` modifiers
3. `LifecycleCallback.event(...)`
    1. Maybe `start/end` modifiers
4. `finalize()`

## Regression Evaluation ##
Main
```
vllm (pretrained=/home/kyle/llm-compressor/Meta-Llama-3-8B-Instruct-W4A16-G128,dtype=bfloat16,add_bos_token=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1
|  Tasks   |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
|----------|------:|------|-----:|------|---|-----:|---|-----:|
|winogrande|      1|none  |     5|acc   |↑  |0.7482|±  |0.0122|
```

This branch
```
vllm (pretrained=/home/kyle/llm-compressor/Meta-Llama-3-8B-Instruct-W4A16-G128,dtype=bfloat16,add_bos_token=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1
|  Tasks   |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
|----------|------:|------|-----:|------|---|-----:|---|-----:|
|winogrande|      1|none  |     5|acc   |↑  |0.7482|±  |0.0122|
```

---------

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/__init__.py                 |   1 -
 src/llmcompressor/core/__init__.py            |   2 -
 src/llmcompressor/core/events/event.py        |  31 -----
 src/llmcompressor/core/lifecycle.py           |  95 ++++-----------
 src/llmcompressor/core/session.py             |  39 -------
 src/llmcompressor/core/session_functions.py   |  13 +--
 src/llmcompressor/modifiers/interface.py      |  19 ---
 src/llmcompressor/modifiers/modifier.py       |  31 -----
 .../modifiers/quantization/gptq/base.py       |  18 ++-
 src/llmcompressor/modifiers/stage.py          |  24 +---
 .../pytorch/model_load/helpers.py             |  27 +----
 src/llmcompressor/recipe/__init__.py          |   5 +-
 src/llmcompressor/recipe/container.py         | 110 +++++++++---------
 src/llmcompressor/recipe/recipe.py            |  20 +++-
 .../transformers/finetune/runner.py           |  12 +-
 .../transformers/finetune/session_mixin.py    |  20 ----
 .../transformers/finetune/text_generation.py  |  13 +--
 .../compressed_tensors_utils.py               |  23 ++--
 .../transformers/utils/helpers.py             |  10 +-
 src/llmcompressor/utils/fsdp/helpers.py       |  69 +----------
 .../pruning/sparsegpt/test_pytorch.py         |   8 +-
 .../obcq/test_consecutive_runs.py             |  32 +++--
 tests/unit/core/events/test_event.py          |   7 --
 23 files changed, 159 insertions(+), 470 deletions(-)

diff --git a/src/llmcompressor/__init__.py b/src/llmcompressor/__init__.py
index 6f174a59e..f979a7453 100644
--- a/src/llmcompressor/__init__.py
+++ b/src/llmcompressor/__init__.py
@@ -40,7 +40,6 @@
     create_session,
     finalize,
     initialize,
-    pre_initialize_structure,
     reset_session,
 )
 from llmcompressor.entrypoints import Oneshot, oneshot
diff --git a/src/llmcompressor/core/__init__.py b/src/llmcompressor/core/__init__.py
index 75335164d..47e710943 100644
--- a/src/llmcompressor/core/__init__.py
+++ b/src/llmcompressor/core/__init__.py
@@ -15,7 +15,6 @@
     create_session,
     finalize,
     initialize,
-    pre_initialize_structure,
     reset_session,
 )
 from llmcompressor.core.state import Data, Hardware, ModifiedState, State
@@ -36,7 +35,6 @@
     "create_session",
     "active_session",
     "reset_session",
-    "pre_initialize_structure",
     "initialize",
     "finalize",
     "apply",
diff --git a/src/llmcompressor/core/events/event.py b/src/llmcompressor/core/events/event.py
index 9d5d48d63..89eb780c8 100644
--- a/src/llmcompressor/core/events/event.py
+++ b/src/llmcompressor/core/events/event.py
@@ -27,7 +27,6 @@ class EventType(Enum):
     The purpose of each EventType is to trigger the corresponding
     modifier callback during training or post training pipelines.
 
-    :param PRE_INIT: Event type for pre-initialization.
     :param INITIALIZE: Event type for initialization.
     :param FINALIZE: Event type for finalization.
     :param BATCH_START: Event type for the start of a batch.
@@ -38,7 +37,6 @@ class EventType(Enum):
     """
 
     # training lifecycle
-    PRE_INIT = "pre_init"
     INITIALIZE = "initialize"
     FINALIZE = "finalize"
 
@@ -51,35 +49,6 @@ class EventType(Enum):
     OPTIM_PRE_STEP = "optim_pre_step"
     OPTIM_POST_STEP = "optim_post_step"
 
-    def order(self) -> int:
-        """
-        Returns the priority order of the current EventType.
-        Lower values have higher priority.
-
-        :raises ValueError: if the event type is invalid.
-        :return: The order of the event type, lower has higher priority.
-        :rtype: int
-        """
-        if self == EventType.PRE_INIT:
-            return 0
-        elif self == EventType.INITIALIZE:
-            return 10
-        elif self == EventType.FINALIZE:
-            return 20
-        elif self == EventType.BATCH_START:
-            return 100
-        elif self == EventType.LOSS_CALCULATED:
-            return 110
-        elif self == EventType.OPTIM_PRE_STEP:
-            return 120
-        elif self == EventType.OPTIM_POST_STEP:
-            return 130
-        elif self == EventType.BATCH_END:
-            return 140
-        else:
-            logger.error("Invalid event type: {}", self)
-            raise ValueError(f"Invalid event type {self}")
-
 
 @dataclass
 class Event:
diff --git a/src/llmcompressor/core/lifecycle.py b/src/llmcompressor/core/lifecycle.py
index b76a57523..e69882800 100644
--- a/src/llmcompressor/core/lifecycle.py
+++ b/src/llmcompressor/core/lifecycle.py
@@ -18,7 +18,12 @@
 )
 from llmcompressor.core.state import State
 from llmcompressor.modifiers import StageModifiers
-from llmcompressor.recipe import RecipeContainer
+from llmcompressor.recipe import (
+    RecipeArgsInput,
+    RecipeContainer,
+    RecipeInput,
+    RecipeStageInput,
+)
 
 __all__ = ["CompressionLifecycle"]
 
@@ -38,7 +43,7 @@ class CompressionLifecycle:
     :type event_lifecycle: Optional[EventLifecycle]
     """
 
-    state: Optional[State] = None
+    state: State = field(default_factory=State)
     recipe_container: RecipeContainer = field(default_factory=RecipeContainer)
     modifiers: List[StageModifiers] = field(default_factory=list)
     event_lifecycle: Optional[EventLifecycle] = None
@@ -62,46 +67,16 @@ def reset(self):
             except Exception as e:
                 logger.warning(f"Exception during finalizing modifier: {e}")
 
-        self.state = None
-        self.recipe_container = RecipeContainer()
-        self.modifiers = []
-        self.event_lifecycle = None
-
-        self.initialized_ = False
-        self.finalized = False
+        self.__init__()
         logger.info("Compression lifecycle reset")
 
-    def pre_initialize_structure(self, **kwargs) -> List[Any]:
-        """
-        Pre-initialize the structure of the compression lifecycle.
-
-        :param kwargs: Additional arguments to update the state with
-        :return: List of data returned from pre-initialization of modifiers
-        :rtype: List[Any]
-        """
-        logger.debug("Pre-initializing structure")
-        self._check_create_state()
-        extras = self.state.update(**kwargs)
-        extras = self.recipe_container.update(**extras)
-
-        self._check_compile_recipe()
-        mod_data = []
-        for mod in self.modifiers:
-            data = mod.pre_initialize_structure(state=self.state, **extras)
-            logger.debug("Pre-initialized modifier: {}", mod)
-            if data is not None:
-                mod_data.append(data)
-
-        applied_stage_names = [mod.unique_id for mod in self.modifiers if mod.applied]
-        self.recipe_container.update_applied_stages(applied_stage_names)
-        logger.info(
-            "Compression lifecycle structure pre-initialized for {} modifiers",
-            len(self.modifiers),
-        )
-
-        return mod_data
-
-    def initialize(self, **kwargs) -> List[Any]:
+    def initialize(
+        self,
+        recipe: Optional[RecipeInput] = None,
+        recipe_stage: Optional[RecipeStageInput] = None,
+        recipe_args: Optional[RecipeArgsInput] = None,
+        **kwargs,
+    ) -> List[Any]:
         """
         Initialize the compression lifecycle.
 
@@ -109,16 +84,18 @@ def initialize(self, **kwargs) -> List[Any]:
         :return: List of data returned from initialization of modifiers
         :rtype: List[Any]
         """
-        logger.debug("Initializing compression lifecycle")
-        self._check_create_state()
-        extras = self.state.update(**kwargs)
-        extras = self.recipe_container.update(**extras)
+        self.state.update(**kwargs)
+        if self.initialized_:  # TODO: do not initialize twice
+            return
 
-        self._check_compile_recipe()
+        logger.debug("Initializing compression lifecycle")
+        self.recipe_container.append(recipe, recipe_stage, recipe_args)
+        self.modifiers = self.recipe_container.get_modifiers()
         self._set_model_layer_prefix()
+
         mod_data = []
         for mod in self.modifiers:
-            data = mod.initialize(state=self.state, **extras)
+            data = mod.initialize(state=self.state, **kwargs)
             logger.debug("Initialized modifier: {}", mod)
             if data is not None:
                 mod_data.append(data)
@@ -185,7 +162,7 @@ def event(self, event_type: EventType, **kwargs) -> List[Any]:
             logger.error("Cannot invoke event after finalizing")
             raise ValueError("Cannot invoke event after finalizing")
 
-        if event_type in [EventType.PRE_INIT, EventType.INITIALIZE, EventType.FINALIZE]:
+        if event_type in [EventType.INITIALIZE, EventType.FINALIZE]:
             logger.error(
                 "Cannot invoke {} event. Use the corresponding method instead.",
                 event_type,
@@ -223,30 +200,6 @@ def event(self, event_type: EventType, **kwargs) -> List[Any]:
 
         return mod_data
 
-    def _check_create_state(self):
-        if self.state is not None:
-            return
-
-        logger.debug("Creating new State instance for compression lifecycle")
-        self.state = State()
-        logger.info("State created for compression lifecycle")
-
-    def _check_compile_recipe(self):
-        if not self.recipe_container.check_compile_recipe():
-            return
-
-        logger.debug(
-            "Compiling recipe and creating modifiers for compression lifecycle"
-        )
-        self.modifiers = self.recipe_container.compiled_recipe.create_modifier()
-        for mod in self.modifiers:
-            if mod.unique_id in self.recipe_container.applied_stages:
-                mod.applied = True
-        logger.info(
-            "Recipe compiled and {} modifiers created",
-            len(self.modifiers),
-        )
-
     def _check_setup_event_lifecycle(self, event_type: EventType):
         if self.event_lifecycle is not None:
             return
diff --git a/src/llmcompressor/core/session.py b/src/llmcompressor/core/session.py
index 888db3f1e..f028510bc 100644
--- a/src/llmcompressor/core/session.py
+++ b/src/llmcompressor/core/session.py
@@ -65,45 +65,6 @@ def state(self) -> State:
         """
         return self._lifecycle.state
 
-    def pre_initialize_structure(
-        self,
-        model: Any,
-        recipe: Union[str, List[str], Recipe, List[Recipe], None] = None,
-        recipe_stage: Union[str, List[str], None] = None,
-        recipe_args: Union[Dict[str, Any], List[Dict[str, Any]], None] = None,
-        **kwargs,
-    ) -> ModifiedState:
-        """
-        A method to pre-initialize the structure of the model for compression.
-        This will run the pre-initialize structure method for each modifier in the
-        session's lifecycle. This will also set the session's state to the
-        pre-initialized state. Takes care of cases when the model(s) structure
-        has been previously modified by a modifier.
-
-        :param model: the model to pre-initialize the structure for
-        :param recipe: the recipe to use for the compression, can be a path to a
-            recipe file, a raw recipe string, a recipe object, or a list
-            of recipe objects.
-        :param recipe_stage: the stage to use for the compression
-        :param recipe_args: the args to use for overriding the recipe defaults
-        :return: A ModifiedState instance holding the modified model and modifier_data
-            after pre-initializing the structure
-        """
-        mod_data = self._lifecycle.pre_initialize_structure(
-            model=model,
-            recipe=recipe,
-            recipe_stage=recipe_stage,
-            recipe_args=recipe_args,
-            **kwargs,
-        )
-
-        return ModifiedState(
-            model=self.state.model,
-            optimizer=None,
-            loss=None,
-            modifier_data=mod_data,
-        )
-
     def initialize(
         self,
         recipe: Union[str, List[str], "Recipe", List["Recipe"], None] = None,
diff --git a/src/llmcompressor/core/session_functions.py b/src/llmcompressor/core/session_functions.py
index da54872c4..4d12f22ff 100644
--- a/src/llmcompressor/core/session_functions.py
+++ b/src/llmcompressor/core/session_functions.py
@@ -11,7 +11,6 @@
     "create_session",
     "active_session",
     "reset_session",
-    "pre_initialize_structure",
     "initialize",
     "finalize",
     "callbacks",
@@ -59,16 +58,6 @@ def reset_session():
     session._lifecycle.reset()
 
 
-def pre_initialize_structure(**kwargs):
-    """
-    A method to pre-initialize the structure of the model for the active session
-
-    :param kwargs: the kwargs to pass to the active session's pre-initialize-structure
-        method
-    """
-    active_session().pre_initialize_structure(**kwargs)
-
-
 def initialize(
     recipe: Union[str, List[str], "Recipe", List["Recipe"], None] = None,
     recipe_stage: Union[str, List[str], None] = None,
@@ -156,7 +145,7 @@ def event(cls, event_type: EventType, **kwargs) -> ModifiedState:
         :param kwargs: additional kwargs to pass to the current session's event method
         :return: the modified state of the active session after invoking the event
         """
-        if event_type in [EventType.PRE_INIT, EventType.INITIALIZE, EventType.FINALIZE]:
+        if event_type in [EventType.INITIALIZE, EventType.FINALIZE]:
             raise ValueError(
                 f"Cannot invoke {event_type} event. "
                 f"Use the corresponding method instead."
diff --git a/src/llmcompressor/modifiers/interface.py b/src/llmcompressor/modifiers/interface.py
index e3a3786b4..f1c73c54b 100644
--- a/src/llmcompressor/modifiers/interface.py
+++ b/src/llmcompressor/modifiers/interface.py
@@ -11,15 +11,6 @@ class ModifierInterface(ABC):
     Defines the contract that all modifiers must implement
     """
 
-    @property
-    @abstractmethod
-    def initialized_structure(self) -> bool:
-        """
-        :return: True if the modifier structure has been
-            applied to the model
-        """
-        raise NotImplementedError()
-
     @property
     @abstractmethod
     def initialized(self) -> bool:
@@ -58,16 +49,6 @@ def calculate_end(self) -> float:
         """
         raise NotImplementedError()
 
-    @abstractmethod
-    def pre_initialize_structure(self, state: State, **kwargs):
-        """
-        Apply the modifier structure to the model
-
-        :param state: The current state of the model
-        :param kwargs: Additional arguments for the modifier
-        """
-        raise NotImplementedError()
-
     @abstractmethod
     def initialize(self, state: State, **kwargs):
         """
diff --git a/src/llmcompressor/modifiers/modifier.py b/src/llmcompressor/modifiers/modifier.py
index 65b4a4029..4092cc3de 100644
--- a/src/llmcompressor/modifiers/modifier.py
+++ b/src/llmcompressor/modifiers/modifier.py
@@ -29,20 +29,11 @@ class Modifier(ModifierInterface, HooksMixin):
     end: Optional[float] = None
     update: Optional[float] = None
 
-    initialized_structure_: bool = False
     initialized_: bool = False
     finalized_: bool = False
     started_: bool = False
     ended_: bool = False
 
-    @property
-    def initialized_structure(self) -> bool:
-        """
-        :return: True if the modifier structure has been
-            applied to the model
-        """
-        return self.initialized_structure_
-
     @property
     def initialized(self) -> bool:
         """
@@ -78,15 +69,6 @@ def calculate_end(self) -> float:
         """
         return self.end if self.end is not None else -1
 
-    def pre_initialize_structure(self, state: State, **kwargs):
-        """
-        :param state: The current state of the model
-        :param kwargs: Additional arguments for initializing the structure
-            of the model in question
-        """
-        self.on_initialize_structure(state, **kwargs)
-        self.initialized_structure_ = True
-
     def initialize(self, state: State, **kwargs):
         """
         Initialize the modifier for the given model and state.
@@ -221,19 +203,6 @@ def should_end(self, event: Event):
 
         return self.end is not None and current >= self.end
 
-    def on_initialize_structure(self, state: State, **kwargs):
-        """
-        on_initialize_structure is called before the model is initialized
-        with the modifier structure.
-
-        TODO: Depreciate this function as part of the lifecycle
-
-        :param state: The current state of the model
-        :param kwargs: Additional arguments for initializing the structure
-            of the model in question
-        """
-        pass
-
     @abstractmethod
     def on_initialize(self, state: State, **kwargs) -> bool:
         """
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
index 65e1c90e0..525ba1301 100644
--- a/src/llmcompressor/modifiers/quantization/gptq/base.py
+++ b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -62,9 +62,8 @@ class GPTQModifier(Modifier, HooksMixin):
     |                    actorder: False
 
     Lifecycle:
-        - on_initialize_structure
-            - _build_quant_modifier
         - on_initialize
+            - _build_quant_modifier
             - register_hook(module, compress_module, "forward")
             - run_sequential / run_layer_sequential / run_basic
                 - make_empty_hessian
@@ -141,16 +140,16 @@ def validate_sequential_update(cls, value: bool) -> bool:
 
         return True
 
-    def on_initialize_structure(self, state: State, **kwargs):
+    def _check_build_quant_modifier(self, model: torch.nn.Module):
         """
         Check the model's quantization state matches that expected by this modifier,
         adding a default quantization scheme if needed
 
-        TODO: Depreciate and fold into `on_initialize`
+        # TODO: build modifier during recipe validation
 
         :param state: session state storing input model and calibration data
         """
-        quantization_already_active = qat_active(state.model)
+        quantization_already_active = qat_active(model)
         if isinstance(self.quantize, bool):
             if not self.quantize and quantization_already_active:
                 logger.warning(
@@ -191,18 +190,15 @@ def on_initialize_structure(self, state: State, **kwargs):
             self._build_quant_modifier_from_dict(self.quantize)
             self.quantize = True
 
-        if self._quantization_modifier:
-            self._quantization_modifier.on_initialize_structure(state, **kwargs)
-
     def on_initialize(self, state: State, **kwargs) -> bool:
         """
         Initialize and run the GPTQ algorithm on the current state
 
         :param state: session state storing input model and calibration data
         """
-        # initialize quantization modifier
-        if not self.initialized_structure_:
-            self.on_initialize_structure(state, **kwargs)
+        # build quantization modifier
+        self._check_build_quant_modifier(state.model)
+
         if self._quantization_modifier:
             self._quantization_modifier.initialize(state, **kwargs)
         if not self.quantize:
diff --git a/src/llmcompressor/modifiers/stage.py b/src/llmcompressor/modifiers/stage.py
index 7e63245b6..fe773bcb5 100644
--- a/src/llmcompressor/modifiers/stage.py
+++ b/src/llmcompressor/modifiers/stage.py
@@ -19,7 +19,7 @@ class StageModifiers(ModifierInterface, BaseModel):
     :param index: The index of the stage, if applicable
     :param group: The group name of the stage, if applicable
     :param applied: Flag for indicating if this stage has has already been
-    applied to the model, through structure initialization or finalization
+    applied to the model through finalization
     """
 
     modifiers: List["Modifier"] = Field(default_factory=list)
@@ -27,14 +27,6 @@ class StageModifiers(ModifierInterface, BaseModel):
     group: Optional[str] = None
     applied: bool = False
 
-    @property
-    def initialized_structure(self) -> bool:
-        """
-        :return: True if any of the stage modifiers have initialized structure,
-            False otherwise
-        """
-        return any(mod.initialized_structure for mod in self.modifiers)
-
     @property
     def initialized(self) -> bool:
         """
@@ -93,20 +85,6 @@ def calculate_end(self) -> float:
         """
         return max(mod.calculate_end() for mod in self.modifiers)
 
-    def pre_initialize_structure(self, state: "State", **kwargs):
-        """
-        Pre initialize the structure for all stage modifiers mark the stage applied
-
-        :param state: The current state of the training
-        :param kwargs: Additional kwargs to pass to the modifier(s)
-            pre_initialize_structure method
-        """
-        for modifier in self.modifiers:
-            modifier.pre_initialize_structure(state, **kwargs)
-
-        self.applied = True
-        state.loggers.system.info(tag="stage", string="Model structure initialized")
-
     def initialize(self, state: "State", **kwargs):
         """
         Initialize all the stage modifiers
diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
index e2e1a91b7..850fba32f 100644
--- a/src/llmcompressor/pytorch/model_load/helpers.py
+++ b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -8,13 +8,12 @@
 from torch.nn import Module
 from transformers import PreTrainedModel
 
-from llmcompressor.core import active_session, create_session, pre_initialize_structure
+from llmcompressor.core import active_session
 from llmcompressor.typing import Processor
 
 COMPLETED_STAGES_FILENAME = "completed_stages.json"
 
 __all__ = [
-    "initialize_recipe",
     "copy_python_files_from_model_cache",
     "fallback_to_cpu",
     "parse_dtype",
@@ -25,30 +24,6 @@
 ]
 
 
-def initialize_recipe(model: Module, recipe_path: str):
-    """
-    Initializes a recipe that has been previously applied to the model
-    :param model: PyTorch model to apply structure to
-    :param recipe_path: path to recipe to apply to the model
-    """
-    if not active_session():
-        create_session()
-    pre_initialize_structure(model=model, recipe=recipe_path)
-
-    # no need to reload if no recipe was applied
-    if recipe_path is None:
-        return
-
-    session = active_session()
-    num_stages = len(session.lifecycle.recipe_container.compiled_recipe.stages)
-    msg = (
-        "an unstaged recipe"
-        if num_stages == 1
-        else f"a staged recipe with {num_stages} stages"
-    )
-    logger.info(f"Applied {msg} to the model")
-
-
 def save_checkpoint(
     save_path: str,
     model: PreTrainedModel,
diff --git a/src/llmcompressor/recipe/__init__.py b/src/llmcompressor/recipe/__init__.py
index e02a18b39..bb4df06af 100644
--- a/src/llmcompressor/recipe/__init__.py
+++ b/src/llmcompressor/recipe/__init__.py
@@ -9,7 +9,7 @@
     RecipeMetaData,
 )
 from .modifier import RecipeModifier
-from .recipe import Recipe, RecipeTuple
+from .recipe import Recipe, RecipeArgsInput, RecipeInput, RecipeStageInput, RecipeTuple
 from .stage import RecipeStage, StageRunType
 
 __all__ = [
@@ -26,4 +26,7 @@
     "Recipe",
     "RecipeTuple",
     "StageRunType",
+    "RecipeInput",
+    "RecipeStageInput",
+    "RecipeArgsInput",
 ]
diff --git a/src/llmcompressor/recipe/container.py b/src/llmcompressor/recipe/container.py
index 5cae0dd2c..90c9c1dad 100644
--- a/src/llmcompressor/recipe/container.py
+++ b/src/llmcompressor/recipe/container.py
@@ -1,8 +1,14 @@
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Union
+from typing import List, Optional
 
 from llmcompressor.modifiers import Modifier
-from llmcompressor.recipe.recipe import Recipe, RecipeTuple
+from llmcompressor.recipe.recipe import (
+    Recipe,
+    RecipeArgsInput,
+    RecipeInput,
+    RecipeStageInput,
+    RecipeTuple,
+)
 
 __all__ = ["RecipeContainer"]
 
@@ -22,52 +28,42 @@ class RecipeContainer:
     recipes: List[RecipeTuple] = field(default_factory=list)
     applied_stages: List[str] = field(default_factory=list)
 
-    def update(
+    def prepend(
         self,
-        recipe: Union[
-            str, List[str], Recipe, List[Recipe], Modifier, List[Modifier], None
-        ] = None,
-        recipe_stage: Union[str, List[str], List[List[str]], None] = None,
-        recipe_args: Union[Dict[str, Any], List[Dict[str, Any]], None] = None,
-        **kwargs,
-    ) -> Dict:
-        """
-        Update the recipes in the container. If a recipe is provided, it will
-        reset any existing compiled_recipe in the container. Must call
-        `check_compile_recipe` to re-compile the recipes into a single compiled_recipe.
-        If no recipe is provided, does nothing and returns the kwargs.
-
-        Can provide multiple recipes to update the container with:
-        >>> container = RecipeContainer()
-        >>> recipe_str_1 = '''
-        ... test_stage:
-        ...     pruning_modifiers:
-        ...         ConstantPruningModifier:
-        ...             start: 0.0
-        ...             end: 2.0
-        ...             targets: ['re:.*weight']
-        ... '''
-        >>> recipe_str_2 = '''
-        ... test_stage:
-        ...     pruning_modifiers:
-        ...         ConstantPruningModifier:
-        ...             start: 3.0
-        ...             end: 4.0
-        ...             targets: ['re:.*weight']
-        ... '''
-        >>> result = container.update(recipe=[recipe_str_1, recipe_str_2])
-
-        :param recipe: the recipe to update the container with
-        :param recipe_stage: the recipe stage to update the container with
-        :param recipe_args: the recipe args to update the recipe with
-        :param kwargs: additional kwargs to return
-        :return: the passed in kwargs
-        """
-        if recipe is None or isinstance(recipe, list) and len(recipe) == 0:
-            return kwargs
+        recipe: Optional[RecipeInput] = None,
+        recipe_stage: Optional[RecipeStageInput] = None,
+        recipe_args: Optional[RecipeArgsInput] = None,
+    ):
+        recipe_tuples = self._prepare_tuples(recipe, recipe_stage, recipe_args)
+        self.recipes = recipe_tuples + self.recipes
+        self._check_compile_recipe()
+
+    def append(
+        self,
+        recipe: Optional[RecipeInput] = None,
+        recipe_stage: Optional[RecipeStageInput] = None,
+        recipe_args: Optional[RecipeArgsInput] = None,
+    ):
+        recipe_tuples = self._prepare_tuples(recipe, recipe_stage, recipe_args)
+        self.recipes = self.recipes + recipe_tuples
+        self._check_compile_recipe()
 
-        self.compiled_recipe = None
+    def get_modifiers(self) -> List[Modifier]:
+        if self.compiled_recipe is None:
+            return []
 
+        return self.compiled_recipe.create_modifier()
+
+    def _prepare_tuples(
+        self,
+        recipe: Optional[RecipeInput] = None,
+        recipe_stage: Optional[RecipeStageInput] = None,
+        recipe_args: Optional[RecipeArgsInput] = None,
+    ) -> List[RecipeTuple]:
+        if recipe is None or (isinstance(recipe, list) and len(recipe) == 0):
+            return []
+
+        # prepare recipe
         if isinstance(recipe, Modifier) or (
             isinstance(recipe, list)
             and all(isinstance(mod, Modifier) for mod in recipe)
@@ -77,6 +73,12 @@ def update(
         if not isinstance(recipe, list):
             recipe = [recipe]
 
+        recipe = [
+            Recipe.create_instance(rec) if isinstance(rec, str) else rec
+            for rec in recipe
+        ]
+
+        # prepare stage
         if recipe_stage is None:
             recipe_stage = [None] * len(recipe)
         else:
@@ -85,22 +87,23 @@ def update(
             if not isinstance(recipe_stage[0], list):
                 recipe_stage = [recipe_stage] * len(recipe)
 
+        # prepare args
         if recipe_args is None:
             recipe_args = [{}] * len(recipe)
         elif not isinstance(recipe_args, list):
             recipe_args = [recipe_args] * len(recipe)
 
+        # validation
         if len(recipe) != len(recipe_stage) or len(recipe) != len(recipe_args):
             raise ValueError(
                 "recipe, recipe_stage, and recipe_args must be the same length"
             )
 
-        for rec, stage, args in zip(recipe, recipe_stage, recipe_args):
-            if isinstance(rec, str):
-                rec = Recipe.create_instance(rec)
-            self.recipes.append(RecipeTuple(rec, stage, args))
-
-        return kwargs
+        # create tuples
+        return [
+            RecipeTuple(rec, stage, args)
+            for rec, stage, args in zip(recipe, recipe_stage, recipe_args)
+        ]
 
     def update_applied_stages(self, new_stages: List[str]):
         """
@@ -113,7 +116,7 @@ def update_applied_stages(self, new_stages: List[str]):
             if stage not in self.applied_stages:
                 self.applied_stages.append(stage)
 
-    def check_compile_recipe(self) -> bool:
+    def _check_compile_recipe(self):
         """
         Check if the recipes need to be compiled into a single recipe and
         compile them if they do.
@@ -122,9 +125,6 @@ def check_compile_recipe(self) -> bool:
         """
         if self.compiled_recipe is None and self.recipes:
             self.compiled_recipe = Recipe.simplify_combine_recipes(self.recipes)
-            return True
-
-        return False
 
     def check_any_recipe_exists(self) -> bool:
         """
diff --git a/src/llmcompressor/recipe/recipe.py b/src/llmcompressor/recipe/recipe.py
index 1e9851ba8..f48c4a568 100644
--- a/src/llmcompressor/recipe/recipe.py
+++ b/src/llmcompressor/recipe/recipe.py
@@ -14,7 +14,13 @@
 from llmcompressor.recipe.metadata import RecipeMetaData
 from llmcompressor.recipe.stage import RecipeStage
 
-__all__ = ["Recipe", "RecipeTuple"]
+__all__ = [
+    "Recipe",
+    "RecipeTuple",
+    "RecipeInput",
+    "RecipeStageInput",
+    "RecipeArgsInput",
+]
 
 
 class Recipe(RecipeBase):
@@ -150,7 +156,7 @@ def create_instance(
 
     @staticmethod
     def simplify_recipe(
-        recipe: Union["Recipe", "RecipeTuple"], shift: Optional[int] = None
+        recipe: Union[str, "Recipe", "RecipeTuple"], shift: Optional[int] = None
     ) -> "Recipe":
         """
         Simplify a RecipeTuple by removing stages that are not in the target_stages
@@ -177,6 +183,9 @@ def simplify_recipe(
             defaults to None (No shift)
         :return: The simplified Recipe instance
         """
+        if isinstance(recipe, str):
+            recipe = Recipe.create_instance(recipe)
+
         if isinstance(recipe, Recipe):
             recipe.evaluate(shift=shift)
             return recipe
@@ -212,7 +221,7 @@ def simplify_recipe(
 
     @staticmethod
     def simplify_combine_recipes(
-        recipes: List[Union["Recipe", "RecipeTuple"]],
+        recipes: List[Union[str, "Recipe", "RecipeTuple"]],
     ) -> "Recipe":
         """
         A method to combine multiple recipes into one recipe
@@ -571,6 +580,11 @@ def _get_yaml_dict(self) -> Dict[str, Any]:
         return yaml_recipe_dict
 
 
+RecipeInput = Union[str, List[str], Recipe, List[Recipe], Modifier, List[Modifier]]
+RecipeStageInput = Union[str, List[str], List[List[str]]]
+RecipeArgsInput = Union[Dict[str, Any], List[Dict[str, Any]]]
+
+
 @dataclass
 class RecipeTuple:
     """
diff --git a/src/llmcompressor/transformers/finetune/runner.py b/src/llmcompressor/transformers/finetune/runner.py
index dd45b7daf..1735a99b8 100644
--- a/src/llmcompressor/transformers/finetune/runner.py
+++ b/src/llmcompressor/transformers/finetune/runner.py
@@ -6,6 +6,7 @@
 import torch
 from loguru import logger
 from torch.utils.data import Dataset
+from transformers import PreTrainedModel
 
 from llmcompressor.args import (
     DatasetArguments,
@@ -154,7 +155,9 @@ def train(self, checkpoint: str, stage: Optional[str] = None):
         # this includes saving the state, optimizer and scheduler
         self.trainer.save_model(output_dir=self._output_dir)
 
-    def run_sequential_stages(self, checkpoint: Optional[str] = None):
+    def run_sequential_stages(
+        self, model: PreTrainedModel, checkpoint: Optional[str] = None
+    ):
         """
         Run the recipe stage by stage, allowing for alternating between one-shot and
         finetuning flows. Optionally save the model output at the end of each stage
@@ -181,12 +184,6 @@ def run_sequential_stages(self, checkpoint: Optional[str] = None):
                     "the stage name."
                 )
 
-            # just load structure if stage has already applied
-            if stage_name in completed_stages:
-                self.trainer.initialize_structure(stage=stage)
-                self.trainer.accelerator.wait_for_everyone()
-                continue
-
             # setup checkpoint dir, TODO: this should be optional
             self._output_dir = os.path.join(
                 self.parent_output_dir, "stage_" + stage_name
@@ -201,7 +198,6 @@ def run_sequential_stages(self, checkpoint: Optional[str] = None):
             if run_type is StageRunType.ONESHOT:
                 from llmcompressor import Oneshot
 
-                model = get_session_model()
                 self._model_args.model = model
 
                 oneshot = Oneshot.from_args(
diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
index c6e35c2fc..27882d7d6 100644
--- a/src/llmcompressor/transformers/finetune/session_mixin.py
+++ b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -17,7 +17,6 @@
     create_session,
     finalize,
     initialize,
-    pre_initialize_structure,
 )
 from llmcompressor.metrics import LoggerManager
 from llmcompressor.modifiers.distillation.utils.pytorch.model_wrapper import (
@@ -177,25 +176,6 @@ def initialize_session(
 
         torch.cuda.empty_cache()
 
-    def initialize_structure(self, stage: Optional[str] = None):
-        """
-        Initialize any recipe structural changes such as quantization on the model,
-        return immediately if session has already been initialized
-        :param stage: Optional stage of recipe to run, or None to run all stages
-        """
-        session = active_session()
-        if session.lifecycle.initialized_:
-            return False
-
-        pre_initialize_structure(
-            model=self.model,
-            recipe=self.recipe,
-            recipe_stage=stage,
-            recipe_args=self.recipe_args,
-        )
-        logger.info(f"Initialized LLM Compressor structure from recipe {self.recipe}")
-        torch.cuda.empty_cache()
-
     def finalize_session(self):
         """
         Wrap up training by finalizing all modifiers initialized in the current session
diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py
index c1be354db..9a3623f60 100644
--- a/src/llmcompressor/transformers/finetune/text_generation.py
+++ b/src/llmcompressor/transformers/finetune/text_generation.py
@@ -40,11 +40,9 @@
     RecipeArguments,
     TrainingArguments,
 )
-from llmcompressor.core import pre_initialize_structure, reset_session
+from llmcompressor.core import reset_session
 from llmcompressor.pytorch.model_load.helpers import (
     fallback_to_cpu,
-    get_session_model,
-    initialize_recipe,
     parse_dtype,
     save_checkpoint,
 )
@@ -383,11 +381,6 @@ def main(
     if isinstance(processor, str) or processor is None:
         processor = initialize_processor_from_path(model_args, model, teacher)
 
-    pre_initialize_structure(model=model)
-
-    # initialize session manager
-    initialize_recipe(model, None)
-
     # Load datasets
     stage_runner = StageRunner(
         model_args=model_args,
@@ -401,7 +394,7 @@ def main(
     calib_dataset = stage_runner.get_dataset_split("calibration")
 
     trainer = Trainer(
-        model_init=get_session_model,
+        model_init=lambda: model,
         teacher=teacher,
         recipe=recipe_args.recipe,
         recipe_args=recipe_args.recipe_args,
@@ -429,7 +422,7 @@ def main(
         checkpoint = None
         if last_checkpoint is not None:
             checkpoint = last_checkpoint
-        stage_runner.run_sequential_stages(checkpoint)
+        stage_runner.run_sequential_stages(model, checkpoint)
 
         # exit immediately
         return
diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
index b3ac28383..5959f1699 100644
--- a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
+++ b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
@@ -20,6 +20,7 @@
 
 from llmcompressor.core import active_session
 from llmcompressor.pytorch.model_load.helpers import copy_python_files_from_model_cache
+from llmcompressor.recipe.recipe import Recipe
 from llmcompressor.transformers.compression.quantization_format import (
     infer_quantization_format,
 )
@@ -27,6 +28,7 @@
     SparsityConfigMetadata,
 )
 from llmcompressor.transformers.utils import RECIPE_FILE_NAME
+from llmcompressor.transformers.utils.helpers import infer_recipe_from_model_path
 
 __all__ = ["modify_save_pretrained"]
 
@@ -129,7 +131,7 @@ def skip(*args, **kwargs):
                 )
                 compressor.update_config(save_directory)
 
-            # TODO: update existing recipe
+            # update existing recipe
             update_and_save_recipe(model.name_or_path, save_directory)
 
             # copy python files from cache dir to save_path if any
@@ -253,10 +255,17 @@ def get_model_compressor(
 
 
 def update_and_save_recipe(model_path: str, save_directory: str):
-    # TODO: update existing recipe
-    recipe_path = os.path.join(save_directory, RECIPE_FILE_NAME)
-    session = active_session()
+    recipes_to_save = []
+    existing_recipe = infer_recipe_from_model_path(model_path)
+    if existing_recipe is not None:
+        recipes_to_save.append(existing_recipe)
+
+    new_recipe = active_session().lifecycle.recipe_container.compiled_recipe
+    if new_recipe is not None:
+        recipes_to_save.append(new_recipe)
 
-    if (recipe_yaml_str := session.get_serialized_recipe()) is not None:
-        with open(recipe_path, "w") as fp:
-            fp.write(recipe_yaml_str)
+    recipe = Recipe.simplify_combine_recipes(recipes_to_save)
+
+    # save recipe
+    recipe_path = os.path.join(save_directory, RECIPE_FILE_NAME)
+    recipe.yaml(recipe_path)
diff --git a/src/llmcompressor/transformers/utils/helpers.py b/src/llmcompressor/transformers/utils/helpers.py
index 0d9fd4f7d..cddd45d4f 100644
--- a/src/llmcompressor/transformers/utils/helpers.py
+++ b/src/llmcompressor/transformers/utils/helpers.py
@@ -109,7 +109,7 @@ def infer_recipe_from_model_path(model_path: Union[str, Path]) -> Optional[str]:
     recipe = recipe_from_huggingface_model_id(hf_stub=model_path)
 
     if recipe is None:
-        logger.info("Failed to infer the recipe from the model_path")
+        logger.debug("Failed to infer the recipe from the model_path")
 
     return recipe
 
@@ -140,14 +140,10 @@ def recipe_from_huggingface_model_id(
         return None
 
     try:
-        logger.info(
-            "Attempting to download a recipe ",
-            f"{hf_stub} " f"from {HUGGINGFACE_CO_URL_HOME}",
-        )
         recipe = hf_hub_download(repo_id=hf_stub, filename=recipe_file_name)
         logger.info(f"Found recipe: {recipe_file_name} for model ID: {hf_stub}.")
-    except Exception as e:
-        logger.error(
+    except Exception as e:  # TODO: narrow acceptable exceptions
+        logger.debug(
             (
                 f"Unable to find recipe {recipe_file_name} "
                 f"for model ID: {hf_stub}: {e}."
diff --git a/src/llmcompressor/utils/fsdp/helpers.py b/src/llmcompressor/utils/fsdp/helpers.py
index 53fc04ca8..51c08010b 100644
--- a/src/llmcompressor/utils/fsdp/helpers.py
+++ b/src/llmcompressor/utils/fsdp/helpers.py
@@ -1,19 +1,11 @@
 import operator
-from pathlib import Path
 from typing import Optional
 
-from loguru import logger
-
 try:
-    from torch.distributed.fsdp import (
-        FullStateDictConfig,
-        FullyShardedDataParallel,
-        StateDictType,
-    )
+    from torch.distributed.fsdp import FullyShardedDataParallel
 except ImportError:
     FullyShardedDataParallel = None
 
-import torch
 from torch.nn import Module
 
 from llmcompressor.core.state import State
@@ -22,9 +14,7 @@
     "is_fsdp_model",
     "maybe_get_wrapped",
     "set_wrapped_model",
-    "save_pretrained_fsdp",
     "get_fsdp_parent",
-    "find_and_move_state_dicts_to_cpu",
 ]
 
 
@@ -68,63 +58,6 @@ def set_wrapped_model(state: State, wrapped_model: Module):
         state.model = wrapped_model
 
 
-def find_and_move_state_dicts_to_cpu(output_dir: str):
-    """
-    Looks for state dicts in the output directory and overwrites them
-    with cpu state dicts.
-
-    this is needed for quantized models trained with FSDP as the state dict
-    contains device information, which can cause issues when loading the model
-    using transformers AutoModel.from_pretrained(...) if the device information
-    is not removed, assumes the state dicts are named pytorch_model*.bin
-    """
-
-    for model_file in Path(output_dir).rglob("pytorch_model*.bin"):
-        loaded_dict = torch.load(model_file)
-        for key, value in loaded_dict.items():
-            if isinstance(value, torch.Tensor):
-                loaded_dict[key] = value.cpu()
-
-        torch.save(loaded_dict, model_file)
-        logger.info(f"Moved state dict {model_file} to cpu")
-
-
-def save_pretrained_fsdp(
-    model,
-    accelerator,
-    output_dir,
-    save_safetensors: bool = True,
-    save_compressed: bool = False,
-):
-    full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
-    """
-    Gathers the full FSDP state dict of the model onto rank0 GPU, then uses it to save
-    the pretrained FSDP model to disk
-
-    :param model: model to save
-    :param accelerator: Accelerator instance used to perform unwrapping
-    :param output_dir: where to save output model
-    :param save_safetensors: True to safe in safetensors format, otherwise .bin
-    :param save_compressed: whether to compress sparse weights on disk
-    """
-    with FullyShardedDataParallel.state_dict_type(
-        model, StateDictType.FULL_STATE_DICT, full_state_dict_config
-    ):
-        state_dict = accelerator.get_state_dict(model, unwrap=False)
-
-    if accelerator.is_main_process:
-        accelerator.unwrap_model(model).save_pretrained(
-            output_dir,
-            is_main_process=accelerator.is_main_process,
-            save_function=accelerator.save,
-            state_dict=state_dict,
-            save_compressed=save_compressed,
-            safe_serialization=save_safetensors,
-        )
-
-    accelerator.wait_for_everyone()
-
-
 def get_fsdp_parent(layer_name: str, model: Module) -> Optional[Module]:
     """
     Gets the closest parent of layer_name that is wrapped by FSDP. If no FSDP wrapper
diff --git a/tests/llmcompressor/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py b/tests/llmcompressor/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py
index c1f0cb425..0c5ad534d 100644
--- a/tests/llmcompressor/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py
+++ b/tests/llmcompressor/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py
@@ -71,7 +71,7 @@ def test_create_default_quant_modifier(self):
         assert modifier._quantization_modifier is None
 
         testing_harness = LifecyleTestingHarness(model=LinearNet())
-        modifier.on_initialize_structure(testing_harness.get_state())
+        modifier._check_build_quant_modifier(testing_harness.get_state().model)
         assert modifier.quantize
         assert isinstance(modifier._quantization_modifier, QuantizationModifier)
         modifier._quantization_modifier.create_init_config()
@@ -105,10 +105,6 @@ def test_set_quant_if_modifer_already_exists(self):
 
         modifier = GPTQModifier(block_size=128)
         assert not modifier._quantization_modifier
-
-        modifier.on_initialize_structure(testing_harness.get_state())
-        # since quantization modifier is already applied, quantization must be set in
-        # GPTQ
         assert modifier.quantize
 
 
@@ -142,7 +138,7 @@ def test_set_quant_in_gptq(self):
         assert modifier._quantization_modifier is None
 
         testing_harness = LifecyleTestingHarness(model=LinearNet())
-        modifier.on_initialize_structure(testing_harness.get_state())
+        modifier._check_build_quant_modifier(testing_harness.get_state().model)
         assert modifier.quantize
         self.assertIsInstance(modifier._quantization_modifier, QuantizationModifier)
 
diff --git a/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py b/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py
index b1331afc0..b2176d0fe 100644
--- a/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py
+++ b/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py
@@ -8,8 +8,8 @@
 from transformers import AutoModelForCausalLM
 from transformers.utils.quantization_config import CompressedTensorsConfig
 
+from llmcompressor.recipe import Recipe
 from llmcompressor.transformers.utils import is_model_ct_quantized_from_path
-from llmcompressor.transformers.utils.helpers import infer_recipe_from_model_path
 from tests.testing_utils import parse_params, requires_gpu
 
 CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs"
@@ -28,7 +28,6 @@ def _test_consecutive_runs(
 
         from llmcompressor import oneshot
         from llmcompressor.core import active_session
-        from llmcompressor.pytorch.model_load.helpers import initialize_recipe
         from llmcompressor.pytorch.utils.helpers import tensor_sparsity
         from llmcompressor.utils.pytorch import qat_active
 
@@ -61,11 +60,7 @@ def _test_consecutive_runs(
         self.assertEqual(len(stages), 1)
         session.reset()
 
-        recipe = infer_recipe_from_model_path(model_path=self.output_first)
-        if recipe:
-            initialize_recipe(model=first_model, recipe_path=recipe)
-
-        # reload saved model and up sparsity to 0.7
+        # reload saved model and increase sparsity to 0.7
         oneshot(
             model=self.output_first,
             dataset=self.dataset,
@@ -87,11 +82,6 @@ def _test_consecutive_runs(
         assert math.isclose(layer_0_sparse.item(), 0.7, rel_tol=tolerance)
         assert qat_active(second_model)
 
-        session = active_session()
-        session_recipe = session.lifecycle.recipe_container.compiled_recipe
-        stages = [stage.group for stage in session_recipe.stages]
-        self.assertEqual(len(stages), 2)
-
         recipe_path = self.output_second / "recipe.yaml"
         recipe_data = yaml.safe_load(recipe_path.read_text())
         stage_keys = recipe_data.keys()
@@ -99,6 +89,24 @@ def _test_consecutive_runs(
         self.assertIn("test_stage_0", stage_keys)
         self.assertIn("test_stage_1", stage_keys)
 
+        # check saved modifier names are same
+        stage0_modifier_names = list(
+            list(recipe_data["test_stage_0"].values())[0].keys()
+        )
+        exp_stage0_modifier_names = [
+            mod.type
+            for mod in Recipe.create_instance(self.first_recipe).stages[0].modifiers
+        ]
+        stage1_modifier_names = list(
+            list(recipe_data["test_stage_1"].values())[0].keys()
+        )
+        exp_stage1_modifier_names = [
+            mod.type
+            for mod in Recipe.create_instance(self.second_recipe).stages[0].modifiers
+        ]
+        self.assertEqual(stage0_modifier_names, exp_stage0_modifier_names)
+        self.assertEqual(stage1_modifier_names, exp_stage1_modifier_names)
+
     def tearDown(self):
         shutil.rmtree(self.output)
 
diff --git a/tests/unit/core/events/test_event.py b/tests/unit/core/events/test_event.py
index 06f78ec53..de18dcb28 100644
--- a/tests/unit/core/events/test_event.py
+++ b/tests/unit/core/events/test_event.py
@@ -3,13 +3,6 @@
 from llmcompressor.core import Event, EventType
 
 
-@pytest.mark.smoke
-def test_event_type_order():
-    assert EventType.PRE_INIT.order() == 0
-    assert EventType.INITIALIZE.order() == 10
-    assert EventType.FINALIZE.order() == 20
-
-
 @pytest.mark.smoke
 def test_event_epoch_based():
     event = Event(steps_per_epoch=10)

From 29ddedbd92a1e0e74d70aab6841ad9ba12a57dc6 Mon Sep 17 00:00:00 2001
From: Domenic Barbuzzi <domenic@neuralmagic.com>
Date: Wed, 26 Feb 2025 15:37:27 -0500
Subject: [PATCH 12/23] Make `transformers-tests` job conditional on files
 changed (#1197)

SUMMARY:
Make `transformers-tests` job conditional on files changed by way of
using the `tj-actions/changed-files@v45` action to check which files
changed (ignoring files/folders based on a separate trial: #1194).


TEST PLAN:
Verify this PR:
1. Correctly results in skipping the `transformers-tests` job
2. Allows merging with the skipped job

---------

Signed-off-by: Domenic Barbuzzi <domenic@neuralmagic.com>
---
 .../workflows/test-check-transformers.yaml    | 34 ++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml
index f3911dfe9..f34359406 100644
--- a/.github/workflows/test-check-transformers.yaml
+++ b/.github/workflows/test-check-transformers.yaml
@@ -15,9 +15,41 @@ env:
   CLEARML_API_SECRET_KEY: ${{ secrets.CLEARML_API_SECRET_KEY }}
 
 jobs:
+  detect-changes:
+    runs-on: ubuntu-latest
+
+    outputs:
+      changes-present: ${{ steps.changed-files.outputs.any_modified }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/changed-files@v45
+        with:
+          files: |
+            **
+            !examples/**
+            !tests/e2e/**
+            !tests/lmeval/**
+            !tests/examples/**
+            !**/*.md
+            !.github/**
+            .github/workflows/test-check-transformers.yaml
+
+      - name: Log relevant output
+        run: |
+          echo "changes-present: ${{ steps.changed-files.outputs.any_modified }}"
+          echo "all modified files: ${{ steps.changed-files.outputs.all_modified_files }}"
+        shell: bash
+
   transformers-tests:
+    needs: [detect-changes]
     runs-on: gcp-k8s-vllm-l4-solo
-    if: contains(github.event.pull_request.labels.*.name, 'ready') || github.event_name == 'push'
+    if: (contains(github.event.pull_request.labels.*.name, 'ready') || github.event_name == 'push') && needs.detect-changes.outputs.changes-present == 'true'
     steps:
       - uses: actions/setup-python@v5
         with:

From 06162fc5baca2351490a86d333df4e681291ddf9 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 27 Feb 2025 15:57:40 -0500
Subject: [PATCH 13/23] Update finetune tests to decrease execution time
 (#1208)

Summary
- Update such that we're using less data or fewer epochs
- Decreases finetune test time from 20 minutes to about 4
---
 .../finetune/data/test_dataset_helpers.py     |  2 +-
 .../finetune/data/test_dataset_loading.py     | 18 ++++++++----------
 .../finetune_oneshot_configs/config.yaml      |  2 +-
 .../finetune/test_oneshot_and_finetune.py     |  5 +++--
 .../finetune/test_oneshot_then_finetune.py    | 19 ++++++++-----------
 5 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
index 7b475fdb5..aa55a752c 100644
--- a/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
+++ b/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
@@ -23,7 +23,7 @@ def test_combined_datasets():
 
 @pytest.mark.unit
 def test_separate_datasets():
-    splits = {"train": "train[:10%]", "validation": "train[10%:20%]"}
+    splits = {"train": "train[:5%]", "validation": "train[5%:7%]"}
     data_args = DatasetArguments(
         dataset="wikitext", dataset_config_name="wikitext-2-raw-v1"
     )
diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py
index dcc602877..db539a74c 100644
--- a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py
+++ b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py
@@ -67,7 +67,7 @@ def test_no_padding_tokenization(self):
         op_manager = TextGenerationDataset.load_from_registry(
             self.data_args.dataset,
             data_args=self.data_args,
-            split="train[5%:10%]",
+            split="train[5%:7%]",
             processor=self.tiny_llama_tokenizer,
         )
         dataset = op_manager.load_dataset()  # load
@@ -82,7 +82,7 @@ def test_no_padding_tokenization(self):
         ex_item = dataset[0]["text"]
         self.assertIn("Below is an instruction that describes a task", ex_item)
 
-        self.assertEqual(dataset.split, "train[5%:10%]")
+        self.assertEqual(dataset.split, "train[5%:7%]")
         tokenized_dataset = op_manager()
         self.assertIn("input_ids", tokenized_dataset.features)
         self.assertIn("labels", tokenized_dataset.features)
@@ -107,7 +107,7 @@ def test_max_seq_len_clipped(self):
         op_manager = TextGenerationDataset.load_from_registry(
             self.data_args.dataset,
             data_args=self.data_args,
-            split="train[80%:]",
+            split="train[95%:]",
             processor=self.tiny_llama_tokenizer,
         )
 
@@ -136,7 +136,7 @@ def test_dataset_kwargs_and_percentages(self):
         c4_manager_a = TextGenerationDataset.load_from_registry(
             self.data_args.dataset,
             data_args=self.data_args,
-            split="train[5%:10%]",
+            split="train[5%:6%]",
             processor=self.tiny_llama_tokenizer,
         )
         raw_dataset_a = c4_manager_a.load_dataset()
@@ -144,7 +144,7 @@ def test_dataset_kwargs_and_percentages(self):
         c4_manager_b = TextGenerationDataset.load_from_registry(
             self.data_args.dataset,
             data_args=self.data_args,
-            split="train[5%:15%]",
+            split="train[6%:8%]",
             processor=self.tiny_llama_tokenizer,
         )
         raw_dataset_b = c4_manager_b.load_dataset()
@@ -162,7 +162,7 @@ def prepare_fixture(self, tiny_llama_tokenizer):
         [
             ["ptb", "penn_treebank", "train[:5%]", False],
             ["gsm8k", "main", "train[:5%]", True],
-            ["ultrachat_200k", "default", "train_sft[:2%]", False],
+            ["ultrachat_200k", "default", "train_sft[:1%]", False],
         ]
     )
     def test_datasets(self, dataset_key, dataset_config, split, do_concat):
@@ -271,9 +271,7 @@ class TestSplitLoading(unittest.TestCase):
     def prepare_fixture(self, tiny_llama_tokenizer):
         self.tiny_llama_tokenizer = tiny_llama_tokenizer
 
-    @parameterized.expand(
-        [["train"], ["train[60%:]"], [{"train": "train[:20%]"}], [None]]
-    )
+    @parameterized.expand([["train[95%:]"], [{"train": "train[:5%]"}]])
     def test_split_loading(self, split_def):
         data_args = DatasetArguments(
             dataset="open_platypus",
@@ -302,7 +300,7 @@ class TestTokenizationDataset(unittest.TestCase):
     def prepare_fixture(self, tiny_llama_tokenizer):
         self.tiny_llama_tokenizer = tiny_llama_tokenizer
         dataset = load_dataset("garage-bAInd/Open-Platypus")["train"]
-        self.num_calib_samples = 256
+        self.num_calib_samples = 64
         self.max_seq_len = 512
         self.dataset = dataset.shuffle(seed=42).select(range(self.num_calib_samples))
 
diff --git a/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml b/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml
index 044407c5d..30b4658cb 100644
--- a/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml
+++ b/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml
@@ -4,5 +4,5 @@ model: "Xenova/llama2.c-stories15M"
 dataset: wikitext
 dataset_config_name: "wikitext-2-raw-v1"
 recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml"
-num_train_epochs: 1
+num_train_epochs: 0.25
 concat_txt: False
\ No newline at end of file
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py
index 870503496..d3bc611d0 100644
--- a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py
+++ b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py
@@ -19,9 +19,9 @@ class TestOneshotAndFinetune(unittest.TestCase):
     def _test_oneshot_and_finetune(self):
         from llmcompressor.transformers import apply
 
-        splits = {"train": "train[:30%]", "calibration": "train[30%:40%]"}
+        splits = {"train": "train[:5%]", "calibration": "train[5%:10%]"}
         if self.dataset == "ultrachat-200k":
-            splits = {"train": "train_gen[:30%]", "calibration": "train_gen[30%:40%]"}
+            splits = {"train": "train_gen[:5%]", "calibration": "train_gen[5%:10%]"}
 
         apply(
             model=self.model,
@@ -30,6 +30,7 @@ def _test_oneshot_and_finetune(self):
             output_dir=self.output,
             recipe=self.recipe,
             num_train_epochs=self.num_train_epochs,
+            num_calibration_samples=64,
             concatenate_data=self.concat_txt,
             splits=splits,
             oneshot_device=self.device,
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py
index d4d65469d..e8e0ae426 100644
--- a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py
+++ b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py
@@ -27,7 +27,7 @@ def test_oneshot_sparsification_then_finetune(self):
         concatenate_data = False
         num_calibration_samples = 64
         output_dir = self.output / "oneshot_out"
-        splits = {"calibration": "train[:10%]"}
+        splits = {"calibration": "train[:5%]"}
 
         with create_session():
             oneshot(
@@ -56,8 +56,7 @@ def test_oneshot_sparsification_then_finetune(self):
         dataset = "open_platypus"
         concatenate_data = False
         output_dir = self.output / "finetune_out"
-        splits = "train[:50%]"
-        max_steps = 25
+        splits = "train[5%:7%]"
 
         with create_session():
             train(
@@ -65,11 +64,10 @@ def test_oneshot_sparsification_then_finetune(self):
                 distill_teacher=distill_teacher,
                 dataset=dataset,
                 output_dir=output_dir,
-                num_calibration_samples=num_calibration_samples,
+                num_train_epochs=0.05,
                 recipe=recipe_str,
                 concatenate_data=concatenate_data,
                 splits=splits,
-                max_steps=max_steps,
             )
 
         # test reloading checkpoint and final model
@@ -85,11 +83,10 @@ def test_oneshot_sparsification_then_finetune(self):
                 distill_teacher=distill_teacher,
                 dataset=dataset,
                 output_dir=output_dir,
-                num_calibration_samples=num_calibration_samples,
+                num_train_epochs=0.05,
                 recipe=recipe_str,
                 concatenate_data=concatenate_data,
                 splits=splits,
-                max_steps=max_steps,
                 resume_from_checkpoint=True,  # use last checkpoint
             )
 
@@ -106,7 +103,7 @@ def test_oneshot_quantization_then_finetune(self):
         concatenate_data = False
         num_calibration_samples = 64
         output_dir = self.output / "oneshot_out"
-        splits = {"calibration": "train[:10%]"}
+        splits = {"calibration": "train[:5%]"}
 
         with create_session():
             oneshot(
@@ -130,17 +127,17 @@ def test_oneshot_quantization_then_finetune(self):
         dataset = "open_platypus"
         concatenate_data = False
         output_dir = self.output / "finetune_out"
-        splits = {"calibration": "train[:10%]", "train": "train[:10%]"}
+        splits = {"calibration": "train[:5%]", "train": "train[5%:7%]"}
 
         with create_session():
             train(
                 model=model,
                 dataset=dataset,
                 output_dir=output_dir,
-                num_calibration_samples=num_calibration_samples,
                 recipe=recipe,
                 concatenate_data=concatenate_data,
                 splits=splits,
+                num_train_epochs=0.05,
             )
 
         # test reloading checkpoint and final model
@@ -152,10 +149,10 @@ def test_oneshot_quantization_then_finetune(self):
                 model=model,
                 dataset=dataset,
                 output_dir=output_dir,
-                num_calibration_samples=num_calibration_samples,
                 recipe=recipe,
                 concatenate_data=concatenate_data,
                 splits=splits,
+                num_train_epochs=0.05,
                 resume_from_checkpoint=True,  # use last checkpoint
             )
 

From 203c9b72d302c9bedbc8b729b952237d2678ab01 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 28 Feb 2025 09:41:48 -0500
Subject: [PATCH 14/23] Update transformers tests to speed-up execution (#1211)

Summary
- Dont calculate extra perplexity calculation for small models
- Don't process more data than what it is actually required for
calibration
- Move some decompression/runcompressed test cases to nightly
- From 23 minutes to about 5
---
 .../compression/configs/channelwise_15m.yaml          |  3 +--
 .../transformers/compression/configs/fp8_15m.yaml     |  3 +--
 .../transformers/compression/configs/inputs_15m.yaml  |  3 +--
 .../compression/configs/weights_only_1.1b.yaml        |  3 +--
 .../compression/configs/weights_only_15m.yaml         |  3 +--
 .../compression/decompression_configs/w4a16.yaml      |  2 +-
 .../decompression_configs/w8a16_dense.yaml            |  2 +-
 .../transformers/compression/test_decompress.py       | 11 ++++++-----
 .../transformers/compression/test_quantization.py     |  6 ++++--
 .../transformers/compression/test_run_compressed.py   | 11 ++++++-----
 10 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/tests/llmcompressor/transformers/compression/configs/channelwise_15m.yaml b/tests/llmcompressor/transformers/compression/configs/channelwise_15m.yaml
index 7cf010f66..628521890 100644
--- a/tests/llmcompressor/transformers/compression/configs/channelwise_15m.yaml
+++ b/tests/llmcompressor/transformers/compression/configs/channelwise_15m.yaml
@@ -1,5 +1,4 @@
 cadence: "commit"
 test_type: "regression"
 model_stub: "Xenova/llama2.c-stories15M"
-new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml"
-ppl_threshold: 30000
\ No newline at end of file
+new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml"
\ No newline at end of file
diff --git a/tests/llmcompressor/transformers/compression/configs/fp8_15m.yaml b/tests/llmcompressor/transformers/compression/configs/fp8_15m.yaml
index 9b136327a..6837be14e 100644
--- a/tests/llmcompressor/transformers/compression/configs/fp8_15m.yaml
+++ b/tests/llmcompressor/transformers/compression/configs/fp8_15m.yaml
@@ -1,5 +1,4 @@
 cadence: "commit"
 test_type: "regression"
 model_stub: "Xenova/llama2.c-stories15M"
-new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml"
-ppl_threshold: 30000
\ No newline at end of file
+new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml"
\ No newline at end of file
diff --git a/tests/llmcompressor/transformers/compression/configs/inputs_15m.yaml b/tests/llmcompressor/transformers/compression/configs/inputs_15m.yaml
index 38981e2ca..ca3c1286b 100644
--- a/tests/llmcompressor/transformers/compression/configs/inputs_15m.yaml
+++ b/tests/llmcompressor/transformers/compression/configs/inputs_15m.yaml
@@ -1,5 +1,4 @@
 cadence: "commit"
 test_type: "regression"
 model_stub: "Xenova/llama2.c-stories15M"
-new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml"
-ppl_threshold: 30000
\ No newline at end of file
+new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml"
\ No newline at end of file
diff --git a/tests/llmcompressor/transformers/compression/configs/weights_only_1.1b.yaml b/tests/llmcompressor/transformers/compression/configs/weights_only_1.1b.yaml
index 3c9d18f2c..50ccd0aa3 100644
--- a/tests/llmcompressor/transformers/compression/configs/weights_only_1.1b.yaml
+++ b/tests/llmcompressor/transformers/compression/configs/weights_only_1.1b.yaml
@@ -1,5 +1,4 @@
 cadence: "nightly"
 test_type: "regression"
 model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
-new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml"
-ppl_threshold: 20
\ No newline at end of file
+new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml"
\ No newline at end of file
diff --git a/tests/llmcompressor/transformers/compression/configs/weights_only_15m.yaml b/tests/llmcompressor/transformers/compression/configs/weights_only_15m.yaml
index 564c961a0..d7aa73f58 100644
--- a/tests/llmcompressor/transformers/compression/configs/weights_only_15m.yaml
+++ b/tests/llmcompressor/transformers/compression/configs/weights_only_15m.yaml
@@ -1,5 +1,4 @@
 cadence: "commit"
 test_type: "regression"
 model_stub: "Xenova/llama2.c-stories15M"
-new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml"
-ppl_threshold: 30000
\ No newline at end of file
+new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml"
\ No newline at end of file
diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml
index 144044f28..330023a80 100644
--- a/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml
+++ b/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml
@@ -1,4 +1,4 @@
-cadence: "commit"
+cadence: "nightly"
 test_type: "regression"
 compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed"
 skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
\ No newline at end of file
diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml
index 95e73b148..337e6c19e 100644
--- a/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml
+++ b/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml
@@ -1,4 +1,4 @@
-cadence: "commit"
+cadence: "nightly"
 test_type: "regression"
 compressed_model_stub: "nm-testing/tinyllama-w8a16-dense"
 skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
\ No newline at end of file
diff --git a/tests/llmcompressor/transformers/compression/test_decompress.py b/tests/llmcompressor/transformers/compression/test_decompress.py
index 616dd0dfe..b18cba80e 100644
--- a/tests/llmcompressor/transformers/compression/test_decompress.py
+++ b/tests/llmcompressor/transformers/compression/test_decompress.py
@@ -3,6 +3,7 @@
 import tempfile
 import unittest
 
+import torch
 from compressed_tensors import QUANTIZATION_CONFIG_NAME
 from compressed_tensors.compressors import ModelCompressor
 from compressed_tensors.quantization import QuantizationStatus
@@ -113,16 +114,16 @@ def test_hf_quantizer_decompress_match_manual_decompress(self):
             )
             inputs = inputs.to(self.decompressed_model_manual.device)
 
-            decompressed_model_manual_output = self.tokenizer.batch_decode(
-                self.decompressed_model_manual.generate(**inputs, max_length=50)
+            decompressed_model_manual_output = self.decompressed_model_manual.generate(
+                **inputs, max_length=50
             )
 
-            decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode(
+            decompressed_model_hf_quantizer_out = (
                 self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50)
             )
 
-            assert (
-                decompressed_model_hf_quantizer_out == decompressed_model_manual_output
+            assert torch.equal(
+                decompressed_model_hf_quantizer_out, decompressed_model_manual_output
             )
 
     @classmethod
diff --git a/tests/llmcompressor/transformers/compression/test_quantization.py b/tests/llmcompressor/transformers/compression/test_quantization.py
index e68d8b42a..d7a4bdba7 100644
--- a/tests/llmcompressor/transformers/compression/test_quantization.py
+++ b/tests/llmcompressor/transformers/compression/test_quantization.py
@@ -55,7 +55,7 @@ def tearDownClass(cls):
 
     @staticmethod
     def _run_oneshot(model, recipe, dataset, output_dir):
-        num_calibration_samples = 512
+        num_calibration_samples = 64
         max_seq_length = 512
         pad_to_max_length = False
 
@@ -68,7 +68,7 @@ def _run_oneshot(model, recipe, dataset, output_dir):
             recipe=recipe,
             pad_to_max_length=pad_to_max_length,
             clear_sparse_session=False,
-            splits={"calibration": "train_gen[:5%]"},
+            splits={"calibration": "train_gen[:1%]"},
             save_compressed=False,
         )
         return model
@@ -142,6 +142,8 @@ def _get_dataloader(self, data_args, tokenizer):
 
     @torch.no_grad()
     def test_perplexity(self):
+        if self.ppl_threshold is None:
+            pytest.skip("Skipping perplexity calculation.")
         tokenizer = AutoTokenizer.from_pretrained(self.model_stub)
         data_args = DatasetArguments(
             dataset="ultrachat-200k",
diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py
index 616dd0dfe..b18cba80e 100644
--- a/tests/llmcompressor/transformers/compression/test_run_compressed.py
+++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py
@@ -3,6 +3,7 @@
 import tempfile
 import unittest
 
+import torch
 from compressed_tensors import QUANTIZATION_CONFIG_NAME
 from compressed_tensors.compressors import ModelCompressor
 from compressed_tensors.quantization import QuantizationStatus
@@ -113,16 +114,16 @@ def test_hf_quantizer_decompress_match_manual_decompress(self):
             )
             inputs = inputs.to(self.decompressed_model_manual.device)
 
-            decompressed_model_manual_output = self.tokenizer.batch_decode(
-                self.decompressed_model_manual.generate(**inputs, max_length=50)
+            decompressed_model_manual_output = self.decompressed_model_manual.generate(
+                **inputs, max_length=50
             )
 
-            decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode(
+            decompressed_model_hf_quantizer_out = (
                 self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50)
             )
 
-            assert (
-                decompressed_model_hf_quantizer_out == decompressed_model_manual_output
+            assert torch.equal(
+                decompressed_model_hf_quantizer_out, decompressed_model_manual_output
             )
 
     @classmethod

From 510547564b0bca646120675d07422c31e809815e Mon Sep 17 00:00:00 2001
From: Aman Gupta <amagupta@linkedin.com>
Date: Fri, 28 Feb 2025 13:53:55 -0800
Subject: [PATCH 15/23] Fix logging bug in oneshot.py (#1213)

SUMMARY:

Changed `logger.waning` to `logger.warning` in `oneshot.py`

TEST PLAN:
`make test` passes

Signed-off-by: Aman Gupta <aman2304@gmail.com>
---
 src/llmcompressor/entrypoints/oneshot.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index 9c57b423a..1440c08ad 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -361,7 +361,7 @@ def _get_output_dir_from_argv() -> Optional[str]:
 
     # raise depreciation warnings
     if data_args.remove_columns is not None:
-        logger.waning(
+        logger.warning(
             "`remove_columns` argument is depreciated. When tokenizing datasets, all "
             "columns which are invalid inputs the tokenizer will be removed",
             DeprecationWarning,

From 7bb517f64101aed8da8a2023adc92405c4a27313 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Mon, 3 Mar 2025 14:39:38 -0500
Subject: [PATCH 16/23] [Training] Decouple Argument parser (#1207)

Order of reviews:
https://github.com/vllm-project/llm-compressor/pull/1206
https://github.com/vllm-project/llm-compressor/pull/1207 <-- Here
https://github.com/vllm-project/llm-compressor/pull/1209
https://github.com/vllm-project/llm-compressor/pull/1212
https://github.com/vllm-project/llm-compressor/pull/1214

SUMMARY:
* Decouple arg parser to be used for both oneshot and train

TEST PLAN:
* Pass tests
---
 src/llmcompressor/args/__init__.py            |  1 +
 src/llmcompressor/args/utils.py               | 73 ++++++++++++++++++
 src/llmcompressor/entrypoints/oneshot.py      | 74 ++-----------------
 .../llmcompressor/entrypoints/test_oneshot.py |  4 +-
 4 files changed, 82 insertions(+), 70 deletions(-)
 create mode 100644 src/llmcompressor/args/utils.py

diff --git a/src/llmcompressor/args/__init__.py b/src/llmcompressor/args/__init__.py
index d60435c42..26ad530b6 100644
--- a/src/llmcompressor/args/__init__.py
+++ b/src/llmcompressor/args/__init__.py
@@ -4,3 +4,4 @@
 from .model_arguments import ModelArguments
 from .recipe_arguments import RecipeArguments
 from .training_arguments import TrainingArguments
+from .utils import parse_args
diff --git a/src/llmcompressor/args/utils.py b/src/llmcompressor/args/utils.py
new file mode 100644
index 000000000..810d2f6ab
--- /dev/null
+++ b/src/llmcompressor/args/utils.py
@@ -0,0 +1,73 @@
+from typing import Tuple
+
+from loguru import logger
+from transformers import HfArgumentParser
+
+from llmcompressor.args import (
+    DatasetArguments,
+    ModelArguments,
+    RecipeArguments,
+    TrainingArguments,
+)
+from llmcompressor.transformers.utils.helpers import resolve_processor_from_model_args
+
+
+def parse_args(
+    include_training_args: bool = False, **kwargs
+) -> Tuple[ModelArguments, DatasetArguments, RecipeArguments, TrainingArguments, str]:
+    """
+    Keyword arguments passed in from `oneshot` or `train` will
+    separate the arguments into the following:
+
+        * ModelArguments in
+            src/llmcompressor/args/model_args.py
+        * DatasetArguments in
+            src/llmcompressor/args/dataset_args.py
+        * RecipeArguments in
+            src/llmcompressor/args/recipe_args.py
+        * TrainingArguments in
+            src/llmcompressor/args/training_args.py
+
+    ModelArguments, DatasetArguments, and RecipeArguments are used for both
+    `oneshot` and `train`. TrainingArguments is only used for `train`.
+
+    """
+
+    # pop output_dir, used as an attr in TrainingArguments, where oneshot is not used
+    output_dir = kwargs.pop("output_dir", None)
+
+    parser_args = (ModelArguments, DatasetArguments, RecipeArguments)
+    if include_training_args:
+        parser_args += (TrainingArguments,)
+
+    parser = HfArgumentParser(parser_args)
+    parsed_args = parser.parse_dict(kwargs)
+
+    training_args = None
+    if include_training_args:
+        model_args, dataset_args, recipe_args, training_args = parsed_args
+        if output_dir is not None:
+            training_args.output_dir = output_dir
+    else:
+        model_args, dataset_args, recipe_args = parsed_args
+
+    if recipe_args.recipe_args is not None:
+        if not isinstance(recipe_args.recipe_args, dict):
+            arg_dict = {}
+            for recipe_arg in recipe_args.recipe_args:
+                key, value = recipe_arg.split("=")
+                arg_dict[key] = value
+            recipe_args.recipe_args = arg_dict
+
+    # raise depreciation warnings
+    if dataset_args.remove_columns is not None:
+        logger.warn(
+            "`remove_columns` argument is depreciated. When tokenizing datasets, all "
+            "columns which are invalid inputs the tokenizer will be removed",
+            DeprecationWarning,
+        )
+
+    # silently assign tokenizer to processor
+    resolve_processor_from_model_args(model_args)
+
+    return model_args, dataset_args, recipe_args, training_args, output_dir
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index 1440c08ad..ecdebf46b 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -1,11 +1,11 @@
 from pathlib import PosixPath
-from typing import Optional, Tuple
+from typing import Optional
 
 from loguru import logger
 from torch.utils.data import DataLoader
-from transformers import HfArgumentParser, PreTrainedModel
+from transformers import PreTrainedModel
 
-from llmcompressor.args import DatasetArguments, ModelArguments, RecipeArguments
+from llmcompressor.args import parse_args
 from llmcompressor.core.session_functions import active_session
 from llmcompressor.transformers.finetune.data.data_helpers import (
     get_calibration_dataloader,
@@ -18,9 +18,8 @@
     modify_save_pretrained,
     patch_tied_tensors_bug,
 )
-from llmcompressor.transformers.utils.helpers import resolve_processor_from_model_args
 
-__all__ = ["Oneshot", "oneshot", "parse_oneshot_args"]
+__all__ = ["Oneshot", "oneshot"]
 
 
 class Oneshot:
@@ -123,10 +122,10 @@ def __init__(
 
         """
 
-        model_args, data_args, recipe_args, output_dir = parse_oneshot_args(**kwargs)
+        model_args, dataset_args, recipe_args, _, output_dir = parse_args(**kwargs)
 
         self.model_args = model_args
-        self.data_args = data_args
+        self.data_args = dataset_args
         self.recipe_args = recipe_args
         self.output_dir = output_dir
 
@@ -310,64 +309,3 @@ def oneshot(**kwargs) -> PreTrainedModel:
     one_shot()
 
     return one_shot.model
-
-
-def parse_oneshot_args(
-    **kwargs,
-) -> Tuple[ModelArguments, DatasetArguments, RecipeArguments, str]:
-    """
-    Parses kwargs by grouping into model, data or training arg groups:
-        * model_args in
-            src/llmcompressor/transformers/utils/arg_parser/model_args.py
-        * data_args in
-            src/llmcompressor/transformers/utils/arg_parser/data_args.py
-        * recipe_args in
-            src/llmcompressor/transformers/utils/arg_parser/recipe_args.py
-        * training_args in
-            src/llmcompressor/transformers/utils/arg_parser/training_args.py
-    """
-    output_dir = kwargs.pop("output_dir", None)
-
-    parser = HfArgumentParser((ModelArguments, DatasetArguments, RecipeArguments))
-
-    if not kwargs:
-
-        def _get_output_dir_from_argv() -> Optional[str]:
-            import sys
-
-            output_dir = None
-            if "--output_dir" in sys.argv:
-                index = sys.argv.index("--output_dir")
-                sys.argv.pop(index)
-                if index < len(sys.argv):  # Check if value exists afer the flag
-                    output_dir = sys.argv.pop(index)
-
-            return output_dir
-
-        output_dir = _get_output_dir_from_argv() or output_dir
-        parsed_args = parser.parse_args_into_dataclasses()
-    else:
-        parsed_args = parser.parse_dict(kwargs)
-
-    model_args, data_args, recipe_args = parsed_args
-
-    if recipe_args.recipe_args is not None:
-        if not isinstance(recipe_args.recipe_args, dict):
-            arg_dict = {}
-            for recipe_arg in recipe_args.recipe_args:
-                key, value = recipe_arg.split("=")
-                arg_dict[key] = value
-            recipe_args.recipe_args = arg_dict
-
-    # raise depreciation warnings
-    if data_args.remove_columns is not None:
-        logger.warning(
-            "`remove_columns` argument is depreciated. When tokenizing datasets, all "
-            "columns which are invalid inputs the tokenizer will be removed",
-            DeprecationWarning,
-        )
-
-    # silently assign tokenizer to processor
-    resolve_processor_from_model_args(model_args)
-
-    return model_args, data_args, recipe_args, output_dir
diff --git a/tests/llmcompressor/entrypoints/test_oneshot.py b/tests/llmcompressor/entrypoints/test_oneshot.py
index 4a7f2a5a7..1d00c828f 100644
--- a/tests/llmcompressor/entrypoints/test_oneshot.py
+++ b/tests/llmcompressor/entrypoints/test_oneshot.py
@@ -1,7 +1,7 @@
 from transformers import AutoModelForCausalLM
 
 from llmcompressor import Oneshot
-from llmcompressor.entrypoints.oneshot import parse_oneshot_args
+from llmcompressor.args import parse_args
 
 
 def test_oneshot_from_args():
@@ -17,7 +17,7 @@ def test_oneshot_from_args():
 
     output_dir = "bar_output_dir"
 
-    model_args, data_args, recipe_args, output_dir = parse_oneshot_args(
+    model_args, data_args, recipe_args, _, output_dir = parse_args(
         model=model,
         dataset=dataset,
         recipe=recipe,

From 07726ef3d317107385622cfc99fb10420fa7ce9e Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 5 Mar 2025 09:52:19 -0500
Subject: [PATCH 17/23] Remove MonkeyPatch for GPUs (#1227)

SUMMARY:
- This was added previously due to an error we were seeing with multiple
GPUs (i.e having more than one gpu visible would cause an error)
- Can verify this no longer happens
---
 .../finetune/test_finetune_no_recipe_custom_dataset.py     | 7 -------
 .../finetune/test_oneshot_and_finetune_with_tokenizer.py   | 4 ----
 2 files changed, 11 deletions(-)

diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py
index f8f8d9827..37524069c 100644
--- a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py
+++ b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py
@@ -108,7 +108,6 @@ def create_mock_file(self, extension, content, path, filename):
 
     def tearDown(self):
         shutil.rmtree(self.output)
-        self.monkeypatch.undo()
 
 
 @pytest.mark.integration
@@ -121,11 +120,8 @@ class TestOneshotCustomDatasetSmall(TestFinetuneNoRecipeCustomDataset):
     def setUp(self):
         import torch
 
-        self.monkeypatch = pytest.MonkeyPatch()
-
         if torch.cuda.is_available():
             self.device = "cuda:0"
-            self.monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0")
         else:
             self.device = "cpu"
 
@@ -147,15 +143,12 @@ def setUp(self):
         import torch
         from transformers import AutoModelForCausalLM
 
-        self.monkeypatch = pytest.MonkeyPatch()
         self.device = "cuda:0"
         self.output = "./oneshot_output"
-        self.monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0")
 
         self.model = AutoModelForCausalLM.from_pretrained(
             self.model, device_map=self.device, torch_dtype=torch.bfloat16
         )
-        self.monkeypatch = pytest.MonkeyPatch()
 
     def test_oneshot_then_finetune_gpu(self):
         self._test_finetune_wout_recipe_custom_dataset()
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py
index 509464a34..45b25818b 100644
--- a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py
+++ b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py
@@ -21,7 +21,6 @@ def setUp(self):
         self.output = "./finetune_output"
         # finetune workflows in general seem to have trouble with multi-gpus
         # use just one atm
-        self.monkeypatch = pytest.MonkeyPatch()
 
     def test_oneshot_and_finetune_with_tokenizer(self):
         from datasets import load_dataset
@@ -29,8 +28,6 @@ def test_oneshot_and_finetune_with_tokenizer(self):
 
         from llmcompressor.transformers import compress
 
-        self.monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "0")
-
         recipe_str = (
             "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml"
         )
@@ -71,4 +68,3 @@ def test_oneshot_and_finetune_with_tokenizer(self):
 
     def tearDown(self):
         shutil.rmtree(self.output)
-        self.monkeypatch.undo()

From 391b202fb24d929c024a830292eed90100c03cc4 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Wed, 5 Mar 2025 10:27:47 -0500
Subject: [PATCH 18/23] [Cosmetic] Rename data_args to dataset_args (#1206)

Order of reviews:
https://github.com/vllm-project/llm-compressor/pull/1206  <-- Here
https://github.com/vllm-project/llm-compressor/pull/1207
https://github.com/vllm-project/llm-compressor/pull/1209
https://github.com/vllm-project/llm-compressor/pull/1212
https://github.com/vllm-project/llm-compressor/pull/1214

SUMMARY:
Rename data_args to dataset_args

TEST PLAN:
Pass tests
FInd `data_args` using `grep`

---------

Signed-off-by: George Ohashi <george@neuralmagic.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 examples/trl_mixin/ex_trl_distillation.py     |  8 +-
 src/llmcompressor/entrypoints/oneshot.py      | 26 ++++---
 .../transformers/finetune/data/base.py        | 76 ++++++++++---------
 .../transformers/finetune/data/c4.py          | 14 ++--
 .../finetune/data/cnn_dailymail.py            | 14 ++--
 .../transformers/finetune/data/custom.py      |  2 +-
 .../finetune/data/data_helpers.py             | 30 ++++----
 .../finetune/data/evolcodealpaca.py           | 14 ++--
 .../transformers/finetune/data/flickr_30k.py  | 12 +--
 .../transformers/finetune/data/gsm8k.py       | 14 ++--
 .../finetune/data/open_platypus.py            | 14 ++--
 .../transformers/finetune/data/ptb.py         | 14 ++--
 .../finetune/data/ultrachat_200k.py           | 14 ++--
 .../transformers/finetune/data/wikitext.py    | 14 ++--
 .../transformers/finetune/runner.py           | 28 +++----
 .../transformers/finetune/session_mixin.py    | 20 ++---
 .../transformers/finetune/text_generation.py  | 34 ++++-----
 .../transformers/tracing/debug.py             | 10 +--
 .../llmcompressor/entrypoints/test_oneshot.py |  6 +-
 .../compression/test_quantization.py          | 10 +--
 .../finetune/data/test_dataset_helpers.py     | 10 +--
 .../finetune/data/test_dataset_loading.py     | 58 +++++++-------
 .../finetune/data/test_registry.py            | 30 ++++----
 .../finetune/test_session_mixin.py            |  4 +-
 .../transformers/obcq/test_obcq_completion.py |  6 +-
 25 files changed, 256 insertions(+), 226 deletions(-)

diff --git a/examples/trl_mixin/ex_trl_distillation.py b/examples/trl_mixin/ex_trl_distillation.py
index ebd14c5d2..4ebb53276 100644
--- a/examples/trl_mixin/ex_trl_distillation.py
+++ b/examples/trl_mixin/ex_trl_distillation.py
@@ -19,12 +19,12 @@
 max_seq_length = 512
 
 # Load gsm8k using SparseML dataset tools
-data_args = DatasetArguments(
+dataset_args = DatasetArguments(
     dataset="gsm8k", dataset_config_name="main", max_seq_length=max_seq_length
 )
 dataset_manager = TextGenerationDataset.load_from_registry(
-    data_args.dataset,
-    data_args=data_args,
+    dataset_args.dataset,
+    dataset_args=dataset_args,
     split="train",
     processor=tokenizer,
 )
@@ -69,7 +69,7 @@
     train_dataset=train_dataset,
     data_collator=data_collator,
     trl_sft_config_args=trl_sft_config_args,
-    data_args=data_args,
+    dataset_args=dataset_args,
     model_args=model_args,
 )
 trainer.train()
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index ecdebf46b..ea6481043 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -35,7 +35,7 @@ class Oneshot:
         `kwargs` are parsed into:
         - `model_args`: Arguments for loading and configuring a pretrained model
           (e.g., `AutoModelForCausalLM`).
-        - `data_args`: Arguments for dataset-related configurations, such as
+        - `dataset_args`: Arguments for dataset-related configurations, such as
           calibration dataloaders.
         - `recipe_args`: Arguments for defining and configuring recipes that specify
           optimization actions.
@@ -108,24 +108,23 @@ def __init__(
         """
         Initializes the `Oneshot` class with provided arguments.
 
-        Parses the input keyword arguments into `model_args`, `data_args`, and
+        Parses the input keyword arguments into `model_args`, `dataset_args`, and
         `recipe_args`. Performs preprocessing to initialize the model and
         tokenizer/processor.
 
         :param model_args: ModelArguments parameters, responsible for controlling
             model loading and saving logic
-        :param data_args: DatasetArguments parameters, responsible for controlling
+        :param dataset_args: DatasetArguments parameters, responsible for controlling
             dataset loading, preprocessing and dataloader loading
         :param recipe_args: RecipeArguments parameters, responsible for containing
             recipe-related parameters
         :param output_dir: Path to save the output model after carrying out oneshot
 
         """
-
         model_args, dataset_args, recipe_args, _, output_dir = parse_args(**kwargs)
 
         self.model_args = model_args
-        self.data_args = dataset_args
+        self.dataset_args = dataset_args
         self.recipe_args = recipe_args
         self.output_dir = output_dir
 
@@ -136,14 +135,19 @@ def __init__(
 
     @classmethod
     def from_args(
-        cls, model_args, data_args, recipe_args, output_dir, do_preprocess: bool = True
+        cls,
+        model_args,
+        dataset_args,
+        recipe_args,
+        output_dir,
+        do_preprocess: bool = True,
     ):
         """
         Used only for the stage runner to populate the args.
         """
         instance = super().__new__(cls)
         instance.model_args = model_args
-        instance.data_args = data_args
+        instance.dataset_args = dataset_args
         instance.recipe_args = recipe_args
         instance.output_dir = output_dir
 
@@ -176,7 +180,7 @@ def __call__(self):
         self.processor = self.model_args.processor
 
         calibration_dataloader = get_calibration_dataloader(
-            self.data_args, self.processor
+            self.dataset_args, self.processor
         )
         self.apply_recipe_modifiers(
             calibration_dataloader=calibration_dataloader,
@@ -242,7 +246,7 @@ def _pre_process(self):
         - Applies patches to fix tied tensor issues and modifies `save_pretrained`
           behavior.
         - Initializes the processor if specified as a path or `None`.
-        - Sets the minimum tokens per module if `data_args` are provided.
+        - Sets the minimum tokens per module if `dataset_args` are provided.
 
         Raises:
             FileNotFoundError: If the model or processor path is invalid.
@@ -265,8 +269,8 @@ def _pre_process(self):
             self.processor = self.model_args.processor
 
         # Set minimum tokens per module if data arguments are provided
-        if self.data_args:
-            self.min_tokens_per_module = self.data_args.min_tokens_per_module
+        if self.dataset_args:
+            self.min_tokens_per_module = self.dataset_args.min_tokens_per_module
 
     def check_tied_embeddings(self):
         """
diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py
index bde819e43..593682cbd 100644
--- a/src/llmcompressor/transformers/finetune/data/base.py
+++ b/src/llmcompressor/transformers/finetune/data/base.py
@@ -31,7 +31,7 @@ class TextGenerationDataset(RegistryMixin):
     3. Tokenize dataset using model tokenizer/processor
     4. Apply post processing such as grouping text and/or adding labels for finetuning
 
-    :param data_args: configuration settings for dataset loading
+    :param dataset_args: configuration settings for dataset loading
     :param split: split from dataset to load, for instance `test` or `train[:5%]`
     :param processor: processor or tokenizer to use on dataset
     """
@@ -41,11 +41,11 @@ class TextGenerationDataset(RegistryMixin):
 
     def __init__(
         self,
-        data_args: DatasetArguments,
+        dataset_args: DatasetArguments,
         split: str,
         processor: Processor,
     ):
-        self.data_args = data_args
+        self.dataset_args = dataset_args
         self.split = split
         self.processor = processor
 
@@ -58,23 +58,23 @@ def __init__(
                 self.tokenizer.pad_token = self.tokenizer.eos_token
 
             # configure sequence length
-            max_seq_length = data_args.max_seq_length
-            if data_args.max_seq_length > self.tokenizer.model_max_length:
+            max_seq_length = dataset_args.max_seq_length
+            if dataset_args.max_seq_length > self.tokenizer.model_max_length:
                 logger.warning(
                     f"The max_seq_length passed ({max_seq_length}) is larger than "
                     f"maximum length for model ({self.tokenizer.model_max_length}). "
                     f"Using max_seq_length={self.tokenizer.model_max_length}."
                 )
             self.max_seq_length = min(
-                data_args.max_seq_length, self.tokenizer.model_max_length
+                dataset_args.max_seq_length, self.tokenizer.model_max_length
             )
 
             # configure padding
             self.padding = (
                 False
-                if self.data_args.concatenate_data
+                if self.dataset_args.concatenate_data
                 else "max_length"
-                if self.data_args.pad_to_max_length
+                if self.dataset_args.pad_to_max_length
                 else False
             )
 
@@ -83,7 +83,7 @@ def __init__(
             self.padding = False
 
     def __call__(self, add_labels: bool = True) -> DatasetType:
-        dataset = self.data_args.dataset
+        dataset = self.dataset_args.dataset
 
         if isinstance(dataset, str):
             # load dataset: load from huggingface or disk
@@ -96,8 +96,8 @@ def __call__(self, add_labels: bool = True) -> DatasetType:
                 dataset,
                 self.preprocess,
                 batched=False,
-                num_proc=self.data_args.preprocessing_num_workers,
-                load_from_cache_file=not self.data_args.overwrite_cache,
+                num_proc=self.dataset_args.preprocessing_num_workers,
+                load_from_cache_file=not self.dataset_args.overwrite_cache,
                 desc="Preprocessing",
             )
             logger.debug(f"Dataset after preprocessing: {get_columns(dataset)}")
@@ -121,20 +121,20 @@ def __call__(self, add_labels: bool = True) -> DatasetType:
                 # regardless of `batched` argument
                 remove_columns=get_columns(dataset),  # assumes that input names
                 # and output names are disjoint
-                num_proc=self.data_args.preprocessing_num_workers,
-                load_from_cache_file=not self.data_args.overwrite_cache,
+                num_proc=self.dataset_args.preprocessing_num_workers,
+                load_from_cache_file=not self.dataset_args.overwrite_cache,
                 desc="Tokenizing",
             )
             logger.debug(f"Model kwargs after tokenizing: {get_columns(dataset)}")
 
-        if self.data_args.concatenate_data:
+        if self.dataset_args.concatenate_data:
             # postprocess: group text
             dataset = self.map(
                 dataset,
                 self.group_text,
                 batched=True,
-                num_proc=self.data_args.preprocessing_num_workers,
-                load_from_cache_file=not self.data_args.overwrite_cache,
+                num_proc=self.dataset_args.preprocessing_num_workers,
+                load_from_cache_file=not self.dataset_args.overwrite_cache,
                 desc="Concatenating data",
             )
             logger.debug(f"Model kwargs after concatenating: {get_columns(dataset)}")
@@ -145,8 +145,8 @@ def __call__(self, add_labels: bool = True) -> DatasetType:
                 dataset,
                 self.add_labels,
                 batched=False,  # not compatible with batching, need row lengths
-                num_proc=self.data_args.preprocessing_num_workers,
-                load_from_cache_file=not self.data_args.overwrite_cache,
+                num_proc=self.dataset_args.preprocessing_num_workers,
+                load_from_cache_file=not self.dataset_args.overwrite_cache,
                 desc="Adding labels",
             )
             logger.debug(f"Model kwargs after adding labels: {get_columns(dataset)}")
@@ -165,27 +165,31 @@ def load_dataset(self):
         :param cache_dir: disk location to search for cached dataset
         :return: the requested dataset
         """
-        if self.data_args.dataset_path is not None:
-            if self.data_args.dvc_data_repository is not None:
-                self.data_args.raw_kwargs["storage_options"] = {
-                    "url": self.data_args.dvc_data_repository
+        if self.dataset_args.dataset_path is not None:
+            if self.dataset_args.dvc_data_repository is not None:
+                self.dataset_args.raw_kwargs["storage_options"] = {
+                    "url": self.dataset_args.dvc_data_repository
                 }
-                self.data_args.raw_kwargs["data_files"] = self.data_args.dataset_path
+                self.dataset_args.raw_kwargs["data_files"] = (
+                    self.dataset_args.dataset_path
+                )
             else:
-                self.data_args.raw_kwargs["data_files"] = get_custom_datasets_from_path(
-                    self.data_args.dataset_path,
-                    self.data_args.dataset
-                    if hasattr(self.data_args, "dataset")
-                    else self.data_args.dataset_name,
+                self.dataset_args.raw_kwargs["data_files"] = (
+                    get_custom_datasets_from_path(
+                        self.dataset_args.dataset_path,
+                        self.dataset_args.dataset
+                        if hasattr(self.dataset_args, "dataset")
+                        else self.dataset_args.dataset_name,
+                    )
                 )
 
-        logger.debug(f"Loading dataset {self.data_args.dataset}")
+        logger.debug(f"Loading dataset {self.dataset_args.dataset}")
         return get_raw_dataset(
-            self.data_args,
+            self.dataset_args,
             None,
             split=self.split,
-            streaming=self.data_args.streaming,
-            **self.data_args.raw_kwargs,
+            streaming=self.dataset_args.streaming,
+            **self.dataset_args.raw_kwargs,
         )
 
     @cached_property
@@ -194,7 +198,7 @@ def preprocess(self) -> Union[Callable[[LazyRow], Any], None]:
         The function must return keys which correspond to processor/tokenizer kwargs,
         optionally including PROMPT_KEY
         """
-        preprocessing_func = self.data_args.preprocessing_func
+        preprocessing_func = self.dataset_args.preprocessing_func
 
         if callable(preprocessing_func):
             return preprocessing_func
@@ -218,9 +222,9 @@ def dataset_template(self) -> Union[Callable[[Any], Any], None]:
     def rename_columns(self, dataset: DatasetType) -> DatasetType:
         # rename columns to match processor/tokenizer kwargs
         column_names = get_columns(dataset)
-        if self.data_args.text_column in column_names and "text" not in column_names:
-            logger.debug(f"Renaming column `{self.data_args.text_column}` to `text`")
-            dataset = dataset.rename_column(self.data_args.text_column, "text")
+        if self.dataset_args.text_column in column_names and "text" not in column_names:
+            logger.debug(f"Renaming column `{self.dataset_args.text_column}` to `text`")
+            dataset = dataset.rename_column(self.dataset_args.text_column, "text")
 
         return dataset
 
diff --git a/src/llmcompressor/transformers/finetune/data/c4.py b/src/llmcompressor/transformers/finetune/data/c4.py
index 988a4adc3..e4fe6431c 100644
--- a/src/llmcompressor/transformers/finetune/data/c4.py
+++ b/src/llmcompressor/transformers/finetune/data/c4.py
@@ -13,14 +13,16 @@ class C4Dataset(TextGenerationDataset):
     """
     Child text generation class for the C4 dataset
 
-    :param data_args: configuration settings for dataset loading
+    :param dataset_args: configuration settings for dataset loading
     :param split: split from dataset to load, for instance `test` or `train[:5%]`
     :param processor: processor or tokenizer to use on dataset
     """
 
-    def __init__(self, data_args: "DatasetArguments", split: str, processor: Processor):
-        data_args = deepcopy(data_args)
-        data_args.dataset = "allenai/c4"
-        data_args.text_column = "text"
+    def __init__(
+        self, dataset_args: "DatasetArguments", split: str, processor: Processor
+    ):
+        dataset_args = deepcopy(dataset_args)
+        dataset_args.dataset = "allenai/c4"
+        dataset_args.text_column = "text"
 
-        super().__init__(data_args=data_args, split=split, processor=processor)
+        super().__init__(dataset_args=dataset_args, split=split, processor=processor)
diff --git a/src/llmcompressor/transformers/finetune/data/cnn_dailymail.py b/src/llmcompressor/transformers/finetune/data/cnn_dailymail.py
index b005724aa..fcc67482f 100644
--- a/src/llmcompressor/transformers/finetune/data/cnn_dailymail.py
+++ b/src/llmcompressor/transformers/finetune/data/cnn_dailymail.py
@@ -13,19 +13,21 @@ class CNNDailyMailDataset(TextGenerationDataset):
     """
     Text generation class for the CNN/DailyMail dataset
 
-    :param data_args: configuration settings for dataset loading
+    :param dataset_args: configuration settings for dataset loading
     :param split: split from dataset to load, for instance `test` or `train[:5%]`
     :param processor: processor or tokenizer to use on dataset
     """
 
     SAMPLE_TEMPLATE = "Article:\n{article}\n\n### Summarization:\n{highlights}\n"
 
-    def __init__(self, data_args: "DatasetArguments", split: str, processor: Processor):
-        data_args = deepcopy(data_args)
-        data_args.dataset = "cnn_dailymail"
-        data_args.dataset_config_name = "3.0.0"
+    def __init__(
+        self, dataset_args: "DatasetArguments", split: str, processor: Processor
+    ):
+        dataset_args = deepcopy(dataset_args)
+        dataset_args.dataset = "cnn_dailymail"
+        dataset_args.dataset_config_name = "3.0.0"
 
-        super().__init__(data_args=data_args, split=split, processor=processor)
+        super().__init__(dataset_args=dataset_args, split=split, processor=processor)
 
     def dataset_template(self, sample):
         return {
diff --git a/src/llmcompressor/transformers/finetune/data/custom.py b/src/llmcompressor/transformers/finetune/data/custom.py
index 7cff3c1d9..1239e08be 100644
--- a/src/llmcompressor/transformers/finetune/data/custom.py
+++ b/src/llmcompressor/transformers/finetune/data/custom.py
@@ -7,7 +7,7 @@ class CustomDataset(TextGenerationDataset):
     Child text generation class for custom local dataset supporting load
     for csv and json
 
-    :param data_args: configuration settings for dataset loading
+    :param dataset_args: configuration settings for dataset loading
     :param split: split from dataset to load, for instance `test` or `train[:5%]`
         Can also be set to None to load all the splits
     :param processor: processor or tokenizer to use on dataset
diff --git a/src/llmcompressor/transformers/finetune/data/data_helpers.py b/src/llmcompressor/transformers/finetune/data/data_helpers.py
index cf9b81f69..bd28de314 100644
--- a/src/llmcompressor/transformers/finetune/data/data_helpers.py
+++ b/src/llmcompressor/transformers/finetune/data/data_helpers.py
@@ -70,7 +70,7 @@ def format_calibration_data(
 
 
 def get_raw_dataset(
-    data_args,
+    dataset_args,
     cache_dir: Optional[str] = None,
     streaming: Optional[bool] = False,
     **kwargs,
@@ -84,11 +84,11 @@ def get_raw_dataset(
 
     """
     raw_datasets = load_dataset(
-        data_args.dataset,
-        data_args.dataset_config_name,
+        dataset_args.dataset,
+        dataset_args.dataset_config_name,
         cache_dir=cache_dir,
         streaming=streaming,
-        trust_remote_code=data_args.trust_remote_code_data,
+        trust_remote_code=dataset_args.trust_remote_code_data,
         **kwargs,
     )
     return raw_datasets
@@ -235,26 +235,26 @@ def do_transform(candidate: str) -> bool:
 
 
 def get_calibration_dataloader(
-    data_args,
+    dataset_args,
     processor,
     add_labels: bool = False,  # for oneshot
     do_oneshot=True,
 ) -> torch.utils.data.DataLoader:
     """
-    Loads datasets for each flow based on data_args, stores a Dataset for each
+    Loads datasets for each flow based on dataset_args, stores a Dataset for each
     enabled flow in self.datasets
 
     :param processor: processor or tokenizer to use for dataset tokenization
     :param add_labels: if True, add labels column to dataset splits
     """
-    if data_args.dataset is None:
+    if dataset_args.dataset is None:
         logger.info(
             "Running oneshot without calibration data. This is expected for "
             "weight-only and dynamic quantization"
         )
         return
 
-    splits = data_args.splits
+    splits = dataset_args.splits
     tokenized_datasets = {}
 
     def _get_split_name(inp_str):
@@ -272,9 +272,11 @@ def _get_split_name(inp_str):
         splits = {_get_split_name(s): s for s in splits}
 
     # default to custom dataset if dataset provided isn't a string
-    registry_id = data_args.dataset if isinstance(data_args.dataset, str) else "custom"
+    registry_id = (
+        dataset_args.dataset if isinstance(dataset_args.dataset, str) else "custom"
+    )
     for split_name, split_str in splits.items():
-        dataset = data_args.dataset
+        dataset = dataset_args.dataset
         if hasattr(dataset, "column_names") and "input_ids" in dataset.column_names:
             # dataset is already tokenized
             tokenized_datasets[split_name] = dataset
@@ -286,7 +288,7 @@ def _get_split_name(inp_str):
 
             dataset_manager = TextGenerationDataset.load_from_registry(
                 registry_id,
-                data_args=data_args,
+                dataset_args=dataset_args,
                 split=split_str,
                 processor=processor,
             )
@@ -301,7 +303,7 @@ def _get_split_name(inp_str):
 
     return format_calibration_data(
         tokenized_dataset=calibration_dataset,
-        num_calibration_samples=data_args.num_calibration_samples,
-        do_shuffle=data_args.shuffle_calibration_samples,
-        collate_fn=data_args.data_collator,
+        num_calibration_samples=dataset_args.num_calibration_samples,
+        do_shuffle=dataset_args.shuffle_calibration_samples,
+        collate_fn=dataset_args.data_collator,
     )
diff --git a/src/llmcompressor/transformers/finetune/data/evolcodealpaca.py b/src/llmcompressor/transformers/finetune/data/evolcodealpaca.py
index 3de833738..8a7892c13 100644
--- a/src/llmcompressor/transformers/finetune/data/evolcodealpaca.py
+++ b/src/llmcompressor/transformers/finetune/data/evolcodealpaca.py
@@ -13,7 +13,7 @@ class EvolCodeAlpacaDataset(TextGenerationDataset):
     """
     Child text generation class for the Evol Code Alpaca dataset
 
-    :param data_args: configuration settings for dataset loading
+    :param dataset_args: configuration settings for dataset loading
     :param split: split from dataset to load, for instance `test` or `train[:5%]`
     :param processor: processor or tokenizer to use on dataset
     """
@@ -25,12 +25,14 @@ class EvolCodeAlpacaDataset(TextGenerationDataset):
         "\n\n### Response:\n"
     )
 
-    def __init__(self, data_args: "DatasetArguments", split: str, processor: Processor):
-        data_args = deepcopy(data_args)
-        data_args.dataset = "theblackcat102/evol-codealpaca-v1"
-        data_args.text_column = "text"
+    def __init__(
+        self, dataset_args: "DatasetArguments", split: str, processor: Processor
+    ):
+        dataset_args = deepcopy(dataset_args)
+        dataset_args.dataset = "theblackcat102/evol-codealpaca-v1"
+        dataset_args.text_column = "text"
 
-        super().__init__(data_args, split=split, processor=processor)
+        super().__init__(dataset_args, split=split, processor=processor)
 
     def dataset_template(self, sample):
         prompt = self.EVOL_ALPACA_TEMPLATE.format(instruction=sample["instruction"])
diff --git a/src/llmcompressor/transformers/finetune/data/flickr_30k.py b/src/llmcompressor/transformers/finetune/data/flickr_30k.py
index 6e11c3aaf..8ada07a0e 100644
--- a/src/llmcompressor/transformers/finetune/data/flickr_30k.py
+++ b/src/llmcompressor/transformers/finetune/data/flickr_30k.py
@@ -13,7 +13,7 @@
 @TextGenerationDataset.register(name="flickr", alias="flickr30k")
 class Flickr30K(TextGenerationDataset):
     """
-    :param data_args: configuration settings for dataset loading
+    :param dataset_args: configuration settings for dataset loading
     :param split: split from dataset to load, for instance `test` or `train[:5%]`
     :param processor: processor or tokenizer to use on dataset
     """
@@ -31,11 +31,13 @@ class Flickr30K(TextGenerationDataset):
         "{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
     )
 
-    def __init__(self, data_args: "DatasetArguments", split: str, processor: Processor):
-        data_args = deepcopy(data_args)
-        data_args.dataset = "lmms-lab/flickr30k"
+    def __init__(
+        self, dataset_args: "DatasetArguments", split: str, processor: Processor
+    ):
+        dataset_args = deepcopy(dataset_args)
+        dataset_args.dataset = "lmms-lab/flickr30k"
 
-        super().__init__(data_args=data_args, split=split, processor=processor)
+        super().__init__(dataset_args=dataset_args, split=split, processor=processor)
 
         if (
             self.tokenizer is not None
diff --git a/src/llmcompressor/transformers/finetune/data/gsm8k.py b/src/llmcompressor/transformers/finetune/data/gsm8k.py
index 4f61d1726..ae1318571 100644
--- a/src/llmcompressor/transformers/finetune/data/gsm8k.py
+++ b/src/llmcompressor/transformers/finetune/data/gsm8k.py
@@ -13,19 +13,21 @@ class GSM8KDataset(TextGenerationDataset):
     """
     Child text generation class for the Grade School Math 8k dataset
 
-    :param data_args: configuration settings for dataset loading
+    :param dataset_args: configuration settings for dataset loading
     :param split: split from dataset to load, for instance `test` or `train[:5%]`
     :param processor: processor or tokenizer to use on dataset
     """
 
     GSM_TEMPLATE = "Question: {question}\nAnswer:"
 
-    def __init__(self, data_args: "DatasetArguments", split: str, processor: Processor):
-        data_args = deepcopy(data_args)
-        data_args.dataset = "gsm8k"
-        data_args.text_column = "text"
+    def __init__(
+        self, dataset_args: "DatasetArguments", split: str, processor: Processor
+    ):
+        dataset_args = deepcopy(dataset_args)
+        dataset_args.dataset = "gsm8k"
+        dataset_args.text_column = "text"
 
-        super().__init__(data_args=data_args, split=split, processor=processor)
+        super().__init__(dataset_args=dataset_args, split=split, processor=processor)
 
     def dataset_template(self, sample):
         prompt = self.GSM_TEMPLATE.format(question=sample["question"])
diff --git a/src/llmcompressor/transformers/finetune/data/open_platypus.py b/src/llmcompressor/transformers/finetune/data/open_platypus.py
index 33c9ddc86..81413e785 100644
--- a/src/llmcompressor/transformers/finetune/data/open_platypus.py
+++ b/src/llmcompressor/transformers/finetune/data/open_platypus.py
@@ -13,7 +13,7 @@ class OpenPlatypusDataset(TextGenerationDataset):
     """
     Child text generation class for the Open Platypus dataset
 
-    :param data_args: configuration settings for dataset loading
+    :param dataset_args: configuration settings for dataset loading
     :param split: split from dataset to load, for instance `test` or `train[:5%]`
     :param processor: processor or tokenizer to use on dataset
     """
@@ -28,11 +28,13 @@ class OpenPlatypusDataset(TextGenerationDataset):
         "instruction}\n\n### Response:\n",
     }
 
-    def __init__(self, data_args: "DatasetArguments", split: str, processor: Processor):
-        data_args = deepcopy(data_args)
-        data_args.dataset = "garage-bAInd/Open-Platypus"
-        data_args.text_column = "text"
-        super().__init__(data_args=data_args, split=split, processor=processor)
+    def __init__(
+        self, dataset_args: "DatasetArguments", split: str, processor: Processor
+    ):
+        dataset_args = deepcopy(dataset_args)
+        dataset_args.dataset = "garage-bAInd/Open-Platypus"
+        dataset_args.text_column = "text"
+        super().__init__(dataset_args=dataset_args, split=split, processor=processor)
 
     def dataset_template(self, sample):
         if "input" in sample and sample["input"] != "":
diff --git a/src/llmcompressor/transformers/finetune/data/ptb.py b/src/llmcompressor/transformers/finetune/data/ptb.py
index 7966fe4d0..8f03ad509 100644
--- a/src/llmcompressor/transformers/finetune/data/ptb.py
+++ b/src/llmcompressor/transformers/finetune/data/ptb.py
@@ -13,18 +13,20 @@ class PtbDataset(TextGenerationDataset):
     """
     Child text generation class for the PTB dataset
 
-    :param data_args: configuration settings for dataset loading
+    :param dataset_args: configuration settings for dataset loading
     :param split: split from dataset to load, for instance `test` or `train[:5%]`
     :param processor: processor or tokenizer to use on dataset
     """
 
-    def __init__(self, data_args: "DatasetArguments", split: str, processor: Processor):
-        data_args = deepcopy(data_args)
-        data_args.dataset = "ptb_text_only"
-        data_args.text_column = "sentence"
+    def __init__(
+        self, dataset_args: "DatasetArguments", split: str, processor: Processor
+    ):
+        dataset_args = deepcopy(dataset_args)
+        dataset_args.dataset = "ptb_text_only"
+        dataset_args.text_column = "sentence"
 
         super().__init__(
-            data_args=data_args,
+            dataset_args=dataset_args,
             split=split,
             processor=processor,
         )
diff --git a/src/llmcompressor/transformers/finetune/data/ultrachat_200k.py b/src/llmcompressor/transformers/finetune/data/ultrachat_200k.py
index fad57c076..296eb3db5 100644
--- a/src/llmcompressor/transformers/finetune/data/ultrachat_200k.py
+++ b/src/llmcompressor/transformers/finetune/data/ultrachat_200k.py
@@ -15,7 +15,7 @@ class UltraChatDataset(TextGenerationDataset):
     """
     Child text generation class for the Ultra Chat 200k dataset
 
-    :param data_args: configuration settings for dataset loading
+    :param dataset_args: configuration settings for dataset loading
     :param split: split from dataset to load, for instance `test` or `train[:5%]`
     :param processor: processor or tokenizer to use on dataset
     """
@@ -33,15 +33,17 @@ class UltraChatDataset(TextGenerationDataset):
         "{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
     )
 
-    def __init__(self, data_args: "DatasetArguments", split: str, processor: Processor):
-        data_args = deepcopy(data_args)
-        data_args.dataset = "HuggingFaceH4/ultrachat_200k"
-        data_args.text_column = "messages"
+    def __init__(
+        self, dataset_args: "DatasetArguments", split: str, processor: Processor
+    ):
+        dataset_args = deepcopy(dataset_args)
+        dataset_args.dataset = "HuggingFaceH4/ultrachat_200k"
+        dataset_args.text_column = "messages"
 
         if split in ["train", "test"]:
             split += "_sft"
 
-        super().__init__(data_args=data_args, split=split, processor=processor)
+        super().__init__(dataset_args=dataset_args, split=split, processor=processor)
 
         if (
             self.tokenizer is not None
diff --git a/src/llmcompressor/transformers/finetune/data/wikitext.py b/src/llmcompressor/transformers/finetune/data/wikitext.py
index 868c9d951..73142d671 100644
--- a/src/llmcompressor/transformers/finetune/data/wikitext.py
+++ b/src/llmcompressor/transformers/finetune/data/wikitext.py
@@ -13,18 +13,20 @@ class WikiTextDataset(TextGenerationDataset):
     """
     Child text generation class for the Open Platypus dataset
 
-    :param data_args: configuration settings for dataset loading
+    :param dataset_args: configuration settings for dataset loading
     :param split: split from dataset to load, for instance `test` or `train[:5%]`
     :param processor: processor or tokenizer to use on dataset
     """
 
-    def __init__(self, data_args: "DatasetArguments", split: str, processor: Processor):
-        data_args = deepcopy(data_args)
-        data_args.dataset = "Salesforce/wikitext"
-        data_args.text_column = "text"
+    def __init__(
+        self, dataset_args: "DatasetArguments", split: str, processor: Processor
+    ):
+        dataset_args = deepcopy(dataset_args)
+        dataset_args.dataset = "Salesforce/wikitext"
+        dataset_args.text_column = "text"
 
         super().__init__(
-            data_args=data_args,
+            dataset_args=dataset_args,
             split=split,
             processor=processor,
         )
diff --git a/src/llmcompressor/transformers/finetune/runner.py b/src/llmcompressor/transformers/finetune/runner.py
index 1735a99b8..75d963aa5 100644
--- a/src/llmcompressor/transformers/finetune/runner.py
+++ b/src/llmcompressor/transformers/finetune/runner.py
@@ -42,19 +42,19 @@ class StageRunner:
         - train()
 
     :param model_args: Arguments pertaining to model/config/processor
-    :param data_args: Arguments pertaining to what data to use for different flows
+    :param dataset_args: Arguments pertaining to what data to use for different flows
     :param training_args: Arguments pertaining to training loop configuration
     :model: unwrapped model to run flows on
     """
 
     def __init__(
         self,
-        data_args: "DatasetArguments",
+        dataset_args: "DatasetArguments",
         model_args: "ModelArguments",
         training_args: "TrainingArguments",
         recipe_args: "RecipeArguments",
     ):
-        self._data_args = data_args
+        self._dataset_args = dataset_args
         self._model_args = model_args
         self._training_args = training_args
         self._recipe_args = recipe_args
@@ -67,13 +67,13 @@ def __init__(
 
     def populate_datasets(self, processor: Processor, add_labels: bool = True):
         """
-        Loads datasets for each flow based on data_args, stores a Dataset for each
+        Loads datasets for each flow based on dataset_args, stores a Dataset for each
         enabled flow in self.datasets
 
         :param processor: processor or tokenizer to use for dataset tokenization
         :param add_labels: if True, add labels column to dataset splits
         """
-        if self._data_args.dataset is None:
+        if self._dataset_args.dataset is None:
             self.processor = self._model_args.processor
             logger.info(
                 "Running oneshot without calibration data. This is expected for "
@@ -81,7 +81,7 @@ def populate_datasets(self, processor: Processor, add_labels: bool = True):
             )
             return
 
-        splits = self._data_args.splits
+        splits = self._dataset_args.splits
         tokenized_datasets = {}
 
         def _get_split_name(inp_str):
@@ -100,12 +100,12 @@ def _get_split_name(inp_str):
 
         # default to custom dataset if dataset provided isn't a string
         registry_id = (
-            self._data_args.dataset
-            if isinstance(self._data_args.dataset, str)
+            self._dataset_args.dataset
+            if isinstance(self._dataset_args.dataset, str)
             else "custom"
         )
         for split_name, split_str in splits.items():
-            dataset = self._data_args.dataset
+            dataset = self._dataset_args.dataset
             if hasattr(dataset, "column_names") and "input_ids" in dataset.column_names:
                 # dataset is already tokenized
                 tokenized_datasets[split_name] = dataset
@@ -113,7 +113,7 @@ def _get_split_name(inp_str):
                 # dataset needs to be tokenized
                 dataset_manager = TextGenerationDataset.load_from_registry(
                     registry_id,
-                    data_args=self._data_args,
+                    dataset_args=self._dataset_args,
                     split=split_str,
                     processor=processor,
                 )
@@ -202,7 +202,7 @@ def run_sequential_stages(
 
                 oneshot = Oneshot.from_args(
                     model_args=self._model_args,
-                    data_args=self._data_args,
+                    dataset_args=self._dataset_args,
                     recipe_args=self._recipe_args,
                     output_dir=self._training_args.output_dir,
                     do_preprocess=do_preprocess,
@@ -210,9 +210,9 @@ def run_sequential_stages(
 
                 calib_data = format_calibration_data(
                     tokenized_dataset=self.get_dataset_split("calibration"),
-                    num_calibration_samples=self._data_args.num_calibration_samples,
-                    do_shuffle=self._data_args.shuffle_calibration_samples,
-                    collate_fn=self._data_args.data_collator,
+                    num_calibration_samples=self._dataset_args.num_calibration_samples,
+                    do_shuffle=self._dataset_args.shuffle_calibration_samples,
+                    collate_fn=self._dataset_args.data_collator,
                     accelerator=self.trainer.accelerator,
                 )
 
diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
index 27882d7d6..f64916e69 100644
--- a/src/llmcompressor/transformers/finetune/session_mixin.py
+++ b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -54,14 +54,14 @@ class SessionManagerMixIn:
 
     :param recipe: path to recipe file to apply during training
     :param recipe_args: additional kwargs to use for evaluating recipe
-    :param data_args: kwargs for configuring dataset loading
+    :param dataset_args: kwargs for configuring dataset loading
     :param teacher: optional teacher model to use for distillation
     """
 
     def __init__(
         self,
         recipe: str,
-        data_args: "DatasetArguments",
+        dataset_args: "DatasetArguments",
         model_args: "ModelArguments",
         teacher: Optional[Union[Module, str]] = None,
         recipe_args: Optional[Union[Dict[str, Any], str]] = None,
@@ -77,7 +77,7 @@ def __init__(
 
         self.metadata = None
         if training_args is not None:
-            # trl_sft_trainer pathway. Both training_args and data_args
+            # trl_sft_trainer pathway. Both training_args and dataset_args
             # have `max_seq_length` which causes collision error. This is the
             # only shared parameter, where training arg is `TRLSFTConfig` that
             # inherits HuggingFace's `TrainingArguments`
@@ -87,7 +87,7 @@ def __init__(
                     training_args_dict.pop("max_seq_length")
                 )
                 logger.warning(
-                    "Detected `max_seq_length` in both data_args ",
+                    "Detected `max_seq_length` in both dataset_args ",
                     "and training_args. This is expected for TRL in distillation. ",
                     "Updating metadata to `training_args_max_seq_length`",
                 )
@@ -95,7 +95,7 @@ def __init__(
             self.metadata = self._extract_metadata(
                 metadata_args=METADATA_ARGS,
                 training_args_dict=training_args_dict,
-                data_args_dict=asdict(data_args) if data_args else {},
+                dataset_args_dict=asdict(dataset_args) if dataset_args else {},
             )
 
         # setup metrics and session
@@ -125,8 +125,8 @@ def __init__(
         if self.is_fsdp_enabled:
             self._prepare_model_for_fsdp()
 
-        if data_args is not None:
-            self.min_tokens_per_module = data_args.min_tokens_per_module
+        if dataset_args is not None:
+            self.min_tokens_per_module = dataset_args.min_tokens_per_module
 
     def initialize_session(
         self,
@@ -459,16 +459,16 @@ def _extract_metadata(
         self,
         metadata_args: List[str],
         training_args_dict: Dict[str, Any],
-        data_args_dict: Dict[str, Any],
+        dataset_args_dict: Dict[str, Any],
     ) -> Dict[str, Any]:
         metadata = {}
-        if not training_args_dict.keys().isdisjoint(data_args_dict.keys()):
+        if not training_args_dict.keys().isdisjoint(dataset_args_dict.keys()):
             raise ValueError(
                 "Found common keys in `training_args` and `data args`. "
                 "This is prohibitive and may lead to undesired behavior."
             )
 
-        args_dict = {**training_args_dict, **data_args_dict}
+        args_dict = {**training_args_dict, **dataset_args_dict}
 
         for arg in metadata_args:
             if arg not in args_dict.keys():
diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py
index 9a3623f60..d03867b85 100644
--- a/src/llmcompressor/transformers/finetune/text_generation.py
+++ b/src/llmcompressor/transformers/finetune/text_generation.py
@@ -68,18 +68,18 @@ def train(**kwargs):
     """
     CLI entrypoint for running training
     """
-    model_args, data_args, recipe_args, training_args = parse_args(**kwargs)
+    model_args, dataset_args, recipe_args, training_args = parse_args(**kwargs)
     training_args.do_train = True
-    main(model_args, data_args, recipe_args, training_args)
+    main(model_args, dataset_args, recipe_args, training_args)
 
 
 def eval(**kwargs):
     """
     CLI entrypoint for running evaluation
     """
-    model_args, data_args, recipe_args, training_args = parse_args(**kwargs)
+    model_args, dataset_args, recipe_args, training_args = parse_args(**kwargs)
     training_args.do_eval = True
-    main(model_args, data_args, recipe_args, training_args)
+    main(model_args, dataset_args, recipe_args, training_args)
 
 
 @deprecated(
@@ -99,13 +99,13 @@ def apply(**kwargs):
     CLI entrypoint for any of training, oneshot
     """
     report_to = kwargs.get("report_to", None)
-    model_args, data_args, recipe_args, training_args = parse_args(**kwargs)
+    model_args, dataset_args, recipe_args, training_args = parse_args(**kwargs)
 
     training_args.run_stages = True
     if report_to is None:  # user didn't specify any reporters
         # get rid of the reporters inferred from hugging face
         training_args.report_to = []
-    main(model_args, data_args, recipe_args, training_args)
+    main(model_args, dataset_args, recipe_args, training_args)
 
 
 def compress(**kwargs):
@@ -117,8 +117,8 @@ def parse_args(**kwargs):
     Parses kwargs by grouping into model, data or training arg groups:
         * model_args in
             src/llmcompressor/transformers/utils/arg_parser/model_args.py
-        * data_args in
-            src/llmcompressor/transformers/utils/arg_parser/data_args.py
+        * dataset_args in
+            src/llmcompressor/transformers/utils/arg_parser/dataset_args.py
         * recipe_args in
             src/llmcompressor/transformers/utils/arg_parser/recipe_args.py
         * training_args in
@@ -134,7 +134,7 @@ def parse_args(**kwargs):
     else:
         parsed_args = parser.parse_dict(kwargs)
 
-    model_args, data_args, recipe_args, training_args = parsed_args
+    model_args, dataset_args, recipe_args, training_args = parsed_args
     if recipe_args.recipe_args is not None:
         if not isinstance(recipe_args.recipe_args, dict):
             arg_dict = {}
@@ -144,7 +144,7 @@ def parse_args(**kwargs):
             recipe_args.recipe_args = arg_dict
 
     # raise depreciation warnings
-    if data_args.remove_columns is not None:
+    if dataset_args.remove_columns is not None:
         warnings.warn(
             "`remove_columns` argument is depreciated. When tokenizing datasets, all "
             "columns which are invalid inputs the tokenizer will be removed",
@@ -158,7 +158,7 @@ def parse_args(**kwargs):
         model_args.processor = model_args.tokenizer
     model_args.tokenizer = None
 
-    return model_args, data_args, recipe_args, training_args
+    return model_args, dataset_args, recipe_args, training_args
 
 
 def initialize_model_from_path(
@@ -304,7 +304,7 @@ def initialize_processor_from_path(
 
 def main(
     model_args: ModelArguments,
-    data_args: DatasetArguments,
+    dataset_args: DatasetArguments,
     recipe_args: RecipeArguments,
     training_args: TrainingArguments,
 ):
@@ -326,8 +326,8 @@ def main(
 
     :param model_args: Arguments pertaining to which model/config/tokenizer we are
     going to fine-tune from
-    :param data_args: Arguments pertaining to what data we are going to input our model
-    for training
+    :param dataset_args: Arguments pertaining to what data we are going to input
+        our model for training
     :param training_args: Arguments pertaining to training loop configuration
     """
 
@@ -384,7 +384,7 @@ def main(
     # Load datasets
     stage_runner = StageRunner(
         model_args=model_args,
-        data_args=data_args,
+        dataset_args=dataset_args,
         training_args=training_args,
         recipe_args=recipe_args,
     )
@@ -400,10 +400,10 @@ def main(
         recipe_args=recipe_args.recipe_args,
         args=training_args,
         model_args=model_args,
-        data_args=data_args,
+        dataset_args=dataset_args,
         train_dataset=train_dataset or calib_dataset,
         processing_class=processor,
-        data_collator=data_args.data_collator,
+        data_collator=dataset_args.data_collator,
     )
 
     # wrap model.save_pretrained
diff --git a/src/llmcompressor/transformers/tracing/debug.py b/src/llmcompressor/transformers/tracing/debug.py
index ccce917a7..2bb399b3c 100644
--- a/src/llmcompressor/transformers/tracing/debug.py
+++ b/src/llmcompressor/transformers/tracing/debug.py
@@ -63,11 +63,11 @@ def trace(
     print("Loaded model")
 
     # Prepare sample data
-    data_args = DatasetArguments(**get_dataset_kwargs(modality))
+    dataset_args = DatasetArguments(**get_dataset_kwargs(modality))
     dataset = TextGenerationDataset.load_from_registry(
-        data_args.dataset,
-        data_args=data_args,
-        split=data_args.splits["calibration"],
+        dataset_args.dataset,
+        dataset_args=dataset_args,
+        split=dataset_args.splits["calibration"],
         processor=processor,
     )(add_labels=False)
     sample_input = next(iter(dataset))
@@ -89,7 +89,7 @@ def trace(
         "\nAttempting trace\n"
         f"    model_id={model_id}\n"
         f"    model_class={model_class.__name__}\n"
-        f"    dataset={data_args.dataset}\n"
+        f"    dataset={dataset_args.dataset}\n"
         f"    split={dataset.split}\n"
         f"    inputs={sample_input.keys()}\n"
         f"    sequential_targets={sequential_targets}\n"
diff --git a/tests/llmcompressor/entrypoints/test_oneshot.py b/tests/llmcompressor/entrypoints/test_oneshot.py
index 1d00c828f..ba0cb3a3a 100644
--- a/tests/llmcompressor/entrypoints/test_oneshot.py
+++ b/tests/llmcompressor/entrypoints/test_oneshot.py
@@ -17,7 +17,7 @@ def test_oneshot_from_args():
 
     output_dir = "bar_output_dir"
 
-    model_args, data_args, recipe_args, _, output_dir = parse_args(
+    model_args, dataset_args, recipe_args, _, output_dir = parse_args(
         model=model,
         dataset=dataset,
         recipe=recipe,
@@ -26,10 +26,10 @@ def test_oneshot_from_args():
         output_dir=output_dir,
     )
 
-    oneshot = Oneshot.from_args(model_args, data_args, recipe_args, output_dir)
+    oneshot = Oneshot.from_args(model_args, dataset_args, recipe_args, output_dir)
     assert oneshot.model == model
     assert oneshot.model_args is model_args
-    assert oneshot.data_args is data_args
+    assert oneshot.dataset_args is dataset_args
     assert oneshot.recipe_args is recipe_args
     assert oneshot.model_args is model_args
     assert oneshot.output_dir is output_dir
diff --git a/tests/llmcompressor/transformers/compression/test_quantization.py b/tests/llmcompressor/transformers/compression/test_quantization.py
index d7a4bdba7..8a4f46fb5 100644
--- a/tests/llmcompressor/transformers/compression/test_quantization.py
+++ b/tests/llmcompressor/transformers/compression/test_quantization.py
@@ -123,10 +123,10 @@ def test_quantization_reload(self):
             assert o_zp.dtype == n_zp.dtype
             assert torch.equal(o_zp, n_zp)
 
-    def _get_dataloader(self, data_args, tokenizer):
+    def _get_dataloader(self, dataset_args, tokenizer):
         dataset_manager = TextGenerationDataset.load_from_registry(
-            data_args.dataset,
-            data_args=data_args,
+            dataset_args.dataset,
+            dataset_args=dataset_args,
             split="train_gen[:5%]",
             processor=tokenizer,
         )
@@ -145,11 +145,11 @@ def test_perplexity(self):
         if self.ppl_threshold is None:
             pytest.skip("Skipping perplexity calculation.")
         tokenizer = AutoTokenizer.from_pretrained(self.model_stub)
-        data_args = DatasetArguments(
+        dataset_args = DatasetArguments(
             dataset="ultrachat-200k",
             max_seq_length=self.max_seq_length,
         )
-        dataloader = self._get_dataloader(data_args, tokenizer)
+        dataloader = self._get_dataloader(dataset_args, tokenizer)
 
         total_ppl = 0.0
         total_non_nan = 0
diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
index aa55a752c..39165ffe6 100644
--- a/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
+++ b/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
@@ -9,10 +9,10 @@
 
 @pytest.mark.unit
 def test_combined_datasets():
-    data_args = DatasetArguments(
+    dataset_args = DatasetArguments(
         dataset="wikitext", dataset_config_name="wikitext-2-raw-v1"
     )
-    raw_wikitext2 = get_raw_dataset(data_args)
+    raw_wikitext2 = get_raw_dataset(dataset_args)
     datasets = {"all": raw_wikitext2}
     split_datasets = make_dataset_splits(datasets, do_train=True)
     assert split_datasets.get("train") is not None
@@ -23,13 +23,13 @@ def test_combined_datasets():
 
 @pytest.mark.unit
 def test_separate_datasets():
-    splits = {"train": "train[:5%]", "validation": "train[5%:7%]"}
-    data_args = DatasetArguments(
+    splits = {"train": "train[:5%]", "validation": "train[10%:20%]"}
+    dataset_args = DatasetArguments(
         dataset="wikitext", dataset_config_name="wikitext-2-raw-v1"
     )
     datasets = {}
     for split_name, split_str in splits.items():
-        raw_wikitext2 = get_raw_dataset(data_args, split=split_str)
+        raw_wikitext2 = get_raw_dataset(dataset_args, split=split_str)
         datasets[split_name] = raw_wikitext2
 
     split_datasets = make_dataset_splits(datasets, do_train=True)
diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py
index db539a74c..7198e0da3 100644
--- a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py
+++ b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py
@@ -21,7 +21,7 @@
 @pytest.mark.unit
 class TestConcentrationTokenization(unittest.TestCase):
     def setUp(self):
-        self.data_args = DatasetArguments(
+        self.dataset_args = DatasetArguments(
             dataset="wikitext",
             dataset_config_name="wikitext-2-raw-v1",
             concatenate_data=True,
@@ -33,8 +33,8 @@ def prepare_fixture(self, tiny_llama_tokenizer):
 
     def test_concatenation_tokenization(self):
         wiki_manager = TextGenerationDataset.load_from_registry(
-            self.data_args.dataset,
-            data_args=self.data_args,
+            self.dataset_args.dataset,
+            dataset_args=self.dataset_args,
             split="train[:5%]",
             processor=self.tiny_llama_tokenizer,
         )
@@ -54,7 +54,7 @@ def test_concatenation_tokenization(self):
 @pytest.mark.unit
 class TestNoPaddingTokenization(unittest.TestCase):
     def setUp(self):
-        self.data_args = DatasetArguments(
+        self.dataset_args = DatasetArguments(
             dataset="open_platypus", pad_to_max_length=False
         )
 
@@ -65,8 +65,8 @@ def prepare_fixture(self, tiny_llama_tokenizer):
     @pytest.mark.usefixtures("tiny_llama_tokenizer")
     def test_no_padding_tokenization(self):
         op_manager = TextGenerationDataset.load_from_registry(
-            self.data_args.dataset,
-            data_args=self.data_args,
+            self.dataset_args.dataset,
+            dataset_args=self.dataset_args,
             split="train[5%:7%]",
             processor=self.tiny_llama_tokenizer,
         )
@@ -75,7 +75,7 @@ def test_no_padding_tokenization(self):
             dataset,
             op_manager.preprocess,
             batched=False,
-            num_proc=op_manager.data_args.preprocessing_num_workers,
+            num_proc=op_manager.dataset_args.preprocessing_num_workers,
         )
         dataset = op_manager.rename_columns(dataset)  # rename
         self.assertGreater(len(dataset), 0)
@@ -97,7 +97,9 @@ def test_no_padding_tokenization(self):
 @pytest.mark.unit
 class TestMaxSeqLenClipped(unittest.TestCase):
     def setUp(self):
-        self.data_args = DatasetArguments(dataset="open_platypus", max_seq_length=4096)
+        self.dataset_args = DatasetArguments(
+            dataset="open_platypus", max_seq_length=4096
+        )
 
     @pytest.fixture(autouse=True)
     def prepare_fixture(self, tiny_llama_tokenizer):
@@ -105,8 +107,8 @@ def prepare_fixture(self, tiny_llama_tokenizer):
 
     def test_max_seq_len_clipped(self):
         op_manager = TextGenerationDataset.load_from_registry(
-            self.data_args.dataset,
-            data_args=self.data_args,
+            self.dataset_args.dataset,
+            dataset_args=self.dataset_args,
             split="train[95%:]",
             processor=self.tiny_llama_tokenizer,
         )
@@ -119,7 +121,7 @@ def test_max_seq_len_clipped(self):
 @pytest.mark.unit
 class TestDatasetKwargsAndPercent(unittest.TestCase):
     def setUp(self):
-        self.data_args = DatasetArguments(
+        self.dataset_args = DatasetArguments(
             dataset="wikitext",
             raw_kwargs={
                 "data_files": {
@@ -134,16 +136,16 @@ def prepare_fixture(self, tiny_llama_tokenizer):
 
     def test_dataset_kwargs_and_percentages(self):
         c4_manager_a = TextGenerationDataset.load_from_registry(
-            self.data_args.dataset,
-            data_args=self.data_args,
+            self.dataset_args.dataset,
+            dataset_args=self.dataset_args,
             split="train[5%:6%]",
             processor=self.tiny_llama_tokenizer,
         )
         raw_dataset_a = c4_manager_a.load_dataset()
 
         c4_manager_b = TextGenerationDataset.load_from_registry(
-            self.data_args.dataset,
-            data_args=self.data_args,
+            self.dataset_args.dataset,
+            dataset_args=self.dataset_args,
             split="train[6%:8%]",
             processor=self.tiny_llama_tokenizer,
         )
@@ -166,15 +168,15 @@ def prepare_fixture(self, tiny_llama_tokenizer):
         ]
     )
     def test_datasets(self, dataset_key, dataset_config, split, do_concat):
-        data_args = DatasetArguments(
+        dataset_args = DatasetArguments(
             dataset=dataset_key,
             dataset_config_name=dataset_config,
             concatenate_data=do_concat,
             trust_remote_code_data=True,
         )
         manager = TextGenerationDataset.load_from_registry(
-            data_args.dataset,
-            data_args=data_args,
+            dataset_args.dataset,
+            dataset_args=dataset_args,
             split=split,
             processor=self.tiny_llama_tokenizer,
         )
@@ -205,7 +207,7 @@ def prepare_fixture(self, tiny_llama_tokenizer):
         self.tiny_llama_tokenizer = tiny_llama_tokenizer
 
     def setUp(self):
-        self.data_args = DatasetArguments(
+        self.dataset_args = DatasetArguments(
             dataset="evolcodealpaca",
             dataset_config_name=None,
             concatenate_data=False,
@@ -213,8 +215,8 @@ def setUp(self):
 
     def test_evol(self):
         evol_manager = TextGenerationDataset.load_from_registry(
-            self.data_args.dataset,
-            data_args=self.data_args,
+            self.dataset_args.dataset,
+            dataset_args=self.dataset_args,
             split="train[:2%]",
             processor=self.tiny_llama_tokenizer,
         )
@@ -234,7 +236,7 @@ def test_evol(self):
 @pytest.mark.unit
 class TestStreamLoading(unittest.TestCase):
     def setUp(self):
-        self.data_args = DatasetArguments(
+        self.dataset_args = DatasetArguments(
             dataset="wikitext",
             dataset_config_name="wikitext-2-raw-v1",
             concatenate_data=True,
@@ -247,8 +249,8 @@ def prepare_fixture(self, tiny_llama_tokenizer):
 
     def test_stream_loading(self):
         manager = TextGenerationDataset.load_from_registry(
-            self.data_args.dataset,
-            data_args=self.data_args,
+            self.dataset_args.dataset,
+            dataset_args=self.dataset_args,
             split="train",
             processor=self.tiny_llama_tokenizer,
         )
@@ -273,7 +275,7 @@ def prepare_fixture(self, tiny_llama_tokenizer):
 
     @parameterized.expand([["train[95%:]"], [{"train": "train[:5%]"}]])
     def test_split_loading(self, split_def):
-        data_args = DatasetArguments(
+        dataset_args = DatasetArguments(
             dataset="open_platypus",
             splits=split_def,
             trust_remote_code_data=True,
@@ -283,7 +285,7 @@ def test_split_loading(self, split_def):
         recipe_args = RecipeArguments()
         stage_runner = StageRunner(
             model_args=model_args,
-            data_args=data_args,
+            dataset_args=dataset_args,
             training_args=training_args,
             recipe_args=recipe_args,
         )
@@ -319,7 +321,7 @@ def preprocess(sample):
         )
         stage_runner = StageRunner(
             model_args=None,
-            data_args=DatasetArguments(
+            dataset_args=DatasetArguments(
                 dataset=tokenized_dataset, shuffle_calibration_samples=False
             ),
             training_args=TrainingArguments(do_oneshot=True),
@@ -337,7 +339,7 @@ def preprocess(sample):
         calib_dataloader = format_calibration_data(
             tokenized_dataset=calib_dataset,
             num_calibration_samples=self.num_calib_samples,
-            do_shuffle=stage_runner._data_args.shuffle_calibration_samples,
+            do_shuffle=stage_runner._dataset_args.shuffle_calibration_samples,
         )
         self.assertEqual(len(calib_dataloader), self.num_calib_samples)
         dataloader_sample = next(iter(calib_dataloader))["input_ids"]
diff --git a/tests/llmcompressor/transformers/finetune/data/test_registry.py b/tests/llmcompressor/transformers/finetune/data/test_registry.py
index 694a9b6d3..29895b4a4 100644
--- a/tests/llmcompressor/transformers/finetune/data/test_registry.py
+++ b/tests/llmcompressor/transformers/finetune/data/test_registry.py
@@ -11,49 +11,49 @@
 
 @pytest.mark.usefixtures("tiny_llama_tokenizer")
 def test_c4_initializes(tiny_llama_tokenizer):
-    data_args = DatasetArguments(dataset="c4", concatenate_data=True)
+    dataset_args = DatasetArguments(dataset="c4", concatenate_data=True)
     c4_manager = TextGenerationDataset.load_from_registry(
-        data_args.dataset,
-        data_args=data_args,
+        dataset_args.dataset,
+        dataset_args=dataset_args,
         split=None,
         processor=tiny_llama_tokenizer,
     )
     assert isinstance(c4_manager, TextGenerationDataset)
     assert isinstance(c4_manager, C4Dataset)
-    assert c4_manager.data_args.text_column == "text"
+    assert c4_manager.dataset_args.text_column == "text"
     assert not c4_manager.padding
-    assert c4_manager.max_seq_length == data_args.max_seq_length
+    assert c4_manager.max_seq_length == dataset_args.max_seq_length
 
 
 @pytest.mark.usefixtures("tiny_llama_tokenizer")
 def test_wikitext_initializes(tiny_llama_tokenizer):
-    data_args = DatasetArguments(
+    dataset_args = DatasetArguments(
         dataset="wikitext", dataset_config_name="wikitext-2-raw-v1"
     )
     wiki_manager = TextGenerationDataset.load_from_registry(
-        data_args.dataset,
-        data_args=data_args,
+        dataset_args.dataset,
+        dataset_args=dataset_args,
         split=None,
         processor=tiny_llama_tokenizer,
     )
     assert isinstance(wiki_manager, TextGenerationDataset)
     assert isinstance(wiki_manager, WikiTextDataset)
-    assert wiki_manager.data_args.text_column == "text"
+    assert wiki_manager.dataset_args.text_column == "text"
     assert wiki_manager.padding == "max_length"
-    assert wiki_manager.max_seq_length == data_args.max_seq_length
+    assert wiki_manager.max_seq_length == dataset_args.max_seq_length
 
 
 @pytest.mark.usefixtures("tiny_llama_tokenizer")
 def test_open_platypus_initializes(tiny_llama_tokenizer):
-    data_args = DatasetArguments(dataset="open_platypus", pad_to_max_length=False)
+    dataset_args = DatasetArguments(dataset="open_platypus", pad_to_max_length=False)
     op_manager = TextGenerationDataset.load_from_registry(
-        data_args.dataset,
-        data_args=data_args,
+        dataset_args.dataset,
+        dataset_args=dataset_args,
         split=None,
         processor=tiny_llama_tokenizer,
     )
     assert isinstance(op_manager, TextGenerationDataset)
     assert isinstance(op_manager, OpenPlatypusDataset)
-    assert op_manager.data_args.text_column == "text"
+    assert op_manager.dataset_args.text_column == "text"
     assert not op_manager.padding
-    assert op_manager.max_seq_length == data_args.max_seq_length
+    assert op_manager.max_seq_length == dataset_args.max_seq_length
diff --git a/tests/llmcompressor/transformers/finetune/test_session_mixin.py b/tests/llmcompressor/transformers/finetune/test_session_mixin.py
index 4fa981de9..65e5140bf 100644
--- a/tests/llmcompressor/transformers/finetune/test_session_mixin.py
+++ b/tests/llmcompressor/transformers/finetune/test_session_mixin.py
@@ -15,7 +15,7 @@ def __init__(
         recipe: Optional[str],
         recipe_args: Optional[Union[Dict[str, Any], str]] = None,
         model_args: Optional[Union[Dict[str, Any], str]] = None,
-        data_args: Optional[Union[Dict[str, Any], str]] = None,
+        dataset_args: Optional[Union[Dict[str, Any], str]] = None,
         teacher: Optional[Union[Module, str]] = None,
         **kwargs,
     ):
@@ -24,7 +24,7 @@ def __init__(
             recipe=recipe,
             recipe_args=recipe_args,
             model_args=model_args,
-            data_args=data_args,
+            dataset_args=dataset_args,
             teacher=teacher,
             **kwargs,
         )
diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_completion.py b/tests/llmcompressor/transformers/obcq/test_obcq_completion.py
index 1016cf422..5528a443e 100644
--- a/tests/llmcompressor/transformers/obcq/test_obcq_completion.py
+++ b/tests/llmcompressor/transformers/obcq/test_obcq_completion.py
@@ -26,14 +26,14 @@ def labeled_dataloader(self, dataset_name, model_name):
         from llmcompressor.transformers.finetune.data import TextGenerationDataset
 
         tokenizer = AutoTokenizer.from_pretrained(model_name)
-        data_args = DatasetArguments(
+        dataset_args = DatasetArguments(
             dataset=dataset_name,
             max_seq_length=512,
             pad_to_max_length=False,
         )
         dataset_manager = TextGenerationDataset.load_from_registry(
-            data_args.dataset,
-            data_args=data_args,
+            dataset_args.dataset,
+            dataset_args=dataset_args,
             split="train",
             processor=tokenizer,
         )

From 8fc6012182212d9cee1f288588308569fb45e76e Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Wed, 5 Mar 2025 13:36:22 -0500
Subject: [PATCH 19/23] [Training] Datasets - update Module (#1209)

Order of reviews:
https://github.com/vllm-project/llm-compressor/pull/1206
https://github.com/vllm-project/llm-compressor/pull/1207
https://github.com/vllm-project/llm-compressor/pull/1209 <-- Here
https://github.com/vllm-project/llm-compressor/pull/1212
https://github.com/vllm-project/llm-compressor/pull/1214

SUMMARY:
* Move dataset logic out of transformers module
`src/llmcompressor/transformers/finetune/data/data_helpers.py`, add it
to `src/llmcompressor/datasets/utils.py`


TEST PLAN:
Pass tests
---
 src/llmcompressor/datasets/__init__.py        |   8 +
 src/llmcompressor/datasets/utils.py           | 191 ++++++++++++++++++
 src/llmcompressor/entrypoints/oneshot.py      |   4 +-
 .../finetune/data/data_helpers.py             | 174 +---------------
 .../transformers/finetune/runner.py           |   9 +-
 .../finetune/data/test_dataset_helpers.py     |   6 +-
 .../finetune/data/test_dataset_loading.py     |   4 +-
 .../transformers/obcq/test_obcq_owl.py        |   4 +-
 8 files changed, 209 insertions(+), 191 deletions(-)
 create mode 100644 src/llmcompressor/datasets/__init__.py
 create mode 100644 src/llmcompressor/datasets/utils.py

diff --git a/src/llmcompressor/datasets/__init__.py b/src/llmcompressor/datasets/__init__.py
new file mode 100644
index 000000000..0b81cc724
--- /dev/null
+++ b/src/llmcompressor/datasets/__init__.py
@@ -0,0 +1,8 @@
+# flake8: noqa
+
+from .utils import (
+    format_calibration_data,
+    get_calibration_dataloader,
+    get_processed_dataset,
+    make_dataset_splits,
+)
diff --git a/src/llmcompressor/datasets/utils.py b/src/llmcompressor/datasets/utils.py
new file mode 100644
index 000000000..0d36cb3ac
--- /dev/null
+++ b/src/llmcompressor/datasets/utils.py
@@ -0,0 +1,191 @@
+import re
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+from datasets import Dataset
+from loguru import logger
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from transformers.data import default_data_collator
+
+from llmcompressor.args import DatasetArguments
+from llmcompressor.transformers.finetune.data import TextGenerationDataset
+from llmcompressor.typing import Processor
+
+
+def get_processed_dataset(
+    dataset_args: DatasetArguments,
+    processor: Processor,
+    do_oneshot: bool = False,
+    do_train: bool = True,
+) -> Optional[Dict[str, Dataset]]:
+    """
+    Loads datasets for each flow based on dataset_args, stores a Dataset for each
+    enabled flow in datasets
+    :param dataset_args: DatasetArguments that contain dataset loading and
+        processing params
+    :param processor: processor or tokenizer to use for dataset tokenization
+    :param do_oneshot: True for oneshot pathway
+    :param do_train: True for train pathway
+    :return: A dataset corresponding to either train or calibration (oneshot)
+    """
+    if dataset_args.dataset is None:
+        logger.warning(
+            "Running oneshot without calibration data. This is expected for "
+            "weight-only and dynamic quantization"
+        )
+        return
+
+    splits = dataset_args.splits
+    tokenized_datasets = {}
+
+    def _get_split_name(inp_str):
+        # strip out split name, for ex train[60%:] -> train
+        match = re.match(r"(\w*)\[.*\]", inp_str)
+        if match is not None:
+            return match.group(1)
+        return inp_str
+
+    if splits is None:
+        splits = {"all": None}
+    elif isinstance(splits, str):
+        splits = {_get_split_name(splits): splits}
+    elif isinstance(splits, List):
+        splits = {_get_split_name(s): s for s in splits}
+
+    # default to custom dataset if dataset provided isn't a string
+    registry_id = (
+        dataset_args.dataset if isinstance(dataset_args.dataset, str) else "custom"
+    )
+    for split_name, split_str in splits.items():
+        dataset = dataset_args.dataset
+        if hasattr(dataset, "column_names") and "input_ids" in dataset.column_names:
+            # dataset is already tokenized
+            tokenized_datasets[split_name] = dataset
+        else:
+            # dataset needs to be tokenized
+            dataset_manager = TextGenerationDataset.load_from_registry(
+                registry_id,
+                dataset_args=dataset_args,
+                split=split_str,
+                processor=processor,
+            )
+            tokenized_datasets[split_name] = dataset_manager(add_labels=do_train)
+
+    return make_dataset_splits(
+        tokenized_datasets,
+        do_oneshot=do_oneshot,
+        do_train=do_train,
+    )
+
+
+def get_calibration_dataloader(
+    dataset_args: DatasetArguments,
+    processor: Processor,
+) -> torch.utils.data.DataLoader:
+    """
+    Get the dataloader used for oneshot calibration.
+    :param dataset_args: DatasetArguments that contains the dataset parameters.
+    :param processor: Processor or the tokenizer of the model.
+    :return: PyTorch dataloader object that contains the calibration dataset.
+    """
+    if dataset_args.dataset is None:
+        # weight-only quantization or dynamic quantization
+        return
+
+    datasets = get_processed_dataset(
+        dataset_args=dataset_args,
+        processor=processor,
+        do_oneshot=True,
+        do_train=False,
+    )
+
+    calibration_dataset = datasets.get("calibration")
+
+    return format_calibration_data(
+        tokenized_dataset=calibration_dataset,
+        num_calibration_samples=dataset_args.num_calibration_samples,
+        do_shuffle=dataset_args.shuffle_calibration_samples,
+        collate_fn=dataset_args.data_collator,
+    )
+
+
+def format_calibration_data(
+    tokenized_dataset: Dataset,
+    num_calibration_samples: Optional[int] = None,
+    do_shuffle: bool = True,
+    collate_fn: Callable = default_data_collator,
+) -> List[torch.Tensor]:
+    """
+    Creates a dataloader out of the calibration dataset split, trimming it to
+    the desired number of calibration samples
+    :param tokenized_dataset: dataset to convert to dataloader
+    :param num_calibration_samples: number of data samples to convert
+    :param do_shuffle: whether to shuffle the dataset before selecting calibration
+        samples, true by default
+    :param collate_fn: optional custom collate function, or use default
+    :return: list of trimmed calibration data tensors
+    """
+    safe_calibration_samples = len(tokenized_dataset)
+    if num_calibration_samples is not None:
+        safe_calibration_samples = min(len(tokenized_dataset), num_calibration_samples)
+        if safe_calibration_samples != num_calibration_samples:
+            logger.warn(
+                f"Requested {num_calibration_samples} calibration samples but "
+                f"the provided dataset only has {safe_calibration_samples}. "
+            )
+
+    if do_shuffle:
+        tokenized_dataset = tokenized_dataset.shuffle()
+    tokenized_calibration = tokenized_dataset.select(range(safe_calibration_samples))
+
+    dataloader_params = {
+        "batch_size": 1,
+        "sampler": RandomSampler(tokenized_calibration)
+        if do_shuffle
+        else SequentialSampler(tokenized_calibration),
+        "collate_fn": collate_fn,
+        "pin_memory": True,
+    }
+
+    calibration_dataloader = DataLoader(tokenized_calibration, **dataloader_params)
+
+    return calibration_dataloader
+
+
+def make_dataset_splits(
+    tokenized_datasets: Dict[str, Any],
+    do_oneshot: bool = True,
+    do_train: bool = False,
+) -> Dict[str, Dataset]:
+    """
+    Restructures the datasets dictionary based on what tasks will be run
+    train
+    :param tokenized_datasets: dictionary of processed datasets
+    :param do_oneshot: Whether to store the calibration dataset
+    :return: A dataset corresponding to either train or calibration (oneshot)
+    """
+
+    # handles case where all splits are contained in a single dataset
+    if "all" in tokenized_datasets and len(tokenized_datasets) == 1:
+        tokenized_datasets = tokenized_datasets.get("all")
+        if isinstance(tokenized_datasets, Dataset):
+            tokenized_datasets = {"train": tokenized_datasets}
+
+    train_split = calib_split = None
+
+    if do_train:
+        if "train" not in tokenized_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_split = tokenized_datasets["train"]
+    if do_oneshot:
+        calib_split = tokenized_datasets.get("calibration")
+        if calib_split is None:
+            if "train" not in tokenized_datasets:
+                raise ValueError("--do_oneshot requires a calibration dataset")
+            calib_split = tokenized_datasets["train"]
+
+    split_datasets = {
+        "train": train_split,
+        "calibration": calib_split,
+    }
+    return split_datasets
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index ea6481043..cfaf83f92 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -7,9 +7,7 @@
 
 from llmcompressor.args import parse_args
 from llmcompressor.core.session_functions import active_session
-from llmcompressor.transformers.finetune.data.data_helpers import (
-    get_calibration_dataloader,
-)
+from llmcompressor.datasets import get_calibration_dataloader
 from llmcompressor.transformers.finetune.text_generation import (
     initialize_model_from_path,
     initialize_processor_from_path,
diff --git a/src/llmcompressor/transformers/finetune/data/data_helpers.py b/src/llmcompressor/transformers/finetune/data/data_helpers.py
index bd28de314..ff56cfbb9 100644
--- a/src/llmcompressor/transformers/finetune/data/data_helpers.py
+++ b/src/llmcompressor/transformers/finetune/data/data_helpers.py
@@ -1,74 +1,18 @@
 import logging
 import os
-import re
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Dict, Optional
 
-import torch
 from datasets import Dataset, load_dataset
-from loguru import logger
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from transformers.data import default_data_collator
 
 LOGGER = logging.getLogger(__name__)
 LABELS_MASK_VALUE = -100
 
 __all__ = [
-    "format_calibration_data",
     "get_raw_dataset",
-    "make_dataset_splits",
     "get_custom_datasets_from_path",
-    "get_calibration_dataloader",
 ]
 
 
-def format_calibration_data(
-    tokenized_dataset: Dataset,
-    num_calibration_samples: Optional[int] = None,
-    do_shuffle: bool = True,
-    collate_fn: Callable = default_data_collator,
-    accelerator: Optional[Any] = None,
-) -> List[torch.Tensor]:
-    """
-    Creates a dataloader out of the calibration dataset split, trimming it to
-    the desired number of calibration samples
-
-    :param tokenized_dataset: dataset to convert to dataloader
-    :param num_calibration_samples: number of data samples to convert
-    :param do_shuffle: whether to shuffle the dataset before selecting calibration
-    samples, true by default
-    :param collate_fn: optional custom collate function, or use default
-    :param accelerator: optional accelerator for if preparing in FSDP mode
-    :return: list of trimmed calibration data tensors
-    """
-    safe_calibration_samples = len(tokenized_dataset)
-    if num_calibration_samples is not None:
-        safe_calibration_samples = min(len(tokenized_dataset), num_calibration_samples)
-        if safe_calibration_samples != num_calibration_samples:
-            LOGGER.warn(
-                f"Requested {num_calibration_samples} calibration samples but "
-                f"the provided dataset only has {safe_calibration_samples}. "
-            )
-
-    if do_shuffle:
-        tokenized_dataset = tokenized_dataset.shuffle()
-    tokenized_calibration = tokenized_dataset.select(range(safe_calibration_samples))
-
-    dataloader_params = {
-        "batch_size": 1,
-        "sampler": RandomSampler(tokenized_calibration)
-        if do_shuffle
-        else SequentialSampler(tokenized_calibration),
-        "collate_fn": collate_fn,
-        "pin_memory": True,
-    }
-
-    calib_dataloader = DataLoader(tokenized_calibration, **dataloader_params)
-    if accelerator:
-        calib_dataloader = accelerator.prepare(calib_dataloader)
-
-    return calib_dataloader
-
-
 def get_raw_dataset(
     dataset_args,
     cache_dir: Optional[str] = None,
@@ -94,47 +38,6 @@ def get_raw_dataset(
     return raw_datasets
 
 
-def make_dataset_splits(
-    tokenized_datasets: Dict[str, Any],
-    do_train: bool = False,
-    do_oneshot: bool = False,
-) -> Dict[str, Dataset]:
-    """
-    Restructures the datasets dictionary based on what tasks will be run
-    train
-
-    :param tokenized_datasets: dictionary of processed datasets
-    :param do_oneshot: Whether to store the calibration dataset
-
-    :return: Datasets to be used by the requested tasks
-    """
-
-    # handles case where all splits are contained in a single dataset
-    if "all" in tokenized_datasets and len(tokenized_datasets) == 1:
-        tokenized_datasets = tokenized_datasets.get("all")
-        if isinstance(tokenized_datasets, Dataset):
-            tokenized_datasets = {"train": tokenized_datasets}
-
-    train_split = calib_split = None
-
-    if do_train:
-        if "train" not in tokenized_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_split = tokenized_datasets["train"]
-    if do_oneshot:
-        calib_split = tokenized_datasets.get("calibration")
-        if calib_split is None:
-            if "train" not in tokenized_datasets:
-                raise ValueError("--do_oneshot requires a calibration dataset")
-            calib_split = tokenized_datasets["train"]
-
-    split_datasets = {
-        "train": train_split,
-        "calibration": calib_split,
-    }
-    return split_datasets
-
-
 def get_custom_datasets_from_path(path: str, ext: str = "json") -> Dict[str, str]:
     """
     Get a dictionary of custom datasets from a directory path. Support HF's load_dataset
@@ -232,78 +135,3 @@ def do_transform(candidate: str) -> bool:
             transform_dataset_key(dataset_key)
 
     return data_files
-
-
-def get_calibration_dataloader(
-    dataset_args,
-    processor,
-    add_labels: bool = False,  # for oneshot
-    do_oneshot=True,
-) -> torch.utils.data.DataLoader:
-    """
-    Loads datasets for each flow based on dataset_args, stores a Dataset for each
-    enabled flow in self.datasets
-
-    :param processor: processor or tokenizer to use for dataset tokenization
-    :param add_labels: if True, add labels column to dataset splits
-    """
-    if dataset_args.dataset is None:
-        logger.info(
-            "Running oneshot without calibration data. This is expected for "
-            "weight-only and dynamic quantization"
-        )
-        return
-
-    splits = dataset_args.splits
-    tokenized_datasets = {}
-
-    def _get_split_name(inp_str):
-        # strip out split name, for ex train[60%:] -> train
-        match = re.match(r"(\w*)\[.*\]", inp_str)
-        if match is not None:
-            return match.group(1)
-        return inp_str
-
-    if splits is None:
-        splits = {"all": None}
-    elif isinstance(splits, str):
-        splits = {_get_split_name(splits): splits}
-    elif isinstance(splits, List):
-        splits = {_get_split_name(s): s for s in splits}
-
-    # default to custom dataset if dataset provided isn't a string
-    registry_id = (
-        dataset_args.dataset if isinstance(dataset_args.dataset, str) else "custom"
-    )
-    for split_name, split_str in splits.items():
-        dataset = dataset_args.dataset
-        if hasattr(dataset, "column_names") and "input_ids" in dataset.column_names:
-            # dataset is already tokenized
-            tokenized_datasets[split_name] = dataset
-        else:
-            # dataset needs to be tokenized
-            from llmcompressor.transformers.finetune.data.base import (
-                TextGenerationDataset,
-            )
-
-            dataset_manager = TextGenerationDataset.load_from_registry(
-                registry_id,
-                dataset_args=dataset_args,
-                split=split_str,
-                processor=processor,
-            )
-            tokenized_datasets[split_name] = dataset_manager(add_labels=add_labels)
-
-    datasets = make_dataset_splits(
-        tokenized_datasets,
-        do_oneshot=do_oneshot,
-    )
-
-    calibration_dataset = datasets.get("calibration")
-
-    return format_calibration_data(
-        tokenized_dataset=calibration_dataset,
-        num_calibration_samples=dataset_args.num_calibration_samples,
-        do_shuffle=dataset_args.shuffle_calibration_samples,
-        collate_fn=dataset_args.data_collator,
-    )
diff --git a/src/llmcompressor/transformers/finetune/runner.py b/src/llmcompressor/transformers/finetune/runner.py
index 75d963aa5..b45153b4f 100644
--- a/src/llmcompressor/transformers/finetune/runner.py
+++ b/src/llmcompressor/transformers/finetune/runner.py
@@ -23,10 +23,6 @@
 )
 from llmcompressor.recipe import Recipe, StageRunType
 from llmcompressor.transformers.finetune.data import TextGenerationDataset
-from llmcompressor.transformers.finetune.data.data_helpers import (
-    format_calibration_data,
-    make_dataset_splits,
-)
 from llmcompressor.typing import Processor
 
 
@@ -119,6 +115,8 @@ def _get_split_name(inp_str):
                 )
                 tokenized_datasets[split_name] = dataset_manager(add_labels=add_labels)
 
+        from llmcompressor.datasets import make_dataset_splits
+
         self.datasets = make_dataset_splits(
             tokenized_datasets,
             do_train=self._training_args.do_train,
@@ -164,6 +162,7 @@ def run_sequential_stages(
 
         :param checkpoint: optional checkpoint to pick up a stage from
         """
+
         recipe_obj = Recipe.create_instance(self._recipe_args.recipe)
         with self.trainer.accelerator.main_process_first():
             checkpoint_dir = self._model_args.model
@@ -197,6 +196,7 @@ def run_sequential_stages(
             # run stage
             if run_type is StageRunType.ONESHOT:
                 from llmcompressor import Oneshot
+                from llmcompressor.datasets import format_calibration_data
 
                 self._model_args.model = model
 
@@ -213,7 +213,6 @@ def run_sequential_stages(
                     num_calibration_samples=self._dataset_args.num_calibration_samples,
                     do_shuffle=self._dataset_args.shuffle_calibration_samples,
                     collate_fn=self._dataset_args.data_collator,
-                    accelerator=self.trainer.accelerator,
                 )
 
                 if do_preprocess:
diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
index 39165ffe6..a7138b186 100644
--- a/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
+++ b/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
@@ -1,10 +1,8 @@
 import pytest
 
 from llmcompressor.args import DatasetArguments
-from llmcompressor.transformers.finetune.data.data_helpers import (
-    get_raw_dataset,
-    make_dataset_splits,
-)
+from llmcompressor.datasets import make_dataset_splits
+from llmcompressor.transformers.finetune.data.data_helpers import get_raw_dataset
 
 
 @pytest.mark.unit
diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py
index 7198e0da3..3fc174acb 100644
--- a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py
+++ b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py
@@ -11,10 +11,8 @@
     RecipeArguments,
     TrainingArguments,
 )
+from llmcompressor.datasets import format_calibration_data
 from llmcompressor.transformers import TextGenerationDataset
-from llmcompressor.transformers.finetune.data.data_helpers import (
-    format_calibration_data,
-)
 from llmcompressor.transformers.finetune.runner import StageRunner
 
 
diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_owl.py b/tests/llmcompressor/transformers/obcq/test_obcq_owl.py
index 4948c6da3..17effeb7a 100644
--- a/tests/llmcompressor/transformers/obcq/test_obcq_owl.py
+++ b/tests/llmcompressor/transformers/obcq/test_obcq_owl.py
@@ -3,10 +3,8 @@
 from datasets import Dataset
 from transformers import AutoModelForCausalLM
 
+from llmcompressor.datasets import format_calibration_data
 from llmcompressor.modifiers.obcq import SparseGPTModifier
-from llmcompressor.transformers.finetune.data.data_helpers import (
-    format_calibration_data,
-)
 from llmcompressor.utils.pytorch.module import get_layers
 
 
From 14ac2e714d68645951df2692231168f915b6b8bb Mon Sep 17 00:00:00 2001
From: Aman Gupta <aman2304@gmail.com>
Date: Wed, 5 Mar 2025 10:41:13 -0800
Subject: [PATCH 20/23] [BugFix] Fix logging disabling bug and add tests
 (#1218)

SUMMARY:
Fixed logging and clear loggers enabling/disabling bug. Previously, any
value on the right environment variables would disable logging. Now, we
explicitly check for `true`

TEST PLAN:
Added unit tests for enabling logging.
`make test` passes

---------

Signed-off-by: Aman Gupta <aman2304@gmail.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 src/llmcompressor/logger.py |  8 ++++----
 tests/unit/test_logger.py   | 13 +++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/llmcompressor/logger.py b/src/llmcompressor/logger.py
index 332daeb3d..686da7564 100644
--- a/src/llmcompressor/logger.py
+++ b/src/llmcompressor/logger.py
@@ -53,9 +53,9 @@ class LoggerConfig:
     metrics_disabled: bool = False
 
 
-def configure_logger(config: Optional[LoggerConfig] = None):
+def configure_logger(config: Optional[LoggerConfig] = None) -> None:
     """
-    Configure the metrics for LLM Compressor.
+    Configure the logger for LLM Compressor.
     This function sets up the console and file logging
     as per the specified or default parameters.
 
@@ -68,9 +68,9 @@ def configure_logger(config: Optional[LoggerConfig] = None):
 
     # env vars get priority
     if (disabled := os.getenv("LLM_COMPRESSOR_LOG_DISABLED")) is not None:
-        logger_config.disabled = disabled.lower()
+        logger_config.disabled = disabled.lower() == "true"
     if (clear_loggers := os.getenv("LLM_COMPRESSOR_CLEAR_LOGGERS")) is not None:
-        logger_config.clear_loggers = clear_loggers.lower()
+        logger_config.clear_loggers = clear_loggers.lower() == "true"
     if (console_log_level := os.getenv("LLM_COMPRESSOR_LOG_LEVEL")) is not None:
         logger_config.console_log_level = console_log_level.upper()
     if (log_file := os.getenv("LLM_COMPRESSOR_LOG_FILE")) is not None:
diff --git a/tests/unit/test_logger.py b/tests/unit/test_logger.py
index 0e0ac8925..1796293f7 100644
--- a/tests/unit/test_logger.py
+++ b/tests/unit/test_logger.py
@@ -103,3 +103,16 @@ def test_environment_variable_disable_logging(monkeypatch, capsys):
     captured = capsys.readouterr()
     assert captured.out == ""
     assert captured.err == ""
+
+
+def test_environment_variable_enable_logging(monkeypatch, capsys):
+    # Test environment variable to enable logging
+    monkeypatch.setenv("LLM_COMPRESSOR_LOG_DISABLED", "false")
+
+    configure_logger(config=LoggerConfig())
+    logger.info("Info message")
+    logger.error("Error message")
+
+    captured = capsys.readouterr()
+    assert captured.out.count("Info message") == 1
+    assert captured.out.count("Error message") == 1

From 9d82f3587b47cfee759606c31d35641776a07c1a Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Thu, 6 Mar 2025 14:03:56 -0500
Subject: [PATCH 21/23] [Training] Unifying Preprocess + Postprocessing logic
 for Train/Oneshot (#1212)

Order of reviews:
https://github.com/vllm-project/llm-compressor/pull/1206
https://github.com/vllm-project/llm-compressor/pull/1207
https://github.com/vllm-project/llm-compressor/pull/1209
https://github.com/vllm-project/llm-compressor/pull/1212  <-- Here
https://github.com/vllm-project/llm-compressor/pull/1214

SUMMARY:
* Move the preprocessing and postprocessing logic out of
`src/llmcompressor/transformers/finetune/text_generation.py` and into
`src/llmcompressor/entrypoints/utils.py`

TEST PLAN:
Pass tests
---
 src/llmcompressor/entrypoints/__init__.py     |   1 +
 src/llmcompressor/entrypoints/oneshot.py      | 116 +-------
 src/llmcompressor/entrypoints/utils.py        | 272 ++++++++++++++++++
 .../transformers/finetune/text_generation.py  | 195 ++-----------
 4 files changed, 294 insertions(+), 290 deletions(-)
 create mode 100644 src/llmcompressor/entrypoints/utils.py

diff --git a/src/llmcompressor/entrypoints/__init__.py b/src/llmcompressor/entrypoints/__init__.py
index dd1d4aa83..299ab9084 100644
--- a/src/llmcompressor/entrypoints/__init__.py
+++ b/src/llmcompressor/entrypoints/__init__.py
@@ -1,2 +1,3 @@
 # flake8: noqa
 from .oneshot import Oneshot, oneshot
+from .utils import post_process, pre_process
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
index cfaf83f92..21e29057b 100644
--- a/src/llmcompressor/entrypoints/oneshot.py
+++ b/src/llmcompressor/entrypoints/oneshot.py
@@ -1,21 +1,12 @@
-from pathlib import PosixPath
 from typing import Optional
 
-from loguru import logger
 from torch.utils.data import DataLoader
 from transformers import PreTrainedModel
 
 from llmcompressor.args import parse_args
 from llmcompressor.core.session_functions import active_session
 from llmcompressor.datasets import get_calibration_dataloader
-from llmcompressor.transformers.finetune.text_generation import (
-    initialize_model_from_path,
-    initialize_processor_from_path,
-)
-from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
-    modify_save_pretrained,
-    patch_tied_tensors_bug,
-)
+from llmcompressor.entrypoints.utils import post_process, pre_process
 
 __all__ = ["Oneshot", "oneshot"]
 
@@ -71,7 +62,7 @@ class Oneshot:
             Initializes the `Oneshot` object by parsing input arguments, performing
             preprocessing, and setting instance attributes.
 
-        run(**kwargs):
+        __call__(**kwargs):
             Performs the one-shot calibration process by preparing a calibration
             dataloader, applying recipe modifiers to the model, and executing
             postprocessing steps.
@@ -86,17 +77,6 @@ class Oneshot:
             defined in the recipe. Each action is executed via the global
             `CompressionSession`.
 
-        _pre_process():
-            Handles preprocessing steps, including model initialization,
-            tokenizer/processor setup, and resolving tied embedding issues.
-
-        check_tied_embeddings():
-            Logs a warning if `tie_word_embeddings=True`, which may interfere with
-            saving in the one-shot workflow.
-
-        _post_process():
-            Executes postprocessing steps such as saving the model and resetting
-            lifecycle actions, especially when a custom `output_dir` is specified.
     """
 
     def __init__(
@@ -151,7 +131,7 @@ def from_args(
 
         # only run for the first oneshot call
         if do_preprocess:
-            instance._pre_process()
+            pre_process(model_args)
 
         # Set instance attributes
         instance.model = instance.model_args.model
@@ -172,7 +152,7 @@ def __call__(self):
         """
         # TODO: move back once stage runner is removed
         # Preprocess the model and tokenizer/processor
-        self._pre_process()
+        pre_process(self.model_args)
         self.model = self.model_args.model
         self.recipe = self.recipe_args.recipe
         self.processor = self.model_args.processor
@@ -183,24 +163,7 @@ def __call__(self):
         self.apply_recipe_modifiers(
             calibration_dataloader=calibration_dataloader,
         )
-        self._post_process()
-
-    def save(self):
-        """
-        Saves the model and tokenizer/processor to the output directory.
-
-        The model is saved in a compressed format if specified in `model_args`.
-        The tokenizer or processor, if available, is also saved.
-
-        Raises:
-            ValueError: If saving fails due to an invalid `output_dir` or other issues.
-        """
-        self.model.save_pretrained(
-            self.output_dir,
-            save_compressed=self.model_args.save_compressed,
-        )
-        if self.processor is not None:
-            self.processor.save_pretrained(self.output_dir)
+        post_process(model_args=self.model_args, output_dir=self.output_dir)
 
     def apply_recipe_modifiers(
         self,
@@ -236,75 +199,6 @@ def apply_recipe_modifiers(
         session.initialize(**session_kwargs)
         session.finalize(**session_kwargs)
 
-    def _pre_process(self):
-        """
-        Prepares the model and tokenizer/processor for calibration.
-
-        - Initializes the model if it's specified as a path or string.
-        - Applies patches to fix tied tensor issues and modifies `save_pretrained`
-          behavior.
-        - Initializes the processor if specified as a path or `None`.
-        - Sets the minimum tokens per module if `dataset_args` are provided.
-
-        Raises:
-            FileNotFoundError: If the model or processor path is invalid.
-        """
-        self.check_tied_embeddings()
-
-        # Initialize model
-        if isinstance(self.model_args.model, (str, PosixPath)):
-            self.model_args.model, _ = initialize_model_from_path(self.model_args)
-
-        patch_tied_tensors_bug(self.model_args.model)
-        modify_save_pretrained(self.model_args.model)
-
-        # Initialize processor
-        if isinstance(self.model_args.processor, (str, type(None))):
-            self.model_args.processor = initialize_processor_from_path(
-                self.model_args, self.model_args.model
-            )
-            # TODO: move to init once stage runner is removed
-            self.processor = self.model_args.processor
-
-        # Set minimum tokens per module if data arguments are provided
-        if self.dataset_args:
-            self.min_tokens_per_module = self.dataset_args.min_tokens_per_module
-
-    def check_tied_embeddings(self):
-        """
-        Logs a warning if the model has tied word embeddings.
-
-        The `tie_word_embeddings` flag may cause issues during saving in the one-shot
-        calibration workflow due to shared tensor addresses.
-        """
-        if self.model_args.tie_word_embeddings:
-            logger.debug(
-                "The tie_word_embeddings flag is by default set to False. "
-                "This guarantees that the one-shot algorithm saves the final "
-                "weights without errors. Detected tie_word_embeddings=True. "
-                "This may cause issues with the one-shot algorithm on save."
-            )
-
-    def _post_process(self):
-        """
-        Executes post-calibration steps.
-
-        This method saves the model and resets lifecycle actions if the `output_dir`
-        is not the default directory.
-
-        Raises:
-            ValueError: If saving fails due to invalid configurations.
-        """
-        if self.output_dir is not None:
-            self.save()
-            return
-
-        logger.warning(
-            "Optimized model not saved. To save, please provide",
-            "`output_dir` as input arg.",
-            "Ex. `oneshot(..., output_dir=...)`",
-        )
-
 
 def oneshot(**kwargs) -> PreTrainedModel:
     one_shot = Oneshot(**kwargs)
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
new file mode 100644
index 000000000..c8cc3ba07
--- /dev/null
+++ b/src/llmcompressor/entrypoints/utils.py
@@ -0,0 +1,272 @@
+import inspect
+import os
+from pathlib import PosixPath
+from typing import Optional, Tuple
+
+from loguru import logger
+from torch.nn import Module
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoProcessor,
+    PreTrainedModel,
+    set_seed,
+)
+from transformers.utils.quantization_config import CompressedTensorsConfig
+
+from llmcompressor.args import ModelArguments, TrainingArguments
+from llmcompressor.pytorch.model_load.helpers import fallback_to_cpu, parse_dtype
+from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
+    modify_save_pretrained,
+    patch_tied_tensors_bug,
+)
+from llmcompressor.transformers.utils.helpers import (
+    detect_last_checkpoint,
+    is_model_ct_quantized_from_path,
+)
+from llmcompressor.typing import Processor
+from llmcompressor.utils.fsdp.helpers import is_fsdp_model
+
+
+def pre_process(model_args: "ModelArguments"):
+    """
+    Prepares the model and tokenizer/processor for calibration.
+    - Initializes the model if it's specified as a path or string.
+    - Applies patches to fix tied tensor issues and modifies `save_pretrained`
+        behavior.
+    - Initializes the processor if specified as a path or `None`.
+    - Sets the minimum tokens per module if `dataset_args` are provided.
+    Raises:
+        FileNotFoundError: If the model or processor path is invalid.
+    """
+    _warn_tied_embeddings(model_args.tie_word_embeddings)
+
+    # Initialize model
+    if isinstance(model_args.model, (str, PosixPath)):
+        model, distill_teacher = initialize_model_from_path(model_args)
+        if is_fsdp_model(model):
+            raise NotImplementedError(
+                "FSDP models are not supported in the current release but will be "
+                "suported in future releases of LLM Compressor."
+            )
+        model_args.model = model
+        model_args.distill_teacher = distill_teacher
+
+    # Initialize processor
+    if isinstance(model_args.processor, (str, type(None))):
+        model_args.processor = initialize_processor_from_path(
+            model_args, model_args.model
+        )
+
+    # untie tie_word_embeddings weights
+    patch_tied_tensors_bug(model_args.model)
+
+    # wrap model.save_pretrained
+    modify_save_pretrained(model_args.model)
+
+
+def post_process(
+    model_args: "ModelArguments",
+    output_dir: Optional[str] = None,
+):
+    """
+    Saves the model and tokenizer/processor to the output directory.
+
+    If the `output_dir` is not the default directory, the method resets lifecycle
+    actions. The model is saved in a compressed format if specified in `model_args`.
+    Additionally, the tokenizer or processor, if available, is also saved.
+
+    Raises:
+        ValueError: If saving fails due to an invalid `output_dir` or other issues.
+    """
+    if output_dir is not None:
+        model_args.model.save_pretrained(
+            output_dir,
+            save_compressed=model_args.save_compressed,
+        )
+        if model_args.processor:
+            model_args.processor.save_pretrained(output_dir)
+        return
+
+    logger.warning(
+        "Optimized model is not saved. To save, please provide",
+        "`output_dir` as input arg.",
+        "Ex. `oneshot(..., output_dir=...)`",
+    )
+
+
+def _warn_tied_embeddings(tie_word_embeddings: bool = False):
+    """
+    Logs a warning if the model has tied word embeddings.
+    The `tie_word_embeddings` flag may cause issues during saving in the one-shot
+    calibration workflow due to shared tensor addresses.
+    """
+    if tie_word_embeddings:
+        logger.debug(
+            "The tie_word_embeddings flag is by default set to False. "
+            "This guarantees that the one-shot algorithm saves the final "
+            "weights without errors. Detected tie_word_embeddings=True. "
+            "This may cause issues with the one-shot algorithm on save."
+        )
+
+
+def initialize_model_from_path(
+    model_args: ModelArguments,
+    training_args: Optional[TrainingArguments] = None,
+) -> Tuple[PreTrainedModel, Optional[PreTrainedModel]]:
+    # Load pretrained model
+    # The .from_pretrained methods guarantee that only one local process can
+    # concurrently download model & vocab.
+    model_path = model_args.model
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        tie_word_embeddings=model_args.tie_word_embeddings,
+        trust_remote_code=model_args.trust_remote_code_model,
+    )
+
+    last_checkpoint = None
+    teacher = None
+
+    if training_args is not None:
+        # Load teacher configuration if applicable
+        teacher_config = (
+            AutoConfig.from_pretrained(
+                model_args.distill_teacher,
+                use_auth_token=True if model_args.use_auth_token else None,
+                tie_word_embeddings=model_args.tie_word_embeddings,
+                trust_remote_code=model_args.trust_remote_code_model,
+            )
+            if model_args.distill_teacher
+            else None
+        )
+
+        # Detect last checkpoint
+        last_checkpoint = detect_last_checkpoint(training_args, model_args=model_args)
+
+        # Set seed before initializing model
+        set_seed(training_args.seed)
+
+        # Initialize teacher model if teacher path is provided
+        if model_args.distill_teacher is not None:
+            teacher_device_map = (
+                None
+                if os.environ.get("ACCELERATE_USE_FSDP", "false") == "true"
+                else "auto"
+            )
+            teacher_kwargs = {
+                "config": teacher_config,
+                "cache_dir": model_args.cache_dir,
+                "use_auth_token": True if model_args.use_auth_token else None,
+                "torch_dtype": parse_dtype(model_args.precision),
+                "device_map": teacher_device_map,
+                "trust_remote_code": model_args.trust_remote_code_model,
+            }
+
+            teacher = AutoModelForCausalLM.from_pretrained(
+                model_args.distill_teacher,
+                **teacher_kwargs,
+            )
+            if "sequence_length" in teacher_kwargs:
+                teacher.seqlen = teacher_kwargs["sequence_length"]
+
+    model_path = (
+        last_checkpoint or model_args.model
+        if hasattr(model_args, "model")
+        else model_args.model_name_or_path
+    )
+
+    # Fallback to CPU if GPU requested and not available
+    model_args.oneshot_device = fallback_to_cpu(model_args.oneshot_device)
+
+    device_map = model_args.oneshot_device
+    if training_args is not None and training_args.do_train:
+        device_map = "auto"
+
+    model_kwargs = {
+        "config": config,
+        "cache_dir": model_args.cache_dir,
+        "revision": model_args.model_revision,
+        "use_auth_token": True if model_args.use_auth_token else None,
+        "torch_dtype": parse_dtype(model_args.precision),
+        "device_map": device_map,
+        "trust_remote_code": model_args.trust_remote_code_model,
+    }
+
+    # optimized models must be decompressed to carry out oneshot/train/etc
+    if is_model_ct_quantized_from_path(model_path):
+        model_kwargs["quantization_config"] = CompressedTensorsConfig(
+            run_compressed=False
+        )
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        **model_kwargs,
+    )
+    if "sequence_length" in model_kwargs:
+        model.seqlen = model_kwargs["sequence_length"]
+
+    return model, teacher
+
+
+def initialize_processor_from_path(
+    model_args: ModelArguments,
+    model: PreTrainedModel,
+    teacher: Optional[PreTrainedModel] = None,
+) -> Processor:
+    processor_src = model_args.processor or get_processor_name_from_model(
+        model, teacher
+    )
+    # The use_fast=True option is not currently supported safely in Transformers
+    # See: https://github.com/huggingface/transformers/pull/34836#issuecomment-2491809727  # noqa: E501
+    try:
+        processor = AutoProcessor.from_pretrained(
+            processor_src,
+            cache_dir=model_args.cache_dir,
+            use_fast=True,
+            revision=model_args.model_revision,
+            use_auth_token=True if model_args.use_auth_token else None,
+            trust_remote_code=model_args.trust_remote_code_model,
+        )
+    except Exception:
+        logger.debug("Could not load fast processor, loading slow processor instead")
+        processor = AutoProcessor.from_pretrained(
+            processor_src,
+            cache_dir=model_args.cache_dir,
+            use_fast=False,
+            revision=model_args.model_revision,
+            use_auth_token=True if model_args.use_auth_token else None,
+            trust_remote_code=model_args.trust_remote_code_model,
+        )
+
+    return processor
+
+
+def get_processor_name_from_model(student: Module, teacher: Optional[Module]) -> str:
+    """
+    Get a processor/tokenizer source used for both student and teacher, assuming
+    that they could be shared
+
+    :param student: the student model
+    :param teacher: the teacher model
+    :return: the source for the processor/tokenizer shared between teacher and model
+    """
+    if teacher is not None and teacher not in ("disable", "self"):
+        student_forward_params = list(
+            inspect.signature(student.forward).parameters.keys()
+        )
+        teacher_forward_params = list(
+            inspect.signature(teacher.forward).parameters.keys()
+        )
+        diff = [p for p in student_forward_params if p not in teacher_forward_params]
+        if diff:
+            raise RuntimeError(
+                "Teacher tokenizer cannot be used for student "
+                f"due to missing args: {diff}"
+            )
+        src_model = teacher
+    else:
+        src_model = student
+    return src_model.config._name_or_path
diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py
index d03867b85..66652b686 100644
--- a/src/llmcompressor/transformers/finetune/text_generation.py
+++ b/src/llmcompressor/transformers/finetune/text_generation.py
@@ -17,22 +17,12 @@
 # Adapted from https://github.com/huggingface/transformers
 # vllm-project: no copyright
 
-import os
 import warnings
 from pathlib import PosixPath
-from typing import Optional
 
 from compressed_tensors.utils.helpers import deprecated
 from loguru import logger
-from transformers import (
-    AutoConfig,
-    AutoModelForCausalLM,
-    AutoProcessor,
-    HfArgumentParser,
-    PreTrainedModel,
-    set_seed,
-)
-from transformers.utils.quantization_config import CompressedTensorsConfig
+from transformers import HfArgumentParser
 
 from llmcompressor.args import (
     DatasetArguments,
@@ -41,11 +31,7 @@
     TrainingArguments,
 )
 from llmcompressor.core import reset_session
-from llmcompressor.pytorch.model_load.helpers import (
-    fallback_to_cpu,
-    parse_dtype,
-    save_checkpoint,
-)
+from llmcompressor.pytorch.model_load.helpers import save_checkpoint
 from llmcompressor.recipe import Recipe, StageRunType
 from llmcompressor.transformers.finetune.runner import StageRunner
 from llmcompressor.transformers.finetune.trainer import Trainer
@@ -53,14 +39,6 @@
     modify_save_pretrained,
     patch_tied_tensors_bug,
 )
-from llmcompressor.transformers.sparsification.sparse_model import (
-    get_processor_name_from_model,
-)
-from llmcompressor.transformers.utils.helpers import (
-    detect_last_checkpoint,
-    is_model_ct_quantized_from_path,
-)
-from llmcompressor.typing import Processor
 from llmcompressor.utils.fsdp.helpers import is_fsdp_model
 
 
@@ -73,15 +51,6 @@ def train(**kwargs):
     main(model_args, dataset_args, recipe_args, training_args)
 
 
-def eval(**kwargs):
-    """
-    CLI entrypoint for running evaluation
-    """
-    model_args, dataset_args, recipe_args, training_args = parse_args(**kwargs)
-    training_args.do_eval = True
-    main(model_args, dataset_args, recipe_args, training_args)
-
-
 @deprecated(
     message=(
         "`from llmcompressor.transformers import oneshot` is deprecated, "
@@ -98,10 +67,14 @@ def apply(**kwargs):
     """
     CLI entrypoint for any of training, oneshot
     """
-    report_to = kwargs.get("report_to", None)
-    model_args, dataset_args, recipe_args, training_args = parse_args(**kwargs)
+    from llmcompressor.args import parse_args
+
+    model_args, dataset_args, recipe_args, training_args, _ = parse_args(
+        include_training_args=True, **kwargs
+    )
 
     training_args.run_stages = True
+    report_to = kwargs.get("report_to", None)
     if report_to is None:  # user didn't specify any reporters
         # get rid of the reporters inferred from hugging face
         training_args.report_to = []
@@ -123,7 +96,6 @@ def parse_args(**kwargs):
             src/llmcompressor/transformers/utils/arg_parser/recipe_args.py
         * training_args in
             src/llmcompressor/transformers/utils/arg_parser/training_args.py
-
     """
     parser = HfArgumentParser(
         (ModelArguments, DatasetArguments, RecipeArguments, TrainingArguments)
@@ -161,147 +133,6 @@ def parse_args(**kwargs):
     return model_args, dataset_args, recipe_args, training_args
 
 
-def initialize_model_from_path(
-    model_args: ModelArguments,
-    training_args: Optional[TrainingArguments] = None,
-):
-    # Load pretrained model
-    # The .from_pretrained methods guarantee that only one local process can
-    # concurrently download model & vocab.
-    model_path = model_args.model
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-        tie_word_embeddings=model_args.tie_word_embeddings,
-        trust_remote_code=model_args.trust_remote_code_model,
-    )
-
-    last_checkpoint = None
-    teacher = None
-
-    if training_args is not None:
-        # Load teacher configuration if applicable
-        teacher_config = (
-            AutoConfig.from_pretrained(
-                model_args.distill_teacher,
-                use_auth_token=True if model_args.use_auth_token else None,
-                tie_word_embeddings=model_args.tie_word_embeddings,
-                trust_remote_code=model_args.trust_remote_code_model,
-            )
-            if model_args.distill_teacher
-            else None
-        )
-
-        # Detect last checkpoint
-        last_checkpoint = detect_last_checkpoint(training_args, model_args=model_args)
-
-        # Set seed before initializing model
-        set_seed(training_args.seed)
-
-        # Initialize teacher model if teacher path is provided
-        if model_args.distill_teacher is not None:
-            teacher_device_map = (
-                None
-                if os.environ.get("ACCELERATE_USE_FSDP", "false") == "true"
-                else "auto"
-            )
-            teacher_kwargs = {
-                "config": teacher_config,
-                "cache_dir": model_args.cache_dir,
-                "use_auth_token": True if model_args.use_auth_token else None,
-                "torch_dtype": parse_dtype(model_args.precision),
-                "device_map": teacher_device_map,
-                "trust_remote_code": model_args.trust_remote_code_model,
-            }
-
-            teacher = AutoModelForCausalLM.from_pretrained(
-                model_args.distill_teacher,
-                **teacher_kwargs,
-            )
-            if "sequence_length" in teacher_kwargs:
-                teacher.seqlen = teacher_kwargs["sequence_length"]
-
-    model_path = (
-        last_checkpoint or model_args.model
-        if hasattr(model_args, "model")
-        else model_args.model_name_or_path
-    )
-
-    # Fallback to CPU if GPU requested and not available
-    model_args.oneshot_device = fallback_to_cpu(model_args.oneshot_device)
-
-    # Trainer handles device assignment for FSDP and training, don't do mapping here
-    # if running oneshot outside of FSDP, apply user device settings
-
-    fsdp_enabled = os.environ.get("ACCELERATE_USE_FSDP", "false") == "true"
-
-    device_map = model_args.oneshot_device
-    if not fsdp_enabled and training_args is not None and training_args.do_train:
-        device_map = "auto"
-
-    model_kwargs = {
-        "config": config,
-        "cache_dir": model_args.cache_dir,
-        "revision": model_args.model_revision,
-        "use_auth_token": True if model_args.use_auth_token else None,
-        "torch_dtype": parse_dtype(model_args.precision),
-        "device_map": device_map,
-        "trust_remote_code": model_args.trust_remote_code_model,
-    }
-
-    # this calls from_pretrained under the hood so should be FSDP safe
-
-    # optimized models must be decompressed to carry out oneshot/train/etc
-    if is_model_ct_quantized_from_path(model_path):
-        model_kwargs["quantization_config"] = CompressedTensorsConfig(
-            run_compressed=False
-        )
-
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        **model_kwargs,
-    )
-    if "sequence_length" in model_kwargs:
-        model.seqlen = model_kwargs["sequence_length"]
-
-    return model, teacher
-
-
-def initialize_processor_from_path(
-    model_args: ModelArguments,
-    model: PreTrainedModel,
-    teacher: Optional[PreTrainedModel] = None,
-) -> Processor:
-    processor_src = model_args.processor or get_processor_name_from_model(
-        model, teacher
-    )
-    # The use_fast=True option is not currently supported safely in Transformers
-    # See: https://github.com/huggingface/transformers/pull/34836#issuecomment-2491809727  # noqa: E501
-    try:
-        processor = AutoProcessor.from_pretrained(
-            processor_src,
-            cache_dir=model_args.cache_dir,
-            use_fast=True,
-            revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
-            trust_remote_code=model_args.trust_remote_code_model,
-        )
-    except Exception:
-        logger.debug("Could not load fast processor, loading slow processor instead")
-        processor = AutoProcessor.from_pretrained(
-            processor_src,
-            cache_dir=model_args.cache_dir,
-            use_fast=False,
-            revision=model_args.model_revision,
-            use_auth_token=True if model_args.use_auth_token else None,
-            trust_remote_code=model_args.trust_remote_code_model,
-        )
-
-    return processor
-
-
 def main(
     model_args: ModelArguments,
     dataset_args: DatasetArguments,
@@ -326,10 +157,15 @@ def main(
 
     :param model_args: Arguments pertaining to which model/config/tokenizer we are
     going to fine-tune from
-    :param dataset_args: Arguments pertaining to what data we are going to input
-        our model for training
+    :param dataset_args: Arguments pertaining to what data we are
+        going to input our model for training
     :param training_args: Arguments pertaining to training loop configuration
     """
+    from llmcompressor.args import TrainingArguments
+    from llmcompressor.entrypoints.utils import (
+        initialize_model_from_path,
+        initialize_processor_from_path,
+    )
 
     # Temporary warning, to be removed
     if model_args.tie_word_embeddings is True:
@@ -426,6 +262,7 @@ def main(
 
         # exit immediately
         return
+
     # Training
     if training_args.do_train:
         checkpoint = None

From 4607036e491cae255d4985ebd657260d2cfba286 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Thu, 6 Mar 2025 17:59:43 -0500
Subject: [PATCH 22/23] [Docs] Add info on when to use which PTQ/Sparsification
 (#1157)

SUMMARY:
Current README shows which algo we support + how to run. However, to a
user it is still hard to understand when to use which. Add more info on
based on the users use-case and hardware the optimization to apply.


TEST PLAN:
N/A
---
 README.md | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index e61e2a49e..3ae778835 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,32 @@
 * SmoothQuant
 * SparseGPT
 
+### When to Use Which Optimization
+
+#### PTQ
+PTQ is performed to reduce the precision of quantizable weights (e.g., linear layers) to a lower bit-width. Supported formats are:
+
+##### [W4A16](./examples/quantization_w4a16/README.md)
+- Uses GPTQ to compress weights to 4 bits. Requires calibration dataset.
+- Useful speed ups in low QPS regimes with more weight compression. 
+- Recommended for any GPUs types. 
+##### [W8A8-INT8](./examples/quantization_w8a8_int8/README.md)
+- Uses channel-wise quantization to compress weights to 8 bits using GPTQ, and uses dynamic per-token quantization to compress activations to 8 bits. Requires calibration dataset for weight quantization. Activation quantization is carried out during inference on vLLM.
+- Useful for speed ups in high QPS regimes or offline serving on vLLM. 
+- Recommended for NVIDIA GPUs with compute capability <8.9 (Ampere, Turing, Volta, Pascal, or older). 
+##### [W8A8-FP8](./examples/quantization_w8a8_fp8/README.md)
+- Uses channel-wise quantization to compress weights to 8 bits, and uses dynamic per-token quantization to compress activations to 8 bits. Does not require calibration dataset. Activation quantization is carried out during inference on vLLM.
+- Useful for speed ups in high QPS regimes or offline serving on vLLM. 
+- Recommended for NVIDIA GPUs with compute capability >8.9 (Hopper and Ada Lovelace). 
+
+#### Sparsification
+Sparsification reduces model complexity by pruning selected weight values to zero while retaining essential weights in a subset of parameters. Supported formats include:
+
+##### [2:4-Sparsity with FP8 Weight, FP8 Input Activation](./examples/sparse_2of4_quantization_fp8/README.md)
+- Uses (1) semi-structured sparsity (SparseGPT), where, for every four contiguous weights in a tensor, two are set to zero. (2) Uses channel-wise quantization to compress weights to 8 bits and dynamic per-token quantization to compress activations to 8 bits.
+- Useful for better inference than W8A8-fp8, with almost no drop in its evaluation score [blog](https://neuralmagic.com/blog/24-sparse-llama-fp8-sota-performance-for-nvidia-hopper-gpus/). Note: Small models may experience accuracy drops when the remaining non-zero weights are insufficient to recapitulate the original distribution.
+- Recommended for compute capability >8.9 (Hopper and Ada Lovelace).
+
 
 ## Installation
 
@@ -35,16 +61,16 @@ pip install llmcompressor
 ### End-to-End Examples
 
 Applying quantization with `llmcompressor`:
-* [Activation quantization to `int8`](examples/quantization_w8a8_int8)
-* [Activation quantization to `fp8`](examples/quantization_w8a8_fp8)
-* [Weight only quantization to `int4`](examples/quantization_w4a16)
-* [Quantizing MoE LLMs](examples/quantizing_moe)
-* [Quantizing Vision-Language Models](examples/multimodal_vision)
-* [Quantizing Audio-Language Models](examples/multimodal_audio)
+* [Activation quantization to `int8`](examples/quantization_w8a8_int8/README.md)
+* [Activation quantization to `fp8`](examples/quantization_w8a8_fp8/README.md)
+* [Weight only quantization to `int4`](examples/quantization_w4a16/README.md)
+* [Quantizing MoE LLMs](examples/quantizing_moe/README.md)
+* [Quantizing Vision-Language Models](examples/multimodal_vision/README.md)
+* [Quantizing Audio-Language Models](examples/multimodal_audio/README.md)
 
 ### User Guides
 Deep dives into advanced usage of `llmcompressor`:
-* [Quantizing with large models with the help of `accelerate`](examples/big_models_with_accelerate)
+* [Quantizing with large models with the help of `accelerate`](examples/big_models_with_accelerate/README.md)
 
 
 ## Quick Tour

From 2a5955413e16a4dee4e6972e69736df8bfdfe878 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Fri, 7 Mar 2025 12:23:21 -0600
Subject: [PATCH 23/23] [Callbacks] Remove
 `MagnitudePruningModifier.leave_enabled` (#1198)

## Purpose ##
* Simplify the modifier lifecycle by removing the ability for modifiers
to affect the model after the modifier's `end` event
* This allows the `on_event` method to be removed in a future change

## Background ##
* The `leave_enabled` option was originally intended as a shortcut to
simplify recipes which used magnitude pruning during the iterative
pruning, then needed the masks to stay enabled during stabilization SFT
* This change proposes making the recipe clearer by requiring the
ConstantPruningModifier after the MagnitudePruningModifier becomes
inactive

## Changes ##
* Remove `MagnitudePruningModifier.leave_enabled` with a deprecation
warning

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../modifiers/pruning/magnitude/base.py       | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/llmcompressor/modifiers/pruning/magnitude/base.py b/src/llmcompressor/modifiers/pruning/magnitude/base.py
index e557ef091..fb0fa1817 100644
--- a/src/llmcompressor/modifiers/pruning/magnitude/base.py
+++ b/src/llmcompressor/modifiers/pruning/magnitude/base.py
@@ -1,5 +1,8 @@
+import warnings
 from typing import Any, Dict, List, Union
 
+from pydantic import field_validator
+
 from llmcompressor.core import Event, EventType, ModelParameterizedLayer, State
 from llmcompressor.modifiers import Modifier
 from llmcompressor.modifiers.pruning.helpers import (
@@ -25,7 +28,7 @@ class MagnitudePruningModifier(Modifier, LayerParamMasking):
     update_scheduler: str = "cubic"
     scheduler_args: Dict[str, Any] = {}
     mask_structure: str = "unstructured"
-    leave_enabled: bool = True
+    leave_enabled: bool = False
     apply_globally: bool = False
 
     parameterized_layers_: Dict[str, ModelParameterizedLayer] = None
@@ -35,6 +38,14 @@ class MagnitudePruningModifier(Modifier, LayerParamMasking):
     mask_creator_function_: MaskCreatorType = None
     current_sparsity_: float = None
 
+    @field_validator("leave_enabled")
+    def validate_leave_enabled(value: bool) -> bool:
+        warnings.warn(
+            "MagnitudePruningModifier.leave_enable has been deprecated",
+            DeprecationWarning,
+        )
+        return False
+
     def on_initialize(self, state: State, **kwargs) -> bool:
         if self.apply_globally:
             raise NotImplementedError("global pruning not implemented yet for PyTorch")
@@ -75,9 +86,8 @@ def on_initialize(self, state: State, **kwargs) -> bool:
         return True
 
     def on_finalize(self, state: State, **kwargs) -> bool:
-        if not self.leave_enabled:
-            for layer_param_name, _ in self.parameterized_layers_.items():
-                self.remove_mask(layer_param_name)
+        for layer_param_name, _ in self.parameterized_layers_.items():
+            self.remove_mask(layer_param_name)
 
         return True
 
@@ -119,12 +129,7 @@ def on_update(self, state: State, event: Event, **kwargs):
             self._update_masks(event)
 
     def on_end(self, state: State, event: Event, **kwargs):
-        if not self.leave_enabled:
-            self.disable_masks()
-
-    def on_event(self, state: State, event: Event, **kwargs):
-        if event.current_index >= self.end and self.leave_enabled:
-            self._update_masks(event)
+        self.disable_masks()
 
     def _update_masks(self, event: Event):
         if event.type_ == EventType.OPTIM_PRE_STEP and not self._use_hooks: