vllm-project · dsikka · Mar 10, 2025 · Mar 10, 2025
diff --git a/tests/llmcompressor/transformers/compression/configs/channelwise_15m.yaml b/tests/llmcompressor/transformers/compression/configs/channelwise_15m.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "Xenova/llama2.c-stories15M"
+model_stub: "nm-testing/llama2.c-stories15M"
 new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_channel.yaml"
diff --git a/tests/llmcompressor/transformers/compression/configs/fp8_15m.yaml b/tests/llmcompressor/transformers/compression/configs/fp8_15m.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "Xenova/llama2.c-stories15M"
+model_stub: "nm-testing/llama2.c-stories15M"
 new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_fp8.yaml"
diff --git a/tests/llmcompressor/transformers/compression/configs/inputs_15m.yaml b/tests/llmcompressor/transformers/compression/configs/inputs_15m.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "Xenova/llama2.c-stories15M"
+model_stub: "nm-testing/llama2.c-stories15M"
 new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_full.yaml"
diff --git a/tests/llmcompressor/transformers/compression/configs/weights_only_15m.yaml b/tests/llmcompressor/transformers/compression/configs/weights_only_15m.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "Xenova/llama2.c-stories15M"
+model_stub: "nm-testing/llama2.c-stories15M"
 new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_weight.yaml"
diff --git a/tests/llmcompressor/transformers/finetune/data/conftest.py b/tests/llmcompressor/transformers/finetune/data/conftest.py
@@ -6,7 +6,7 @@
 
 @pytest.fixture
 def tiny_llama_path():
-    return "Xenova/llama2.c-stories15M"
+    return "nm-testing/llama2.c-stories15M"
 
 
 @pytest.fixture

diff --git a/tests/llmcompressor/transformers/finetune/finetune_custom/config1.yaml b/tests/llmcompressor/transformers/finetune/finetune_custom/config1.yaml
@@ -1,5 +1,5 @@
 cadence: "commit"
 test_type: "sanity"
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 file_extension: json
 num_train_epochs: 1
diff --git a/tests/llmcompressor/transformers/finetune/finetune_custom/config2.yaml b/tests/llmcompressor/transformers/finetune/finetune_custom/config2.yaml
@@ -1,5 +1,5 @@
 cadence: "commit"
 test_type: "sanity"
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 file_extension: csv
 num_train_epochs: 1
diff --git a/tests/llmcompressor/transformers/finetune/finetune_generic/config1.yaml b/tests/llmcompressor/transformers/finetune/finetune_generic/config1.yaml
@@ -1,4 +1,4 @@
 cadence: "nightly"
 test_type: "regression"
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 dataset: open_platypus
diff --git a/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml b/tests/llmcompressor/transformers/finetune/finetune_oneshot_configs/config.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "sanity"
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 dataset: wikitext
 dataset_config_name: "wikitext-2-raw-v1"
 recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml"

diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py
@@ -113,7 +113,7 @@ def tearDown(self):
 @pytest.mark.integration
 @parameterized_class(parse_params(CONFIGS_DIRECTORY))
 class TestOneshotCustomDatasetSmall(TestFinetuneNoRecipeCustomDataset):
-    model = None  # "Xenova/llama2.c-stories15M"
+    model = None  # "nm-testing/llama2.c-stories15M"
     file_extension = None  # ["json", "csv"]
     num_train_epochs = None
 

diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py
@@ -21,7 +21,7 @@ def setUp(self):
     def test_oneshot_sparsification_then_finetune(self):
         recipe_str = "tests/llmcompressor/transformers/obcq/recipes/test_tiny2.yaml"
         model = AutoModelForCausalLM.from_pretrained(
-            "Xenova/llama2.c-stories15M", device_map="auto"
+            "nm-testing/llama2.c-stories15M", device_map="auto"
         )
         dataset = "open_platypus"
         concatenate_data = False
@@ -51,7 +51,7 @@ def test_oneshot_sparsification_then_finetune(self):
             quantization_config=self.quantization_config,
         )
         distill_teacher = AutoModelForCausalLM.from_pretrained(
-            "Xenova/llama2.c-stories15M", device_map="auto"
+            "nm-testing/llama2.c-stories15M", device_map="auto"
         )
         dataset = "open_platypus"
         concatenate_data = False

diff --git a/tests/llmcompressor/transformers/finetune/test_session_mixin.py b/tests/llmcompressor/transformers/finetune/test_session_mixin.py
@@ -32,7 +32,7 @@ def __init__(
 
 @pytest.mark.unit
 def test_mixin_init():
-    model_state_path = "Xenova/llama2.c-stories15M"
+    model_state_path = "nm-testing/llama2.c-stories15M"
     model = AutoModelForCausalLM.from_pretrained(model_state_path)
     recipe = "tests/llmcompressor/transformers/finetune/test_quantization.yaml"
 
@@ -45,7 +45,7 @@ def test_mixin_init():
 
 @pytest.fixture
 def mixin_trainer():
-    model_state_path = "Xenova/llama2.c-stories15M"
+    model_state_path = "nm-testing/llama2.c-stories15M"
     model = AutoModelForCausalLM.from_pretrained(model_state_path)
     recipe = "tests/llmcompressor/transformers/finetune/test_quantization.yaml"
     train_dataset = "open-platypus"

diff --git a/tests/llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant.yaml b/tests/llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant.yaml
@@ -1,6 +1,6 @@
 cadence: "nightly"
 test_type: "sanity"
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 dataset: open_platypus
 recipe: "tests/llmcompressor/transformers/obcq/recipes/quant.yaml"
 num_samples: 32

diff --git a/.../llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant_and_sparse.yaml b/.../llmcompressor/transformers/obcq/obcq_configs/completion/tiny_llama_quant_and_sparse.yaml
@@ -1,6 +1,6 @@
 cadence: "nightly"
 test_type: "sanity"
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 dataset: open_platypus
 recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml"
 num_samples: 32

diff --git a/.../obcq/obcq_configs/consec_runs/mask_structure/tiny_llama_mask_structure_preservation.yaml b/.../obcq/obcq_configs/consec_runs/mask_structure/tiny_llama_mask_structure_preservation.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "sanity"
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 dataset: open_platypus
 initial_pruning_only_recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse_with_mask_structure.yaml"
 initial_sparsity: 0.5

diff --git a/tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs/tiny_llama_consec_runs.yaml b/tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs/tiny_llama_consec_runs.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "sanity"
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 dataset: open_platypus
 first_recipe: "tests/llmcompressor/transformers/obcq/recipes/quant_and_sparse.yaml"
 second_recipe: "tests/llmcompressor/transformers/obcq/recipes/additional_sparsity.yaml"
diff --git a/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/tiny_llama_sparse.yaml b/tests/llmcompressor/transformers/obcq/obcq_configs/sparse/tiny_llama_sparse.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "sanity"
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 dataset: open_platypus
 recipe: "tests/llmcompressor/transformers/obcq/recipes/sparse.yaml"
 sparsity: 0.3
diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py b/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py
@@ -9,7 +9,7 @@
 def test_infer_targets():
     modifier = SparseGPTModifier(sparsity=0.0)
     with init_empty_weights():
-        model = AutoModelForCausalLM.from_pretrained("Xenova/llama2.c-stories15M")
+        model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
 
     inferred = modifier._infer_sequential_targets(model)
     assert inferred == ["LlamaDecoderLayer"]
diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py b/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py
@@ -16,7 +16,7 @@ def setUp(self):
         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
         self.model = AutoModelForCausalLM.from_pretrained(
-            "Xenova/llama2.c-stories15M", device_map=self.device
+            "nm-testing/llama2.c-stories15M", device_map=self.device
         )
 
         self.kwargs = {

diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_owl.py b/tests/llmcompressor/transformers/obcq/test_obcq_owl.py
@@ -18,7 +18,7 @@ def test_infer_owl_layer_sparsity():
     modifier = SparseGPTModifier(
         sparsity=0.7, sparsity_profile="owl", owl_m=5, owl_lmbda=0.05
     )
-    model = AutoModelForCausalLM.from_pretrained("Xenova/llama2.c-stories15M")
+    model = AutoModelForCausalLM.from_pretrained("nm-testing/llama2.c-stories15M")
 
     dataset = Dataset.from_dict(
         {"input_ids": torch.randint(0, vocab_size, (ds_size, seq_len))}

diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf1.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf1.yaml
@@ -1,7 +1,7 @@
 cadence: "commit"
 test_type: "smoke"
 tokenize: False
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 dataset: open_platypus
 recipe: |
   test_stage:

diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf2.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf2.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "smoke"
 tokenize: False
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 dataset: open_platypus
 recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"
diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf3.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf3.yaml
@@ -1,7 +1,7 @@
 cadence: "commit"
 test_type: "smoke"
 tokenize: False
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 dataset: "gsm8k"
 dataset_config_name: "main"
 recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"
diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf4.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf4.yaml
@@ -1,7 +1,7 @@
 cadence: "commit"
 test_type: "smoke"
 tokenize: False
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 dataset: "gsm8k"
 dataset_config_name: "main"
 recipe: |

diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf5.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf5.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "smoke"
 tokenize: True
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 dataset: open_platypus
 recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"
diff --git a/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf6.yaml b/tests/llmcompressor/transformers/oneshot/oneshot_configs/tiny_stories_conf6.yaml
@@ -1,6 +1,6 @@
 cadence: "commit"
 test_type: "smoke"
 tokenize: True
-model: "Xenova/llama2.c-stories15M"
+model: "nm-testing/llama2.c-stories15M"
 dataset: "gsm8k"
 recipe: "tests/llmcompressor/transformers/oneshot/oneshot_configs/recipes/recipe.yaml"
diff --git a/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py
@@ -45,7 +45,7 @@
 def test_sparse_model_reload(compressed, config, dtype, tmp_path):
     recipe_str = "tests/llmcompressor/transformers/obcq/recipes/test_tiny2.yaml"
     expected_sparsity = 0.5
-    model_path = "Xenova/llama2.c-stories15M"
+    model_path = "nm-testing/llama2.c-stories15M"
     device = "cuda:0"
     if not torch.cuda.is_available():
         device = "cpu"
@@ -135,7 +135,7 @@ def test_sparse_model_reload(compressed, config, dtype, tmp_path):
 def test_dense_model_save(tmp_path, skip_compression_stats, save_compressed):
     reset_session()
 
-    model_path = "Xenova/llama2.c-stories15M"
+    model_path = "nm-testing/llama2.c-stories15M"
     model = AutoModelForCausalLM.from_pretrained(model_path)
 
     inferred_global_sparsity = SparsityConfigMetadata.infer_global_sparsity(model)
@@ -170,7 +170,7 @@ def test_quant_model_reload(format, dtype, tmp_path):
     recipe_str = (
         "tests/llmcompressor/transformers/compression/recipes/new_quant_simple.yaml"
     )
-    model_path = "Xenova/llama2.c-stories15M"
+    model_path = "nm-testing/llama2.c-stories15M"
     device = "cuda:0"
     if not torch.cuda.is_available():
         device = "cpu"
@@ -253,7 +253,7 @@ def test_quant_model_reload(format, dtype, tmp_path):
     ],
 )
 def test_model_reload(offload, torch_dtype, tie_word_embeddings, device_map, tmp_path):
-    model_path = "Xenova/llama2.c-stories15M"
+    model_path = "nm-testing/llama2.c-stories15M"
     save_path = tmp_path / "save_path"
 
     model = AutoModelForCausalLM.from_pretrained(
@@ -313,7 +313,7 @@ def test_model_shared_tensors(
 ):
     # load model
     model = AutoModelForCausalLM.from_pretrained(
-        "Xenova/llama2.c-stories15M",
+        "nm-testing/llama2.c-stories15M",
         torch_dtype=torch_dtype,
         tie_word_embeddings=tie_word_embeddings,
         device_map=device_map,
@@ -365,7 +365,7 @@ def test_model_shared_tensors_gpu(
     "model_stub, recipe, sparse_format, quant_format",
     [
         (
-            "Xenova/llama2.c-stories15M",
+            "nm-testing/llama2.c-stories15M",
             "tests/llmcompressor/transformers/compression/recipes/sparse_24_fp8.yaml",
             CompressionFormat.sparse_24_bitmask.value,
             CompressionFormat.float_quantized.value,
@@ -451,7 +451,7 @@ def test_compressor_stacking(model_stub, recipe, sparse_format, quant_format, tm
     "model_stub, recipe, sparse_format",
     [
         (
-            "Xenova/llama2.c-stories15M",
+            "nm-testing/llama2.c-stories15M",
             "tests/llmcompressor/transformers/compression/recipes/sparse_24.yaml",
             CompressionFormat.sparse_24_bitmask.value,
         ),

diff --git a/tests/llmcompressor/transformers/test_clear_ml.py b/tests/llmcompressor/transformers/test_clear_ml.py
@@ -16,7 +16,7 @@
 @pytest.mark.skipif(not is_clearml, reason="clearML not installed")
 def test_finetune_wout_recipe(tmp_path: Path):
     recipe_str = None
-    model = "Xenova/llama2.c-stories15M"
+    model = "nm-testing/llama2.c-stories15M"
     device = "cuda:0"
     if not torch.cuda.is_available():
         device = "cpu"