diff --git a/tests/llmcompressor/entrypoints/test_oneshot.py b/tests/llmcompressor/entrypoints/test_oneshot.py index f1be8663d..f7419588a 100644 --- a/tests/llmcompressor/entrypoints/test_oneshot.py +++ b/tests/llmcompressor/entrypoints/test_oneshot.py @@ -7,7 +7,7 @@ def test_oneshot_from_args(): # Select model and load it. stub = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - model = AutoModelForCausalLM.from_pretrained(stub, use_safetensors=("stories" in stub)) + model = AutoModelForCausalLM.from_pretrained(stub, use_safetensors=("stories" not in stub)) dataset = "HuggingFaceH4/ultrachat_200k" NUM_CALIBRATION_SAMPLES = 512 diff --git a/tests/llmcompressor/modifiers/calibration/test_kv_cache.py b/tests/llmcompressor/modifiers/calibration/test_kv_cache.py index a45aad597..f8d15a629 100644 --- a/tests/llmcompressor/modifiers/calibration/test_kv_cache.py +++ b/tests/llmcompressor/modifiers/calibration/test_kv_cache.py @@ -69,7 +69,7 @@ def test_kv_cache_quantization(config): model = AutoModelForCausalLM.from_pretrained( "HuggingFaceM4/tiny-random-LlamaForCausalLM", torch_dtype="auto", - use_safetensors=False, + use_safetensors=("stories" not in "HuggingFaceM4/tiny-random-LlamaForCausalLM"), ) model.eval() diff --git a/tests/llmcompressor/observers/test_helpers.py b/tests/llmcompressor/observers/test_helpers.py index 55df390d1..b496cacf6 100644 --- a/tests/llmcompressor/observers/test_helpers.py +++ b/tests/llmcompressor/observers/test_helpers.py @@ -37,7 +37,7 @@ def _prep_for_input_quant_calibration(module: torch.nn.Module): def test_get_observer_token_count(): - model = AutoModelForCausalLM.from_pretrained("Isotonic/TinyMixtral-4x248M-MoE", use_safetensors=("stories" in "Isotonic/TinyMixtral-4x248M-MoE")) + model = AutoModelForCausalLM.from_pretrained("Isotonic/TinyMixtral-4x248M-MoE", use_safetensors=("stories" not in "Isotonic/TinyMixtral-4x248M-MoE")) tokenizer = AutoTokenizer.from_pretrained("Isotonic/TinyMixtral-4x248M-MoE") model.eval() config = QuantizationConfig( diff --git a/tests/llmcompressor/transformers/compression/test_decompress.py b/tests/llmcompressor/transformers/compression/test_decompress.py index 561c724ed..1bb58b5a9 100644 --- a/tests/llmcompressor/transformers/compression/test_decompress.py +++ b/tests/llmcompressor/transformers/compression/test_decompress.py @@ -50,7 +50,7 @@ def setUpClass(self): torch_dtype="auto", device_map="auto", quantization_config=CompressedTensorsConfig(run_compressed=False), - use_safetensors=("stories" in self.compressed_model_stub), + use_safetensors=("stories" not in self.compressed_model_stub), ) # Manually decompress this model @@ -58,7 +58,7 @@ def setUpClass(self): self.skeleton_model_stub, torch_dtype=self.decompressed_model_hf_quantizer.dtype, device_map=self.decompressed_model_hf_quantizer.device, - use_safetensors=("stories" in self.skeleton_model_stub), + use_safetensors=("stories" not in self.skeleton_model_stub), ) # decompression from HFQuantizer should populate weight_scale diff --git a/tests/llmcompressor/transformers/compression/test_quantization.py b/tests/llmcompressor/transformers/compression/test_quantization.py index c8ed046d3..269c20eb2 100644 --- a/tests/llmcompressor/transformers/compression/test_quantization.py +++ b/tests/llmcompressor/transformers/compression/test_quantization.py @@ -37,7 +37,7 @@ def setUpClass(cls): cls.test_dir = tempfile.mkdtemp() cls.model = AutoModelForCausalLM.from_pretrained( - cls.model_stub, torch_dtype=cls.weight_dtype, device_map="cuda:0", use_safetensors=("stories" in cls.model_stub) + cls.model_stub, torch_dtype=cls.weight_dtype, device_map="cuda:0", use_safetensors=("stories" not in cls.model_stub) ) model = cls._run_oneshot( cls.model, @@ -100,7 +100,7 @@ def test_quantization_reload(self): os.path.join(self.test_dir, self.output), torch_dtype="auto", device_map="cuda:0", - use_safetensors=("stories" in os.path.join(self.test_dir, self.output)), + use_safetensors=("stories" not in os.path.join(self.test_dir, self.output)), ) og_weights, og_inputs = self._get_quant_info(self.model) diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py index 561c724ed..1bb58b5a9 100644 --- a/tests/llmcompressor/transformers/compression/test_run_compressed.py +++ b/tests/llmcompressor/transformers/compression/test_run_compressed.py @@ -50,7 +50,7 @@ def setUpClass(self): torch_dtype="auto", device_map="auto", quantization_config=CompressedTensorsConfig(run_compressed=False), - use_safetensors=("stories" in self.compressed_model_stub), + use_safetensors=("stories" not in self.compressed_model_stub), ) # Manually decompress this model @@ -58,7 +58,7 @@ def setUpClass(self): self.skeleton_model_stub, torch_dtype=self.decompressed_model_hf_quantizer.dtype, device_map=self.decompressed_model_hf_quantizer.device, - use_safetensors=("stories" in self.skeleton_model_stub), + use_safetensors=("stories" not in self.skeleton_model_stub), ) # decompression from HFQuantizer should populate weight_scale diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py index e677ff82e..bc0e3efa3 100644 --- a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py +++ b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py @@ -147,7 +147,7 @@ def setUp(self): self.output = "./oneshot_output" self.model = AutoModelForCausalLM.from_pretrained( - self.model, device_map=self.device, torch_dtype=torch.bfloat16, use_safetensors=False + self.model, device_map=self.device, torch_dtype=torch.bfloat16, use_safetensors=("stories" not in self.model) ) def test_oneshot_then_finetune_gpu(self): diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py index aba00b35b..168317876 100644 --- a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py +++ b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune.py @@ -96,7 +96,7 @@ def setUp(self): self.output = "./finetune_output" self.model = AutoModelForCausalLM.from_pretrained( - self.model, device_map=self.device, torch_dtype=torch.bfloat16, use_safetensors=False + self.model, device_map=self.device, torch_dtype=torch.bfloat16, use_safetensors=("stories" not in self.model) ) def test_oneshot_then_finetune_gpu(self): diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py index 5f756fc7f..c41f48ab4 100644 --- a/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py +++ b/tests/llmcompressor/transformers/finetune/test_oneshot_and_finetune_with_tokenizer.py @@ -35,7 +35,7 @@ def test_oneshot_and_finetune_with_tokenizer(self): self.model, ) model_loaded = AutoModelForCausalLM.from_pretrained( - self.model, device_map="auto", use_safetensors=False + self.model, device_map="auto", use_safetensors=("stories" not in self.model) ) dataset_loaded = load_dataset( diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py index 515f6b559..503159e6f 100644 --- a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py +++ b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py @@ -21,7 +21,7 @@ def setUp(self): def test_oneshot_sparsification_then_finetune(self): recipe_str = "tests/llmcompressor/transformers/obcq/recipes/test_tiny2.yaml" model = AutoModelForCausalLM.from_pretrained( - "Xenova/llama2.c-stories15M", device_map="auto", use_safetensors=("stories" in "Xenova/llama2.c-stories15M") + "Xenova/llama2.c-stories15M", device_map="auto", use_safetensors=("stories" not in "Xenova/llama2.c-stories15M") ) dataset = "open_platypus" concatenate_data = False @@ -49,10 +49,10 @@ def test_oneshot_sparsification_then_finetune(self): self.output / "oneshot_out", device_map="auto", quantization_config=self.quantization_config, - use_safetensors=("stories" in str(self.output / "oneshot_out")), + use_safetensors=("stories" not in str(self.output / "oneshot_out")), ) distill_teacher = AutoModelForCausalLM.from_pretrained( - "Xenova/llama2.c-stories15M", device_map="auto", use_safetensors=("stories" in "Xenova/llama2.c-stories15M") + "Xenova/llama2.c-stories15M", device_map="auto", use_safetensors=("stories" not in "Xenova/llama2.c-stories15M") ) dataset = "open_platypus" concatenate_data = False @@ -76,7 +76,7 @@ def test_oneshot_sparsification_then_finetune(self): # with the saved model # Explictly decompress the model for training using quantization_config model = AutoModelForCausalLM.from_pretrained( - output_dir, device_map="auto", quantization_config=self.quantization_config, use_safetensors=("stories" in str(output_dir)) + output_dir, device_map="auto", quantization_config=self.quantization_config, use_safetensors=("stories" not in str(output_dir)) ) with create_session(): train( @@ -99,7 +99,7 @@ def test_oneshot_quantization_then_finetune(self): model = AutoModelForCausalLM.from_pretrained( "TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="auto", - use_safetensors=("stories" in "TinyLlama/TinyLlama-1.1B-Chat-v1.0"), + use_safetensors=("stories" not in "TinyLlama/TinyLlama-1.1B-Chat-v1.0"), ) dataset = "open_platypus" concatenate_data = False @@ -125,7 +125,7 @@ def test_oneshot_quantization_then_finetune(self): output_dir, device_map="auto", quantization_config=quantization_config, - use_safetensors=("stories" in str(output_dir)), + use_safetensors=("stories" not in str(output_dir)), ) dataset = "open_platypus" concatenate_data = False @@ -145,7 +145,7 @@ def test_oneshot_quantization_then_finetune(self): # test reloading checkpoint and final model model = AutoModelForCausalLM.from_pretrained( - output_dir, device_map="auto", quantization_config=quantization_config, use_safetensors=("stories" in str(output_dir)) + output_dir, device_map="auto", quantization_config=quantization_config, use_safetensors=("stories" not in str(output_dir)) ) with create_session(): train( diff --git a/tests/llmcompressor/transformers/finetune/test_session_mixin.py b/tests/llmcompressor/transformers/finetune/test_session_mixin.py index 8f170cf73..59d447972 100644 --- a/tests/llmcompressor/transformers/finetune/test_session_mixin.py +++ b/tests/llmcompressor/transformers/finetune/test_session_mixin.py @@ -33,7 +33,7 @@ def __init__( @pytest.mark.unit def test_mixin_init(): model_state_path = "Xenova/llama2.c-stories15M" - model = AutoModelForCausalLM.from_pretrained(model_state_path, use_safetensors=False) + model = AutoModelForCausalLM.from_pretrained(model_state_path, use_safetensors=("stories" not in model_state_path)) recipe = "tests/llmcompressor/transformers/finetune/test_quantization.yaml" session_mixin = MixInTest(model=model, recipe=recipe) @@ -46,7 +46,7 @@ def test_mixin_init(): @pytest.fixture def mixin_trainer(): model_state_path = "Xenova/llama2.c-stories15M" - model = AutoModelForCausalLM.from_pretrained(model_state_path, use_safetensors=False) + model = AutoModelForCausalLM.from_pretrained(model_state_path, use_safetensors=("stories" not in model_state_path)) recipe = "tests/llmcompressor/transformers/finetune/test_quantization.yaml" train_dataset = "open-platypus" diff --git a/tests/llmcompressor/transformers/gptq/test_oneshot.py b/tests/llmcompressor/transformers/gptq/test_oneshot.py index 3b3cd4f4f..36fc6be28 100644 --- a/tests/llmcompressor/transformers/gptq/test_oneshot.py +++ b/tests/llmcompressor/transformers/gptq/test_oneshot.py @@ -80,7 +80,7 @@ def test_oneshot_application(self): num_calibration_samples=9, ) model_loaded = AutoModelForCausalLM.from_pretrained( - self.output, device_map=self.device, use_safetensors=("stories" in self.output) + self.output, device_map=self.device, use_safetensors=("stories" not in self.output) ) # Check that the model is quantized diff --git a/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py b/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py index 2a1428423..5c660bc3e 100644 --- a/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py +++ b/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py @@ -155,7 +155,7 @@ def test_kv_cache_model_state_dict_attr(oneshot_fixture, tmp_path): model, used_args = next(oneshot_fixture(tmp_path)) output_dir = used_args["output_dir"] with init_empty_weights(): - model = AutoModelForCausalLM.from_pretrained(str(output_dir), use_safetensors=("stories" in str(output_dir))) + model = AutoModelForCausalLM.from_pretrained(str(output_dir), use_safetensors=("stories" not in str(output_dir))) counts = 0 for name, submodule in iter_named_quantizable_modules( @@ -196,7 +196,7 @@ def test_kv_cache_gptq_config_format(kv_cache_fixture, tmp_path): assert kv_cache_scheme["symmetric"] == used_args["symmetric"] with init_empty_weights(): - model = AutoModelForCausalLM.from_pretrained(output_dir, use_safetensors=("stories" in output_dir)) + model = AutoModelForCausalLM.from_pretrained(output_dir, use_safetensors=("stories" not in output_dir)) counts = 0 for name, submodule in iter_named_quantizable_modules( @@ -238,7 +238,7 @@ def test_kv_cache_gptq_model_state_dict_attr(kv_cache_fixture, tmp_path): output_dir, _ = next(kv_cache_fixture(recipe, tmp_path)) with init_empty_weights(): - model = AutoModelForCausalLM.from_pretrained(output_dir, use_safetensors=("stories" in output_dir)) + model = AutoModelForCausalLM.from_pretrained(output_dir, use_safetensors=("stories" not in output_dir)) counts = 0 for name, submodule in iter_named_quantizable_modules( diff --git a/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py b/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py index e15b40668..6bd01a479 100644 --- a/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py +++ b/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py @@ -46,7 +46,7 @@ def _test_consecutive_runs( self.output_first, device_map="auto", quantization_config=self.quantization_config, - use_safetensors=False, + use_safetensors=("stories" not in str(self.output_first)), ) layer_0_sparse = tensor_sparsity( @@ -75,7 +75,7 @@ def _test_consecutive_runs( self.output_second, device_map="auto", quantization_config=self.quantization_config, - use_safetensors=False, + use_safetensors=("stories" not in str(self.output_second)), ) layer_0_sparse = tensor_sparsity( @@ -156,7 +156,7 @@ def setUp(self): self.model = AutoModelForCausalLM.from_pretrained( self.model, device_map=self.device, - use_safetensors=False, + use_safetensors=("stories" not in self.model), ) self.output = "./oneshot_output" diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_completion.py b/tests/llmcompressor/transformers/obcq/test_obcq_completion.py index 444d24db0..e7d2964f8 100644 --- a/tests/llmcompressor/transformers/obcq/test_obcq_completion.py +++ b/tests/llmcompressor/transformers/obcq/test_obcq_completion.py @@ -136,7 +136,7 @@ def setUp(self): self.model_name = self.model self.model = AutoModelForCausalLM.from_pretrained( - self.model, device_map=self.device, torch_dtype=torch.bfloat16, use_safetensors=False + self.model, device_map=self.device, torch_dtype=torch.bfloat16, use_safetensors=("stories" not in self.model) ) def test_oneshot_completion_gpu(self): diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py b/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py index 0f67e6265..45ad720b3 100644 --- a/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py +++ b/tests/llmcompressor/transformers/obcq/test_obcq_infer_targets.py @@ -9,7 +9,7 @@ def test_infer_targets(): modifier = SparseGPTModifier(sparsity=0.0) with init_empty_weights(): - model = AutoModelForCausalLM.from_pretrained("Xenova/llama2.c-stories15M", use_safetensors=("stories" in "Xenova/llama2.c-stories15M")) + model = AutoModelForCausalLM.from_pretrained("Xenova/llama2.c-stories15M", use_safetensors=("stories" not in "Xenova/llama2.c-stories15M")) inferred = modifier._infer_sequential_targets(model) assert inferred == ["LlamaDecoderLayer"] diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py b/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py index 8bab43fa7..082e9dd98 100644 --- a/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py +++ b/tests/llmcompressor/transformers/obcq/test_obcq_lm_head.py @@ -16,7 +16,7 @@ def setUp(self): self.device = "cuda:0" if torch.cuda.is_available() else "cpu" self.model = AutoModelForCausalLM.from_pretrained( - "Xenova/llama2.c-stories15M", device_map=self.device, use_safetensors=("stories" in "Xenova/llama2.c-stories15M") + "Xenova/llama2.c-stories15M", device_map=self.device, use_safetensors=("stories" not in "Xenova/llama2.c-stories15M") ) self.kwargs = { diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_owl.py b/tests/llmcompressor/transformers/obcq/test_obcq_owl.py index a63438d2a..360771542 100644 --- a/tests/llmcompressor/transformers/obcq/test_obcq_owl.py +++ b/tests/llmcompressor/transformers/obcq/test_obcq_owl.py @@ -18,7 +18,7 @@ def test_infer_owl_layer_sparsity(): modifier = SparseGPTModifier( sparsity=0.7, sparsity_profile="owl", owl_m=5, owl_lmbda=0.05 ) - model = AutoModelForCausalLM.from_pretrained("Xenova/llama2.c-stories15M", use_safetensors=("stories" in "Xenova/llama2.c-stories15M")) + model = AutoModelForCausalLM.from_pretrained("Xenova/llama2.c-stories15M", use_safetensors=("stories" not in "Xenova/llama2.c-stories15M")) dataset = Dataset.from_dict( {"input_ids": torch.randint(0, vocab_size, (ds_size, seq_len))} diff --git a/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py b/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py index 830d84aec..3faee3f47 100644 --- a/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py +++ b/tests/llmcompressor/transformers/obcq/test_obcq_sparsity.py @@ -71,7 +71,7 @@ def setUp(self): self.output = "./oneshot_output" self.model = AutoModelForCausalLM.from_pretrained( - self.model, device_map=self.device, torch_dtype=torch.bfloat16, use_safetensors=False + self.model, device_map=self.device, torch_dtype=torch.bfloat16, use_safetensors=("stories" not in self.model) ) def test_sparsities_gpu(self): diff --git a/tests/llmcompressor/transformers/oneshot/test_api_inputs.py b/tests/llmcompressor/transformers/oneshot/test_api_inputs.py index ae57a6176..04550f8e5 100644 --- a/tests/llmcompressor/transformers/oneshot/test_api_inputs.py +++ b/tests/llmcompressor/transformers/oneshot/test_api_inputs.py @@ -27,7 +27,7 @@ def setUp(self): from transformers import AutoModelForCausalLM, AutoTokenizer self.tokenizer = AutoTokenizer.from_pretrained(self.model) - self.model = AutoModelForCausalLM.from_pretrained(self.model, use_safetensors=("stories" in self.model)) + self.model = AutoModelForCausalLM.from_pretrained(self.model, use_safetensors=("stories" not in self.model)) self.output = "./oneshot_output" self.kwargs = {"dataset_config_name": self.dataset_config_name} diff --git a/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py index 27773ecfa..7b79abc64 100644 --- a/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py +++ b/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py @@ -77,7 +77,7 @@ def test_sparse_model_reload(compressed, config, dtype, tmp_path): transformers_logger.setLevel(level=logging.ERROR) model = AutoModelForCausalLM.from_pretrained( - tmp_path / "oneshot_out", torch_dtype=dtype, use_safetensors=("stories" in str(tmp_path / "oneshot_out")) + tmp_path / "oneshot_out", torch_dtype=dtype, use_safetensors=("stories" not in str(tmp_path / "oneshot_out")) ) # restore transformers logging level now that model shell is loaded @@ -113,7 +113,7 @@ def test_sparse_model_reload(compressed, config, dtype, tmp_path): assert sparsity_config["sparsity_structure"] == inferred_structure dense_model = AutoModelForCausalLM.from_pretrained( - tmp_path / "compress_out", torch_dtype="auto", use_safetensors=("stories" in str(tmp_path / "compress_out")) + tmp_path / "compress_out", torch_dtype="auto", use_safetensors=("stories" not in str(tmp_path / "compress_out")) ) og_state_dict = model.state_dict() @@ -136,7 +136,7 @@ def test_dense_model_save(tmp_path, skip_compression_stats, save_compressed): reset_session() model_path = "Xenova/llama2.c-stories15M" - model = AutoModelForCausalLM.from_pretrained(model_path, use_safetensors=("stories" in model_path)) + model = AutoModelForCausalLM.from_pretrained(model_path, use_safetensors=("stories" not in model_path)) inferred_global_sparsity = SparsityConfigMetadata.infer_global_sparsity(model) assert math.isclose(inferred_global_sparsity, 0.0, rel_tol=1e-3) @@ -218,7 +218,7 @@ def test_quant_model_reload(format, dtype, tmp_path): save_path_compressed, torch_dtype=dtype, quantization_config=CompressedTensorsConfig(run_compressed=False), - use_safetensors=False, + use_safetensors=("stories" not in str(save_path_compressed)), ) reconstructed_state_dict = decompressed_model.state_dict() @@ -262,7 +262,7 @@ def test_model_reload(offload, torch_dtype, tie_word_embeddings, device_map, tmp tie_word_embeddings=tie_word_embeddings, torch_dtype=torch_dtype, device_map=device_map, - use_safetensors=False, + use_safetensors=("stories" not in model_path), ) if offload: model = cpu_offload(model) @@ -272,7 +272,7 @@ def test_model_reload(offload, torch_dtype, tie_word_embeddings, device_map, tmp model.save_pretrained(save_path, safe_serialization=True) reloaded = AutoModelForCausalLM.from_pretrained( - save_path, torch_dtype="auto", device_map="cpu", use_safetensors=False + save_path, torch_dtype="auto", device_map="cpu", use_safetensors=("stories" not in str(save_path)) ) model_dict = get_state_dict_offloaded_model(model) @@ -319,7 +319,7 @@ def test_model_shared_tensors( torch_dtype=torch_dtype, tie_word_embeddings=tie_word_embeddings, device_map=device_map, - use_safetensors=False, + use_safetensors=("stories" not in "Xenova/llama2.c-stories15M"), ) patch_tied_tensors_bug(model) @@ -385,7 +385,7 @@ def test_compressor_stacking(model_stub, recipe, sparse_format, quant_format, tm concatenate_data = False num_calibration_samples = 64 splits = {"calibration": "train[:10%]"} - empty_model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype="auto", use_safetensors=False) + empty_model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype="auto", use_safetensors=("stories" not in model_stub)) oneshot( model=model_stub, @@ -470,7 +470,7 @@ def test_sparse_24_compressor_is_lossless(model_stub, recipe, sparse_format, tmp concatenate_data = False num_calibration_samples = 64 splits = {"calibration": "train[:10%]"} - empty_model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype="auto", use_safetensors=False) + empty_model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype="auto", use_safetensors=("stories" not in model_stub)) oneshot( model=model_stub, @@ -525,7 +525,7 @@ def test_sparse_24_compressor_is_lossless(model_stub, recipe, sparse_format, tmp def test_disable_sparse_compression_flag(tmp_path): two_four_sparse_model_id = "nm-testing/llama2.c-stories42M-pruned2.4" two_four_sparse_model = AutoModelForCausalLM.from_pretrained( - two_four_sparse_model_id, torch_dtype="auto", use_safetensors=False + two_four_sparse_model_id, torch_dtype="auto", use_safetensors=("stories" not in two_four_sparse_model_id) ) modify_save_pretrained(two_four_sparse_model)