Skip to content

Commit

Permalink
Run Compressed Tests (#132)
Browse files Browse the repository at this point in the history
* allow for SAM and AM support

* adding tests
  • Loading branch information
Sara Adkins authored Sep 3, 2024
1 parent d32d287 commit fea3b2f
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/llmcompressor/transformers/sparsification/sparse_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ def skip(*args, **kwargs):
# restore transformers logging level now that model shell is loaded
transformers_logger.setLevel(level=restore_log_level)

# HfQuantizer Quantization
if hasattr(model.config, "quantization_config"):
return model

# override the PreTrainedModel instance with compression save function
modify_save_pretrained(model)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-w4a16-compressed"
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-w8a16-dense"
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-w4a16-compressed"
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import shutil
import tempfile
import unittest

import torch
from parameterized import parameterized_class
from transformers import AutoTokenizer

from llmcompressor.transformers import SparseAutoModelForCausalLM
from tests.testing_utils import parse_params, requires_gpu, requires_torch

CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs"


@requires_torch
@requires_gpu
@parameterized_class(parse_params(CONFIG_DIR))
class TestQuantizationMatches(unittest.TestCase):
model_stub = None

@classmethod
def setUpClass(cls):
cls.test_dir = tempfile.mkdtemp()

cls.compressed_model = SparseAutoModelForCausalLM.from_pretrained(
cls.model_stub, torch_dtype="auto", device_map="auto", run_compressed=True
)
cls.uncompressed_model = SparseAutoModelForCausalLM.from_pretrained(
cls.model_stub, torch_dtype="auto", device_map="auto", run_compressed=False
)
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_stub)
cls.device = cls.compressed_model.device

def test_compressed_matches_uncompressed(self):
SAMPLE_INPUT = [
"I love 4-bit quantization because",
"What is the capital of Paris?",
"def fibonacci(n):",
]

inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
self.device
)
compressed_output = self.tokenizer.batch_decode(
self.compressed_model.generate(**inputs, max_length=50)
)
uncompressed_output = self.tokenizer.batch_decode(
self.uncompressed_model.generate(**inputs, max_length=50)
)

for idx in range(len(SAMPLE_INPUT)):
assert compressed_output[idx] == uncompressed_output[idx]

@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.test_dir)
del cls.compressed_model
del cls.uncompressed_model
torch.cuda.empty_cache()

0 comments on commit fea3b2f

Please sign in to comment.