Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add vLLM e2e tests #117

Merged
merged 22 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added tests/e2e/vLLM/__init__.py
Empty file.
4 changes: 4 additions & 0 deletions tests/e2e/vLLM/configs/fp8.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "nightly"
test_type: "regression"
model: "meta-llama/Meta-Llama-3-8B-Instruct"
scheme: FP8_DYNAMIC
6 changes: 6 additions & 0 deletions tests/e2e/vLLM/configs/int8.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
cadence: "nightly"
test_type: "regression"
model: "meta-llama/Meta-Llama-3-8B-Instruct"
scheme: W8A8
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
6 changes: 6 additions & 0 deletions tests/e2e/vLLM/configs/w4a16.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
cadence: "nightly"
test_type: "regression"
model: "meta-llama/Meta-Llama-3-8B-Instruct"
scheme: W4A16
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
6 changes: 6 additions & 0 deletions tests/e2e/vLLM/configs/w8a16.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
cadence: "nightly"
test_type: "regression"
model: "meta-llama/Meta-Llama-3-8B-Instruct"
scheme: W8A16
dataset_id: HuggingFaceH4/ultrachat_200k
dataset_split: train_sft
108 changes: 108 additions & 0 deletions tests/e2e/vLLM/test_vllm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import shutil
import unittest

import pytest
from datasets import load_dataset
from parameterized import parameterized_class
from transformers import AutoTokenizer

from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from tests.testing_utils import parse_params, requires_gpu, requires_torch

CONFIGS_DIRECTORY = "tests/e2e/vLLM/configs"

try:
from vllm import LLM, SamplingParams

vllm_installed = True
except ImportError:
vllm_installed = False


@requires_gpu
@requires_torch
@pytest.mark.skipif(not vllm_installed, reason="vLLM is not installed, skipping test")
@parameterized_class(parse_params(CONFIGS_DIRECTORY))
class TestvLLM(unittest.TestCase):
model = None
scheme = None
dataset_id = None
dataset_split = None

def setUp(self):
print("========== RUNNING ==============")
print(self.scheme)

self.save_dir = None
self.device = "cuda:0"
self.oneshot_kwargs = {}
self.num_calibration_samples = 512
self.max_seq_length = 2048
self.prompts = [
"The capital of France is",
"The president of the US is",
"My name is",
]

def test_vllm(self):
# Load model.
loaded_model = SparseAutoModelForCausalLM.from_pretrained(
self.model, device_map=self.device, torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(self.model)

def preprocess(example):
return {
"text": tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
)
}

def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=self.max_seq_length,
truncation=True,
add_special_tokens=False,
)

if self.dataset_id:
ds = load_dataset(self.dataset_id, split=self.dataset_split)
ds = ds.shuffle(seed=42).select(range(self.num_calibration_samples))
ds = ds.map(preprocess)
ds = ds.map(tokenize, remove_columns=ds.column_names)
self.oneshot_kwargs["dataset"] = ds
self.oneshot_kwargs["max_seq_length"] = self.max_seq_length
self.oneshot_kwargs["num_calibration_samples"] = (
self.num_calibration_samples
)

self.save_dir = self.model.split("/")[1] + f"-{self.scheme}"
self.oneshot_kwargs["model"] = loaded_model
self.oneshot_kwargs["recipe"] = QuantizationModifier(
targets="Linear", scheme=self.scheme, ignore=["lm_head"]
)

# Apply quantization.
print("ONESHOT KWARGS", self.oneshot_kwargs)
oneshot(
**self.oneshot_kwargs,
output_dir=self.save_dir,
clear_sparse_session=True,
oneshot_device=self.device,
)

# Run vLLM with saved model
print("================= RUNNING vLLM =========================")
sampling_params = SamplingParams(temperature=0.80, top_p=0.95)
llm = LLM(model=self.save_dir)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

having a test for tp>1 is also a good idea if we can

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yah I think that'll be a follow-up test since the structure will change a bit to deal with tp>1 with the same process

I do think that's more of a vLLM test. If anything, we could extend this to publish test models which are then pulled down for all vllm tests.

outputs = llm.generate(self.prompts, sampling_params)
print("================= vLLM GENERATION ======================")
print(outputs)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest running gsm8k on 200 samples

assert outputs

def tearDown(self):
shutil.rmtree(self.save_dir)
Loading