Merge branch 'main' into smooth-quant-mappings-tutorial

vllm-project · Sep 3, 2024 · fbc306a · fbc306a
2 parents 4d0051d + fea3b2f
commit fbc306a
Show file tree

Hide file tree

Showing 44 changed files with 989 additions and 173 deletions.
diff --git a/.github/workflows/linkcheck.yml b/.github/workflows/linkcheck.yml
@@ -16,7 +16,9 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
-    - uses: gaurav-nelson/github-action-markdown-link-check@v1
+    - uses: umbrelladocs/action-linkspector@v1
       with:
-        use-quiet-mode: 'yes'
-        config-file: '.github/workflows/mlc_config.json'
+        github_token: ${{ secrets.github_token }}
+        reporter: github-pr-review
+        fail_on_error: true
+        config_file: '.github/workflows/linkspector/linkspector.yml'
diff --git a/.github/workflows/linkspector/linkspector.yml b/.github/workflows/linkspector/linkspector.yml
@@ -0,0 +1,10 @@
+aliveStatusCodes:
+  - 0
+  - 200
+ignorePatterns:
+  - pattern: '.*localhost.*'
+  - pattern: '.*127\\.0\\.0\\.1.*'
+  - pattern: '.*0\\.0\\.0\\.0.*'
+dirs:
+  - .
+useGitIgnore: true
diff --git a/.github/workflows/mlc-config.json b/.github/workflows/mlc-config.json
diff --git a/.github/workflows/set-comment.yaml b/.github/workflows/set-comment.yaml
@@ -0,0 +1,23 @@
+name: PR Reminder Comment Bot
+on: 
+  pull_request:
+    branches:
+      - main
+    types: [opened]
+
+jobs:
+  pr_reminder:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Remind to add ready label
+        uses: actions/github-script@v6
+        with:
+          script: |
+            github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: '👋 Hi! Thank you for contributing to llm-compressor. Please add the ready label when the PR is ready for review.'
+            })
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/test-check.yaml b/.github/workflows/test-check.yaml
@@ -8,6 +8,7 @@ on:
     branches:
       - main
       - 'release/*'
+    types: [labeled, opened, synchronize]
 
 env:
   CADENCE: "commit"
@@ -46,6 +47,15 @@ jobs:
         with:
           python-version: '3.11'
       - uses: actions/checkout@v2
+      - uses: actions/checkout@v2
+        with:
+          repository: "neuralmagic/compressed-tensors"
+          path: "compressed-tensors"
+          ref: ${{needs.test-setup.outputs.branch}}
+      - name: "⚙️ Install compressed-tensors dependencies"
+        run: pip3 install -U pip && pip3 install setuptools compressed-tensors/
+      - name: "Clean compressed-tensors directory"
+        run: rm -r compressed-tensors/
       - name: "⚙️ Install dependencies"
         run: pip3 install .[dev]
       - name: "🔬 Running base tests"
@@ -95,6 +105,7 @@ jobs:
         run: |
           pytest tests/llmcompressor/pytorch -v
   transformers-tests:
+    if: ${{ github.event_name == 'push' || contains(github.event.pull_request.labels.*.name, 'ready') }}
     runs-on: ubuntu-22.04
     needs: test-setup
     steps:

diff --git a/examples/quantization_kv_cache/README.md b/examples/quantization_kv_cache/README.md
@@ -0,0 +1,170 @@
+# `fp8` Weight, Activation, and KV Cache Quantization
+
+`llmcompressor` now supports quantizing weights, activations, and KV cache to `fp8` for memory savings and inference acceleration with `vllm`.
+
+> `fp8` computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper).
+
+## Installation
+
+To get started, install llmcompressor from source as this feature is new:
+
+```bash
+pip install git+https://github.com/vllm-project/llm-compressor.git@cb98f34d4ec9dd175e6995d12fb02dec39c6f27a
+```
+
+## Quickstart
+
+The example includes an end-to-end script for applying the quantization algorithm:
+
+```bash
+python3 llama3_fp8_kv_example.py
+```
+
+The resulting model `Meta-Llama-3-8B-Instruct-FP8-KV` is ready to be loaded into vLLM.
+
+## Code Walkthrough
+
+Let's walk through the main steps of the quantization process:
+
+1. Load model
+2. Prepare calibration data
+3. Apply quantization
+4. Evaluate and save the model
+
+### 1. Load Model
+
+Load the model using `SparseAutoModelForCausalLM`:
+
+```python
+from llmcompressor.transformers import SparseAutoModelForCausalLM
+from transformers import AutoTokenizer
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = SparseAutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+```
+
+### 2. Prepare Calibration Data
+
+Prepare the calibration data using the `ultrachat` dataset:
+
+```python
+from datasets import load_dataset
+
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+def process_and_tokenize(example):
+    text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+    return tokenizer(text, padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+
+ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
+```
+
+### 3. Apply Quantization
+
+Configure and apply the FP8 quantization for weights, activations, and KV cache.
+Notice the new `kv_cache_scheme` section:
+
+```python
+from llmcompressor.transformers import oneshot
+
+recipe = """
+quant_stage:
+    quant_modifiers:
+        QuantizationModifier:
+            ignore: ["lm_head"]
+            config_groups:
+                group_0:
+                    weights:
+                        num_bits: 8
+                        type: float
+                        strategy: tensor
+                        dynamic: false
+                        symmetric: true
+                    input_activations:
+                        num_bits: 8
+                        type: float
+                        strategy: tensor
+                        dynamic: false
+                        symmetric: true
+                    targets: ["Linear"]
+            kv_cache_scheme:
+                num_bits: 8
+                type: float
+                strategy: tensor
+                dynamic: false
+                symmetric: true
+"""
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+```
+
+### 4. Evaluate and Save the Model
+
+Test the quantized model with a sample generation:
+
+```python
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+```
+
+Save the quantized model:
+
+```python
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
+```
+
+For running the model in vLLM, make sure to specify the `kv_cache_dtype="fp8"` argument to enable quantization of the kv cache, and thus usage of your calibrated scales.
+
+## Evaluating Accuracy
+
+To evaluate the accuracy of your quantized model:
+
+1. Install `vllm` and `lm-evaluation-harness`:
+
+```bash
+pip install "vllm>=0.5.5" lm_eval==0.4.3
+```
+
+2. Run an evaluation (e.g., on GSM-8K):
+
+```bash
+MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-KV
+lm_eval \
+  --model vllm \
+  --model_args pretrained=$MODEL,kv_cache_dtype=fp8,add_bos_token=True \
+  --tasks gsm8k --num_fewshot 5 --batch_size auto
+```
+
+```
+vllm (pretrained=Meta-Llama-3-8B-Instruct-FP8-KV,kv_cache_dtype=fp8,add_bos_token=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: auto
+|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
+|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
+|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.7748|±  |0.0115|
+|     |       |strict-match    |     5|exact_match|↑  |0.7763|±  |0.0115|
+```
+
+Note: Include `add_bos_token=True` as quantized models can be sensitive to the presence of the `bos` token.
+
+## Questions or Feature Requests?
+
+Please open an issue on `vllm-project/llm-compressor`.
diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -0,0 +1,95 @@
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
+
+# Select model and load it.
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = SparseAutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+
+def process_and_tokenize(example):
+    text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+    return tokenizer(
+        text,
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm and scheme.
+# In this case, we:
+#   * quantize the weights to fp8 with per-tensor scales
+#   * quantize the activations to fp8 with per-tensor scales
+#   * quantize the kv cache to fp8 with per-tensor scales
+recipe = """
+quant_stage:
+    quant_modifiers:
+        QuantizationModifier:
+            ignore: ["lm_head"]
+            config_groups:
+                group_0:
+                    weights:
+                        num_bits: 8
+                        type: float
+                        strategy: tensor
+                        dynamic: false
+                        symmetric: true
+                    input_activations:
+                        num_bits: 8
+                        type: float
+                        strategy: tensor
+                        dynamic: false
+                        symmetric: true
+                    targets: ["Linear"]
+            kv_cache_scheme:
+                num_bits: 8
+                type: float
+                strategy: tensor
+                dynamic: false
+                symmetric: true
+"""
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -7,22 +7,21 @@
 
 # 1) Load model.
 model = SparseAutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto")
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # 2) Configure the quantization algorithm and scheme.
 # In this case, we:
 #   * quantize the weights to fp8 with per channel via ptq
 #   * quantize the activations to fp8 with dynamic per token
 recipe = QuantizationModifier(
-    targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+    targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
+)
 
 # 3) Apply quantization and save in compressed-tensors format.
 OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-oneshot(model=model,
-        recipe=recipe,
-        output_dir=OUTPUT_DIR,
-        tokenizer=tokenizer)
+oneshot(model=model, recipe=recipe, output_dir=OUTPUT_DIR, tokenizer=tokenizer)
 
 # Confirm generations of the quantized model look sane.
 print("========== SAMPLE GENERATION ==============")