vllm-project
diff --git a/‎.cd/benchmark/benchmark_defaults.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.cd/benchmark/benchmark_defaults.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.cd/benchmark/benchmark_scenarios_text.yaml‎
Lines changed: 3 additions & 0 deletions b/‎.cd/benchmark/benchmark_scenarios_text.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.cd/docker-compose.yml‎
Lines changed: 1 addition & 1 deletion b/‎.cd/docker-compose.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.cd/server/settings_vllm.csv‎
Lines changed: 1 addition & 0 deletions b/‎.cd/server/settings_vllm.csv‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.cd/templates/template_vllm_benchmark.sh‎
Lines changed: 3 additions & 1 deletion b/‎.cd/templates/template_vllm_benchmark.sh‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎calibration/deepseek_gaudi2_converter.py‎
Lines changed: 70 additions & 0 deletions b/‎calibration/deepseek_gaudi2_converter.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎calibration/quantization_config/README.md‎
Lines changed: 57 additions & 0 deletions b/‎calibration/quantization_config/README.md‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎calibration/quantization_config/act_maxabs_pow2_weights_pcs_opt_pow2_quant.json‎
Lines changed: 7 additions & 0 deletions b/‎calibration/quantization_config/act_maxabs_pow2_weights_pcs_opt_pow2_quant.json‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎calibration/quantization_config/maxabs_measure.json‎
Lines changed: 6 additions & 0 deletions b/‎calibration/quantization_config/maxabs_measure.json‎
Lines changed: 6 additions & 0 deletions
@@ -14,6 +14,7 @@ model_text:
     - Qwen/Qwen2.5-32B-Instruct
     - Qwen/Qwen2.5-72B-Instruct
     - Qwen/Qwen2.5-7B-Instruct
+    - Qwen/Qwen3-0.6B
     - ibm-granite/granite-8b-code-instruct-4k
     - ibm-granite/granite-20b-code-instruct-8k
   DATASET: /workspace/vllm-project/benchmarks/sonnet.txt
 
@@ -41,6 +41,9 @@ qwen25_72b_instruct:
 qwen25_7b_instruct:
   MODEL: Qwen/Qwen2.5-7B-Instruct
 
+Qwen/Qwen3-0.6B:
+  MODEL: Qwen/Qwen3-0.6B
+
 granite_8b_code_instruct_4k:
   MODEL: ibm-granite/granite-8b-code-instruct-4k
 
 
@@ -43,5 +43,5 @@ services:
     env_file:
       - ./benchmark/benchmark_user.env
     volumes:
-      - ./logs:/root/scripts/logs
+      - /tmp/logs:/root/scripts/logs
     command: ["benchmark", "--config-file", "${VLLM_BENCHMARK_CONFIG_FILE}", "--config-name", "${VLLM_BENCHMARK_CONFIG_NAME}"]
@@ -16,3 +16,4 @@ Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,0,10,5,128,1,3
 ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20,0,10,8,128,1,32,1,32,128,256,1,128,256,1,36,4096,8,32,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
 ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,40133986304,2,2,37.37,0,10,4,128,1,32,1,32,128,256,1,128,256,1,52,6144,1,48,2,65536,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
 Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,0,12,4,128,1,32,1,32,128,256,1,128,256,1,28,3584,4,28,2,32768,1,FALSE,FALSE,2048,FALSE,FALSE,FALSE,1,0
+Qwen/Qwen3-0.6B,1,4352,128,2,1.61E+09,2,2,1.5,0,10,5,128,1,32,1,32,128,256,1,128,256,1,28,1024,8,16,2,32768,1,FALSE,FALSE,2048,FALSE,TRUE,TRUE,1,0
@@ -35,4 +35,6 @@ vllm bench serve \
                 --metric-percentiles 90 \
                 --ignore-eos \
                 --trust-remote-code \
-2>&1 | tee -a logs/perftest_inp${INPUT_TOK}_out${OUTPUT_TOK}_user${CONCURRENT_REQ}.log
+                --save-result \
+                --result-dir logs \
+                --result-filename summary_inp${INPUT_TOK}_out${OUTPUT_TOK}_user${CONCURRENT_REQ}.json 2>&1 | tee -a logs/summary_inp${INPUT_TOK}_out${OUTPUT_TOK}_user${CONCURRENT_REQ}.log #save results to logs on a host
@@ -15,7 +15,7 @@ vLLM Hardware Plugin for Intel® Gaudi®
 ---
 *Latest News* 🔥
 
-- [2025/11] The 0.10.2 release introduces the production-ready version of the vLLM Hardware Plugin for Intel® Gaudi® v1.23.0. The plugin is an alternative to the [vLLM fork](https://github.com/HabanaAI/vllm-fork), which reaches end of life with this release and will be deprecated in v1.24.0, remaining functional only for legacy use cases. We strongly encourage all fork users to begin planning their migration to the plugin. For more information about this release, see the [Release Notes](docs/release_notes.md).
+- [2025/11] The 0.11.2 release introduces the production-ready version of the vLLM Hardware Plugin for Intel® Gaudi® v1.22.2. The plugin is an alternative to the [vLLM fork](https://github.com/HabanaAI/vllm-fork), which reaches end of life with this release and will be deprecated in v1.24.0, remaining functional only for legacy use cases. We strongly encourage all fork users to begin planning their migration to the plugin. For more information about this release, see the [Release Notes](docs/release_notes.md).
 - [2025/06] We introduced an early developer preview of the vLLM Hardware Plugin for Intel® Gaudi®, which is not yet intended for general use.
 
 ---
 
@@ -0,0 +1,70 @@
+import torch
+from safetensors import safe_open
+from safetensors.torch import save_file
+from glob import glob
+import os
+
+import argparse
+
+
+def copy_other_files(input_path, output_path):
+    import shutil
+
+    for file in os.listdir(input_path):
+        if file.endswith(".json") or file.endswith(".py"):
+            print(f"copying {file} to {output_path}")
+            shutil.copyfile(
+                os.path.join(input_path, file),
+                os.path.join(output_path, file),
+            )
+
+
+def convert_files(input_path, output_path):
+    all_safetensors = glob(f"{input_path}/*.safetensors")
+    # sort by file name
+    all_safetensors.sort()
+    for safetensors_path in all_safetensors:
+        tensors = {}
+        print(f"processing {safetensors_path}")
+        with safe_open(safetensors_path, framework="pt", device="cpu") as tensor_file:
+            for k in tensor_file:
+                tensor = tensor_file.get_tensor(k)
+                if "proj" in k:
+                    if k.endswith("weight"):
+                        tensor = (tensor.float() * 0.5).to(torch.float8_e4m3fn)
+                    elif k.endswith("weight_scale_inv") or k.endswith("input_scale_inv"):
+                        # "scale_inv" in deepseek-r1 is actually "scale"
+                        tensor = tensor.float() * 2
+                    else:
+                        raise NotImplementedError(f"Cannot convert {k}")
+                else:
+                    print(f"skip {k}.")
+                tensors[k] = tensor
+        new_tensor_path = safetensors_path.replace(input_path, output_path)
+        print(f"saving to {new_tensor_path}")
+        save_file(tensors, new_tensor_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert tensors to float8fnuz format.")
+    parser.add_argument(
+        "-i",
+        "--input_path",
+        help="Path to the official model weights.",
+        required=True,
+    )
+    parser.add_argument(
+        "-o",
+        "--output_path",
+        help="Path to the output directory.",
+        required=True,
+    )
+    args = parser.parse_args()
+    input_path = args.input_path
+    output_path = args.output_path
+
+    # create output directory if it does not exist
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    copy_other_files(input_path, output_path)
+    convert_files(input_path, output_path)
@@ -0,0 +1,57 @@
+# Supported JSON Config File Options
+
+The following table summarizes the options for the JSON config file:
+
+| Attribute            | Description | Values |
+|----------------------|-------------|--------|
+| **Mode**             | The mode to run INC with. | - **MEASURE** – Measure statistics of all modules and emit the results to `dump_stats_path`.<br>- **QUANTIZE** *(default)* – Quantize and run the model according to the provided measurements. |
+| **Observer**         | The observer to measure the statistics. | - **maxabs** *(default)*<br>- **save** – Saves all tensors to files. |
+| **Allowlist**        | List of `nn.Module` names or types to quantize. Empty list means all supported modules are quantized by default. See *supported-modules*. | Default: empty list |
+| **Blocklist**        | List of `nn.Module` names or types **not** to quantize. | Default: empty list |
+| **dump_stats_path**  | Path to save and load measurements. Directory structure is created up to the last `/`; the string after the last `/` is used as a prefix for measurement files. | Default: `stats` |
+| **scale_method**     | Method for calculating the scale from measurements. | - `unit_scale` *(default)* – Always use scale of 1.<br>- `maxabs_arbitrary` – Stretch/compress maxabs to full-scale of FP8.<br>- `maxabs_hw` – Stretch/compress maxabs to full-scale of FP8, then replace with HW-accelerated scale based on `device_for_scales`.<br>- `maxabs_pow2` – Same as above but rounded to power of 2.<br>- `maxabs_hw_opt_weight` – Weight scale chosen for minimal MSE among HW accelerated scales; activations use `maxabs_hw`.<br>- `act_maxabs_pow2_weights_pcs_opt_pow2` – Per-channel weights use `maxabs_hw_opt_weight`; activations use `maxabs_pow2`.<br>- `act_maxabs_hw_weights_pcs_maxabs_pow2` – Per-channel weights use `maxabs_pow2`; activations use `maxabs_hw`.<br>- `act_maxabs_pcs_pow2_weight_maxabs_pts_pow2_hw` – **Dynamic quant only**: per-tensor weights use `maxabs_hw`; activations use per-token `maxabs_pow2`. |
+| **measure_exclude**  | Tensor types to exclude from measurement. | - `NONE` – Measure all tensors.<br>- `OUTPUT` *(default)* – Skip output tensors. |
+| **scale_format**     | Format of scales passed to custom PyTorch ops. | - `const` – Scales passed as tensors.<br>- `scalar` *(default)* – Scales passed as scalar values for compile-time & throughput optimizations. |
+| **device_for_scales**| Exponent-bias values for converting FP32/BF16 to FP8-143. | - `GAUDI3` – Expanded exponent-bias range (0–63).<br>- `GAUDI2` – 4 possible exponent biases (3, 7, 11, 15), default is 7. |
+| **dynamic_quantization** | Enables dynamic FP8 quantization with per-token scales. Only supported with `act_maxabs_pcs_pow2_weight_maxabs_pts_pow2_hw`. | - `true` – Enable.<br>- `false` *(default)* – Disable. |
+
+---
+
+## Configuring Backoff Factors
+
+Maxabs-based scaling methods support backoff factors `input_backoff` and `weight_backoff` to leave margin when converting inputs and weights to FP8.
+
+For example, if an activation has a larger absolute value than observed in calibration, the maxabs value is scaled to:
+
+```
+input_backoff * FP8_143_FULLSCALE
+```
+
+Similarly, for weights:
+
+```
+weight_backoff * FP8_143_FULLSCALE
+```
+
+Defaults:
+- `input_backoff = 0.25`
+- `weight_backoff = 0.5`
+
+To change these values, add the following to the quantization configuration JSON file:
+
+```json
+"scale_params": {"input_backoff": <INPUT_BACKOFF>, "weight_backoff": <WEIGHT_BACKOFF>}
+```
+
+---
+
+## Compile Time and Throughput Optimization
+
+Setting `"scale_format": "scalar"` enables:
+
+- Faster compile time for FP8 inference by reducing the number of compiled recipes.
+- Less host-side overhead when launching FP8 ops, improving throughput in host-bound cases (e.g., small batch sizes).
+
+> **Note:**
+> - Compile time improvement depends on model properties such as recipe count and scale distribution.
+> - Not applicable to PCQ.
@@ -0,0 +1,7 @@
+{
+    "method": "HOOKS",
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2",
+    "dump_stats_path": "./hqt_output/measure"
+}
@@ -0,0 +1,6 @@
+{
+    "method": "HOOKS",
+    "mode": "MEASURE",
+    "observer": "maxabs",
+    "dump_stats_path": "./hqt_output/measure"
+}