AWQ support Modelopt ckpts. (NVIDIA#3258)

Tracin · QiJune · web-flow · commit bb6c338730a4 · 2025-04-04T08:10:35.000+08:00
Signed-off-by: Tracin &lt;10434017+Tracin@users.noreply.github.com&gt;
Co-authored-by: QI JUN &lt;22017000+QiJune@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/models/modeling_utils.py b/tensorrt_llm/models/modeling_utils.py
@@ -1705,14 +1705,16 @@ def preprocess_perlayer_weights(weights,
                 dtype = torch.float16
                 if model_config.dtype == "bfloat16":
                     dtype = torch.bfloat16
-                weights[name] = preprocessor(param.T, torch.quint4x2,
+                weights[name] = preprocessor(param.transpose(-1, -2),
+                                             torch.quint4x2,
                                              activation_type).view(dtype)
-            if name.endswith('weights_scaling_factor'
-                             ) and param.shape[0] > param.shape[1]:
-                # TODO: refine on supporting ModelOpt HF-AWQ
-                weights[name] = param.T.contiguous().to(
+            if name.endswith('weights_scaling_factor'):
+                weights[name] = param.transpose(-1, -2).contiguous().to(
                     str_dtype_to_torch(model_config.dtype))
             if name.endswith('prequant_scaling_factor'):
+                if len(weights[name].shape) == 2:
+                    # MoE experts share the same scaling factor.
+                    param = param[0, :]
                 weights[name] = param.reshape(1, -1)
             if model_config.mapping.tp_rank > 0:
                 if name.endswith('attention.dense.bias') or name.endswith(
diff --git a/tests/integration/defs/examples/test_mixtral.py b/tests/integration/defs/examples/test_mixtral.py
@@ -888,24 +888,39 @@ def test_llm_mixtral_1gpu_fp4_llmapi(
     venv_check_call(llm_venv, mmlu_cmd)
 
 
-@pytest.mark.parametrize("model_name", ['mixtral-8x7b-v0.1-AWQ'])
+@pytest.mark.parametrize(
+    "model_name", ['mixtral-8x7b-v0.1-AWQ', 'Mixtral-8x7B-Instruct-v0.1'])
 def test_llm_mixtral_int4_awq_1gpu_summary(llama_example_root,
                                            llm_datasets_root, model_name,
                                            llm_rouge_root, llm_venv, cmodel_dir,
-                                           engine_dir):
+                                           engine_dir,
+                                           qcache_dir_without_install_package):
     models_root = llm_models_root()
     model_dir = os.path.join(models_root, model_name)
     ckpt_dir = os.path.join(cmodel_dir, model_name)
 
-    print("Convert checkpoint...")
-    convert_cmd = [
-        f"{llama_example_root}/convert_checkpoint.py",
-        "--model_dir",
-        model_dir,
-        "--output_dir",
-        ckpt_dir,
-    ]
-    venv_check_call(llm_venv, convert_cmd)
+    if 'AWQ' in model_name:
+        print("Convert checkpoint...")
+        convert_cmd = [
+            f"{llama_example_root}/convert_checkpoint.py",
+            "--model_dir",
+            model_dir,
+            "--output_dir",
+            ckpt_dir,
+        ]
+        venv_check_call(llm_venv, convert_cmd)
+    else:
+        print("Quantizing model...")
+        ckpt_dir = quantize_data(
+            llm_venv,
+            llama_example_root,
+            model_dir=model_dir,
+            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
+            dtype="float16",
+            qformat="int4_awq",
+            quantize_dir=qcache_dir_without_install_package,
+            tp_size=1,
+            calib_size=32)
 
     print("Build engines...")
     build_cmd = [
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -180,6 +180,7 @@ examples/test_mixtral.py::test_llm_mixtral_wo_2gpus_summary[Mixtral-8x7B-v0.1-in
 examples/test_mixtral.py::test_llm_mixtral_wo_2gpus_summary[Mixtral-8x7B-v0.1-int8-nb:4]
 examples/test_mixtral.py::test_llm_mixtral_1gpu_fp4_llmapi[Mixtral-8x7B-Instruct-v0.1]
 examples/test_mixtral.py::test_llm_mixtral_int4_awq_1gpu_summary[mixtral-8x7b-v0.1-AWQ]
+examples/test_mixtral.py::test_llm_mixtral_int4_awq_1gpu_summary[Mixtral-8x7B-Instruct-v0.1]
 examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]