foundation-model-stack
diff --git a/‎.pylintrc‎
Lines changed: 2 additions & 1 deletion b/‎.pylintrc‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎fms_mo/dq.py‎
Lines changed: 36 additions & 28 deletions b/‎fms_mo/dq.py‎
Lines changed: 36 additions & 28 deletions
diff --git a/‎fms_mo/modules/linear.py‎
Lines changed: 1 addition & 1 deletion b/‎fms_mo/modules/linear.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fms_mo/prep.py‎
Lines changed: 28 additions & 21 deletions b/‎fms_mo/prep.py‎
Lines changed: 28 additions & 21 deletions
diff --git a/‎fms_mo/quant/quantizers.py‎
Lines changed: 1 addition & 1 deletion b/‎fms_mo/quant/quantizers.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fms_mo/recipes/dq.json‎
Lines changed: 3 additions & 1 deletion b/‎fms_mo/recipes/dq.json‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎fms_mo/recipes/quant.json‎ renamed to ‎fms_mo/recipes/fp8_vllm_quantization_config.json‎ b/‎fms_mo/recipes/quant.json‎ renamed to ‎fms_mo/recipes/fp8_vllm_quantization_config.json‎
@@ -69,7 +69,8 @@ ignored-modules=gptqmodel,
                 llmcompressor,
                 cutlass_mm,
                 pygraphviz,
-                matplotlib
+                matplotlib,
+                compressed_tensors
 
 # Python code to execute, usually for sys.path manipulation such as
 # pygtk.require().
 
@@ -1,11 +1,11 @@
 # Copyright The FMS Model Optimizer Authors
-
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,6 +21,8 @@
 # Standard
 from pathlib import Path
 import logging
+import os
+import sys
 
 # Third Party
 from datasets import load_from_disk
@@ -34,7 +36,6 @@
 )
 import torch
 
-import os
 # Local
 from fms_mo import qconfig_init, qmodel_prep
 from fms_mo.custom_ext_kernels.utils import (
@@ -48,14 +49,14 @@
     get_act_scales_1gpu,
 )
 from fms_mo.utils.aiu_utils import save_for_aiu
-from fms_mo.utils.dq_utils import config_quantize_smooth_layers
-from fms_mo.utils.eval_utils import Evaluator, eval_llm_1GPU
-from fms_mo.utils.utils import patch_torch_bmm, prepare_input
 from fms_mo.utils.dq_inf import (
-    save_vllm_fp8,
-    convert_fp8_vllm_to_fms_mo,
     check_quantization_setting,
+    convert_fp8_vllm_to_fms_mo,
+    save_vllm_fp8,
 )
+from fms_mo.utils.dq_utils import config_quantize_smooth_layers
+from fms_mo.utils.eval_utils import Evaluator, eval_llm_1GPU
+from fms_mo.utils.utils import patch_torch_bmm, prepare_input
 
 logger = logging.getLogger(__name__)
 
@@ -133,16 +134,17 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         low_cpu_mem_usage=bool(model_args.device_map),
     )
 
-    inference= model.config.to_dict().get("quantization_config",None)
+    inference_qconfig = None
+    if hasattr(model, "config"):
+        inference_qconfig = model.config.to_dict().get("quantization_config", None)
 
-    if inference:
-        quant_setting = check_quantization_setting(inference)
+    if inference_qconfig:
+        quant_setting = check_quantization_setting(inference_qconfig)
         if quant_setting:
             logger.info("Quantization config settings validated ")
-            model = convert_fp8_vllm_to_fms_mo(model = model)
+            model = convert_fp8_vllm_to_fms_mo(model=model)
         else:
-            exit("__This quantization config is wrong/not supported__")
-
+            sys.exit("Error: This quantization config is wrong/not supported")
 
     embedding_size = model.get_input_embeddings().weight.shape[0]
     if len(tokenizer) > embedding_size:
@@ -152,23 +154,29 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
     logger.info(f"Model is at {model.device} after intialization")
     logger.info(f"Tokenizer is {tokenizer}, block size is {block_size}")
 
-    if not inference:
+    if not inference_qconfig:
         logger.info("quantization mode activated, initalizing the qcfg file ")
         qcfg = qconfig_init(recipe="dq", args=fms_mo_args)
     else:
         logger.info("inference mode activated")
-        if os.path.isfile(model_args.model_name_or_path+"/qcfg.json"):
+        if os.path.isfile(model_args.model_name_or_path + "/qcfg.json"):
             if fms_mo_args.override_fms_args:
-                logger.info("qcfg file found and some parameters are being over-written ")
-                qcfg = qconfig_init(recipe=model_args.model_name_or_path+"/qcfg", args=fms_mo_args)
+                logger.info(
+                    "qcfg file found and some parameters are being over-written "
+                )
+                qcfg = qconfig_init(
+                    recipe=model_args.model_name_or_path + "/qcfg", args=fms_mo_args
+                )
             else:
                 logger.info("qcfg file found, loading the qcfg file ")
-                qcfg = qconfig_init(recipe=model_args.model_name_or_path+"/qcfg")
+                qcfg = qconfig_init(recipe=model_args.model_name_or_path + "/qcfg")
         else:
-            logger.info("qcfg file not found in {model_args.model_name_or_path},\
+            logger.info(
+                "qcfg file not found in {model_args.model_name_or_path},\
                         loading fms_mo_args and recipe"
-                        )
+            )
             qcfg = qconfig_init(recipe="dq", args=fms_mo_args)
+        qcfg["inference"] = True
 
     model_size = model_size_Wb(model, unit="GB")
     gpu_mem_util_per = model_size / total_gpu_memory
@@ -193,7 +201,8 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
 
     qcfg["model"] = model_args.model_name_or_path
     # config layers to skip, smooth scale
-    config_quantize_smooth_layers(qcfg)
+    if not inference_qconfig:
+        config_quantize_smooth_layers(qcfg)
 
     use_dynamo = True
     # use dynamo as default unless really needed, False -> fallback to TorchScript tracing
@@ -225,7 +234,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
     )
 
     # For loading or creating smoothquant scale. Sometimes we may include scales in ckpt as well.
-    if not inference and qcfg["smoothq"] :
+    if not inference_qconfig and qcfg["smoothq"]:
         scale_file = Path(f"./act_scales/{qcfg['model'].replace('/', '-')}.pt")
         if qcfg.get("act_scale_path", None):
             # user provided a scale file (or a dir)
@@ -259,12 +268,11 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
             use_layer_name_pattern_matching=use_layer_name_pattern_matching,
             use_dynamo=use_dynamo,
             dev=dev,
-            mode=inference,
             save_fname="dq",
         )
         logger.info(f"Quantized model {model}")
         logger.info("==" * 20)
-    if not inference:
+    if not inference_qconfig:
         if qcfg["smoothq"]:
             logger.info("Starting to apply smooth scale")
             dq_llm(model, act_scales, qcfg)
@@ -295,11 +303,11 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
             logger.info(
                 f"Saving model processed for vLLM and tokenizer to {opt_args.output_dir}"
             )
-            save_vllm_fp8(model,qcfg,tokenizer,opt_args.output_dir)
+            save_vllm_fp8(model, qcfg, tokenizer, opt_args.output_dir)
         elif opt_args.save_ckpt:
             logger.info(
                 f"Saving quantized model and tokenizer to {opt_args.output_dir}"
-                )
+            )
             model.save_pretrained(opt_args.output_dir, use_safetensors=True)
             tokenizer.save_pretrained(opt_args.output_dir)
 
 
@@ -281,7 +281,6 @@ def forward(self, x):
                 )
 
             # pylint: disable=not-callable
-            
             return F.linear(x, self.W_fp, self.bias)
         else:
             qinput = self.quantize_feature(x / scale).to(x.dtype)
@@ -297,6 +296,7 @@ def forward(self, x):
                 )
 
         qbias = self.bias
+
         # pylint: disable=not-callable
         output = F.linear(qinput, qweight, qbias)
 
 
@@ -23,7 +23,7 @@
 # Third Party
 from torch import nn
 import torch
-import compressed_tensors
+
 # Local
 from fms_mo.calib import qmodel_calib
 from fms_mo.modules import QBmm_modules, QConv2d_modules, QLinear_modules, QLSTM_modules
@@ -391,13 +391,19 @@ def make_quant_module(module, curr_full_name, qcfg, verbose=False):
     # For nn.Linear
     elif isinstance(module, nn.Linear):
         if module.__class__ != nn.Linear:
-            if isinstance(module, compressed_tensors.linear.compressed_linear.CompressedLinear):
-                pass
-            else:
-                logger.warning(
-                    f"{curr_full_name} {type(module)} seems to be a wrapper of Linear."
-                    "Please make sure it doesn't wrap BN and activ func."
-                    "Otherwise please create an equivalen Linear wrapper and change qcfg['mapping']."
+            if available_packages["compressed_tensors"]:
+                # Third Party
+                import compressed_tensors
+
+                if isinstance(
+                    module, compressed_tensors.linear.compressed_linear.CompressedLinear
+                ):
+                    pass
+                else:
+                    logger.warning(
+                        f"{curr_full_name} {type(module)} seems to be a wrapper of Linear."
+                        "Please make sure it doesn't wrap BN and activ func. Otherwise"
+                        "please create an equivalen Linear wrapper and change qcfg['mapping']."
                     )
         QLin = mapping.get(nn.Linear, None)
         if QLin is None:
@@ -572,6 +578,7 @@ def has_quantized_module(model):
     """Check if model is already quantized - do not want to quantize twice if so"""
     return any(isinstance(m, quantized_modules) for m in model.modules())
 
+
 def swap_qbmm(model: nn.Module, qcfg: dict):
     """Go through all model.named_modules(), try to create an equivalent
     Qbmm layer to replace each of the existing linear Bmm layers.
@@ -581,14 +588,13 @@ def swap_qbmm(model: nn.Module, qcfg: dict):
         qcfg (dict): quant config
 
     Returns: updated model is returned with the Qbmm added
-        
+
     """
 
+    # Local
     from fms_mo.modules import QBmm
 
-    qcfg["which2patch_contextmanager"] = qcfg["bmm_prep"][
-        "which2patch_contextmanager"
-    ]
+    qcfg["which2patch_contextmanager"] = qcfg["bmm_prep"]["which2patch_contextmanager"]
     isbmm = qcfg["which2patch_contextmanager"] == "torch.bmm"
     for mod_name, line_nums in qcfg["bmm_prep"]["layers_with_bmm"].items():
         mod_bmm_happened = model.get_submodule(mod_name)
@@ -608,6 +614,7 @@ def swap_qbmm(model: nn.Module, qcfg: dict):
             )
             setattr(mod_bmm_happened, f"QBmm{ln}", newQBmm)
 
+
 def qmodel_prep(
     model,
     dloader,
@@ -619,7 +626,6 @@ def qmodel_prep(
     Qcali=False,
     dev=None,
     use_dynamo=False,
-    mode=False,
     verbose=False,
     **kwargs,
 ):
@@ -695,14 +701,13 @@ def qmodel_prep(
     Returns:
         nn.Module: quantized model ready for further PTQ/QAT
     """
-    if mode:
-        
-        if qcfg.get("QBmm"): 
-            swap_qbmm(model,qcfg)
+    if qcfg["inference"]:
+        if qcfg.get("QBmm"):
+            swap_qbmm(model, qcfg)
 
-        model = q_any_net_5(model, qcfg, verbose = False)
+        model = q_any_net_5(model, qcfg, verbose=False)
         return model
-    
+
     sys.setrecursionlimit(4000)
 
     currDev = next(model.parameters()).device if dev is None else dev
@@ -951,8 +956,10 @@ def qmodel_prep(
         model = torch.nn.parallel.DistributedDataParallel(
             model, device_ids=DPorDDPdevices
         )
-
-    qconfig_save(qcfg, fname=qcfg["output_folder"]+"/qcfg.json")
+    if qcfg["output_folder"] is None:
+        qconfig_save(qcfg, fname="qcfg.json")
+    else:
+        qconfig_save(qcfg, fname=qcfg["output_folder"] + "/qcfg.json")
     qcfg["tb_writer"] = tb_writer
 
     logger.info(f"--- Quantized model --- \n{model}\n")
 
@@ -237,7 +237,7 @@ def get_weight_quantizer(
     recompute=False,
     perGp=None,
     use_subnormal=False,
-    emulate = True,
+    emulate=True,
 ):
     """Return a quantizer for weight quantization
     Regular quantizers:
 
@@ -10,5 +10,7 @@
     "eval_ckpt": true,
     "nbits_bmm1" : 32,
     "nbits_bmm2" : 32,
-    "nbits_kvcache" : 32
+    "nbits_kvcache" : 32,
+    "inference": false,
+    "output_folder": null
 }
Original file line number	Diff line number	Diff line change
`@@ -281,7 +281,6 @@ def forward(self, x):`
`281`	`281`	`)`
`282`	`282`
`283`	`283`	`# pylint: disable=not-callable`
`284`		`-`
`285`	`284`	`return F.linear(x, self.W_fp, self.bias)`
`286`	`285`	`else:`
`287`	`286`	`qinput = self.quantize_feature(x / scale).to(x.dtype)`
`@@ -297,6 +296,7 @@ def forward(self, x):`
`297`	`296`	`)`
`298`	`297`
`299`	`298`	`qbias = self.bias`
	`299`	`+`
`300`	`300`	`# pylint: disable=not-callable`
`301`	`301`	`output = F.linear(qinput, qweight, qbias)`
`302`	`302`
Original file line number	Diff line number	Diff line change
`@@ -10,5 +10,7 @@`
`10`	`10`	`"eval_ckpt": true,`
`11`	`11`	`"nbits_bmm1" : 32,`
`12`	`12`	`"nbits_bmm2" : 32,`
`13`		`- "nbits_kvcache" : 32`
	`13`	`+ "nbits_kvcache" : 32,`
	`14`	`+ "inference": false,`
	`15`	`+ "output_folder": null`
`14`	`16`	`}`