up

sayakpaul · sayakpaul · commit 4af534bde942 · 2025-08-22T10:36:27.000+05:30
diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
@@ -190,6 +190,23 @@ def _get_mapping_function_kwargs(mapping_fn, **kwargs):
     return mapping_kwargs
 
 
+def _maybe_determine_modules_to_not_convert(quantization_config, state_dict):
+    if quantization_config is None:
+        return None
+    else:
+        is_nunchaku = quantization_config.quant_method == "nunchaku"
+        if not is_nunchaku:
+            return None
+        else:
+            no_qweight = set()
+            for key in state_dict:
+                if key.endswith(".weight"):
+                    # module name is everything except the last piece after "."
+                    module_name = ".".join(key.split(".")[:-1])
+                    no_qweight.add(module_name)
+            return sorted(no_qweight)
+
+
 class FromOriginalModelMixin:
     """
     Load pretrained weights saved in the `.ckpt` or `.safetensors` format into a model.
@@ -324,6 +341,18 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
                 user_agent=user_agent,
             )
         if quantization_config is not None:
+            # For `nunchaku` checkpoints, we might want to determine the `modules_to_not_convert`.
+            original_modules_to_not_convert = quantization_config.modules_to_not_convert or []
+            determined_modules_to_not_convert = _maybe_determine_modules_to_not_convert(
+                quantization_config, checkpoint
+            )
+            if determined_modules_to_not_convert:
+                original_modules_to_not_convert.extend(determined_modules_to_not_convert)
+                original_modules_to_not_convert = list(set(original_modules_to_not_convert))
+                logger.info(
+                    f"`modules_to_not_convert` in the quantization_config was updated from {quantization_config.modules_to_not_convert} to {original_modules_to_not_convert}."
+                )
+                quantization_config.modules_to_not_convert = original_modules_to_not_convert
             hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config)
             hf_quantizer.validate_environment()
             torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype)
@@ -404,8 +433,9 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
             model = cls.from_config(diffusers_model_config)
 
         checkpoint_mapping_kwargs = _get_mapping_function_kwargs(checkpoint_mapping_fn, **kwargs)
-
-        if _should_convert_state_dict_to_diffusers(model.state_dict(), checkpoint):
+        if not (
+            quantization_config is not None and quantization_config.quant_method == "nunchaku"
+        ) and _should_convert_state_dict_to_diffusers(model.state_dict(), checkpoint):
             diffusers_format_checkpoint = checkpoint_mapping_fn(
                 config=diffusers_model_config, checkpoint=checkpoint, **checkpoint_mapping_kwargs
             )
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
@@ -297,6 +297,13 @@ def load_model_dict_into_meta(
             offload_index = offload_weight(param, param_name, offload_folder, offload_index)
         elif param_device == "cpu" and state_dict_index is not None:
             state_dict_index = offload_weight(param, param_name, state_dict_folder, state_dict_index)
+        # This check below might be a bit counter-intuitive in nature. This is because we're checking if the param
+        # or its module is quantized and if so, we're proceeding with creating a quantized param. This is because
+        # of the way pre-trained models are loaded. They're initialized under "meta" device, where
+        # quantization layers are first injected. Hence, for a model that is either pre-quantized or supplemented
+        # with a `quantization_config` during `from_pretrained`, we expect `check_if_quantized_param` to return True.
+        # Then depending on the quantization backend being used, we run the actual quantization step under
+        # `create_quantized_param`.
         elif is_quantized and (
             hf_quantizer.check_if_quantized_param(model, param, param_name, state_dict, param_device=param_device)
         ):
diff --git a/src/diffusers/quantizers/nunchaku/nunchaku_quantizer.py b/src/diffusers/quantizers/nunchaku/nunchaku_quantizer.py
@@ -20,6 +20,11 @@
 if is_torch_available():
     import torch
 
+if is_accelerate_available():
+    pass
+
+if is_nunchaku_available():
+    from .utils import replace_with_nunchaku_linear
 
 logger = logging.get_logger(__name__)
 
@@ -35,10 +40,6 @@ class NunchakuQuantizer(DiffusersQuantizer):
 
     def __init__(self, quantization_config, **kwargs):
         super().__init__(quantization_config, **kwargs)
-        dtype_map = {"int4": torch.int8}
-        if is_fp8_available():
-            dtype_map = {"nvfp4": torch.float8_e4m3fn}
-        self.dtype_map = dtype_map
 
     def validate_environment(self, *args, **kwargs):
         if not torch.cuda.is_available():
@@ -74,14 +75,13 @@ def check_if_quantized_param(
         state_dict: Dict[str, Any],
         **kwargs,
     ):
-        # TODO: revisit
-        # Check if the param_name is not in self.modules_to_not_convert
-        if any((key + "." in param_name) or (key == param_name) for key in self.modules_to_not_convert):
-            return False
-        else:
-            # We only quantize the weight of nn.Linear
-            module, _ = get_module_from_name(model, param_name)
-            return isinstance(module, torch.nn.Linear)
+        from nunchaku.models.linear import SVDQW4A4Linear
+
+        module, _ = get_module_from_name(model, param_name)
+        if self.pre_quantized and isinstance(module, SVDQW4A4Linear):
+            return True
+
+        return False
 
     def create_quantized_param(
         self,
@@ -98,42 +98,33 @@ def create_quantized_param(
         from nunchaku.models.linear import SVDQW4A4Linear
 
         module, tensor_name = get_module_from_name(model, param_name)
+        state_dict = args[0]
         if tensor_name not in module._parameters and tensor_name not in module._buffers:
             raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
 
-        if self.pre_quantized:
-            if tensor_name in module._parameters:
-                module._parameters[tensor_name] = torch.nn.Parameter(param_value.to(device=target_device))
-            if tensor_name in module._buffers:
-                module._buffers[tensor_name] = torch.nn.Parameter(param_value.to(target_device))
-
-        elif isinstance(module, torch.nn.Linear):
-            # TODO: this returns an `SVDQW4A4Linear` layer initialized from the corresponding `linear` module.
-            # But we need to have a utility that can take a pretrained param value and quantize it. Not sure
-            # how to do that yet.
-            # Essentially, we need something like `bnb.nn.Params4bit.from_prequantized`. Or is there a better
-            # way to do it?
-            is_param = tensor_name in module._parameters
-            is_buffer = tensor_name in module._buffers
-            new_module = SVDQW4A4Linear.from_linear(
-                module, precision=self.quantization_config.precision, rank=self.quantization_config.rank
-            )
-            module_name = ".".join(param_name.split(".")[:-1])
-            if "." in module_name:
-                parent_name, leaf = module_name.rsplit(".", 1)
-                parent = model.get_submodule(parent_name)
+        if isinstance(module, SVDQW4A4Linear):
+            if param_value.ndim == 1:
+                module._parameters[tensor_name] = torch.nn.Parameter(param_value, requires_grad=False).to(
+                    target_device
+                )
+            elif tensor_name == "qweight":
+                module._parameters[tensor_name] = torch.nn.Parameter(param_value, requires_grad=False).to(
+                    target_device
+                )
+                # if the tensor has qweight, but does not have low-rank branch, we need to add some artificial tensors
+                for t in ["lora_up", "lora_down"]:
+                    # need to check at the state dict level for this
+                    new_tensor_name = param_name.replace(".qweight", f".{t}")
+                    if new_tensor_name not in state_dict:
+                        oc, ic = param_value.shape
+                        ic = ic * 2  # v is packed into INT8, so we need to double the size
+                        module._parameters[t] = torch.zeros(
+                            (0, ic) if t == "lora_down" else (oc, 0), device=param_value.device, dtype=torch.bfloat16
+                        )
             else:
-                parent, leaf = model, module_name
-
-            # rebind
-            # this will result into
-            # AttributeError: 'SVDQW4A4Linear' object has no attribute 'weight'. Did you mean: 'qweight'.
-            if is_param:
-                new_module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device)
-            elif is_buffer:
-                new_module._buffers[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device)
-
-            setattr(parent, leaf, new_module)
+                module._parameters[tensor_name] = torch.nn.Parameter(param_value, requires_grad=False).to(
+                    target_device
+                )
 
     def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
         max_memory = {key: val * 0.90 for key, val in max_memory.items()}
@@ -173,24 +164,25 @@ def _process_model_before_weight_loading(
         **kwargs,
     ):
         self.modules_to_not_convert = self.quantization_config.modules_to_not_convert
-
         if not isinstance(self.modules_to_not_convert, list):
             self.modules_to_not_convert = [self.modules_to_not_convert]
-
         self.modules_to_not_convert.extend(keep_in_fp32_modules)
-
-        # TODO: revisit
-        # Extend `self.modules_to_not_convert` to keys that are supposed to be offloaded to `cpu` or `disk`
-        # if isinstance(device_map, dict) and len(device_map.keys()) > 1:
-        #     keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
-        #     self.modules_to_not_convert.extend(keys_on_cpu)
-
         # Purge `None`.
         # Unlike `transformers`, we don't know if we should always keep certain modules in FP32
         # in case of diffusion transformer models. For language models and others alike, `lm_head`
         # and tied modules are usually kept in FP32.
         self.modules_to_not_convert = [module for module in self.modules_to_not_convert if module is not None]
 
+        # Extend `self.modules_to_not_convert` to keys that are supposed to be offloaded to `cpu` or `disk`
+        if isinstance(device_map, dict) and len(device_map.keys()) > 1:
+            keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
+            self.modules_to_not_convert.extend(keys_on_cpu)
+
+        model = replace_with_nunchaku_linear(
+            model,
+            modules_to_not_convert=self.modules_to_not_convert,
+            quantization_config=self.quantization_config,
+        )
         model.config.quantization_config = self.quantization_config
 
     def _process_model_after_weight_loading(self, model, **kwargs):
diff --git a/src/diffusers/quantizers/nunchaku/utils.py b/src/diffusers/quantizers/nunchaku/utils.py
@@ -0,0 +1,80 @@
+import torch.nn as nn
+
+from ...utils import is_accelerate_available, is_nunchaku_available, logging
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+
+logger = logging.get_logger(__name__)
+
+
+def _replace_with_nunchaku_linear(
+    model,
+    svdq_linear_cls,
+    modules_to_not_convert=None,
+    current_key_name=None,
+    quantization_config=None,
+    has_been_replaced=False,
+):
+    for name, module in model.named_children():
+        if current_key_name is None:
+            current_key_name = []
+        current_key_name.append(name)
+
+        if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
+            # Check if the current key is not in the `modules_to_not_convert`
+            current_key_name_str = ".".join(current_key_name)
+            if not any(
+                (key + "." in current_key_name_str) or (key == current_key_name_str) for key in modules_to_not_convert
+            ):
+                with init_empty_weights():
+                    in_features = module.in_features
+                    out_features = module.out_features
+
+                    model._modules[name] = svdq_linear_cls(
+                        in_features,
+                        out_features,
+                        rank=quantization_config.rank,
+                        bias=module.bias is not None,
+                        torch_dtype=module.weight.dtype,
+                    )
+                    has_been_replaced = True
+                    # Store the module class in case we need to transpose the weight later
+                    model._modules[name].source_cls = type(module)
+                    # Force requires grad to False to avoid unexpected errors
+                    model._modules[name].requires_grad_(False)
+        if len(list(module.children())) > 0:
+            _, has_been_replaced = _replace_with_nunchaku_linear(
+                module,
+                svdq_linear_cls,
+                modules_to_not_convert,
+                current_key_name,
+                quantization_config,
+                has_been_replaced=has_been_replaced,
+            )
+        # Remove the last key for recursion
+        current_key_name.pop(-1)
+    return model, has_been_replaced
+
+
+def replace_with_nunchaku_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
+    if is_nunchaku_available():
+        from nunchaku.models.linear import SVDQW4A4Linear
+
+    model, _ = _replace_with_nunchaku_linear(
+        model, SVDQW4A4Linear, modules_to_not_convert, current_key_name, quantization_config
+    )
+
+    has_been_replaced = any(
+        isinstance(replaced_module, SVDQW4A4Linear) for _, replaced_module in model.named_modules()
+    )
+    if not has_been_replaced:
+        logger.warning(
+            "You are loading your model in the SVDQuant method but no linear modules were found in your model."
+            " Please double check your model architecture, or submit an issue on github if you think this is"
+            " a bug."
+        )
+
+    return model
diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py
@@ -733,36 +733,61 @@ class NunchakuConfig(QuantizationConfigMixin):
     loaded using `nunchaku`.
 
     Args:
-        TODO
+       TODO
        modules_to_not_convert (`list`, *optional*, default to `None`):
             The list of modules to not quantize, useful for quantizing models that explicitly require to have some
-            modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers).
+            modules left in their original precision (e.g. `norm` layers in Qwen-Image).
     """
 
-    group_size_map = {"int4": 64, "nvfp4": 16}
-
     def __init__(
         self,
-        precision: str = "int4",
+        method: str = "svdquant",
+        weight_dtype: str = "int4",
+        weight_scale_dtype: str = None,
+        weight_group_size: int = 64,
+        activation_dtype: str = "int4",
+        activation_scale_dtype: str = None,
+        activation_group_size: int = 64,
         rank: int = 32,
         modules_to_not_convert: Optional[List[str]] = None,
         **kwargs,
     ):
         self.quant_method = QuantizationMethod.NUNCHAKU
-        self.precision = precision
+        self.method = method
+        self.weight_dtype = weight_dtype
+        self.weight_scale_dtype = weight_scale_dtype
+        self.weight_group_size = weight_group_size
+        self.activation_dtype = activation_dtype
+        self.activation_scale_dtype = activation_scale_dtype
+        self.activation_group_size = activation_group_size
         self.rank = rank
-        self.group_size = self.group_size_map[precision]
         self.modules_to_not_convert = modules_to_not_convert
 
         self.post_init()
 
     def post_init(self):
         r"""
-        Safety checker that arguments are correct
+        Safety checker that arguments are correct. Hardware checks were largely adapted from the official `nunchaku`
+        codebase.
         """
-        accpeted_precision = ["int4", "nvfp4"]
-        if self.precision not in accpeted_precision:
-            raise ValueError(f"Only supported precision in {accpeted_precision} but found {self.precision}")
+        from ..utils.torch_utils import get_device
+
+        device = get_device()
+        if isinstance(device, str):
+            device = torch.device(device)
+        capability = torch.cuda.get_device_capability(0 if device.index is None else device.index)
+        sm = f"{capability[0]}{capability[1]}"
+        if sm == "120":  # you can only use the fp4 models
+            if self.weight_dtype != "fp4_e2m1_all":
+                raise ValueError('Please use "fp4" quantization for Blackwell GPUs.')
+        elif sm in ["75", "80", "86", "89"]:
+            if self.weight_dtype != "int4":
+                raise ValueError('Please use "int4" quantization for Turing, Ampere and Ada GPUs.')
+        else:
+            raise ValueError(
+                f"Unsupported GPU architecture {sm} due to the lack of 4-bit tensorcores. "
+                "Please use a Turing, Ampere, Ada or Blackwell GPU for this quantization configuration."
+            )
 
         # TODO: should there be a check for rank?