Adds LoRa fine tuning

LukaDarsalia · LukaDarsalia · commit f113cbddeb86 · 2025-05-21T20:25:13.000+04:00
diff --git a/moshi/moshi/models/loaders.py b/moshi/moshi/models/loaders.py
@@ -376,9 +376,7 @@ def get_moshi_lm(
             model.load_state_dict(pkg["fsdp_best_state"]["model"], assign=True)
 
     if lora:
-        assert not lm_kwargs.get("quantize"), (
-            "LoRA and quantization are incompatible for now."
-        )
+        # LoRA now supports quantized models
         model = get_lora_moshi(
             model=model,
             lora_rank=lora_rank,
diff --git a/moshi/moshi/modules/lora.py b/moshi/moshi/modules/lora.py
@@ -5,16 +5,47 @@
 def replace_all_linear_with_lora(module, rank: int, scaling: float, device=None, dtype=None):
     """ Recursively replace all Linear layers with LoRALinear layers."""
     for name, child in module.named_children():
-        if isinstance(child, nn.Linear):
-            if device is None:
-                this_device = child.weight.device
-            else:
-                this_device = device
-            if dtype is None:
-                this_dtype = child.weight.dtype
-            else:
-                this_dtype = dtype
-            lora = LoRALinear(child.in_features, child.out_features,
+        # Check for both nn.Linear and QLinear (from quantize.py)
+        if isinstance(child, nn.Linear) or (hasattr(child, 'weight') and hasattr(child, 'weight_scb')):
+            # For QLinear, we need to get in_features and out_features differently
+            if isinstance(child, nn.Linear):
+                in_features = child.in_features
+                out_features = child.out_features
+                if device is None:
+                    this_device = child.weight.device
+                else:
+                    this_device = device
+                if dtype is None:
+                    this_dtype = child.weight.dtype
+                else:
+                    this_dtype = dtype
+            else:  # QLinear
+                # For QLinear, we can infer dimensions from weight shape
+                # weight is [out_features, in_features] for both Linear and QLinear
+                if hasattr(child, 'weight') and child.weight.shape:
+                    if child.weight.device.type != 'meta':
+                        out_features = child.weight.shape[0]
+                        in_features = child.weight.shape[1]
+                    else:
+                        # For meta tensors, we need to be careful about padded dimensions
+                        # QLinear pads out_features to multiple of 8
+                        out_features = child.weight_scb.shape[0]  # This is the actual out_features
+                        in_features = child.weight.shape[1]
+                else:
+                    # If we can't determine the shape, skip this layer
+                    continue
+
+                if device is None:
+                    this_device = child.weight.device
+                else:
+                    this_device = device
+                if dtype is None:
+                    # For QLinear, we should use float16 or bfloat16 for LoRA
+                    this_dtype = torch.bfloat16 if dtype is None else dtype
+                else:
+                    this_dtype = dtype
+
+            lora = LoRALinear(in_features, out_features,
                               rank, scaling, device=this_device, dtype=this_dtype)
             lora.frozen_W = child
             setattr(module, name, lora)
@@ -26,17 +57,57 @@ def replace_lora_with_linear(module):
     """Recursively replace all LoRALinear layers with Linear layers."""
     for name, child in module.named_children():
         if isinstance(child, LoRALinear):
-            # Compute merged weights: W' = W + scaling * B @ A
-            merged_weight = child.frozen_W.weight.data + \
-                child.scaling * (child.lora_B.weight @ child.lora_A.weight)
-            # Create a standard Linear layer with the same in/out features
-            new_linear = nn.Linear(child.frozen_W.in_features,
-                                   child.frozen_W.out_features, bias=False,
-                                   device=torch.device('meta'),
-                                   dtype=merged_weight.dtype)
-            new_linear.weight = nn.Parameter(
-                merged_weight, requires_grad=merged_weight.requires_grad)  # Transfer merged weights
-            setattr(module, name, new_linear)  # Replace the module
+            # Check if frozen_W is a QLinear or nn.Linear
+            if hasattr(child.frozen_W, 'weight_scb'):
+                # For QLinear, we need to convert back to nn.Linear
+                # This is because QLinear uses int8 quantization which isn't compatible with
+                # directly adding the LoRA weights
+
+                # First, compute the LoRA contribution
+                lora_contribution = child.scaling * (child.lora_B.weight @ child.lora_A.weight)
+
+                # Create a standard Linear layer with the same in/out features
+                new_linear = nn.Linear(child.in_features,
+                                      child.out_features, bias=False,
+                                      device=torch.device('meta'),
+                                      dtype=lora_contribution.dtype)
+
+                # For QLinear, we need to run a forward pass to get the dequantized weights
+                # This is a workaround since we can't directly access the dequantized weights
+                # We'll create a dummy input and extract the weights from the output
+                with torch.no_grad():
+                    # Create identity matrix as input to extract the weight matrix
+                    dummy_input = torch.eye(
+                        child.in_features, 
+                        device=lora_contribution.device,
+                        dtype=torch.float16  # QLinear expects float16
+                    )
+                    # Get the output which is equivalent to the weight matrix
+                    dequantized_weight = child.frozen_W(dummy_input)
+                    # Convert to the same dtype as lora_contribution
+                    dequantized_weight = dequantized_weight.to(lora_contribution.dtype)
+                    # Transpose the dequantized weight to match the shape of lora_contribution
+                    dequantized_weight = dequantized_weight.transpose(0, 1)
+                    # Add the LoRA contribution
+                    merged_weight = dequantized_weight + lora_contribution
+                    # Set the merged weights
+                    new_linear.weight = nn.Parameter(
+                        merged_weight, requires_grad=False)
+
+                setattr(module, name, new_linear)  # Replace the module
+            else:
+                # Standard nn.Linear case
+                # Compute merged weights: W' = W + scaling * B @ A
+                merged_weight = child.frozen_W.weight.data + \
+                    child.scaling * (child.lora_B.weight @ child.lora_A.weight)
+                # Create a standard Linear layer with the same in/out features
+                new_linear = nn.Linear(child.frozen_W.in_features,
+                                      child.frozen_W.out_features, bias=False,
+                                      device=torch.device('meta'),
+                                      dtype=merged_weight.dtype)
+                new_linear.weight = nn.Parameter(
+                    merged_weight, requires_grad=merged_weight.requires_grad)  # Transfer merged weights
+                setattr(module, name, new_linear)  # Replace the module
         else:
             replace_lora_with_linear(child)  # Recursively process submodules
 
@@ -103,19 +174,68 @@ def merge_weight(self):
 
             weight = up_weight.mm(down_weight) * self.scaling
 
-            weight += self.frozen_W.weight
+            # Handle both nn.Linear and QLinear for frozen_W
+            if isinstance(self.frozen_W, nn.Linear):
+                # Standard nn.Linear case
+                weight += self.frozen_W.weight
+            elif hasattr(self.frozen_W, 'weight_scb'):
+                # For QLinear, we need to run a forward pass to get the dequantized weights
+                # Create identity matrix as input to extract the weight matrix
+                dummy_input = torch.eye(
+                    self.in_features, 
+                    device=weight.device,
+                    dtype=torch.float16  # QLinear expects float16
+                )
+                # Get the output which is equivalent to the weight matrix
+                dequantized_weight = self.frozen_W(dummy_input)
+                # Convert to the same dtype as weight
+                dequantized_weight = dequantized_weight.to(weight.dtype)
+                # Transpose the dequantized weight to match the shape of weight
+                dequantized_weight = dequantized_weight.transpose(0, 1)
+                # Add to the LoRA contribution
+                weight += dequantized_weight
+            else:
+                # Fallback for any other type
+                weight += self.frozen_W.weight
         return weight
 
     @staticmethod
     def _load_hook(module, state_dict, prefix, *_):
-        key_name = prefix + "weight"
-        if key_name in state_dict:
-            w_ref = state_dict.pop(key_name)
-            state_dict[prefix + 'frozen_W.weight'] = w_ref
+        qlinear_params = ("weight", "weight_scb", "weight_absmax",
+                          "bias", "bias_scb", "bias_absmax")   # add others if you use act-order
+
+        for name in qlinear_params:
+            key = prefix + name
+            if key in state_dict:
+                state_dict[f"{prefix}frozen_W.{name}"] = state_dict.pop(key)
 
     def forward(self, x: torch.Tensor):
         lora = self.lora_B(self.lora_A(x))
-        return self.frozen_W(x) + lora * self.scaling
+
+        # Handle both nn.Linear and QLinear for frozen_W
+        if isinstance(self.frozen_W, nn.Linear):
+            # Standard nn.Linear forward
+            return self.frozen_W(x) + lora * self.scaling
+        elif hasattr(self.frozen_W, 'weight_scb'):
+            # QLinear forward - we need to ensure dtype compatibility
+            # QLinear expects float16 input and returns float16 output
+            # LoRA adapters are in float16/bfloat16
+            x_dtype = x.dtype
+            if x_dtype != torch.float16:
+                x_for_frozen = x.to(torch.float16)
+            else:
+                x_for_frozen = x
+
+            frozen_output = self.frozen_W(x_for_frozen)
+
+            # Convert back to original dtype if needed
+            if frozen_output.dtype != x_dtype:
+                frozen_output = frozen_output.to(x_dtype)
+
+            return frozen_output + lora * self.scaling
+        else:
+            # Fallback for any other type
+            return self.frozen_W(x) + lora * self.scaling
 
     def __repr__(self) -> str:
         return "{}Linear(in_features={}, out_features={}, r={})".format(
diff --git a/moshi/moshi/utils/quantize.py b/moshi/moshi/utils/quantize.py
@@ -108,5 +108,16 @@ def replace_linear_with_qlinear(module):
             # of the LM init, after all other modules are initialized and properly dtyped.
             # In any case that should happen before loading the state dict to avoid a loss of precision.
             child.float()
+        elif hasattr(module, 'modules') and hasattr(child, 'frozen_W'):
+            # This is likely a LoRALinear layer
+            # We don't want to replace it directly, as it will be handled by the LoRA-specific code
+            # Instead, we replace its frozen_W with a QLinear
+            try:
+                from ..modules.lora import LoRALinear
+                if isinstance(child, LoRALinear):
+                    child.frozen_W = QLinear(child.frozen_W)
+            except ImportError:
+                # If we can't import LoRALinear, just skip this layer
+                pass
         else:
             replace_linear_with_qlinear(child)