fix: support FSDP compatibility in LigerTiledSwiGLUMLP backward

alektebel · alektebel · commit 8ea9c7dce2c8 · 2026-03-06T18:56:54.000+01:00
The previous backward implementation called torch.autograd.backward()
inside the tiling loop, triggering FSDP's post-backward hook (reshard)
once per shard. This caused FSDP1 to reshard parameters mid-loop,
leading to errors on subsequent shard iterations.

Fix: replace torch.autograd.backward() with torch.autograd.grad()
inside the tiling loop. This computes gradients locally without
accumulating into .grad or triggering any hooks. Param gradients
are accumulated manually across shards and written to .grad exactly
once after the loop — FSDP sees a single gradient event, as expected.

This fix is FSDP-agnostic: LigerTiledSwiGLUMLP requires no knowledge
of FSDP. Verified with FSDP1 (FullyShardedDataParallel) and FSDP2
(fully_shard) on 2x RTX 3060.

- FSDP1: previously errored, now passes
- FSDP2: passes
- Non-distributed: unaffected
diff --git a/src/liger_kernel/ops/tiled_mlp.py b/src/liger_kernel/ops/tiled_mlp.py
@@ -76,6 +76,9 @@ def backward(ctx, *grads) -> tuple:
         incoming_grad = grads[0].view(-1, hidden_size)
         x_grad = torch.zeros_like(x)
 
+        # initialize param grad accumulators
+        param_grads = {p: None for p in mlp_module.parameters()}
+
         x_shards = list(torch.chunk(x, chunks=shards, dim=0))
 
         for i, x_shard in enumerate(x_shards):
@@ -84,22 +87,29 @@ def backward(ctx, *grads) -> tuple:
             # if seqlen is not exactly divisible by shards the last step will be shorter than shard_step
             shard_step = x_shards[i].shape[0]
             shard_offset = i * x_shards[0].shape[0]
-
-            x_shard.grad = x_grad.narrow(0, shard_offset, shard_step).view_as(x_shard)
             incoming_grad_shard = incoming_grad.narrow(0, shard_offset, shard_step).view_as(x_shard)
 
-            all_outputs = []
-            all_incoming_grads = []
             with torch.enable_grad():
-                all_outputs.append(fn(mlp_module, x_shard))
-                all_incoming_grads.append(
-                incoming_grad.narrow(0, shard_offset, shard_step).view_as(x_shard)
-          )
-
-        # AccumulateGrad fires once here, after all shards are computed
-        torch.autograd.backward(all_outputs, all_incoming_grads)
-
-
+                output = fn(mlp_module, x_shard)
+                local_grads = torch.autograd.grad(
+                    outputs=output,
+                    inputs=[x_shard] + list(mlp_module.parameters()),
+                    grad_outputs=incoming_grad_shard,
+                )
+
+            x_grad.narrow(0, shard_offset, shard_step).copy_(local_grads[0])
+
+            for p, g in zip(mlp_module.parameters(), local_grads[1:]):
+                if param_grads[p] is None:
+                    param_grads[p] = g
+                else:
+                    param_grads[p] += g
+
+        for p, g in param_grads.items():
+            if p.grad is None:
+                p.grad = g
+            else:
+                p.grad += g
 
         # unflatten
         x_grad = x_grad.view(x_shape_orig)