NVIDIA
diff --git a/‎tests/pytorch/distributed/fsdp2_tests/run_fsdp2_fused_adam.py‎
Lines changed: 181 additions & 4 deletions b/‎tests/pytorch/distributed/fsdp2_tests/run_fsdp2_fused_adam.py‎
Lines changed: 181 additions & 4 deletions
diff --git a/‎tests/pytorch/distributed/test_torch_fsdp2.py‎
Lines changed: 75 additions & 0 deletions b/‎tests/pytorch/distributed/test_torch_fsdp2.py‎
Lines changed: 75 additions & 0 deletions
@@ -16,11 +16,17 @@
   fused_adam_fp8_master_weights, fused_adam_fp8_master_weights_no_meta,
   fused_adam_bf16, fused_adam_fp8_no_master, fused_adam_bf16_store_param_remainders,
   fuse_wgrad_accumulation, dcp_output_parity, dcp_output_parity_async,
-  safetensors_fp32_export
+  dcp_resharding_save, dcp_resharding_load, safetensors_fp32_export
 
 Available --recipe values:
   DelayedScaling, Float8CurrentScaling, Float8BlockScaling,
   MXFP8BlockScaling, NVFP4BlockScaling
+
+Note: dcp_resharding_save and dcp_resharding_load are two phases of a single
+cross-topology test.  Run dcp_resharding_save under a larger world_size first
+(e.g. --nproc_per_node=4), then run dcp_resharding_load under a smaller one
+(e.g. --nproc_per_node=2).  The orchestration is handled automatically by
+test_fsdp2_fused_adam_dcp_resharding in test_torch_fsdp2.py.
 """
 
 import argparse
@@ -465,7 +471,8 @@ def test_safetensors_fp32_export(recipe_name):
     if recipe_name == "MXFP8BlockScaling":
         pytest.xfail(
             "MXFP8BlockScaling: FusedAdam CUDA kernel does not support "
-            "MXFP8 quantized tensors, causing illegal memory access"
+            "MXFP8 quantized tensors, causing illegal memory access. "
+            "Fixed by https://github.com/NVIDIA/TransformerEngine/pull/2789."
         )
 
     from safetensors.torch import load_file, save_file
@@ -554,7 +561,8 @@ def test_dcp_output_parity(recipe_name, async_save):
             "MXFP8BlockScaling: FusedAdam CUDA kernel does not support "
             "MXFP8 quantized tensors, causing illegal memory access: "
             "/transformer_engine/common/multi_tensor/multi_tensor_apply.cuh:92 in function "
-            "multi_tensor_apply: CUDA Error: an illegal memory access was encountered"
+            "multi_tensor_apply: CUDA Error: an illegal memory access was encountered. "
+            "Fixed by https://github.com/NVIDIA/TransformerEngine/pull/2789."
         )
 
     if recipe_name == "NVFP4BlockScaling":
@@ -740,6 +748,173 @@ def test_dcp_output_parity(recipe_name, async_save):
             shutil.rmtree(checkpoint_dir, ignore_errors=True)
 
 
+def test_dcp_resharding_save(recipe_name):
+    """Phase 1 of the DCP resharding test: train with current world_size and save checkpoint.
+
+    Trains a model for NUM_STEPS, records the forward-pass output, and writes:
+    - A DCP checkpoint to /tmp/te_test_fsdp2_dcp_resharding_<recipe>/
+    - A reference output tensor to /tmp/te_test_fsdp2_dcp_resharding_<recipe>_ref.pt
+
+    These artifacts are consumed by test_dcp_resharding_load, which runs under
+    a *different* world_size (typically half as many ranks) to verify that DCP
+    correctly reshards the checkpoint into the new topology.
+
+    The two phases are orchestrated by test_fsdp2_fused_adam_dcp_resharding in
+    test_torch_fsdp2.py using two sequential plain torchrun invocations.
+    """
+    recipe = get_recipe_from_string(recipe_name)
+
+    import torch.distributed.checkpoint as dcp
+
+    world_size, device = _get_dist_info()
+    rank = int(os.environ.get("RANK", "0"))
+    checkpoint_dir = f"/tmp/te_test_fsdp2_dcp_resharding_{recipe_name}"
+    ref_output_path = f"/tmp/te_test_fsdp2_dcp_resharding_{recipe_name}_ref.pt"
+
+    if rank == 0:
+        shutil.rmtree(checkpoint_dir, ignore_errors=True)
+        if os.path.exists(ref_output_path):
+            os.remove(ref_output_path)
+    dist.barrier()
+
+    model = _build_model(fp8_init=True, recipe=recipe)
+    model = _shard_model(model, world_size)
+
+    optimizer = te.optimizers.FusedAdam(
+        model.parameters(),
+        lr=1e-3,
+        master_weights=True,
+        master_weight_dtype=torch.float32,
+    )
+
+    # Fixed seed so the load phase reproduces the exact same input tensor.
+    torch.manual_seed(12345)
+    torch.cuda.manual_seed(12345)
+    x = torch.randn(SEQ_LEN, BATCH_PER_RANK, HIDDEN_SIZE, dtype=torch.bfloat16, device=device)
+    target = torch.randn_like(x)
+
+    for _ in range(NUM_STEPS):
+        optimizer.zero_grad(set_to_none=True)
+        with te.autocast(enabled=True, recipe=recipe):
+            output = model(x)
+        loss = F.mse_loss(output, target)
+        loss.backward()
+        optimizer.step()
+
+    # Record the reference output before saving.
+    with torch.no_grad():
+        with te.autocast(enabled=True, recipe=recipe):
+            ref_output = model(x).clone().cpu()
+
+    dist.barrier()
+    if rank == 0:
+        torch.save(ref_output, ref_output_path)
+
+    if isinstance(recipe, transformer_engine.common.recipe.DelayedScaling):
+        model_state = {
+            k: v for k, v in model.state_dict().items() if not k.endswith("_extra_state")
+        }
+    else:
+        model_state = model.state_dict()
+
+    dcp.save(
+        {"model": model_state, "optimizer": optimizer.state_dict()}, checkpoint_id=checkpoint_dir
+    )
+    dist.barrier()
+
+
+def test_dcp_resharding_load(recipe_name):
+    """Phase 2 of the DCP resharding test: load into a different world_size and verify parity.
+
+    Loads the DCP checkpoint written by test_dcp_resharding_save (which ran
+    under a larger world_size, e.g. 4 ranks) into a fresh model sharded over
+    the current, smaller world_size (e.g. 2 ranks).  Asserts that the model
+    output after loading is bitwise-identical to the reference saved in phase 1,
+    confirming that DCP resharding correctly reconstructs all parameter shards.
+    """
+    recipe = get_recipe_from_string(recipe_name)
+
+    import torch.distributed.checkpoint as dcp
+
+    world_size, device = _get_dist_info()
+    rank = int(os.environ.get("RANK", "0"))
+    checkpoint_dir = f"/tmp/te_test_fsdp2_dcp_resharding_{recipe_name}"
+    ref_output_path = f"/tmp/te_test_fsdp2_dcp_resharding_{recipe_name}_ref.pt"
+
+    try:
+        model2 = _build_model(fp8_init=True, recipe=recipe)
+        model2 = _shard_model(model2, world_size)
+
+        optimizer2 = te.optimizers.FusedAdam(
+            model2.parameters(),
+            lr=1e-3,
+            master_weights=True,
+            master_weight_dtype=torch.float32,
+        )
+
+        # Same fixed seed as the save phase to reproduce identical x/target.
+        torch.manual_seed(12345)
+        torch.cuda.manual_seed(12345)
+        x = torch.randn(SEQ_LEN, BATCH_PER_RANK, HIDDEN_SIZE, dtype=torch.bfloat16, device=device)
+        target = torch.randn_like(x)
+
+        # Populate optimizer state so load_state_dict has a matching structure.
+        optimizer2.zero_grad(set_to_none=True)
+        with te.autocast(enabled=True, recipe=recipe):
+            out_tmp = model2(x)
+        F.mse_loss(out_tmp, target).backward()
+        optimizer2.step()
+
+        if isinstance(recipe, transformer_engine.common.recipe.DelayedScaling):
+            model2_state = {
+                k: v for k, v in model2.state_dict().items() if not k.endswith("_extra_state")
+            }
+        else:
+            model2_state = model2.state_dict()
+
+        state_to_load = {"model": model2_state, "optimizer": optimizer2.state_dict()}
+        dcp.load(state_to_load, checkpoint_id=checkpoint_dir)
+        model2.load_state_dict(
+            state_to_load["model"],
+            strict=(
+                False
+                if isinstance(recipe, transformer_engine.common.recipe.DelayedScaling)
+                else True
+            ),
+        )
+        optimizer2.load_state_dict(state_to_load["optimizer"])
+
+        with torch.no_grad():
+            with te.autocast(enabled=True, recipe=recipe):
+                loaded_output = model2(x).cpu()
+
+        if rank == 0:
+            ref_output = torch.load(ref_output_path, weights_only=True)
+
+            if isinstance(recipe, transformer_engine.common.recipe.DelayedScaling):
+                torch.testing.assert_close(
+                    loaded_output,
+                    ref_output,
+                    rtol=0.05,
+                    atol=0.1,
+                    msg=lambda m: f"Resharded model output differs from reference: {m}",
+                )
+            else:
+                torch.testing.assert_close(
+                    loaded_output,
+                    ref_output,
+                    rtol=0,
+                    atol=0,
+                    msg=lambda m: f"Resharded model output differs from reference: {m}",
+                )
+    finally:
+        dist.barrier()
+        if rank == 0:
+            shutil.rmtree(checkpoint_dir, ignore_errors=True)
+            if os.path.exists(ref_output_path):
+                os.remove(ref_output_path)
+
+
 TESTS = {
     "fused_adam_fp8_master_weights": test_fused_adam_fp8_master_weights,
     "fused_adam_fp8_master_weights_no_meta": test_fused_adam_fp8_master_weights_no_meta,
@@ -749,13 +924,15 @@ def test_dcp_output_parity(recipe_name, async_save):
     "fuse_wgrad_accumulation": test_fuse_wgrad_accumulation,
     "dcp_output_parity": functools.partial(test_dcp_output_parity, async_save=False),
     "dcp_output_parity_async": functools.partial(test_dcp_output_parity, async_save=True),
+    "dcp_resharding_save": test_dcp_resharding_save,
+    "dcp_resharding_load": test_dcp_resharding_load,
     "safetensors_fp32_export": test_safetensors_fp32_export,
 }
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--test", required=True, choices=list(TESTS.keys()))
+    parser.add_argument("--test", required=True, choices=sorted(TESTS.keys()))
     parser.add_argument(
         "--recipe",
         type=str,
 
@@ -5,6 +5,7 @@
 import os
 import sys
 import subprocess
+import sys
 from pathlib import Path
 
 sys.path.append(str(Path(__file__).resolve().parent.parent))
@@ -18,6 +19,12 @@
 NUM_PROCS: int = torch.cuda.device_count()
 _FSDP2_DIR = Path(__file__).parent.resolve() / "fsdp2_tests"
 
+# Import some utilities from PyTest-owned conftest.py.
+sys.path.insert(0, str(_FSDP2_DIR))
+from conftest import _parametrize_recipes
+
+sys.path.pop(0)
+
 
 @pytest.mark.skipif(NUM_PROCS % 2 != 0, reason="Requires even number of GPUs")
 @pytest.mark.skipif(not te.torch_version() >= (2, 4, 0), reason="Requires PyTorch 2.4.0+")
@@ -59,6 +66,10 @@ def test_fsdp2_fused_adam_tests():
             "-v",
             "-s",
             "--tb=short",
+            # The following 2 tests need to be run in sequence,
+            # as they depend on each other.
+            "-k",
+            "not dcp_resharding_save and not dcp_resharding_load",
         ],
         valid_returncodes=(0, 5),
         env=os.environ,
@@ -90,6 +101,70 @@ def test_fsdp2_mem_leak_tests():
     assert result.returncode in (0, 5), f"Inner pytest failed with exit code {result.returncode}"
 
 
+@pytest.mark.skipif(NUM_PROCS < 4, reason="Requires 4+ GPUs for DP4→DP2 resharding test")
+@pytest.mark.skipif(not te.torch_version() >= (2, 4, 0), reason="Requires PyTorch 2.4.0+")
+@pytest.mark.parametrize("recipe", _parametrize_recipes())
+def test_fsdp2_fused_adam_dcp_resharding(recipe):
+    """DCP checkpoint saved with DP4 loads correctly into DP2 (cross-topology resharding).
+
+    Runs two sequential torchrun invocations against run_fsdp2_fused_adam.py:
+      1. nproc=4  →  dcp_resharding_save  (train + write checkpoint + ref output)
+      2. nproc=2  →  dcp_resharding_load  (load checkpoint, assert output parity)
+    """
+    if recipe == "MXFP8BlockScaling":
+        pytest.xfail(
+            "MXFP8BlockScaling: FusedAdam CUDA kernel does not support "
+            "MXFP8 quantized tensors, causing illegal memory access. "
+            "Fixed by https://github.com/NVIDIA/TransformerEngine/pull/2789."
+        )
+    if recipe == "NVFP4BlockScaling":
+        pytest.xfail(
+            "NVFP4BlockScaling: DCP load_state_dict triggers reset_sharded_param() "
+            "which calls data_ptr() on NVFP4Tensor wrapper subclass with invalid storage"
+        )
+    if recipe == "Float8BlockScaling":
+        pytest.xfail(
+            "Float8BlockScaling doesnt work for DCP resharding with scale inv padding "
+            "not being handled correctly for slice ops"
+        )
+
+    test_path = _FSDP2_DIR / "run_fsdp2_fused_adam.py"
+
+    # Phase 1: save checkpoint with 4 ranks.
+    result = subprocess.run(
+        [
+            "torchrun",
+            "--nproc_per_node=4",
+            "--local-ranks-filter=0",
+            str(test_path),
+            "--test",
+            "dcp_resharding_save",
+            "--recipe",
+            recipe,
+        ],
+        env=os.environ,
+        timeout=300,
+    )
+    assert result.returncode == 0, f"DCP resharding save phase failed: {result.returncode}"
+
+    # Phase 2: load checkpoint with 2 ranks (different topology).
+    result = subprocess.run(
+        [
+            "torchrun",
+            "--nproc_per_node=2",
+            "--local-ranks-filter=0",
+            str(test_path),
+            "--test",
+            "dcp_resharding_load",
+            "--recipe",
+            recipe,
+        ],
+        env=os.environ,
+        timeout=300,
+    )
+    assert result.returncode == 0, f"DCP resharding load phase failed: {result.returncode}"
+
+
 def test_dummy() -> None:
     """Dummy test