revised from_checkpoint, update tests and CHANGELOG

jialusui1102 · jialusui1102 · commit 79cfc7b34f87 · 2025-04-16T15:05:03.000-07:00
Signed-off-by: jialusui1102 &lt;jialusui1102@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,12 +14,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - General purpose patching API for patch-based diffusion
 - New positional embedding selection strategy for CorrDiff SongUNet models
 - Added Multi-Storage Client to allow checkpointing to/from Object Storage
+- Added `ResidualLoss_Opt` for patch amortized CorrDiff training
 
 ### Changed
 
 - Simplified CorrDiff config files, updated default values
 - Refactored CorrDiff losses and samplers to use the patching API
 - Support for non-square images and patches in patch-based diffusion
+- Updated CorrDiff training code to support multiple patch iterations to amortize regression cost and usage of `torch.compile`
+- Refactored `physicsnemo/models/diffusion/layers.py` to optimize data type casting workflow, avoiding unnecessary casting under autocast mode
+- Refactored Conv2d to enable fusion of conv2d with bias addition
+- Refactored GroupNorm, UNetBlock, SongUNet, SongUNetPosEmbd to support usage of Apex GroupNorm, fusion of activation with GroupNorm, and AMP workflow.
+- Updated SongUNetPosEmbd to avoid unnecessary HtoD Memcpy of `pos_embd`
+- Updated `from_checkpoint` to accommodate usage of Apex GroupNorm
+- Refactored CorrDiff NVTX annotation workflow to be configurable
 
 ### Deprecated
 
diff --git a/physicsnemo/models/diffusion/layers.py b/physicsnemo/models/diffusion/layers.py
@@ -31,7 +31,7 @@
 import nvtx
 import contextlib 
 import torch.cuda.amp as amp
-
+import pdb
 
 class Linear(torch.nn.Module):
     """
@@ -353,7 +353,7 @@ def forward(self, x):
                     bias = self.bias.to(x.dtype)
         if self.use_apex_gn:
             x = self.gn(x)
-        elif self.training: #check 
+        elif self.training: 
             # Use default torch implementation of GroupNorm for training
             # This does not support channels last memory format
             x = torch.nn.functional.group_norm(
diff --git a/physicsnemo/models/module.py b/physicsnemo/models/module.py
@@ -375,7 +375,8 @@ def from_checkpoint(cls, file_name: str, model_args: Optional[Dict] = None) -> "
             # Load model arguments and instantiate the model
             with open(local_path.joinpath("args.json"), "r") as f:
                 args = json.load(f)
-            
+            apex_in_ckp = "use_apex_gn" in args["__args__"].keys()
+
             # Merge model_args (adding new keys and updating existing ones)
             if model_args is not None:
                 args["__args__"].update(model_args)
@@ -384,7 +385,8 @@ def from_checkpoint(cls, file_name: str, model_args: Optional[Dict] = None) -> "
             model_dict = torch.load(
                 local_path.joinpath("model.pt"), map_location=model.device
             )
-            if "use_apex_gn" in args["__args__"].keys() and args["__args__"]["use_apex_gn"]:
+            #TODO: for corrdiff model architecture specifically
+            if not apex_in_ckp and "use_apex_gn" in args["__args__"].keys() and args["__args__"]["use_apex_gn"]:
                 filtered_state_dict = {}
                 for key, value in model_dict.items():
                     filtered_state_dict[key] = value  # Keep the original key
@@ -399,7 +401,6 @@ def from_checkpoint(cls, file_name: str, model_args: Optional[Dict] = None) -> "
                 model.load_state_dict(filtered_state_dict,strict=False)
             else:
                 model.load_state_dict(model_dict,strict=False)
-
         return model
 
     @staticmethod
diff --git a/test/models/common/checkpoints.py b/test/models/common/checkpoints.py
@@ -35,6 +35,7 @@ def validate_checkpoint(
     in_args: Tuple[Tensor] = (),
     rtol: float = 1e-5,
     atol: float = 1e-5,
+    enable_autocast: bool = False,
 ) -> bool:
     """Check network's checkpoint safely saves and loads the state of the model
 
@@ -54,6 +55,8 @@ def validate_checkpoint(
         Relative tolerance of error allowed, by default 1e-5
     atol : float, optional
         Absolute tolerance of error allowed, by default 1e-5
+    enable_autocast: bool, optional
+        Whether to enable autocast in model forward
 
     Returns
     -------
@@ -72,8 +75,9 @@ def validate_checkpoint(
         pass
 
     # Now test forward passes
-    output_1 = model_1.forward(*in_args)
-    output_2 = model_2.forward(*in_args)
+    with torch.autocast("cuda", enabled=enable_autocast):
+        output_1 = model_1.forward(*in_args)
+        output_2 = model_2.forward(*in_args)
 
     # Model outputs should initially be different
     assert not compare_output(
@@ -85,12 +89,15 @@ def validate_checkpoint(
     model_2.load("checkpoint.mdlus")
 
     # Forward with loaded checkpoint
-    output_2 = model_2.forward(*in_args)
+    with torch.autocast("cuda", enabled=enable_autocast):
+        output_2 = model_2.forward(*in_args)
+
     loaded_checkpoint = compare_output(output_1, output_2, rtol, atol)
 
     # Restore checkpoint with from_checkpoint, checks initialization of model directly from checkpoint
     model_2 = physicsnemo.Module.from_checkpoint("checkpoint.mdlus").to(model_1.device)
-    output_2 = model_2.forward(*in_args)
+    with torch.autocast("cuda", enabled=enable_autocast):
+        output_2 = model_2.forward(*in_args)
     restored_checkpoint = compare_output(output_1, output_2, rtol, atol)
 
     # Delete checkpoint file (it should exist!)
diff --git a/test/models/diffusion/test_song_unet_agn_amp.py b/test/models/diffusion/test_song_unet_agn_amp.py
@@ -217,10 +217,9 @@ def test_song_unet_checkpoint(device):
     noise_labels = torch.randn([1]).to(device)
     class_labels = torch.randint(0, 1, (1, 1)).to(device)
     input_image = torch.ones([1, 2, 16, 16]).to(device)
-    with torch.autocast("cuda", dtype=torch.bfloat16, enabled=True):
-        assert common.validate_checkpoint(
-            model_1, model_2, (*[input_image, noise_labels, class_labels],)
-        )
+    assert common.validate_checkpoint(
+        model_1, model_2, (*[input_image, noise_labels, class_labels],),enable_autocast=True
+    )
 
 
 @common.check_ort_version()
@@ -243,11 +242,10 @@ def test_son_unet_deploy(device):
     class_labels = torch.randint(0, 1, (1, 1)).to(device)
     input_image = torch.ones([1, 2, 16, 16]).to(device)
 
-    with torch.autocast("cuda", dtype=torch.bfloat16, enabled=True):
-        assert common.validate_onnx_export(
-            model, (*[input_image, noise_labels, class_labels],)
-        )
-    with torch.autocast("cuda", dtype=torch.bfloat16, enabled=True):
-        assert common.validate_onnx_runtime(
-            model, (*[input_image, noise_labels, class_labels],)
-        )
+    assert common.validate_onnx_export(
+        model, (*[input_image, noise_labels, class_labels],)
+    )
+
+    assert common.validate_onnx_runtime(
+        model, (*[input_image, noise_labels, class_labels],)
+    )
diff --git a/test/models/diffusion/test_song_unet_pos_embd_agn_amp.py b/test/models/diffusion/test_song_unet_pos_embd_agn_amp.py
@@ -233,10 +233,9 @@ def test_song_unet_checkpoint(device):
     noise_labels = torch.randn([1]).to(device)
     class_labels = torch.randint(0, 1, (1, 1)).to(device)
     input_image = torch.ones([1, 2, 16, 16]).to(device)
-    with torch.autocast("cuda", enabled=True):
-        assert common.validate_checkpoint(
-            model_1, model_2, (*[input_image, noise_labels, class_labels],), rtol=1e-5, atol=1e-5,
-        )
+    assert common.validate_checkpoint(
+        model_1, model_2, (*[input_image, noise_labels, class_labels],),enable_autocast=True
+    )
 
 
 @common.check_ort_version()