NVIDIA
diff --git a/‎examples/generative/corrdiff/conf/base/model/patched_diffusion.yaml
+1-1 b/‎examples/generative/corrdiff/conf/base/model/patched_diffusion.yaml
+1-1
diff --git a/‎examples/generative/corrdiff/conf/base/model_size/normal.yaml
+1-1 b/‎examples/generative/corrdiff/conf/base/model_size/normal.yaml
+1-1
diff --git a/‎examples/generative/corrdiff/conf/base/training/corrdiff_patched_diffusion_opt.yaml
-98 b/‎examples/generative/corrdiff/conf/base/training/corrdiff_patched_diffusion_opt.yaml
-98
diff --git a/‎examples/generative/corrdiff/conf/config_training_hrrr_patched_diffusion_opt.yaml
+127 b/‎examples/generative/corrdiff/conf/config_training_hrrr_patched_diffusion_opt.yaml
+127
diff --git a/‎examples/generative/corrdiff/train.py
+14-23 b/‎examples/generative/corrdiff/train.py
+14-23
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: diffusion
+name: patched_diffusion
 # Model type.
 hr_mean_conditioning: True
 # Recommended to use high-res conditioning for diffusion.
 
@@ -23,4 +23,4 @@ model_args:
   # Per-resolution multipliers for the number of channels.
   channel_mult: [1, 2, 2, 2, 2]
   # Resolutions at which self-attention layers are applied.
-  attention_levels: [28]
+  attn_resolutions: [28]
@@ -0,0 +1,127 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+hydra:
+    job:
+          chdir: true
+          name: patched_diffusion_opt
+    run:
+          dir: ./output/${hydra:job.name}
+    searchpath:
+          - pkg://conf/base # Do not modify
+
+# Base parameters for dataset, model, training, and validation
+defaults:
+
+    - dataset: hrrr_corrdiff_synthetic
+    # The dataset type for training.
+    # Accepted values:
+    #   `gefs_hrrr`: full GEFS-HRRR dataset for continental US.
+    #   `hrrr_mini`: smaller HRRR dataset (continental US), for fast experiments.
+    #   `cwb`: full CWB dataset for Taiwan.
+    #   `custom`: user-defined dataset. Parameters need to be specified below.
+
+    - model: patched_diffusion
+    # The model type.
+    # Accepted values:
+    #     `regression`: a regression UNet for deterministic predictions
+    #     `lt_aware_ce_regression`: similar to `regression` but with lead time
+    #       conditioning
+    #     `diffusion`: a diffusion UNet for residual predictions
+    #     `patched_diffusion`: a more memory-efficient diffusion model
+    #     `lt_aware_patched_diffusion`: similar to `patched_diffusion` but
+    #       with lead time conditioning
+
+    - model_size: normal
+    # The model size configuration.
+    # Accepted values:
+    #     `normal`: normal model size
+    #     `mini`: smaller model size for fast experiments
+
+    - training: ${model}
+    # The base training parameters. Determined by the model type.
+
+
+# Dataset parameters. Used for `custom` dataset type.
+# Modify or add below parameters that should be passed as argument to the
+# user-defined dataset class.
+dataset:
+    data_path: ./data
+    # Path to .nc data file
+    stats_path: ./data/stats.json
+    # Path to json stats file
+
+# Training parameters
+training:
+    hp:
+        training_duration: 200000000
+        # Training duration based on the number of processed samples
+        total_batch_size: 512
+        # Total batch size
+        batch_size_per_gpu: 4
+        
+        patch_shape_x: 448
+        patch_shape_y: 448
+        # Patch size. Patch training is used if these dimensions differ from
+        # img_shape_x and img_shape_y.
+        patch_num: 16
+        # Number of patches from a single sample. Total number of patches is
+            # patch_num * batch_size_global.
+        max_patch_per_gpu: 9
+        # Maximum number of pataches a gpu can hold
+
+        lr: 0.0002
+        # Learning rate
+        grad_clip_threshold: 1e6
+        lr_decay: 0.7
+        lr_rampup: 1000000
+
+    # Performance
+    perf:
+        fp_optimizations: amp-bf16
+        # Floating point mode, one of ["fp32", "fp16", "amp-fp16", "amp-bf16"]
+        # "amp-{fp16,bf16}" activates Automatic Mixed Precision (AMP) with {float16,bfloat16}
+        dataloader_workers: 4
+        # DataLoader worker processes
+        songunet_checkpoint_level: 0 # 0 means no checkpointing
+        # Gradient checkpointing level, value is number of layers to checkpoint
+        # optimization_mode: True
+        use_apex_gn: True
+        torch_compile: True
+        profile_mode: False
+
+    io:
+        regression_checkpoint_path: /lustre/fsw/portfolios/coreai/users/asui/video-corrdiff-checkpoints/training-state-regression-000513.mdlus
+        # Path to load the regression checkpoint
+
+        # Where to load the regression checkpoint
+        print_progress_freq: 1000
+        # How often to print progress
+        save_checkpoint_freq: 500000
+        # How often to save the checkpoints, measured in number of processed samples
+        validation_freq: 5000
+        # how often to record the validation loss, measured in number of processed samples
+        validation_steps: 10
+        # how many loss evaluations are used to compute the validation loss per checkpoint 
+
+# Parameters for wandb logging
+wandb:
+    mode: offline
+    # Configure whether to use wandb: "offline", "online", "disabled"
+    results_dir: "./wandb"
+    # Directory to store wandb results
+    watch_model: false
+    # If true, wandb will track model parameters and gradients
@@ -162,7 +162,6 @@ def main(cfg: DictConfig) -> None:
         prob_channels = dataset.get_prob_channel_index()
     else:
         prob_channels = None
-
     # Parse the patch shape
     if (
         cfg.model.name == "patched_diffusion"
@@ -348,11 +347,6 @@ def main(cfg: DictConfig) -> None:
     if cfg.model.name == "patched_diffusion" and len(patch_nums_iter)>1:
         loss_fn = ResidualLoss_Opt(
             regression_net=regression_net,
-            img_shape_x=img_shape[1],
-            img_shape_y=img_shape[0],
-            patch_shape_x=patch_shape[1],
-            patch_shape_y=patch_shape[0],
-            patch_num=patch_num,
             hr_mean_conditioning=cfg.model.hr_mean_conditioning,
         )
     elif cfg.model.name in (
@@ -415,11 +409,11 @@ def main(cfg: DictConfig) -> None:
                 tick_start_nimg = cur_nimg
                 tick_start_time = time.time()
 
-                if cur_nimg - start_nimg == 4 * cfg.training.hp.total_batch_size:
+                if cur_nimg - start_nimg == 14 * cfg.training.hp.total_batch_size:
                     logger0.info(f"Starting Profiler at {cur_nimg}")
                     torch.cuda.profiler.start()
 
-                if cur_nimg - start_nimg == 6 * cfg.training.hp.total_batch_size:
+                if cur_nimg - start_nimg == 16 * cfg.training.hp.total_batch_size:
                     logger0.info(f"Stoping Profiler at {cur_nimg}")
                     torch.cuda.profiler.stop()
 
@@ -432,7 +426,7 @@ def main(cfg: DictConfig) -> None:
                             f"accumulation round {n_i}", color="Magenta"
                         ):
                             with nvtx.annotate(f"loading data", color="green"):
-                                img_clean, img_lr, labels, *lead_time_label = next(
+                                img_clean, img_lr, *lead_time_label = next(
                                     dataset_iterator
                                 )
                                 if use_apex_gn:
@@ -446,7 +440,6 @@ def main(cfg: DictConfig) -> None:
                                         dtype=input_dtype,
                                         non_blocking=True,
                                     ).to(memory_format=torch.channels_last)
-                                    labels = labels.to(dist.device, non_blocking=True)
                                 else:
                                     img_clean = (
                                         img_clean.to(dist.device)
@@ -458,15 +451,13 @@ def main(cfg: DictConfig) -> None:
                                         .to(input_dtype)
                                         .contiguous()
                                     )
-                                    labels = labels.to(dist.device).contiguous()
                             loss_fn_kwargs = {
                                 "net": model,
                                 "img_clean": img_clean,
                                 "img_lr": img_lr,
-                                "labels": labels,
                                 "augment_pipe": None,
                             }
-                            
+
                             if lead_time_label:
                                 lead_time_label = (
                                     lead_time_label[0].to(dist.device).contiguous()
@@ -570,7 +561,7 @@ def main(cfg: DictConfig) -> None:
                         ):
                             with torch.no_grad():
                                 for _ in range(cfg.training.io.validation_steps):
-                                    img_clean_valid, img_lr_valid, labels_valid = next(
+                                    img_clean_valid, img_lr_valid, *lead_time_label_valid = next(
                                         validation_dataset_iterator
                                     )
 
@@ -585,9 +576,6 @@ def main(cfg: DictConfig) -> None:
                                             dtype=input_dtype,
                                             non_blocking=True,
                                         ).to(memory_format=torch.channels_last)
-                                        labels_valid = labels_valid.to(
-                                            dist.device, non_blocking=True
-                                        )
 
                                     else:
                                         img_clean_valid = (
@@ -600,17 +588,20 @@ def main(cfg: DictConfig) -> None:
                                             .to(input_dtype)
                                             .contiguous()
                                         )
-                                        labels_valid = labels_valid.to(
-                                            dist.device
-                                        ).contiguous()
 
-                                    loss_fn_valid_kwargs = {
+                                    loss_valid_kwargs = {
                                         "net": model,
                                         "img_clean": img_clean_valid,
                                         "img_lr": img_lr_valid,
-                                        "labels": labels_valid,
                                         "augment_pipe": None,
                                     }
+                                    if lead_time_label_valid:
+                                        lead_time_label_valid = (
+                                            lead_time_label_valid[0].to(dist.device).contiguous()
+                                        )
+                                        loss_valid_kwargs.update(
+                                            {"lead_time_label": lead_time_label_valid}
+                                        )
                                     if isinstance(loss_fn, ResidualLoss_Opt):   
                                         loss_fn.y_mean = None
 
@@ -621,7 +612,7 @@ def main(cfg: DictConfig) -> None:
                                             loss_fn_kwargs.update({"patching": patching}) 
                                         # pdb.set_trace()
                                         with torch.autocast("cuda", dtype=amp_dtype, enabled=enable_amp):
-                                            loss_valid = loss_fn(**loss_fn_valid_kwargs)
+                                            loss_valid = loss_fn(**loss_valid_kwargs)
 
                                         loss_valid = (
                                             (loss_valid.sum() / batch_size_per_gpu)