NVIDIA
diff --git a/‎test/conftest.py
+22 b/‎test/conftest.py
+22
diff --git a/‎test/datapipes/test_climate.py
+8-5 b/‎test/datapipes/test_climate.py
+8-5
diff --git a/‎test/datapipes/test_mesh_datapipe.py
+5-5 b/‎test/datapipes/test_mesh_datapipe.py
+5-5
diff --git a/‎test/metrics/test_metrics_general.py
+18-10 b/‎test/metrics/test_metrics_general.py
+18-10
diff --git a/‎test/models/data/dlwp_healpix.pth
-960 KB b/‎test/models/data/dlwp_healpix.pth
-960 KB
diff --git a/‎test/models/data/dlwp_healpix_const.pth
-960 KB b/‎test/models/data/dlwp_healpix_const.pth
-960 KB
diff --git a/‎test/models/data/dlwp_healpix_decoder.pth
-960 KB b/‎test/models/data/dlwp_healpix_decoder.pth
-960 KB
diff --git a/‎test/models/data/dlwp_healpix_unet.pth
0 Bytes b/‎test/models/data/dlwp_healpix_unet.pth
0 Bytes
diff --git a/‎test/models/data/dlwp_healpix_unet_const.pth
0 Bytes b/‎test/models/data/dlwp_healpix_unet_const.pth
0 Bytes
diff --git a/‎test/models/data/dlwp_healpix_unet_decoder.pth
0 Bytes b/‎test/models/data/dlwp_healpix_unet_decoder.pth
0 Bytes
diff --git a/‎test/models/data/dlwp_healpix_unet_no_decoder_no_const.pth
0 Bytes b/‎test/models/data/dlwp_healpix_unet_no_decoder_no_const.pth
0 Bytes
diff --git a/‎test/models/diffusion/test_dhariwal_unet.py
+7-5 b/‎test/models/diffusion/test_dhariwal_unet.py
+7-5
diff --git a/‎test/models/diffusion/test_song_unet_pos_embd.py
+4-2 b/‎test/models/diffusion/test_song_unet_pos_embd.py
+4-2
diff --git a/‎test/models/diffusion/test_song_unet_pos_lt_embd.py
+4-2 b/‎test/models/diffusion/test_song_unet_pos_lt_embd.py
+4-2
diff --git a/‎test/models/dlwp_healpix/test_healpix_recunet_model.py
+7-7 b/‎test/models/dlwp_healpix/test_healpix_recunet_model.py
+7-7
diff --git a/‎test/models/test_dlwp.py
+4-4 b/‎test/models/test_dlwp.py
+4-4
diff --git a/‎test/models/test_swinrnn.py
+3-12 b/‎test/models/test_swinrnn.py
+3-12
@@ -14,8 +14,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections import defaultdict
+
 import pytest
 
+file_timings = defaultdict(float)
+
+# Total time per file
+file_timings = defaultdict(float)
+
+
+def pytest_runtest_logreport(report):
+    if report.when == "call":
+        # report.nodeid format: path::TestClass::test_name
+        filename = report.nodeid.split("::")[0]
+        file_timings[filename] += report.duration
+
+
+def pytest_sessionfinish(session, exitstatus):
+    print("\n=== Test durations by file ===")
+    for filename, duration in sorted(
+        file_timings.items(), key=lambda x: x[1], reverse=True
+    ):
+        print(f"{filename}: {duration:.2f} seconds")
+
 
 def pytest_addoption(parser):
     parser.addoption(
 
@@ -74,10 +74,10 @@ def geopotential_filename():
     shuffle=False,
 )
 
-
+# Skip CPU tests because too slow
 @nfsdata_or_fail
 @import_or_fail("netCDF4")
-@pytest.mark.parametrize("device", ["cuda:0", "cpu"])
+@pytest.mark.parametrize("device", ["cuda:0"])
 def test_climate_hdf5_constructor(
     data_dir,
     stats_files,
@@ -230,12 +230,13 @@ def test_climate_hdf5_device(
         break
 
 
+# Skip CPU tests because too slow
 @nfsdata_or_fail
 @import_or_fail("netCDF4")
 @pytest.mark.parametrize("data_channels", [[0, 1]])
 @pytest.mark.parametrize("num_steps", [2])
-@pytest.mark.parametrize("batch_size", [1, 2, 3])
-@pytest.mark.parametrize("device", ["cuda:0", "cpu"])
+@pytest.mark.parametrize("batch_size", [2, 3])
+@pytest.mark.parametrize("device", ["cuda:0"])
 def test_climate_hdf5_shape(
     data_dir,
     stats_files,
@@ -322,11 +323,12 @@ def test_climate_hdf5_shape(
         break
 
 
+# Skip CPU tests because too slow
 @nfsdata_or_fail
 @import_or_fail("netCDF4")
 @pytest.mark.parametrize("num_steps", [1, 2])
 @pytest.mark.parametrize("stride", [1, 3])
-@pytest.mark.parametrize("device", ["cuda:0", "cpu"])
+@pytest.mark.parametrize("device", ["cuda:0"])
 def test_era5_hdf5_sequence(
     data_dir,
     stats_files,
@@ -375,6 +377,7 @@ def test_era5_hdf5_sequence(
     )
 
 
+# Skip CPU tests because too slow
 @nfsdata_or_fail
 @import_or_fail("netCDF4")
 @pytest.mark.parametrize("shuffle", [True, False])
 
@@ -118,7 +118,7 @@ def _create_random_vtp_vtu_mesh(
 
     tmp_dir = tmp_path / "temp_data"
     tmp_dir.mkdir()
-    _create_random_vtp_vtu_mesh(num_points=20, num_triangles=40, dir=tmp_dir)
+    _create_random_vtp_vtu_mesh(num_points=10, num_triangles=20, dir=tmp_dir)
     datapipe_vtp = MeshDatapipe(
         data_dir=tmp_dir,
         variables=["RandomFeatures"],
@@ -134,8 +134,8 @@ def _create_random_vtp_vtu_mesh(
 
     assert len(datapipe_vtp) == 1
     for data in datapipe_vtp:
-        assert data[0]["vertices"].shape == (1, 20, 3)
-        assert data[0]["x"].shape == (1, 20, 1)
+        assert data[0]["vertices"].shape == (1, 10, 3)
+        assert data[0]["x"].shape == (1, 10, 1)
 
     datapipe_vtu = MeshDatapipe(
         data_dir=tmp_dir,
@@ -152,8 +152,8 @@ def _create_random_vtp_vtu_mesh(
 
     assert len(datapipe_vtu) == 1
     for data in datapipe_vtu:
-        assert data[0]["vertices"].shape == (1, 20, 3)
-        assert data[0]["x"].shape == (1, 20, 1)
+        assert data[0]["vertices"].shape == (1, 10, 3)
+        assert data[0]["x"].shape == (1, 10, 1)
 
 
 # @nfsdata_or_fail
 
@@ -64,8 +64,8 @@ def get_disagreements(inputs, bins, counts, test):
         print("True counts", trueh)
 
 
-@pytest.mark.parametrize("device", ["cpu", "cuda:0"])
-@pytest.mark.parametrize("input_shape", [(1, 72, 144), (1, 360, 720)])
+@pytest.mark.parametrize("device", ["cuda:0", "cpu"])
+@pytest.mark.parametrize("input_shape", [(1, 72, 144)])
 def test_histogram(device, input_shape, rtol: float = 1e-3, atol: float = 1e-3):
     DistributedManager._shared_state = {}
     if (device == "cuda:0") and (not DistributedManager.is_initialized()):
@@ -225,6 +225,10 @@ def test_histogram(device, input_shape, rtol: float = 1e-3, atol: float = 1e-3):
     )
     if device == "cuda:0":
         DistributedManager.cleanup()
+        del os.environ["RANK"]
+        del os.environ["WORLD_SIZE"]
+        del os.environ["MASTER_ADDR"]
+        del os.environ["MASTER_PORT"]
 
 
 def fair_crps(pred, obs, dim=-1):
@@ -539,8 +543,8 @@ def test_crps(device, rtol: float = 1e-3, atol: float = 1e-3):
 
 
 @pytest.mark.parametrize("device", ["cuda:0", "cpu"])
-@pytest.mark.parametrize("mean", [0.0, 3.0])
-@pytest.mark.parametrize("variance", [1.0, 0.1, 3.0])
+@pytest.mark.parametrize("mean", [3.0])
+@pytest.mark.parametrize("variance", [0.1])
 def test_wasserstein(device, mean, variance, rtol: float = 1e-3, atol: float = 1e-3):
     mean = torch.as_tensor([mean], device=device, dtype=torch.float32)
     variance = torch.as_tensor([variance], device=device, dtype=torch.float32)
@@ -704,6 +708,10 @@ def test_means_var(device, rtol: float = 1e-3, atol: float = 1e-3):
 
     if device == "cuda:0":
         DistributedManager.cleanup()
+        del os.environ["RANK"]
+        del os.environ["WORLD_SIZE"]
+        del os.environ["MASTER_ADDR"]
+        del os.environ["MASTER_PORT"]
 
 
 @pytest.mark.parametrize("device", ["cuda:0", "cpu"])
@@ -781,7 +789,7 @@ def test_calibration(device, rtol: float = 1e-2, atol: float = 1e-2):
 def test_entropy(device, rtol: float = 1e-2, atol: float = 1e-2):
     one = torch.ones([1], device=device, dtype=torch.float32)
 
-    x = torch.randn((100_000, 10, 10), device=device, dtype=torch.float32)
+    x = torch.randn((50_000, 10, 10), device=device, dtype=torch.float32)
     bin_edges, bin_counts = hist.histogram(x, bins=30)
     entropy = ent.entropy_from_counts(bin_counts, bin_edges, normalized=False)
     assert entropy.shape == (10, 10)
@@ -810,11 +818,11 @@ def test_entropy(device, rtol: float = 1e-2, atol: float = 1e-2):
     assert torch.allclose(entropy, one, rtol=rtol, atol=atol)
 
     # Test Relative Entropy
-    x = torch.randn((500_000, 10, 10), device=device, dtype=torch.float32)
+    x = torch.randn((100_000, 10, 10), device=device, dtype=torch.float32)
     bin_edges, x_bin_counts = hist.histogram(x, bins=30)
-    x1 = torch.randn((500_000, 10, 10), device=device, dtype=torch.float32)
+    x1 = torch.randn((100_000, 10, 10), device=device, dtype=torch.float32)
     _, x1_bin_counts = hist.histogram(x1, bins=bin_edges)
-    x2 = 0.1 * torch.randn((100_000, 10, 10), device=device, dtype=torch.float32)
+    x2 = 0.1 * torch.randn((50_000, 10, 10), device=device, dtype=torch.float32)
     _, x2_bin_counts = hist.histogram(x2, bins=bin_edges)
 
     rel_ent_1 = ent.relative_entropy_from_counts(x_bin_counts, x1_bin_counts, bin_edges)
@@ -847,8 +855,8 @@ def test_entropy(device, rtol: float = 1e-2, atol: float = 1e-2):
 
 @pytest.mark.parametrize("device", ["cuda:0", "cpu"])
 def test_power_spectrum(device):
-    """Test the 2D power spectrum routine for correctness using a sine wave"""
-    h, w = 64, 64
+    # Test the 2D power spectrum routine for correctness using a sine wave
+    h, w = 32, 32
     kx, ky = 4, 4
     amplitude = 1.0
 
 
@@ -63,19 +63,20 @@ def test_dhariwal_unet_constructor(device):
     assert output_image.shape == (1, out_channels, img_resolution, img_resolution)
 
 
-@pytest.mark.parametrize("device", ["cuda:0", "cpu"])
+# Skip CPU tests because too slow
+@pytest.mark.parametrize("device", ["cuda:0"])
 def test_dhariwal_unet_optims(device):
     """Test Dhariwal UNet optimizations"""
 
     def setup_model():
         model = UNet(
-            img_resolution=16,
+            img_resolution=8,
             in_channels=2,
             out_channels=2,
         ).to(device)
         noise_labels = torch.randn([1]).to(device)
         class_labels = torch.randint(0, 1, (1, 1)).to(device)
-        input_image = torch.ones([1, 2, 16, 16]).to(device)
+        input_image = torch.ones([1, 2, 8, 8]).to(device)
 
         return model, [input_image, noise_labels, class_labels]
 
@@ -94,7 +95,8 @@ def setup_model():
     assert common.validate_combo_optims(model, (*invar,))
 
 
-@pytest.mark.parametrize("device", ["cuda:0", "cpu"])
+# Skip CPU tests because too slow
+@pytest.mark.parametrize("device", ["cuda:0"])
 def test_dhariwal_unet_checkpoint(device):
     """Test Dhariwal UNet checkpoint save/load"""
     # Construct FNO models
@@ -113,7 +115,7 @@ def test_dhariwal_unet_checkpoint(device):
     # Change the bias in the last layer of the second model as a hack
     # Because this model is initialized with all zeros
     with torch.no_grad():
-        model_2.out_conv.bias += 1
+        model_2.out_conv.bias.add_(1)
 
     noise_labels = torch.randn([1]).to(device)
     class_labels = torch.randint(0, 1, (1, 1)).to(device)
 
@@ -243,7 +243,8 @@ def test_fails_if_grid_is_invalid():
         )
 
 
-@pytest.mark.parametrize("device", ["cuda:0", "cpu"])
+# Skip CPU tests because too slow
+@pytest.mark.parametrize("device", ["cuda:0"])
 def test_song_unet_optims(device):
     """Test Song UNet optimizations"""
 
@@ -278,7 +279,8 @@ def setup_model():
     assert common.validate_combo_optims(model, (*invar,))
 
 
-@pytest.mark.parametrize("device", ["cuda:0", "cpu"])
+# Skip CPU tests because too slow
+@pytest.mark.parametrize("device", ["cuda:0"])
 def test_song_unet_checkpoint(device):
     """Test Song UNet checkpoint save/load"""
     # Construct FNO models
 
@@ -324,7 +324,8 @@ def test_fails_if_grid_is_invalid():
         )
 
 
-@pytest.mark.parametrize("device", ["cuda:0", "cpu"])
+# Skip CPU tests because too slow
+@pytest.mark.parametrize("device", ["cuda:0"])
 def test_song_unet_optims(device):
     """Test Song UNet optimizations"""
 
@@ -359,7 +360,8 @@ def setup_model():
     assert common.validate_combo_optims(model, (*invar,))
 
 
-@pytest.mark.parametrize("device", ["cuda:0", "cpu"])
+# Skip CPU tests because too slow
+@pytest.mark.parametrize("device", ["cuda:0"])
 def test_song_unet_checkpoint(device):
     """Test Song UNet checkpoint save/load"""
     # Construct FNO models
 
@@ -172,8 +172,8 @@ def generate_insolation_data(batch_size=8, time_dim=1, img_size=16, device="cpu"
 @import_or_fail("omegaconf")
 @pytest.mark.parametrize("device", ["cuda:0", "cpu"])
 def test_HEALPixRecUNet_initialize(device, encoder_dict, decoder_dict, pytestconfig):
-    in_channels = 7
-    out_channels = 7
+    in_channels = 3
+    out_channels = 3
     n_constants = 1
     decoder_input_channels = 1
     input_time_dim = 2
@@ -314,8 +314,8 @@ def test_HEALPixRecUNet_reset(
     pytestconfig,
 ):
     # create a smaller version of the dlwp healpix model
-    in_channels = 3
-    out_channels = 3
+    in_channels = 2
+    out_channels = 2
     n_constants = 2
     decoder_input_channels = 1
     input_time_dim = 2
@@ -366,13 +366,13 @@ def test_HEALPixRecUNet_forward(
     pytestconfig,
 ):
     # create a smaller version of the dlwp healpix model
-    in_channels = 3
-    out_channels = 3
+    in_channels = 2
+    out_channels = 2
     n_constants = 2
     decoder_input_channels = 1
     input_time_dim = 2
     output_time_dim = 4
-    batch_size = 8
+    batch_size = 2
     size = 16
 
     fix_random_seeds(seed=42)
 
@@ -47,10 +47,10 @@ def test_dlwp_forward(device):
 
 
 @pytest.mark.parametrize("device", ["cuda:0", "cpu"])
-@pytest.mark.parametrize("nr_input_channels", [2, 4])
-@pytest.mark.parametrize("nr_output_channels", [2, 4])
-@pytest.mark.parametrize("nr_initial_channels", [32, 64])
-@pytest.mark.parametrize("depth", [2, 3, 4])
+@pytest.mark.parametrize("nr_input_channels", [2])
+@pytest.mark.parametrize("nr_output_channels", [2])
+@pytest.mark.parametrize("nr_initial_channels", [32])
+@pytest.mark.parametrize("depth", [2])
 def test_dlwp_constructor(
     device, nr_input_channels, nr_output_channels, nr_initial_channels, depth
 ):
 
@@ -24,7 +24,8 @@
 from . import common
 
 
-@pytest.mark.parametrize("device", ["cuda:0", "cpu"])
+# Skip CPU tests because too slow
+@pytest.mark.parametrize("device", ["cuda:0"])
 def test_swinrnn_forward(device):
     """Test SwinRNN forward pass"""
     torch.manual_seed(0)
@@ -43,7 +44,7 @@ def test_swinrnn_forward(device):
     invar = torch.randn(bsize, 13, 6, 32, 64).to(device)
     # Check output size
     with torch.no_grad():
-        assert common.validate_forward_accuracy(model, (invar,), atol=5e-3)
+        assert common.validate_forward_accuracy(model, (invar,), atol=5e-3, rtol=1e-3)
     del invar, model
     torch.cuda.empty_cache()
 
@@ -53,16 +54,6 @@ def test_swinrnn_constructor(device):
     """Test SwinRNN constructor options"""
     # Define dictionary of constructor args
     arg_list = [
-        {
-            "img_size": (6, 32, 64),
-            "patch_size": (6, 1, 1),
-            "in_chans": 13,
-            "out_chans": 13,
-            "embed_dim": 768,
-            "num_groups": 32,
-            "num_heads": 8,
-            "window_size": 8,
-        },
         {
             "img_size": (3, 32, 32),
             "patch_size": (3, 1, 1),