Fix param count calculation + account depth pruning

kevalmorabia97 · kevalmorabia97 · commit b858631f9f4f · 2025-12-30T12:29:29.000-08:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -40,7 +40,7 @@
         "--no-cov",
     ],
     "evenBetterToml.schema.enabled": false, // disable toml/json schema since we have custom fields
-    "python.analysis.extraPaths": [
+    "cursorpyright.analysis.extraPaths": [
         "./tests/" // add tests to python path just like pytest does in pyproject.toml
     ],
     "git.alwaysSignOff": true,
diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py
@@ -1040,6 +1040,7 @@ def modify(
         ffn_hidden_size_divisor: int = 1,
         mamba_head_dim_divisor: int = 1,
         num_moe_experts_divisor: int = 1,
+        num_layers_divisor: int = 1,
     ):
         """Modify the dynamic choices of the module according to provided keyword arguments.
 
@@ -1048,10 +1049,15 @@ def modify(
             ffn_hidden_size_divisor: The divisor of the mlp ffn_hidden_size.
             mamba_head_dim_divisor: The divisor of the mamba head_dim.
             num_moe_experts_divisor: The divisor of the number of MoE experts.
+            num_layers_divisor: The divisor of the number of layers.
         """
-        hp = self.get_hparam("hidden_size")
-        choices = {int(make_divisible(c, hidden_size_divisor)) for c in hp.choices}  # type: ignore[arg-type]
-        hp.choices = list(set(hp.choices) & choices | {hp.original})
+        for hp_name, divisor in [
+            ("hidden_size", hidden_size_divisor),
+            ("num_layers", num_layers_divisor),
+        ]:
+            hp = self.get_hparam(hp_name)
+            choices = {int(make_divisible(c, divisor)) for c in hp.choices}  # type: ignore[arg-type]
+            hp.choices = list(set(hp.choices) & choices | {hp.original})
 
         for layer in self.decoder.layers:
             layer.modify(
diff --git a/modelopt/torch/prune/plugins/mcore_minitron.py b/modelopt/torch/prune/plugins/mcore_minitron.py
diff --git a/modelopt/torch/prune/pruning.py b/modelopt/torch/prune/pruning.py
@@ -78,7 +78,7 @@ def prune(
                     constraints = {"params": "60%"}
 
                     # Specify export_config with pruned hyperparameters
-                    # This is supported and required if the model is converted via ``mcore_minitron`` mode.
+                    # This is supported only if the model is converted via ``mcore_minitron`` mode.
                     constraints = {
                         "export_config": {
                             "ffn_hidden_size": 128,
diff --git a/modelopt/torch/utils/logging.py b/modelopt/torch/utils/logging.py
@@ -46,7 +46,7 @@
 def num2hrb(num: float, suffix="") -> str:
     """Convert big floating number to human readable string."""
     step = 1000  # step between units is 1000
-    units = ["", "K", "M", "G", "T", "P", "E"]
+    units = ["", "K", "M", "B", "T", "P", "E"]
     while abs(num) >= step and len(units) > 1:
         num /= step
         units.pop(0)
diff --git a/modelopt/torch/utils/network.py b/modelopt/torch/utils/network.py
@@ -112,6 +112,13 @@ def param_num(network: nn.Module, trainable_only: bool = False, unit=1e6) -> flo
     Returns:
         The number of parameters in the model in the given unit.
     """
+    from modelopt.torch.opt.dynamic import DynamicModule
+
+    if isinstance(network, DynamicModule):
+        # NOTE: model.parameters() doesnt consider active_slice so we dont get sorted or trimmed parameters!
+        raise NotImplementedError(
+            "param_num doesn't support DynamicModule. Please use param_num_from_forward instead."
+        )
     return (
         sum(
             p.numel() if not trainable_only or p.requires_grad else 0
diff --git a/modelopt/torch/utils/plugins/megatron_model.py b/modelopt/torch/utils/plugins/megatron_model.py
@@ -46,9 +46,14 @@ def param_num_megatron(
     Returns:
         The number of parameters in the model (reduced across TP and PP ranks).
     """
+    from modelopt.torch.opt.dynamic import DynamicModule
+
     if from_forward:
         assert args is not None, "args must be provided if from_forward is True"
         params = int(param_num_from_forward(model, args, unit=1.0))
+    elif isinstance(model, DynamicModule):
+        # NOTE: model.parameters() doesnt consider active_slice so we dont get sorted or trimmed parameters!
+        raise NotImplementedError("DynamicModule input is not supported without from_forward.")
     else:
         params = sum(p.numel() for p in model.parameters())
     reduced_params = torch.Tensor([params]).to(device=next(model.parameters()).device)
diff --git a/tests/_test_utils/torch/nas_prune/minitron_common.py b/tests/_test_utils/torch/nas_prune/minitron_common.py
@@ -26,6 +26,7 @@ def prune_minitron(model, export_config, config, channel_divisor=64):
                     channel_divisor=channel_divisor,
                     mamba_head_dim_divisor=4,
                     num_moe_experts_divisor=1,
+                    num_layers_divisor=1,
                 ),
             )
         ],
diff --git a/tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py b/tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
@@ -83,7 +83,15 @@ def _test_gpt_search_space(
         normalization=normalization,
     ).cuda()
 
-    mtn.convert(model, [("mcore_minitron", get_mcore_minitron_config(channel_divisor))])
+    mtn.convert(
+        model,
+        [
+            (
+                "mcore_minitron",
+                get_mcore_minitron_config(channel_divisor=channel_divisor, num_layers_divisor=1),
+            )
+        ],
+    )
 
     assert isinstance(model, _DynamicMCoreLanguageModel)
     for m in model.modules():
@@ -255,7 +263,14 @@ def _test_gpt_moe_search_space(rank, size):
 
     mtn.convert(
         model,
-        [("mcore_minitron", get_mcore_minitron_config(channel_divisor, num_moe_experts_divisor=1))],
+        [
+            (
+                "mcore_minitron",
+                get_mcore_minitron_config(
+                    channel_divisor=channel_divisor, num_moe_experts_divisor=1, num_layers_divisor=1
+                ),
+            )
+        ],
     )
 
     moe = model.decoder.layers[0].mlp
diff --git a/tests/gpu/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py b/tests/gpu/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py
@@ -77,7 +77,16 @@ def _test_mamba_search_space(rank, size):
 
     mtn.convert(
         model,
-        [("mcore_minitron", get_mcore_minitron_config(channel_divisor, mamba_head_dim_divisor))],
+        [
+            (
+                "mcore_minitron",
+                get_mcore_minitron_config(
+                    channel_divisor=channel_divisor,
+                    mamba_head_dim_divisor=mamba_head_dim_divisor,
+                    num_layers_divisor=1,
+                ),
+            )
+        ],
     )
 
     assert isinstance(model, _DynamicMCoreLanguageModel)
diff --git a/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py b/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
@@ -78,7 +78,7 @@ def _test_mcore_gpt_parameter_sorting(activation_func, rank, size):
 
     model.eval()
     dynamic_space = _convert_model_to_dynamic_space(
-        model, get_mcore_minitron_config(channel_divisor)
+        model, get_mcore_minitron_config(channel_divisor=channel_divisor)
     )
     registry = ImportanceEstimatorRegistry(model)  # register imp estimators and forward hooks
 

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@ def prune_minitron(model, export_config, config, channel_divisor=64):`
`26`	`26`	`channel_divisor=channel_divisor,`
`27`	`27`	`mamba_head_dim_divisor=4,`
`28`	`28`	`num_moe_experts_divisor=1,`
	`29`	`+ num_layers_divisor=1,`
`29`	`30`	`),`
`30`	`31`	`)`
`31`	`32`	`],`