Allow skipping some hparams in NAS and further restrict search space

kevalmorabia97 · kevalmorabia97 · commit f18f8e20a049 · 2026-01-14T02:22:36.000-08:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/modelopt/torch/prune/plugins/mcore_minitron.py b/modelopt/torch/prune/plugins/mcore_minitron.py
@@ -173,11 +173,12 @@ class CandidateSubnet:
 class MCoreMinitronSearcher(BaseSearcher):
     """Searcher for Minitron pruning algorithm.
 
-    Available additional config options:
-    - `max_width_pruning`: Maximum fraction per width hyperparameter to prune (default: 0.5).
+    Available additional config options (used when `params` constraint is provided):
+    - `max_width_pruning`: Maximum fraction per width hyperparameter to prune (default: 0.40).
         Only top (1 - max_width_pruning) choices will be considered.
-    - `max_depth_pruning`: Maximum fraction per depth hyperparameter to prune (default: 0.2).
+    - `max_depth_pruning`: Maximum fraction per depth hyperparameter to prune (default: 0.20).
         Only top (1 - max_depth_pruning) choices will be considered.
+    - `hparams_to_skip`: List of hparams to skip during the search (default: None).
     - `top_k`: Number of candidates to consider for score_func validation (default: 10).
     """
 
@@ -195,8 +196,9 @@ def default_search_config(self) -> SearchConfig:
             "skip_sorting": False,
             "scores_path": None,
             # Additional search config for parameter-based pruning
-            "max_width_pruning": 0.5,
-            "max_depth_pruning": 0.25,
+            "max_width_pruning": 0.40,
+            "max_depth_pruning": 0.20,
+            "hparams_to_skip": None,
             "top_k": 10,
         }
 
@@ -378,6 +380,7 @@ def search_best_arch_by_params(self, sorted_layers: list[int]) -> dict:
         max_params = float(self.constraints["params"])  # type: ignore[arg-type]
         max_width_pruning = self.config["max_width_pruning"]
         max_depth_pruning = self.config["max_depth_pruning"]
+        hparams_to_skip = self.config["hparams_to_skip"]
         top_k = self.config["top_k"]
         print_rank_0(
             f"\nSearching for the best pruned architecture under {num2hrb(max_params)} params constraints..."
@@ -401,6 +404,7 @@ def search_best_arch_by_params(self, sorted_layers: list[int]) -> dict:
                 hp_choices,  # type: ignore[arg-type]
                 max_width_pruning,
                 max_depth_pruning,
+                hparams_to_skip,
             )
             sample(self.model, sample_func=max)  # reset to max subnet (for sanity)
             selected = []
@@ -466,18 +470,20 @@ def search_best_arch_by_params(self, sorted_layers: list[int]) -> dict:
     @staticmethod
     def _generate_search_space_combos(
         search_space: dict[str, list],
-        max_width_pruning: float = 0.5,
-        max_depth_pruning: float = 0.2,
+        max_width_pruning: float = 0.40,
+        max_depth_pruning: float = 0.20,
+        hparams_to_skip: list[str] | None = None,
     ) -> list[dict[str, Any]]:
         """Generate all possible combinations of hyperparameters from the search space.
 
         Args:
             search_space: Dictionary mapping hyperparameter names to their possible sorted choices.
                         Example: {"hidden_size": [1024, 2048, 3072, 4096], "num_layers": [1, 2, ..., 31, 32]}
-            max_width_pruning: Maximum fraction of width hyperparameters to prune (default: 0.5).
+            max_width_pruning: Maximum fraction of width hyperparameters to prune (default: 0.40).
                             Only top (1 - max_width_pruning) choices will be considered.
-            max_depth_pruning: Maximum fraction of depth hyperparameters to prune (default: 0.2).
+            max_depth_pruning: Maximum fraction of depth hyperparameters to prune (default: 0.20).
                             Only top (1 - max_depth_pruning) choices will be considered.
+            hparams_to_skip: List of hparams to skip during the search (default: None).
 
         Returns:
             List of configuration dictionaries, where each dictionary maps hyperparameter
@@ -494,11 +500,22 @@ def _generate_search_space_combos(
             f"{max_depth_pruning * 100:.0f}% for depth pruning hparams"
         )
 
+        if hparams_to_skip:
+            print_rank_0(f"Skipping {hparams_to_skip=} during search space generation...")
+            for hparam in hparams_to_skip:
+                if hparam in search_space:
+                    search_space.pop(hparam)
+                else:
+                    warn(f"Hparam {hparam} not found in search space! Skipping...")
+
         filtered_ss = {
-            k: sorted(v)[int((1 - max_depth_pruning) * len(v)) :]
-            if k == "num_layers"
-            else sorted(v)[int((1 - max_width_pruning) * len(v)) :]
+            k: (
+                sorted(v)[int((1 - max_depth_pruning) * len(v)) :]
+                if k == "num_layers"
+                else sorted(v)[int((1 - max_width_pruning) * len(v)) :]
+            )
             for k, v in search_space.items()
+            if len(v) > 1
         }
 
         ss_size = 1
@@ -586,15 +603,15 @@ def get_param_count(mod, name) -> int:
         default_rules={
             "megatron.core.models.gpt.GPTModel": {
                 "hidden_size_divisor": 256,
-                "ffn_hidden_size_divisor": 256,
+                "ffn_hidden_size_divisor": 512,
                 "num_moe_experts_divisor": 8,
                 "num_layers_divisor": 2,
             },
             **(
                 {
                     "megatron.core.models.mamba.MambaModel": {
                         "hidden_size_divisor": 256,
-                        "ffn_hidden_size_divisor": 256,
+                        "ffn_hidden_size_divisor": 512,
                         "mamba_head_dim_divisor": 8,
                         "num_moe_experts_divisor": 8,
                         "num_layers_divisor": 2,
@@ -611,20 +628,23 @@ def get_param_count(mod, name) -> int:
 
 def get_mcore_minitron_config(
     *,
-    channel_divisor: int = 256,
+    hidden_size_divisor: int = 256,
+    ffn_hidden_size_divisor: int = 512,
     mamba_head_dim_divisor: int = 8,
     num_moe_experts_divisor: int = 8,
     num_layers_divisor: int = 2,
 ) -> ModeloptBaseConfig:
-    """Get a MCoreMinitronConfig with the given channel divisor instead of default."""
+    """Get a MCoreMinitronConfig with the given divisors instead of default."""
     config = MCoreMinitronConfig()
 
     def _set_divisors(c):
         for k, v in c.items():
             if isinstance(v, dict):
                 _set_divisors(v)
-            elif k in ["hidden_size_divisor", "ffn_hidden_size_divisor"]:
-                c[k] = channel_divisor
+            elif k == "hidden_size_divisor":
+                c[k] = hidden_size_divisor
+            elif k == "ffn_hidden_size_divisor":
+                c[k] = ffn_hidden_size_divisor
             elif k == "mamba_head_dim_divisor":
                 c[k] = mamba_head_dim_divisor
             elif k == "num_moe_experts_divisor":
diff --git a/tests/_test_utils/torch/nas_prune/minitron_common.py b/tests/_test_utils/torch/nas_prune/minitron_common.py
@@ -23,7 +23,8 @@ def prune_minitron(model, export_config, config, channel_divisor=64):
             (
                 "mcore_minitron",
                 mtp.mcore_minitron.get_mcore_minitron_config(
-                    channel_divisor=channel_divisor,
+                    hidden_size_divisor=channel_divisor,
+                    ffn_hidden_size_divisor=channel_divisor,
                     mamba_head_dim_divisor=4,
                     num_moe_experts_divisor=1,
                     num_layers_divisor=1,
diff --git a/tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py b/tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py
@@ -88,7 +88,11 @@ def _test_gpt_search_space(
         [
             (
                 "mcore_minitron",
-                get_mcore_minitron_config(channel_divisor=channel_divisor, num_layers_divisor=1),
+                get_mcore_minitron_config(
+                    hidden_size_divisor=channel_divisor,
+                    ffn_hidden_size_divisor=channel_divisor,
+                    num_layers_divisor=1,
+                ),
             )
         ],
     )
@@ -267,7 +271,10 @@ def _test_gpt_moe_search_space(rank, size):
             (
                 "mcore_minitron",
                 get_mcore_minitron_config(
-                    channel_divisor=channel_divisor, num_moe_experts_divisor=1, num_layers_divisor=1
+                    hidden_size_divisor=channel_divisor,
+                    ffn_hidden_size_divisor=channel_divisor,
+                    num_moe_experts_divisor=1,
+                    num_layers_divisor=1,
                 ),
             )
         ],
diff --git a/tests/gpu/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py b/tests/gpu/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py
@@ -81,7 +81,8 @@ def _test_mamba_search_space(rank, size):
             (
                 "mcore_minitron",
                 get_mcore_minitron_config(
-                    channel_divisor=channel_divisor,
+                    hidden_size_divisor=channel_divisor,
+                    ffn_hidden_size_divisor=channel_divisor,
                     mamba_head_dim_divisor=mamba_head_dim_divisor,
                     num_layers_divisor=1,
                 ),
diff --git a/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py b/tests/gpu/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py
@@ -78,7 +78,10 @@ def _test_mcore_gpt_parameter_sorting(activation_func, rank, size):
 
     model.eval()
     dynamic_space = _convert_model_to_dynamic_space(
-        model, get_mcore_minitron_config(channel_divisor=channel_divisor)
+        model,
+        get_mcore_minitron_config(
+            hidden_size_divisor=channel_divisor, ffn_hidden_size_divisor=channel_divisor
+        ),
     )
     registry = ImportanceEstimatorRegistry(model)  # register imp estimators and forward hooks
 
@@ -355,7 +358,12 @@ def _test_mcore_gpt_moe_parameter_sorting(rank, size):
 
     model.eval()
     dynamic_space = _convert_model_to_dynamic_space(
-        model, get_mcore_minitron_config(channel_divisor=channel_divisor, num_moe_experts_divisor=1)
+        model,
+        get_mcore_minitron_config(
+            hidden_size_divisor=channel_divisor,
+            ffn_hidden_size_divisor=channel_divisor,
+            num_moe_experts_divisor=1,
+        ),
     )
     registry = ImportanceEstimatorRegistry(model)  # register imp estimators and forward hooks
 
@@ -500,11 +508,12 @@ def test_mcore_gpt_pruning_moe(tmp_path):
 def test_generate_search_space_combos():
     ss = {
         "hidden_size": [32, 64, 96, 128, 160],
+        "ffn_hidden_size": [128, 256, 384, 512, 640],
         "num_attention_heads": [8, 16, 24, 32],
         "num_layers": [1, 2, 3, 4, 5, 6, 7, 8],
     }
     ss_combos = MCoreMinitronSearcher._generate_search_space_combos(
-        ss, max_width_pruning=0.5, max_depth_pruning=0.25
+        ss, max_width_pruning=0.5, max_depth_pruning=0.25, hparams_to_skip=["ffn_hidden_size"]
     )
     assert len(ss_combos) == 3 * 2 * 2
     assert ss_combos == [
diff --git a/tests/gpu/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py b/tests/gpu/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py
@@ -78,7 +78,12 @@ def _test_mcore_mamba_parameter_sorting(rank, size):
 
     model.eval()
     dynamic_space = _convert_model_to_dynamic_space(
-        model, get_mcore_minitron_config(channel_divisor=channel_divisor, mamba_head_dim_divisor=4)
+        model,
+        get_mcore_minitron_config(
+            hidden_size_divisor=channel_divisor,
+            ffn_hidden_size_divisor=channel_divisor,
+            mamba_head_dim_divisor=4,
+        ),
     )
     registry = ImportanceEstimatorRegistry(model)  # register imp estimators and forward hooks
 

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,11 @@ def _test_gpt_search_space(`
`88`	`88`	`[`
`89`	`89`	`(`
`90`	`90`	`"mcore_minitron",`
`91`		`- get_mcore_minitron_config(channel_divisor=channel_divisor, num_layers_divisor=1),`
	`91`	`+ get_mcore_minitron_config(`
	`92`	`+ hidden_size_divisor=channel_divisor,`
	`93`	`+ ffn_hidden_size_divisor=channel_divisor,`
	`94`	`+ num_layers_divisor=1,`
	`95`	`+ ),`
`92`	`96`	`)`
`93`	`97`	`],`
`94`	`98`	`)`
`@@ -267,7 +271,10 @@ def _test_gpt_moe_search_space(rank, size):`
`267`	`271`	`(`
`268`	`272`	`"mcore_minitron",`
`269`	`273`	`get_mcore_minitron_config(`
`270`		`- channel_divisor=channel_divisor, num_moe_experts_divisor=1, num_layers_divisor=1`
	`274`	`+ hidden_size_divisor=channel_divisor,`
	`275`	`+ ffn_hidden_size_divisor=channel_divisor,`
	`276`	`+ num_moe_experts_divisor=1,`
	`277`	`+ num_layers_divisor=1,`
`271`	`278`	`),`
`272`	`279`	`)`
`273`	`280`	`],`