Skip to content

Commit b858631

Browse files
Fix param count calculation + account depth pruning
Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
1 parent 408a574 commit b858631

File tree

11 files changed

+192
-71
lines changed

11 files changed

+192
-71
lines changed

.vscode/settings.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
"--no-cov",
4141
],
4242
"evenBetterToml.schema.enabled": false, // disable toml/json schema since we have custom fields
43-
"python.analysis.extraPaths": [
43+
"cursorpyright.analysis.extraPaths": [
4444
"./tests/" // add tests to python path just like pytest does in pyproject.toml
4545
],
4646
"git.alwaysSignOff": true,

modelopt/torch/nas/plugins/megatron.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,6 +1040,7 @@ def modify(
10401040
ffn_hidden_size_divisor: int = 1,
10411041
mamba_head_dim_divisor: int = 1,
10421042
num_moe_experts_divisor: int = 1,
1043+
num_layers_divisor: int = 1,
10431044
):
10441045
"""Modify the dynamic choices of the module according to provided keyword arguments.
10451046
@@ -1048,10 +1049,15 @@ def modify(
10481049
ffn_hidden_size_divisor: The divisor of the mlp ffn_hidden_size.
10491050
mamba_head_dim_divisor: The divisor of the mamba head_dim.
10501051
num_moe_experts_divisor: The divisor of the number of MoE experts.
1052+
num_layers_divisor: The divisor of the number of layers.
10511053
"""
1052-
hp = self.get_hparam("hidden_size")
1053-
choices = {int(make_divisible(c, hidden_size_divisor)) for c in hp.choices} # type: ignore[arg-type]
1054-
hp.choices = list(set(hp.choices) & choices | {hp.original})
1054+
for hp_name, divisor in [
1055+
("hidden_size", hidden_size_divisor),
1056+
("num_layers", num_layers_divisor),
1057+
]:
1058+
hp = self.get_hparam(hp_name)
1059+
choices = {int(make_divisible(c, divisor)) for c in hp.choices} # type: ignore[arg-type]
1060+
hp.choices = list(set(hp.choices) & choices | {hp.original})
10551061

10561062
for layer in self.decoder.layers:
10571063
layer.modify(

modelopt/torch/prune/plugins/mcore_minitron.py

Lines changed: 139 additions & 61 deletions
Large diffs are not rendered by default.

modelopt/torch/prune/pruning.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def prune(
7878
constraints = {"params": "60%"}
7979
8080
# Specify export_config with pruned hyperparameters
81-
# This is supported and required if the model is converted via ``mcore_minitron`` mode.
81+
# This is supported only if the model is converted via ``mcore_minitron`` mode.
8282
constraints = {
8383
"export_config": {
8484
"ffn_hidden_size": 128,

modelopt/torch/utils/logging.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
def num2hrb(num: float, suffix="") -> str:
4747
"""Convert big floating number to human readable string."""
4848
step = 1000 # step between units is 1000
49-
units = ["", "K", "M", "G", "T", "P", "E"]
49+
units = ["", "K", "M", "B", "T", "P", "E"]
5050
while abs(num) >= step and len(units) > 1:
5151
num /= step
5252
units.pop(0)

modelopt/torch/utils/network.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,13 @@ def param_num(network: nn.Module, trainable_only: bool = False, unit=1e6) -> flo
112112
Returns:
113113
The number of parameters in the model in the given unit.
114114
"""
115+
from modelopt.torch.opt.dynamic import DynamicModule
116+
117+
if isinstance(network, DynamicModule):
118+
# NOTE: model.parameters() doesnt consider active_slice so we dont get sorted or trimmed parameters!
119+
raise NotImplementedError(
120+
"param_num doesn't support DynamicModule. Please use param_num_from_forward instead."
121+
)
115122
return (
116123
sum(
117124
p.numel() if not trainable_only or p.requires_grad else 0

modelopt/torch/utils/plugins/megatron_model.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,14 @@ def param_num_megatron(
4646
Returns:
4747
The number of parameters in the model (reduced across TP and PP ranks).
4848
"""
49+
from modelopt.torch.opt.dynamic import DynamicModule
50+
4951
if from_forward:
5052
assert args is not None, "args must be provided if from_forward is True"
5153
params = int(param_num_from_forward(model, args, unit=1.0))
54+
elif isinstance(model, DynamicModule):
55+
# NOTE: model.parameters() doesnt consider active_slice so we dont get sorted or trimmed parameters!
56+
raise NotImplementedError("DynamicModule input is not supported without from_forward.")
5257
else:
5358
params = sum(p.numel() for p in model.parameters())
5459
reduced_params = torch.Tensor([params]).to(device=next(model.parameters()).device)

tests/_test_utils/torch/nas_prune/minitron_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def prune_minitron(model, export_config, config, channel_divisor=64):
2626
channel_divisor=channel_divisor,
2727
mamba_head_dim_divisor=4,
2828
num_moe_experts_divisor=1,
29+
num_layers_divisor=1,
2930
),
3031
)
3132
],

tests/gpu/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,15 @@ def _test_gpt_search_space(
8383
normalization=normalization,
8484
).cuda()
8585

86-
mtn.convert(model, [("mcore_minitron", get_mcore_minitron_config(channel_divisor))])
86+
mtn.convert(
87+
model,
88+
[
89+
(
90+
"mcore_minitron",
91+
get_mcore_minitron_config(channel_divisor=channel_divisor, num_layers_divisor=1),
92+
)
93+
],
94+
)
8795

8896
assert isinstance(model, _DynamicMCoreLanguageModel)
8997
for m in model.modules():
@@ -255,7 +263,14 @@ def _test_gpt_moe_search_space(rank, size):
255263

256264
mtn.convert(
257265
model,
258-
[("mcore_minitron", get_mcore_minitron_config(channel_divisor, num_moe_experts_divisor=1))],
266+
[
267+
(
268+
"mcore_minitron",
269+
get_mcore_minitron_config(
270+
channel_divisor=channel_divisor, num_moe_experts_divisor=1, num_layers_divisor=1
271+
),
272+
)
273+
],
259274
)
260275

261276
moe = model.decoder.layers[0].mlp

tests/gpu/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,16 @@ def _test_mamba_search_space(rank, size):
7777

7878
mtn.convert(
7979
model,
80-
[("mcore_minitron", get_mcore_minitron_config(channel_divisor, mamba_head_dim_divisor))],
80+
[
81+
(
82+
"mcore_minitron",
83+
get_mcore_minitron_config(
84+
channel_divisor=channel_divisor,
85+
mamba_head_dim_divisor=mamba_head_dim_divisor,
86+
num_layers_divisor=1,
87+
),
88+
)
89+
],
8190
)
8291

8392
assert isinstance(model, _DynamicMCoreLanguageModel)

0 commit comments

Comments
 (0)