Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ megatron/core/models/gpt/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/gpt

megatron/core/models/multimodal/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/multi-modal

megatron/core/models/mamba/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba
megatron/core/models/mamba/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-model

megatron/core/models/hybrid/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-model

megatron/core/ssm/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba

megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets
Expand Down
4 changes: 2 additions & 2 deletions examples/mamba/run_text_gen_server_8b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ export NCCL_IB_QPS_PER_CONNECTION=4
export TRITON_CACHE_DIR="./triton-cache/"
export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"

torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
torchrun $DISTRIBUTED_ARGS ../../tools/run_hybrid_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--untie-embeddings-and-output-weights \
Expand All @@ -46,5 +46,5 @@ torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
--bf16 \
--micro-batch-size 1 \
--use-mcore-models \
--spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
--spec megatron.core.models.hybrid.hybrid_layer_specs hybrid_stack_spec \
--seed 42
4 changes: 2 additions & 2 deletions examples/mamba/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ options=" \
--eval-iters 32 \
--bf16 \
--use-mcore-models \
--spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
--spec megatron.core.models.hybrid.hybrid_layer_specs hybrid_stack_spec \
--no-create-attention-mask-in-dataloader \
--tensorboard-dir ${TENSORBOARD_DIR}"

torchrun --nproc_per_node 8 ../../pretrain_mamba.py ${options}
torchrun --nproc_per_node 8 ../../pretrain_hybrid.py ${options}
10 changes: 5 additions & 5 deletions examples/multimodal/layer_specs.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024-2026, NVIDIA CORPORATION. All rights reserved.
import torch

from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.ssm.mamba_block import MambaStack, MambaStackSubmodules
from megatron.core.models.hybrid.hybrid_block import HybridStack, HybridStackSubmodules
from megatron.core.ssm.mamba_layer import MambaLayer, MambaLayerSubmodules
from megatron.core.ssm.mamba_mixer import MambaMixer, MambaMixerSubmodules
from megatron.core.ssm.mlp_layer import MLPLayer
Expand Down Expand Up @@ -125,15 +125,15 @@ def get_layer_spec_te(is_vit=False, padding=False) -> ModuleSpec:
)


def get_mamba_layer_spec_te(padding=False) -> ModuleSpec:
def get_hybrid_layer_spec_te(padding=False) -> ModuleSpec:
attn_mask_type = AttnMaskType.causal
# Padding mask is needed for e.g. Context Parallel.
if padding:
attn_mask_type = AttnMaskType.padding_causal

return ModuleSpec(
module=MambaStack,
submodules=MambaStackSubmodules(
module=HybridStack,
submodules=HybridStackSubmodules(
mamba_layer=ModuleSpec(
module=MambaLayer,
submodules=MambaLayerSubmodules(
Expand Down
6 changes: 3 additions & 3 deletions examples/multimodal/model.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024-2026, NVIDIA CORPORATION. All rights reserved.
import warnings
import logging
from copy import deepcopy

import torch
from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
from layer_specs import (get_layer_spec, get_layer_spec_te, get_mlp_module_spec, get_norm_mlp_module_spec_te,
get_mamba_layer_spec_te)
get_hybrid_layer_spec_te)

from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN, LLaVAModel
from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
Expand Down Expand Up @@ -99,7 +99,7 @@ def model_provider(
# Padding mask needed for SP/CP.
padding = args.context_parallel_size > 1 and args.sequence_parallel
if args.language_model_type.startswith('nemotron5-hybrid'):
language_transformer_layer_spec = get_mamba_layer_spec_te(padding=padding)
language_transformer_layer_spec = get_hybrid_layer_spec_te(padding=padding)
else:
language_transformer_layer_spec = get_layer_spec_te(
is_vit=False, padding=padding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,5 @@ MODEL_ARGS=" \
--bf16 \
--seq-length 8192 \
--max-position-embeddings 8192 \
--export-model-type MambaModel \
--export-model-type HybridModel \
"
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,5 @@ MODEL_ARGS=" \
--bf16 \
--seq-length 8192 \
--max-position-embeddings 8192 \
--export-model-type MambaModel \
--export-model-type HybridModel \
"
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,6 @@ MODEL_ARGS=" \
--tokenizer-type HuggingFaceTokenizer \
--make-vocab-size-divisible-by 1 \
--use-mcore-models \
--export-model-type MambaModel \
--export-model-type HybridModel \
--padded-vocab-size 131072 \
"
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ MODEL_ARGS=" \
--max-position-embeddings 8192 \
--tokenizer-type HuggingFaceTokenizer \
--use-mcore-models \
--export-model-type MambaModel \
--export-model-type HybridModel \
"
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,5 @@ MODEL_ARGS=" \
--make-vocab-size-divisible-by 1 \
--use-mcore-models \
--rotary-base 10000 \
--export-model-type MambaModel \
--export-model-type HybridModel \
"
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@ MODEL_ARGS=" \
--max-position-embeddings 8192 \
--tokenizer-type HuggingFaceTokenizer \
--bf16 \
--export-model-type MambaModel \
--export-model-type HybridModel \
"
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,6 @@ MODEL_ARGS=" \
--use-mcore-models \
--rotary-percent 0.5 \
--rotary-base 500000 \
--export-model-type MambaModel \
--export-model-type HybridModel \
"
# --rotary-base 10000 \
4 changes: 2 additions & 2 deletions examples/post_training/modelopt/convert_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from megatron.core.parallel_state import destroy_model_parallel
from megatron.post_training.arguments import add_modelopt_args
from megatron.post_training.checkpointing import load_modelopt_checkpoint
from megatron.post_training.model_builder import modelopt_gpt_mamba_builder
from megatron.post_training.model_builder import modelopt_gpt_hybrid_builder
from megatron.post_training.utils import (
report_current_memory_info,
to_empty_if_meta,
Expand Down Expand Up @@ -129,7 +129,7 @@ def check_arguments():
)

model = get_model(
functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False
functools.partial(model_provider, modelopt_gpt_hybrid_builder), wrap_with_ddp=False
)
report_current_memory_info()

Expand Down
2 changes: 1 addition & 1 deletion examples/post_training/modelopt/distillation.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ Without this configuration file, the default logits-only distillation with scale

### Training

Distillation is triggered by calling `pretrain_gpt.py` or `pretrain_mamba.py` with the following arguments:
Distillation is triggered by calling `pretrain_gpt.py` or `pretrain_hybrid.py` with the following arguments:

```bash
--export-kd-teacher-load <path-to-teacher-checkpoint>
Expand Down
4 changes: 2 additions & 2 deletions examples/post_training/modelopt/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from megatron.post_training.arguments import add_modelopt_args
from megatron.post_training.checkpointing import load_modelopt_checkpoint
from megatron.post_training.model_builder import modelopt_gpt_mamba_builder
from megatron.post_training.model_builder import modelopt_gpt_hybrid_builder
from megatron.training import get_args, get_model
from megatron.training.initialize import initialize_megatron
from megatron.training.utils import unwrap_model
Expand Down Expand Up @@ -74,7 +74,7 @@ def add_modelopt_export_args(parser):
)

model = get_model(
functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False
functools.partial(model_provider, modelopt_gpt_hybrid_builder), wrap_with_ddp=False
)

# Materialize the model from meta device to cpu before loading the checkpoint.
Expand Down
4 changes: 2 additions & 2 deletions examples/post_training/modelopt/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from megatron.core.models.gpt import GPTModel
from megatron.post_training.arguments import add_modelopt_args
from megatron.post_training.loss_func import loss_func
from megatron.post_training.model_builder import modelopt_gpt_mamba_builder
from megatron.post_training.model_builder import modelopt_gpt_hybrid_builder
from megatron.post_training.non_loss_data_func import report_draft_acceptance_length
from megatron.training import get_args, get_timers, pretrain
from megatron.training.utils import (
Expand Down Expand Up @@ -486,7 +486,7 @@ def forward_step(data_iterator, model: GPTModel):
if __name__ == "__main__":
pretrain(
train_valid_test_sft_datasets_provider,
partial(model_provider, modelopt_gpt_mamba_builder),
partial(model_provider, modelopt_gpt_hybrid_builder),
ModelType.encoder_or_decoder,
forward_step,
extra_args_provider=add_finetune_args,
Expand Down
4 changes: 2 additions & 2 deletions examples/post_training/modelopt/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from megatron.post_training.arguments import add_modelopt_args
from megatron.post_training.checkpointing import load_modelopt_checkpoint
from megatron.post_training.generate import simple_generate
from megatron.post_training.model_builder import modelopt_gpt_mamba_builder
from megatron.post_training.model_builder import modelopt_gpt_hybrid_builder
from megatron.post_training.utils import report_current_memory_info, to_empty_if_meta
from megatron.training import get_args, get_model, initialize_megatron
from utils import get_hf_tokenizer
Expand Down Expand Up @@ -100,7 +100,7 @@ def get_conversations(example):
UserWarning,
)

model = get_model(functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False)
model = get_model(functools.partial(model_provider, modelopt_gpt_hybrid_builder), wrap_with_ddp=False)
report_current_memory_info()

unwrapped_model = unwrap_model(model)[0]
Expand Down
4 changes: 2 additions & 2 deletions examples/post_training/modelopt/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from megatron.post_training.arguments import add_modelopt_args
from megatron.post_training.checkpointing import load_modelopt_checkpoint
from megatron.post_training.generate import simple_generate
from megatron.post_training.model_builder import modelopt_gpt_mamba_builder
from megatron.post_training.model_builder import modelopt_gpt_hybrid_builder
from megatron.post_training.utils import report_current_memory_info
from megatron.training import get_args, get_model, initialize_megatron
from utils import get_hf_tokenizer
Expand Down Expand Up @@ -158,7 +158,7 @@ def generate_prompt(test_example, dev_examples, few_shots=0, no_subject_prompt=F
UserWarning,
)

model = get_model(functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False)
model = get_model(functools.partial(model_provider, modelopt_gpt_hybrid_builder), wrap_with_ddp=False)
report_current_memory_info()

# Materialize the model from meta device to gpu before loading the checkpoint.
Expand Down
4 changes: 2 additions & 2 deletions examples/post_training/modelopt/offline_feature_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from megatron.core import mpu
from megatron.post_training.arguments import add_modelopt_args
from megatron.post_training.checkpointing import load_modelopt_checkpoint
from megatron.post_training.model_builder import modelopt_gpt_mamba_builder
from megatron.post_training.model_builder import modelopt_gpt_hybrid_builder
from megatron.training import get_args, get_model, get_tokenizer, initialize_megatron
from megatron.training.utils import print_rank_0, unwrap_model
from model_provider import model_provider
Expand Down Expand Up @@ -53,7 +53,7 @@ def extract_feature(dataset, model, output_dir, idx_start, idx_end):

args = get_args()
tokenizer = get_tokenizer()
model = get_model(functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False)
model = get_model(functools.partial(model_provider, modelopt_gpt_hybrid_builder), wrap_with_ddp=False)

load_modelopt_checkpoint(model, strict=not args.untie_embeddings_and_output_weights)
print_rank_0("Done loading checkpoint")
Expand Down
4 changes: 2 additions & 2 deletions examples/post_training/modelopt/prune.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from megatron.post_training.arguments import add_modelopt_args
from megatron.post_training.checkpointing import load_modelopt_checkpoint
from megatron.post_training.generate import simple_generate
from megatron.post_training.model_builder import modelopt_gpt_mamba_builder
from megatron.post_training.model_builder import modelopt_gpt_hybrid_builder
from megatron.post_training.utils import (
report_current_memory_info,
)
Expand Down Expand Up @@ -163,7 +163,7 @@ def get_params(model):

tokenizer = get_hf_tokenizer()
model = get_model(
functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False
functools.partial(model_provider, modelopt_gpt_hybrid_builder), wrap_with_ddp=False
)
unwrapped_model = unwrap_model(model)[0]

Expand Down
4 changes: 2 additions & 2 deletions examples/post_training/modelopt/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from megatron.post_training.arguments import add_modelopt_args
from megatron.post_training.checkpointing import load_modelopt_checkpoint
from megatron.post_training.generate import simple_generate
from megatron.post_training.model_builder import modelopt_gpt_mamba_builder
from megatron.post_training.model_builder import modelopt_gpt_hybrid_builder
from megatron.post_training.utils import (
print_distributed_quant_summary,
report_current_memory_info,
Expand Down Expand Up @@ -362,7 +362,7 @@ def get_calib_dataloader(
tokenizer = get_hf_tokenizer()

model = get_model(
functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False
functools.partial(model_provider, modelopt_gpt_hybrid_builder), wrap_with_ddp=False
)

report_current_memory_info()
Expand Down
4 changes: 2 additions & 2 deletions examples/post_training/modelopt/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ fi

export HF_TOKEN=${HF_TOKEN}

if [[ ${MODEL_ARGS} == *"MambaModel"* ]]; then
PRETRAIN_EXE=${SCRIPT_DIR}/../../../pretrain_mamba.py
if [[ ${MODEL_ARGS} == *"HybridModel"* ]] || [[ ${MODEL_ARGS} == *"MambaModel"* ]]; then
PRETRAIN_EXE=${SCRIPT_DIR}/../../../pretrain_hybrid.py
else
PRETRAIN_EXE=${SCRIPT_DIR}/../../../pretrain_gpt.py
fi
Expand Down
4 changes: 2 additions & 2 deletions examples/post_training/modelopt/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from megatron.post_training.arguments import add_modelopt_args
from megatron.post_training.checkpointing import load_modelopt_checkpoint
from megatron.post_training.model_builder import modelopt_gpt_mamba_builder
from megatron.post_training.model_builder import modelopt_gpt_hybrid_builder
from megatron.post_training.utils import get_mtbench_chat_data
from megatron.training import get_args, get_model, initialize_megatron
from utils import get_hf_tokenizer
Expand Down Expand Up @@ -116,7 +116,7 @@ def report_current_memory_info():
ground_truth = [None for _ in range(len(prompts))]

tokenizer = get_hf_tokenizer()
model = get_model(functools.partial(model_provider, modelopt_gpt_mamba_builder), wrap_with_ddp=False)
model = get_model(functools.partial(model_provider, modelopt_gpt_hybrid_builder), wrap_with_ddp=False)

report_current_memory_info()

Expand Down
2 changes: 1 addition & 1 deletion examples/rl/model_configs/nemotron5_56b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ MODEL_OPTIONS="\
\
--fp8-recipe tensorwise \
--hybrid-layer-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \
--spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
--spec megatron.core.models.hybrid.hybrid_layer_specs hybrid_stack_spec \
--mamba-state-dim 256 \
--per-split-data-args-path ${BLEND_PATH} \
--tiktoken-pattern v2 \
Expand Down
2 changes: 1 addition & 1 deletion examples/rl/model_configs/nemotron5_8b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ MODEL_OPTIONS="\
--inference-max-requests $MAX_INFERENCE_BS \
--pretrained-checkpoint $CHECKPOINT \
--hybrid-layer-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \
--spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
--spec megatron.core.models.hybrid.hybrid_layer_specs hybrid_stack_spec \
--tiktoken-pattern v2 \
--distributed-timeout-minutes 60 \
--use-mcore-models \
Expand Down
2 changes: 1 addition & 1 deletion examples/rl/model_configs/nemotron5p5_12b_H.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ MODEL_OPTIONS="\
--disable-gloo-process-groups \
--mamba-head-dim 80 \
--hybrid-layer-pattern M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M- \
--spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
--spec megatron.core.models.hybrid.hybrid_layer_specs hybrid_stack_spec \
--tiktoken-pattern v2 \
--distributed-timeout-minutes 10 \
--use-mcore-models \
Expand Down
Loading
Loading