Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

slurm Multi-machine and multi-GPU training #416

Open
yangzhipeng1108 opened this issue Aug 22, 2024 · 0 comments
Open

slurm Multi-machine and multi-GPU training #416

yangzhipeng1108 opened this issue Aug 22, 2024 · 0 comments

Comments

@yangzhipeng1108
Copy link

yangzhipeng1108 commented Aug 22, 2024

2: Traceback (most recent call last):
2: File "/workspace/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py", line 64, in main
2: model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
2: File "/opt/NeMo/nemo/collections/nlp/models/nlp_model.py", line 465, in restore_from
2: return super().restore_from(
2: File "/opt/NeMo/nemo/core/classes/modelPT.py", line 464, in restore_from
2: instance = cls._save_restore_connector.restore_from(
2: File "/opt/NeMo/nemo/collections/nlp/parts/nlp_overrides.py", line 1122, in restore_from
2: loaded_params = super().load_config_and_state_dict(
2: File "/opt/NeMo/nemo/core/connectors/save_restore_connector.py", line 178, in load_config_and_state_dict
2: instance = calling_cls.from_config_dict(config=conf, trainer=trainer)
2: File "/opt/NeMo/nemo/core/classes/common.py", line 524, in from_config_dict
2: raise e
2: File "/opt/NeMo/nemo/core/classes/common.py", line 516, in from_config_dict
2: instance = cls(cfg=config, trainer=trainer)
2: File "/opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py", line 77, in init
2: super().init(cfg, trainer=trainer)
2: File "/opt/NeMo/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py", line 79, in init
2: super().init(*args, **kwargs)
2: File "/opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py", line 287, in init
2: super().init(cfg, trainer=trainer, no_lm_init=True)
2: File "/opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_base_model.py", line 175, in init
2: init_world_size = trainer.world_size
2: File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1161, in world_size
2: return getattr(self.strategy, "world_size", 1)
2: File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/parallel.py", line 66, in world_size
2: return self.cluster_environment.world_size() if self.cluster_environment is not None else 1
2: File "/usr/local/lib/python3.10/dist-packages/lightning_fabric/plugins/environments/torchelastic.py", line 63, in world_size
2: return int(os.environ["WORLD_SIZE"])
2: File "/usr/lib/python3.10/os.py", line 680, in getitem
2: raise KeyError(key) from None
2: KeyError: 'WORLD_SIZE'

use slurm

#!/bin/bash
#SBATCH --exclusive --nodes=2 --mem=0 --overcommit --ntasks-per-node=8 --time=4:30:00 --job-name=multinode_sft_example

#Load necessary modules and set environment variables
export CUDA_DEVICE_MAX_CONNECTIONS=1

#Set model and training parameters
TRAIN_DS="[/workspace/data/databricks-dolly-15k/training.jsonl]"
VALID_DS="[/workspace/data/databricks-dolly-15k/validation.jsonl]"
TEST_DS="[/workspace/data/databricks-dolly-15k/test.jsonl]"
CKPT="/workspace/model_ckpts/llama2-7b.nemo"

TS=$(date +%s)
OUTPUT_PATH="/workspace/"
RESULTS_DIR="$OUTPUT_PATH/results_${TS}"
CONCAT_SAMPLING_PROBS="[1]"

TP_SIZE=1
PP_SIZE=1
BS=128
MAX_LEN=2048

#The NeMo command to run on each node.
run_cmd="python3 /workspace/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py
trainer.precision=bf16
trainer.devices=$SLURM_NTASKS_PER_NODE
trainer.num_nodes=$SLURM_JOB_NUM_NODES
trainer.val_check_interval=0.1
trainer.max_steps=50
model.restore_from_path=${CKPT}
model.micro_batch_size=1
model.global_batch_size=128
model.tensor_model_parallel_size=${TP_SIZE}
model.pipeline_model_parallel_size=${PP_SIZE}
model.megatron_amp_O2=True
model.activations_checkpoint_granularity=selective
model.activations_checkpoint_method=uniform
model.optim.name=distributed_fused_adam
model.optim.lr=5e-6
model.answer_only_loss=True
model.peft.peft_scheme=none
model.data.train_ds.file_names=${TRAIN_DS}
model.data.validation_ds.file_names=${VALID_DS}
model.data.test_ds.file_names=${TEST_DS}
model.data.train_ds.concat_sampling_probabilities=${CONCAT_SAMPLING_PROBS}
model.data.train_ds.max_seq_length=2048
model.data.validation_ds.max_seq_length=2048
model.data.train_ds.micro_batch_size=1
model.data.train_ds.global_batch_size=128
model.data.validation_ds.micro_batch_size=1
model.data.validation_ds.global_batch_size=128
model.data.test_ds.micro_batch_size=1
model.data.test_ds.global_batch_size=256
model.data.train_ds.num_workers=0
model.data.validation_ds.num_workers=0
model.data.test_ds.num_workers=0
model.data.validation_ds.metric.name=loss
model.data.test_ds.metric.name=loss
exp_manager.create_wandb_logger=False
exp_manager.explicit_log_dir=/workspace/results
exp_manager.resume_if_exists=True
exp_manager.resume_ignore_no_checkpoint=True
exp_manager.create_checkpoint_callback=True
exp_manager.checkpoint_callback_params.monitor=validation_loss
exp_manager.checkpoint_callback_params.save_best_model=False
exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True
++cluster_type=BCP"

#Container
CONT=/root/nvcr.io+nvidia+nemo+24.06.sqsh
CONT_NAME=nemofw-training
CONT_MOUNT=/root/yovole:/workspace,/dev/infiniband:/dev/infiniband

#run on SLURM
srun -l
--ntasks-per-node=8
--container-name="${CONT_NAME}"
--container-image="${CONT}"
--container-mounts="${CONT_MOUNT}"
--container-entrypoint
--no-container-mount-home
bash -c "${run_cmd}"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant