You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
2: Traceback (most recent call last):
2: File "/workspace/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py", line 64, in main
2: model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
2: File "/opt/NeMo/nemo/collections/nlp/models/nlp_model.py", line 465, in restore_from
2: return super().restore_from(
2: File "/opt/NeMo/nemo/core/classes/modelPT.py", line 464, in restore_from
2: instance = cls._save_restore_connector.restore_from(
2: File "/opt/NeMo/nemo/collections/nlp/parts/nlp_overrides.py", line 1122, in restore_from
2: loaded_params = super().load_config_and_state_dict(
2: File "/opt/NeMo/nemo/core/connectors/save_restore_connector.py", line 178, in load_config_and_state_dict
2: instance = calling_cls.from_config_dict(config=conf, trainer=trainer)
2: File "/opt/NeMo/nemo/core/classes/common.py", line 524, in from_config_dict
2: raise e
2: File "/opt/NeMo/nemo/core/classes/common.py", line 516, in from_config_dict
2: instance = cls(cfg=config, trainer=trainer)
2: File "/opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py", line 77, in init
2: super().init(cfg, trainer=trainer)
2: File "/opt/NeMo/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py", line 79, in init
2: super().init(*args, **kwargs)
2: File "/opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py", line 287, in init
2: super().init(cfg, trainer=trainer, no_lm_init=True)
2: File "/opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_base_model.py", line 175, in init
2: init_world_size = trainer.world_size
2: File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1161, in world_size
2: return getattr(self.strategy, "world_size", 1)
2: File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/parallel.py", line 66, in world_size
2: return self.cluster_environment.world_size() if self.cluster_environment is not None else 1
2: File "/usr/local/lib/python3.10/dist-packages/lightning_fabric/plugins/environments/torchelastic.py", line 63, in world_size
2: return int(os.environ["WORLD_SIZE"])
2: File "/usr/lib/python3.10/os.py", line 680, in getitem
2: raise KeyError(key) from None
2: KeyError: 'WORLD_SIZE'
#Load necessary modules and set environment variables
export CUDA_DEVICE_MAX_CONNECTIONS=1
#Set model and training parameters
TRAIN_DS="[/workspace/data/databricks-dolly-15k/training.jsonl]"
VALID_DS="[/workspace/data/databricks-dolly-15k/validation.jsonl]"
TEST_DS="[/workspace/data/databricks-dolly-15k/test.jsonl]"
CKPT="/workspace/model_ckpts/llama2-7b.nemo"
2: Traceback (most recent call last):
2: File "/workspace/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py", line 64, in main
2: model = MegatronGPTSFTModel.restore_from(cfg.model.restore_from_path, model_cfg, trainer=trainer)
2: File "/opt/NeMo/nemo/collections/nlp/models/nlp_model.py", line 465, in restore_from
2: return super().restore_from(
2: File "/opt/NeMo/nemo/core/classes/modelPT.py", line 464, in restore_from
2: instance = cls._save_restore_connector.restore_from(
2: File "/opt/NeMo/nemo/collections/nlp/parts/nlp_overrides.py", line 1122, in restore_from
2: loaded_params = super().load_config_and_state_dict(
2: File "/opt/NeMo/nemo/core/connectors/save_restore_connector.py", line 178, in load_config_and_state_dict
2: instance = calling_cls.from_config_dict(config=conf, trainer=trainer)
2: File "/opt/NeMo/nemo/core/classes/common.py", line 524, in from_config_dict
2: raise e
2: File "/opt/NeMo/nemo/core/classes/common.py", line 516, in from_config_dict
2: instance = cls(cfg=config, trainer=trainer)
2: File "/opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py", line 77, in init
2: super().init(cfg, trainer=trainer)
2: File "/opt/NeMo/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py", line 79, in init
2: super().init(*args, **kwargs)
2: File "/opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py", line 287, in init
2: super().init(cfg, trainer=trainer, no_lm_init=True)
2: File "/opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_base_model.py", line 175, in init
2: init_world_size = trainer.world_size
2: File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 1161, in world_size
2: return getattr(self.strategy, "world_size", 1)
2: File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/parallel.py", line 66, in world_size
2: return self.cluster_environment.world_size() if self.cluster_environment is not None else 1
2: File "/usr/local/lib/python3.10/dist-packages/lightning_fabric/plugins/environments/torchelastic.py", line 63, in world_size
2: return int(os.environ["WORLD_SIZE"])
2: File "/usr/lib/python3.10/os.py", line 680, in getitem
2: raise KeyError(key) from None
2: KeyError: 'WORLD_SIZE'
use slurm
#!/bin/bash
#SBATCH --exclusive --nodes=2 --mem=0 --overcommit --ntasks-per-node=8 --time=4:30:00 --job-name=multinode_sft_example
#Load necessary modules and set environment variables
export CUDA_DEVICE_MAX_CONNECTIONS=1
#Set model and training parameters
TRAIN_DS="[/workspace/data/databricks-dolly-15k/training.jsonl]"
VALID_DS="[/workspace/data/databricks-dolly-15k/validation.jsonl]"
TEST_DS="[/workspace/data/databricks-dolly-15k/test.jsonl]"
CKPT="/workspace/model_ckpts/llama2-7b.nemo"
TS=$(date +%s)
OUTPUT_PATH="/workspace/"
RESULTS_DIR="$OUTPUT_PATH/results_${TS}"
CONCAT_SAMPLING_PROBS="[1]"
TP_SIZE=1
PP_SIZE=1
BS=128
MAX_LEN=2048
#The NeMo command to run on each node.
run_cmd="python3 /workspace/NeMo/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py
trainer.precision=bf16
trainer.devices=$SLURM_NTASKS_PER_NODE
trainer.num_nodes=$SLURM_JOB_NUM_NODES
trainer.val_check_interval=0.1
trainer.max_steps=50
model.restore_from_path=${CKPT}
model.micro_batch_size=1
model.global_batch_size=128
model.tensor_model_parallel_size=${TP_SIZE}
model.pipeline_model_parallel_size=${PP_SIZE}
model.megatron_amp_O2=True
model.activations_checkpoint_granularity=selective
model.activations_checkpoint_method=uniform
model.optim.name=distributed_fused_adam
model.optim.lr=5e-6
model.answer_only_loss=True
model.peft.peft_scheme=none
model.data.train_ds.file_names=${TRAIN_DS}
model.data.validation_ds.file_names=${VALID_DS}
model.data.test_ds.file_names=${TEST_DS}
model.data.train_ds.concat_sampling_probabilities=${CONCAT_SAMPLING_PROBS}
model.data.train_ds.max_seq_length=2048
model.data.validation_ds.max_seq_length=2048
model.data.train_ds.micro_batch_size=1
model.data.train_ds.global_batch_size=128
model.data.validation_ds.micro_batch_size=1
model.data.validation_ds.global_batch_size=128
model.data.test_ds.micro_batch_size=1
model.data.test_ds.global_batch_size=256
model.data.train_ds.num_workers=0
model.data.validation_ds.num_workers=0
model.data.test_ds.num_workers=0
model.data.validation_ds.metric.name=loss
model.data.test_ds.metric.name=loss
exp_manager.create_wandb_logger=False
exp_manager.explicit_log_dir=/workspace/results
exp_manager.resume_if_exists=True
exp_manager.resume_ignore_no_checkpoint=True
exp_manager.create_checkpoint_callback=True
exp_manager.checkpoint_callback_params.monitor=validation_loss
exp_manager.checkpoint_callback_params.save_best_model=False
exp_manager.checkpoint_callback_params.save_nemo_on_train_end=True
++cluster_type=BCP"
#Container
CONT=/root/nvcr.io+nvidia+nemo+24.06.sqsh
CONT_NAME=nemofw-training
CONT_MOUNT=/root/yovole:/workspace,/dev/infiniband:/dev/infiniband
#run on SLURM
srun -l
--ntasks-per-node=8
--container-name="${CONT_NAME}"
--container-image="${CONT}"
--container-mounts="${CONT_MOUNT}"
--container-entrypoint
--no-container-mount-home
bash -c "${run_cmd}"
The text was updated successfully, but these errors were encountered: