diff --git a/pretrain/scripts/v5-test/README.md b/pretrain/scripts/v5-test/README.md new file mode 100644 index 0000000..8a2694d --- /dev/null +++ b/pretrain/scripts/v5-test/README.md @@ -0,0 +1,29 @@ +# v5 test + +v4-midtraining のコードをもとにした、v5用のMegatron-LMの学習のテスト用スクリプト + +## 環境構築 + +```bash +cd installer-abci/ +bash run_setup.sh +``` + +## 実行 + +```bash +# bash train/run_train.sh +bash train/run_train.sh $(realpath tasks/v4-dolmino-mix-1124) 7.7b-llama3-ecjk 1 +``` + +## Notes + +### Installer 関連 + +- `--attention-backend=auto` の設定と競合するため `NVTE_FUSED_ATTN` の設定を除去 +- installer-abci/src/install_megatron_lm.sh において extention name を `helpers_cpp` に変更 + +### MoE 関連 + +- [Megatron-LM MoE Quick Start](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/README.md#quick-start) を参考に 8x7.7b モデルの設定を作成 + - `--moe-permute-fusion` は `TE < 2.1.0` では動作しないのでコメントアウト \ No newline at end of file diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/qsub_setup.sh b/pretrain/scripts/v5-test/installer-abci-hopper/qsub_setup.sh new file mode 100755 index 0000000..9eced7d --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci-hopper/qsub_setup.sh @@ -0,0 +1,53 @@ +#!/bin/bash +#PBS -P gcg51557 +#PBS -q R9920251000 +#PBS -v RTYPE=rt_HF +#PBS -l select=1 +#PBS -l walltime=01:00:00 +#PBS -o /dev/null +#PBS -e /dev/null + +cd $PBS_O_WORKDIR + +TIMESTAMP=$(date +%Y%m%d%H%M%S) +JOBID=${PBS_JOBID%%.*} +mkdir -p logs +LOGFILE=logs/install-$JOBID.out +ERRFILE=logs/install-$JOBID.err +exec > $LOGFILE 2> $ERRFILE + +set -eu -o pipefail + +echo "TARGET_DIR=${TARGET_DIR}" + +# Find the script directory +if [ -n "${PBS_JOBID:-}" ]; then + SCRIPT_PATH="$PBS_O_WORKDIR/$(basename "$0")" +else + SCRIPT_PATH=$(realpath "$0") +fi +SCRIPT_DIR=$(dirname "${SCRIPT_PATH}") +echo "SCRIPT_DIR=${SCRIPT_DIR}" + +mkdir ${TARGET_DIR} +mkdir ${TARGET_DIR}/src + +# Copy necessary scripts +cp -r ${SCRIPT_DIR}/scripts ${TARGET_DIR} + +# Set variables +source ${TARGET_DIR}/scripts/environment.sh +set > ${TARGET_DIR}/installer_envvar.log + +# Install Libraries +source ${SCRIPT_DIR}/src/install_python.sh +source ${SCRIPT_DIR}/src/install_venv.sh +source ${SCRIPT_DIR}/src/install_pytorch.sh +source ${SCRIPT_DIR}/src/install_requirements.sh +source ${SCRIPT_DIR}/src/install_apex.sh +source ${SCRIPT_DIR}/src/install_flash_attention.sh +source ${SCRIPT_DIR}/src/install_transformer_engine.sh +source ${SCRIPT_DIR}/src/install_megatron_lm.sh +source ${SCRIPT_DIR}/src/install_tokenizer.sh + +echo "Done" diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/run_setup.sh b/pretrain/scripts/v5-test/installer-abci-hopper/run_setup.sh new file mode 100755 index 0000000..ed2e42e --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci-hopper/run_setup.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -eu -o pipefail + +if [ $# -ne 1 ]; then + >&2 echo "Usage: $0 " + >&2 echo "Example: $0 /path/to/target_dir" + exit 1 +fi + +target_dir=$1; shift + +qsub \ + -v TARGET_DIR=${target_dir},RTYPE=rt_HF \ + -o /dev/null -e /dev/null \ + -m n \ + qsub_setup.sh + diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/scripts/environment.sh b/pretrain/scripts/v5-test/installer-abci-hopper/scripts/environment.sh new file mode 100644 index 0000000..5b30285 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci-hopper/scripts/environment.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# List of environment variables and module loads for pretrain tasks + +export PRETRAIN_CUDA_VERSION_MAJOR=12 +export PRETRAIN_CUDA_VERSION_MINOR=4 +export PRETRAIN_CUDA_VERSION_PATCH=1 + +export PRETRAIN_CUDA_VERSION=${PRETRAIN_CUDA_VERSION_MAJOR}.${PRETRAIN_CUDA_VERSION_MINOR} +export PRETRAIN_CUDA_VERSION_SHORT=${PRETRAIN_CUDA_VERSION_MAJOR}${PRETRAIN_CUDA_VERSION_MINOR} +export PRETRAIN_CUDNN_VERSION=9.5 +export PRETRAIN_CUDNN_VERSION_WITH_PATCH=9.5.1 +export PRETRAIN_HPCX_VERSION=2.20 +export PRETRAIN_NCCL_VERSION=2.25 +export PRETRAIN_NCCL_VERSION_WITH_PATCH=2.25.1-1 + +export PRETRAIN_PYTHON_VERSION=3.10.4 +export PRETRAIN_TORCH_VERSION=2.6.0 +export PRETRAIN_TORCHVISION_VERSION=0.21.0 +export PRETRAIN_APEX_COMMIT=312acb44f9fe05cab8c67bba6daa0e64d3737863 +export PRETRAIN_FLASH_ATTENTION_VERSION=27f501d +export PRETRAIN_TRANSFORMER_ENGINE_VERSION=2.3.0 + +# export PRETRAIN_MEGATRON_TAG=v4 +export PRETRAIN_MEGATRON_TAG=0176_merge_nvidia_upstream +# Ensure the appropriate Huggingface tokenizer is included +# https://github.com/llm-jp/scripts/pull/12#discussion_r1708415209 +export PRETRAIN_TOKENIZER_TAG=v3.0b2 + +source /etc/profile.d/modules.sh +module load cuda/${PRETRAIN_CUDA_VERSION}/${PRETRAIN_CUDA_VERSION}.${PRETRAIN_CUDA_VERSION_PATCH} +module load cudnn/${PRETRAIN_CUDNN_VERSION}/${PRETRAIN_CUDNN_VERSION_WITH_PATCH} +module load hpcx/${PRETRAIN_HPCX_VERSION} +module load nccl/${PRETRAIN_NCCL_VERSION}/${PRETRAIN_NCCL_VERSION_WITH_PATCH} + +export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_apex.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_apex.sh new file mode 100644 index 0000000..a9fdd99 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_apex.sh @@ -0,0 +1,25 @@ +# Install + +echo "Installing apex with commit ${PRETRAIN_APEX_COMMIT}" +source ${TARGET_DIR}/venv/bin/activate +pushd ${TARGET_DIR}/src + +git clone --recurse-submodules https://github.com/NVIDIA/apex +pushd apex + +# Checkout the specific commit +git checkout ${PRETRAIN_APEX_COMMIT} +git submodule update --init --recursive + + +python -m pip install \ + -v \ + --no-cache-dir \ + --no-build-isolation \ + --config-settings "--build-option=--cpp_ext" \ + --config-settings "--build-option=--cuda_ext" \ + ./ +popd + +popd # ${TARGET_DIR}/src +deactivate \ No newline at end of file diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_flash_attention.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_flash_attention.sh new file mode 100644 index 0000000..e4d25a3 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_flash_attention.sh @@ -0,0 +1,25 @@ +# Installs flash attention. + +echo "Installing Flash Attention ${PRETRAIN_FLASH_ATTENTION_VERSION}" +source ${TARGET_DIR}/venv/bin/activate + +pushd ${TARGET_DIR}/src + +git clone https://github.com/Dao-AILab/flash-attention.git +pushd flash-attention/ +git checkout ${PRETRAIN_FLASH_ATTENTION_VERSION} + +# Use flash-attention 3 +pushd hopper/ + +python setup.py install + +python_path=`python -c "import site; print(site.getsitepackages()[0])"` +mkdir -p $python_path/flash_attn_3 +wget -P $python_path/flash_attn_3 https://raw.githubusercontent.com/Dao-AILab/flash-attention/27f501dbe011f4371bff938fe7e09311ab3002fa/hopper/flash_attn_interface.py + +popd # hopper/ +popd # flash-attention/ +popd # ${TARGET_DIR}/src + +deactivate diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_megatron_lm.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_megatron_lm.sh new file mode 100644 index 0000000..be36100 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_megatron_lm.sh @@ -0,0 +1,30 @@ +# Installs Megatron-LM. + +echo "Installing Megatron-LM ${PRETRAIN_MEGATRON_TAG}" +source ${TARGET_DIR}/venv/bin/activate +pushd ${TARGET_DIR}/src + +# download our Megatron and build helper library +git clone https://github.com/llm-jp/Megatron-LM -b ${PRETRAIN_MEGATRON_TAG} +pushd Megatron-LM +pushd megatron/core/datasets + +# NOTE(odashi): +# Original makefile in the above directory uses the system's (or pyenv's) python3-config. +# But we need to invoke python3-config installed on our target directory. +MEGATRON_HELPER_CPPFLAGS=( + -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color + $(python -m pybind11 --includes) +) +MEGATRON_HELPER_EXT=$(${TARGET_DIR}/python/bin/python3-config --extension-suffix) + +# NOTE(odashi): +# New version of Megatron-LM changed the extension name 'helpers' to 'helpers_cpp' +g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers_cpp${MEGATRON_HELPER_EXT} +# g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers${MEGATRON_HELPER_EXT} + +popd # megatron/core/datasets +popd # Megatron-LM + +popd # ${TARGET_DIR}/src +deactivate diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_python.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_python.sh new file mode 100644 index 0000000..8ac43cb --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_python.sh @@ -0,0 +1,17 @@ +# Script to install Python to TARGET_DIR +# +# This script will make the following directories: +# * ${TARGET_DIR}/src/cpython ... Source of Python +# * ${TARGET_DIR}/python ... installed Python binary + +echo "Installing Python ${PRETRAIN_PYTHON_VERSION}" +pushd ${TARGET_DIR}/src + +git clone https://github.com/python/cpython -b v${PRETRAIN_PYTHON_VERSION} +pushd cpython +./configure --prefix="${TARGET_DIR}/python" --enable-optimizations +make -j 64 +make install +popd # cpython + +popd # ${TARGET_DIR}/src diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_pytorch.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_pytorch.sh new file mode 100644 index 0000000..1405a01 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_pytorch.sh @@ -0,0 +1,13 @@ +# Install pytorch and torchvision + +echo "Installing torch ${PRETRAIN_TORCH_VERSION}+cu${PRETRAIN_CUDA_VERSION_SHORT} and torchvision ${PRETRAIN_TORCHVISION_VERSION}+cu${PRETRAIN_CUDA_VERSION_SHORT}" + +source ${TARGET_DIR}/venv/bin/activate + +python -m pip install \ + --no-cache-dir \ + torch==${PRETRAIN_TORCH_VERSION} \ + torchvision==${PRETRAIN_TORCHVISION_VERSION} \ + --index-url https://download.pytorch.org/whl/cu${PRETRAIN_CUDA_VERSION_SHORT} + +deactivate diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_requirements.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_requirements.sh new file mode 100644 index 0000000..e3554cd --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_requirements.sh @@ -0,0 +1,9 @@ +# Installs prerequisite packages + +echo "Installing requirements" + +source ${TARGET_DIR}/venv/bin/activate + +python -m pip install --no-cache-dir -U -r ${SCRIPT_DIR}/src/requirements.txt + +deactivate diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_tokenizer.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_tokenizer.sh new file mode 100644 index 0000000..69ff9e6 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_tokenizer.sh @@ -0,0 +1,10 @@ +# Install LLM-jp Tokenizer. + +echo "Installing LLM-jp Tokenizer ${PRETRAIN_TOKENIZER_TAG}" +pushd ${TARGET_DIR}/src + +# download our tokeniser +# Tokenizer +git clone https://github.com/llm-jp/llm-jp-tokenizer -b ${PRETRAIN_TOKENIZER_TAG} + +popd # ${TARGET_DIR}/src diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_transformer_engine.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_transformer_engine.sh new file mode 100644 index 0000000..6574a38 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_transformer_engine.sh @@ -0,0 +1,12 @@ +# Installs Transformer Engine. + +echo "Installing Transformer Engine ${PRETRAIN_TRANSFORMER_ENGINE_VERSION}" +source ${TARGET_DIR}/venv/bin/activate + +# install transformer engine +# NOTE(odashi): +# This implicitly installs flash-attn with their recommended version. +# If the auto-installed flash-attn causes some problems, we need to re-install it. +pip install --no-build-isolation --no-cache-dir transformer_engine[pytorch]==${PRETRAIN_TRANSFORMER_ENGINE_VERSION} + +deactivate \ No newline at end of file diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_venv.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_venv.sh new file mode 100644 index 0000000..5f036f5 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_venv.sh @@ -0,0 +1,15 @@ +# Script to install Python to TARGET_DIR +# +# This script will make the following directories: +# * ${TARGET_DIR}/venv ... venv directory inherited from the above Python binary + +echo "Setup venv" +pushd ${TARGET_DIR} + +python/bin/python3 -m venv venv + +source venv/bin/activate +python -m pip install --no-cache-dir -U pip setuptools wheel +deactivate + +popd # ${TARGET_DIR} diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/requirements.txt b/pretrain/scripts/v5-test/installer-abci-hopper/src/requirements.txt new file mode 100644 index 0000000..88e1930 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/requirements.txt @@ -0,0 +1,14 @@ +accelerate==1.0.1 +cmake==3.30.1 +einops==0.8.0 +ninja==1.11.1.1 +numpy==1.26.3 +packaging==24.1 +pybind11==2.13.6 +regex==2024.9.11 +safetensors==0.4.5 +sentencepiece==0.2.0 +six==1.16.0 +transformers==4.46.0 +wandb==0.18.5 +wheel==0.44.0 \ No newline at end of file diff --git a/pretrain/scripts/v5-test/installer-abci/qsub_setup.sh b/pretrain/scripts/v5-test/installer-abci/qsub_setup.sh new file mode 100755 index 0000000..9eced7d --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci/qsub_setup.sh @@ -0,0 +1,53 @@ +#!/bin/bash +#PBS -P gcg51557 +#PBS -q R9920251000 +#PBS -v RTYPE=rt_HF +#PBS -l select=1 +#PBS -l walltime=01:00:00 +#PBS -o /dev/null +#PBS -e /dev/null + +cd $PBS_O_WORKDIR + +TIMESTAMP=$(date +%Y%m%d%H%M%S) +JOBID=${PBS_JOBID%%.*} +mkdir -p logs +LOGFILE=logs/install-$JOBID.out +ERRFILE=logs/install-$JOBID.err +exec > $LOGFILE 2> $ERRFILE + +set -eu -o pipefail + +echo "TARGET_DIR=${TARGET_DIR}" + +# Find the script directory +if [ -n "${PBS_JOBID:-}" ]; then + SCRIPT_PATH="$PBS_O_WORKDIR/$(basename "$0")" +else + SCRIPT_PATH=$(realpath "$0") +fi +SCRIPT_DIR=$(dirname "${SCRIPT_PATH}") +echo "SCRIPT_DIR=${SCRIPT_DIR}" + +mkdir ${TARGET_DIR} +mkdir ${TARGET_DIR}/src + +# Copy necessary scripts +cp -r ${SCRIPT_DIR}/scripts ${TARGET_DIR} + +# Set variables +source ${TARGET_DIR}/scripts/environment.sh +set > ${TARGET_DIR}/installer_envvar.log + +# Install Libraries +source ${SCRIPT_DIR}/src/install_python.sh +source ${SCRIPT_DIR}/src/install_venv.sh +source ${SCRIPT_DIR}/src/install_pytorch.sh +source ${SCRIPT_DIR}/src/install_requirements.sh +source ${SCRIPT_DIR}/src/install_apex.sh +source ${SCRIPT_DIR}/src/install_flash_attention.sh +source ${SCRIPT_DIR}/src/install_transformer_engine.sh +source ${SCRIPT_DIR}/src/install_megatron_lm.sh +source ${SCRIPT_DIR}/src/install_tokenizer.sh + +echo "Done" diff --git a/pretrain/scripts/v5-test/installer-abci/run_setup.sh b/pretrain/scripts/v5-test/installer-abci/run_setup.sh new file mode 100755 index 0000000..ed2e42e --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci/run_setup.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -eu -o pipefail + +if [ $# -ne 1 ]; then + >&2 echo "Usage: $0 " + >&2 echo "Example: $0 /path/to/target_dir" + exit 1 +fi + +target_dir=$1; shift + +qsub \ + -v TARGET_DIR=${target_dir},RTYPE=rt_HF \ + -o /dev/null -e /dev/null \ + -m n \ + qsub_setup.sh + diff --git a/pretrain/scripts/v5-test/installer-abci/scripts/environment.sh b/pretrain/scripts/v5-test/installer-abci/scripts/environment.sh new file mode 100644 index 0000000..65c4bc7 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci/scripts/environment.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# List of environment variables and module loads for pretrain tasks + +export PRETRAIN_CUDA_VERSION_MAJOR=12 +export PRETRAIN_CUDA_VERSION_MINOR=4 +export PRETRAIN_CUDA_VERSION_PATCH=1 + +export PRETRAIN_CUDA_VERSION=${PRETRAIN_CUDA_VERSION_MAJOR}.${PRETRAIN_CUDA_VERSION_MINOR} +export PRETRAIN_CUDA_VERSION_SHORT=${PRETRAIN_CUDA_VERSION_MAJOR}${PRETRAIN_CUDA_VERSION_MINOR} +export PRETRAIN_CUDNN_VERSION=9.5 +export PRETRAIN_CUDNN_VERSION_WITH_PATCH=9.5.1 +export PRETRAIN_HPCX_VERSION=2.20 +export PRETRAIN_NCCL_VERSION=2.25 +export PRETRAIN_NCCL_VERSION_WITH_PATCH=2.25.1-1 + +export PRETRAIN_PYTHON_VERSION=3.10.4 +export PRETRAIN_TORCH_VERSION=2.6.0 +export PRETRAIN_TORCHVISION_VERSION=0.21.0 +export PRETRAIN_APEX_COMMIT=312acb44f9fe05cab8c67bba6daa0e64d3737863 +export PRETRAIN_FLASH_ATTENTION_VERSION=2.5.8 +export PRETRAIN_TRANSFORMER_ENGINE_VERSION=1.13.0 + +# export PRETRAIN_MEGATRON_TAG=v4 +export PRETRAIN_MEGATRON_TAG=0176_merge_nvidia_upstream +# Ensure the appropriate Huggingface tokenizer is included +# https://github.com/llm-jp/scripts/pull/12#discussion_r1708415209 +export PRETRAIN_TOKENIZER_TAG=v3.0b2 + +source /etc/profile.d/modules.sh +module load cuda/${PRETRAIN_CUDA_VERSION}/${PRETRAIN_CUDA_VERSION}.${PRETRAIN_CUDA_VERSION_PATCH} +module load cudnn/${PRETRAIN_CUDNN_VERSION}/${PRETRAIN_CUDNN_VERSION_WITH_PATCH} +module load hpcx/${PRETRAIN_HPCX_VERSION} +module load nccl/${PRETRAIN_NCCL_VERSION}/${PRETRAIN_NCCL_VERSION_WITH_PATCH} + +export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_apex.sh b/pretrain/scripts/v5-test/installer-abci/src/install_apex.sh new file mode 100644 index 0000000..a9fdd99 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci/src/install_apex.sh @@ -0,0 +1,25 @@ +# Install + +echo "Installing apex with commit ${PRETRAIN_APEX_COMMIT}" +source ${TARGET_DIR}/venv/bin/activate +pushd ${TARGET_DIR}/src + +git clone --recurse-submodules https://github.com/NVIDIA/apex +pushd apex + +# Checkout the specific commit +git checkout ${PRETRAIN_APEX_COMMIT} +git submodule update --init --recursive + + +python -m pip install \ + -v \ + --no-cache-dir \ + --no-build-isolation \ + --config-settings "--build-option=--cpp_ext" \ + --config-settings "--build-option=--cuda_ext" \ + ./ +popd + +popd # ${TARGET_DIR}/src +deactivate \ No newline at end of file diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_flash_attention.sh b/pretrain/scripts/v5-test/installer-abci/src/install_flash_attention.sh new file mode 100644 index 0000000..ae042b7 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci/src/install_flash_attention.sh @@ -0,0 +1,11 @@ +# Installs flash attention. + +echo "Installing Flash Attention ${PRETRAIN_FLASH_ATTENTION_VERSION}" +source ${TARGET_DIR}/venv/bin/activate + +python -m pip install \ + --no-build-isolation \ + --no-cache-dir \ + "flash-attn==${PRETRAIN_FLASH_ATTENTION_VERSION}" + +deactivate diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_megatron_lm.sh b/pretrain/scripts/v5-test/installer-abci/src/install_megatron_lm.sh new file mode 100644 index 0000000..be36100 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci/src/install_megatron_lm.sh @@ -0,0 +1,30 @@ +# Installs Megatron-LM. + +echo "Installing Megatron-LM ${PRETRAIN_MEGATRON_TAG}" +source ${TARGET_DIR}/venv/bin/activate +pushd ${TARGET_DIR}/src + +# download our Megatron and build helper library +git clone https://github.com/llm-jp/Megatron-LM -b ${PRETRAIN_MEGATRON_TAG} +pushd Megatron-LM +pushd megatron/core/datasets + +# NOTE(odashi): +# Original makefile in the above directory uses the system's (or pyenv's) python3-config. +# But we need to invoke python3-config installed on our target directory. +MEGATRON_HELPER_CPPFLAGS=( + -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color + $(python -m pybind11 --includes) +) +MEGATRON_HELPER_EXT=$(${TARGET_DIR}/python/bin/python3-config --extension-suffix) + +# NOTE(odashi): +# New version of Megatron-LM changed the extension name 'helpers' to 'helpers_cpp' +g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers_cpp${MEGATRON_HELPER_EXT} +# g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers${MEGATRON_HELPER_EXT} + +popd # megatron/core/datasets +popd # Megatron-LM + +popd # ${TARGET_DIR}/src +deactivate diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_python.sh b/pretrain/scripts/v5-test/installer-abci/src/install_python.sh new file mode 100644 index 0000000..8ac43cb --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci/src/install_python.sh @@ -0,0 +1,17 @@ +# Script to install Python to TARGET_DIR +# +# This script will make the following directories: +# * ${TARGET_DIR}/src/cpython ... Source of Python +# * ${TARGET_DIR}/python ... installed Python binary + +echo "Installing Python ${PRETRAIN_PYTHON_VERSION}" +pushd ${TARGET_DIR}/src + +git clone https://github.com/python/cpython -b v${PRETRAIN_PYTHON_VERSION} +pushd cpython +./configure --prefix="${TARGET_DIR}/python" --enable-optimizations +make -j 64 +make install +popd # cpython + +popd # ${TARGET_DIR}/src diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_pytorch.sh b/pretrain/scripts/v5-test/installer-abci/src/install_pytorch.sh new file mode 100644 index 0000000..1405a01 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci/src/install_pytorch.sh @@ -0,0 +1,13 @@ +# Install pytorch and torchvision + +echo "Installing torch ${PRETRAIN_TORCH_VERSION}+cu${PRETRAIN_CUDA_VERSION_SHORT} and torchvision ${PRETRAIN_TORCHVISION_VERSION}+cu${PRETRAIN_CUDA_VERSION_SHORT}" + +source ${TARGET_DIR}/venv/bin/activate + +python -m pip install \ + --no-cache-dir \ + torch==${PRETRAIN_TORCH_VERSION} \ + torchvision==${PRETRAIN_TORCHVISION_VERSION} \ + --index-url https://download.pytorch.org/whl/cu${PRETRAIN_CUDA_VERSION_SHORT} + +deactivate diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_requirements.sh b/pretrain/scripts/v5-test/installer-abci/src/install_requirements.sh new file mode 100644 index 0000000..e3554cd --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci/src/install_requirements.sh @@ -0,0 +1,9 @@ +# Installs prerequisite packages + +echo "Installing requirements" + +source ${TARGET_DIR}/venv/bin/activate + +python -m pip install --no-cache-dir -U -r ${SCRIPT_DIR}/src/requirements.txt + +deactivate diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_tokenizer.sh b/pretrain/scripts/v5-test/installer-abci/src/install_tokenizer.sh new file mode 100644 index 0000000..69ff9e6 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci/src/install_tokenizer.sh @@ -0,0 +1,10 @@ +# Install LLM-jp Tokenizer. + +echo "Installing LLM-jp Tokenizer ${PRETRAIN_TOKENIZER_TAG}" +pushd ${TARGET_DIR}/src + +# download our tokeniser +# Tokenizer +git clone https://github.com/llm-jp/llm-jp-tokenizer -b ${PRETRAIN_TOKENIZER_TAG} + +popd # ${TARGET_DIR}/src diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_transformer_engine.sh b/pretrain/scripts/v5-test/installer-abci/src/install_transformer_engine.sh new file mode 100644 index 0000000..6574a38 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci/src/install_transformer_engine.sh @@ -0,0 +1,12 @@ +# Installs Transformer Engine. + +echo "Installing Transformer Engine ${PRETRAIN_TRANSFORMER_ENGINE_VERSION}" +source ${TARGET_DIR}/venv/bin/activate + +# install transformer engine +# NOTE(odashi): +# This implicitly installs flash-attn with their recommended version. +# If the auto-installed flash-attn causes some problems, we need to re-install it. +pip install --no-build-isolation --no-cache-dir transformer_engine[pytorch]==${PRETRAIN_TRANSFORMER_ENGINE_VERSION} + +deactivate \ No newline at end of file diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_venv.sh b/pretrain/scripts/v5-test/installer-abci/src/install_venv.sh new file mode 100644 index 0000000..5f036f5 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci/src/install_venv.sh @@ -0,0 +1,15 @@ +# Script to install Python to TARGET_DIR +# +# This script will make the following directories: +# * ${TARGET_DIR}/venv ... venv directory inherited from the above Python binary + +echo "Setup venv" +pushd ${TARGET_DIR} + +python/bin/python3 -m venv venv + +source venv/bin/activate +python -m pip install --no-cache-dir -U pip setuptools wheel +deactivate + +popd # ${TARGET_DIR} diff --git a/pretrain/scripts/v5-test/installer-abci/src/requirements.txt b/pretrain/scripts/v5-test/installer-abci/src/requirements.txt new file mode 100644 index 0000000..88e1930 --- /dev/null +++ b/pretrain/scripts/v5-test/installer-abci/src/requirements.txt @@ -0,0 +1,14 @@ +accelerate==1.0.1 +cmake==3.30.1 +einops==0.8.0 +ninja==1.11.1.1 +numpy==1.26.3 +packaging==24.1 +pybind11==2.13.6 +regex==2024.9.11 +safetensors==0.4.5 +sentencepiece==0.2.0 +six==1.16.0 +transformers==4.46.0 +wandb==0.18.5 +wheel==0.44.0 \ No newline at end of file diff --git a/pretrain/scripts/v5-test/tasks/.gitignore b/pretrain/scripts/v5-test/tasks/.gitignore new file mode 100644 index 0000000..0d5b44a --- /dev/null +++ b/pretrain/scripts/v5-test/tasks/.gitignore @@ -0,0 +1,6 @@ +cache/ +checkpoints/ +checkpoints_hf/ +logs/ +checkpoints_bak/ +train_iters.txt diff --git a/pretrain/scripts/v5-test/tasks/v4-dolmino-mix-1124/train_data_50B.sh b/pretrain/scripts/v5-test/tasks/v4-dolmino-mix-1124/train_data_50B.sh new file mode 100644 index 0000000..8e1a689 --- /dev/null +++ b/pretrain/scripts/v5-test/tasks/v4-dolmino-mix-1124/train_data_50B.sh @@ -0,0 +1,39 @@ +DATASET_ROOT="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-tokenized" +export TRAIN_DATA_PATH=( +1074221928 ${DATASET_ROOT}/dclm/dclm-0000-0009_text_document +1084576663 ${DATASET_ROOT}/dclm/dclm-0010-0019_text_document +1078047741 ${DATASET_ROOT}/dclm/dclm-0020-0029_text_document +1078674563 ${DATASET_ROOT}/dclm/dclm-0030-0039_text_document +1088235637 ${DATASET_ROOT}/dclm/dclm-0040-0049_text_document +1074121727 ${DATASET_ROOT}/dclm/dclm-0050-0059_text_document +1074361471 ${DATASET_ROOT}/dclm/dclm-0060-0069_text_document +1072100298 ${DATASET_ROOT}/dclm/dclm-0070-0079_text_document +1081117973 ${DATASET_ROOT}/dclm/dclm-0080-0089_text_document +1083258442 ${DATASET_ROOT}/dclm/dclm-0090-0099_text_document +1085639589 ${DATASET_ROOT}/dclm/dclm-0100-0109_text_document +1083698256 ${DATASET_ROOT}/dclm/dclm-0110-0119_text_document +1065593782 ${DATASET_ROOT}/dclm/dclm-0120-0129_text_document +1079189542 ${DATASET_ROOT}/dclm/dclm-0130-0139_text_document +1075808767 ${DATASET_ROOT}/dclm/dclm-0140-0149_text_document +1082090476 ${DATASET_ROOT}/dclm/dclm-0150-0159_text_document +1079732898 ${DATASET_ROOT}/dclm/dclm-0160-0169_text_document +1083010268 ${DATASET_ROOT}/dclm/dclm-0170-0179_text_document +1076637801 ${DATASET_ROOT}/dclm/dclm-0180-0189_text_document +1077502310 ${DATASET_ROOT}/dclm/dclm-0190-0199_text_document +1078931337 ${DATASET_ROOT}/dclm/dclm-0200-0209_text_document +1080430161 ${DATASET_ROOT}/dclm/dclm-0210-0219_text_document +1069347924 ${DATASET_ROOT}/dclm/dclm-0220-0229_text_document +1066776191 ${DATASET_ROOT}/dclm/dclm-0230-0239_text_document +667806924 ${DATASET_ROOT}/dclm/dclm-0240-0246_text_document +9242742021 ${DATASET_ROOT}/flan/flan-all_text_document +2174159 ${DATASET_ROOT}/math/codesearchnet-owmfilter-all_text_document +31677007 ${DATASET_ROOT}/math/dolmino_math_synth-all_text_document +2841494 ${DATASET_ROOT}/math/gsm8k-all_text_document +4098243004 ${DATASET_ROOT}/math/mathcoder2-synthmath-all_text_document +85423408 ${DATASET_ROOT}/math/metamath-owmfilter-all_text_document +6944299886 ${DATASET_ROOT}/math/tinyGSM-MIND-all_text_document +250390697 ${DATASET_ROOT}/math/tulu_math-all_text_document +3236969300 ${DATASET_ROOT}/pes2o/pes2o-all_text_document +1464772187 ${DATASET_ROOT}/stackexchange/stackexchange-all_text_document +3896965449 ${DATASET_ROOT}/wiki/wiki-all_text_document +) diff --git a/pretrain/scripts/v5-test/train/common/setup.sh b/pretrain/scripts/v5-test/train/common/setup.sh new file mode 100644 index 0000000..e0b7ddc --- /dev/null +++ b/pretrain/scripts/v5-test/train/common/setup.sh @@ -0,0 +1,29 @@ +# Script for setup trainer environment. + +source /etc/profile.d/modules.sh +# module load cuda/12.1/12.1.1 +module load cuda/12.4/12.4.1 +module load cudnn/9.5/9.5.1 +module load hpcx/2.20 +# module load nccl/2.23/2.23.4-1 +module load nccl/2.25/2.25.1-1 +# echo $(module list) +loaded=$(module -t list 2>&1) +echo "-----" +echo "Modules: $loaded" +echo "-----" + + +source ${ENV_DIR}/venv/bin/activate +# source ${ENV_DIR}/scripts/environment.sh # ADD + +## Debug/logging flags +export LOGLEVEL=INFO +# export NCCL_DEBUG=WARN +export NCCL_DEBUG=INFO +export NCCL_DEBUG_SUBSYS=WARN +export PYTHONFAULTHANDLER=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export CUDA_LAUNCH_BLOCKING=0 +export CUDNN_LOGDEST_DBG=stderr +export CUDNN_LOGERR_DBG=1 diff --git a/pretrain/scripts/v5-test/train/params/7.7b-llama3-ecjk.sh b/pretrain/scripts/v5-test/train/params/7.7b-llama3-ecjk.sh new file mode 100644 index 0000000..4ab1ccf --- /dev/null +++ b/pretrain/scripts/v5-test/train/params/7.7b-llama3-ecjk.sh @@ -0,0 +1,159 @@ +# Pretraining hyperparameters for v4 7.7B. +# Model card: https://github.com/llm-jp/model-cards/pull/30 +# Ref: https://github.com/llm-jp/scripts/blob/ec3516a38f93047b7bc0d8305879d62a375e6ee2/pretrain/scripts/v4-training/params/7.7b-cont1.sh + +ALL_PARAMS=() + +# Model hyperparameters +ALL_PARAMS+=( + --num-layers 32 + --hidden-size 4096 + --ffn-hidden-size 14336 + --num-attention-heads 32 + --group-query-attention + --num-query-groups 8 + --seq-length 8192 + --max-position-embeddings 8192 + --position-embedding-type rope + --rotary-base 500000 + --untie-embeddings-and-output-weights + --swiglu + --normalization RMSNorm + --norm-epsilon 1e-5 + --disable-bias-linear +) + +# Tokenizer +ALL_PARAMS+=( + --tokenizer-type Llama2Tokenizer + --tokenizer-model ${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model +) + +# Optimizer hyperparameters +ALL_PARAMS+=( + --optimizer adam + # --lr 3e-4 # will be defined later + # --min-lr 3e-5 # will be defined later + --adam-beta1 0.9 + --adam-beta2 0.95 + --adam-eps 1e-8 + --clip-grad 1.0 + --weight-decay 0.1 + --init-method-std 0.02 + --attention-dropout 0.0 + --hidden-dropout 0.0 + --override-opt_param-scheduler + # --no-load-optim +) + +# pretrain_iters: 1,859,665 +# 50B: ceil( 55,797,411,281 / 8192 / 1024 ) == 6652 +# 50B sum: 1,859,665 + 6,652 = 1,866,317 +# 100B: ceil( 113,460,356,693 / 8192 / 1024 ) == 13,526 +# 100B sum: 1,859,665 + 13,526 = 1,873,191 +# 300B: ceil( 337,681,167,151 / 8192 / 1024 ) == 40,255 +# 300B sum: 1,859,665 + 40,255 = 1,899,920 +MIDTRAIN_START=1859665 +# TRAIN_ITERS=$(cat ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/train_iters.txt) +DATASET_SIZE=50B +TRAIN_ITERS=6652 # 50B +# MIDTRAIN_ITERS=$((TRAIN_ITERS - MIDTRAIN_START)) + +# Scheduler +# Scheduler +ALL_PARAMS+=( + --lr 3e-5 # Start LR + --min-lr 3e-5 # End LR + # --min-lr 0 # End LR + # --lr-warmup-iters ${MIDTRAIN_START} # No warmup + --lr-warmup-iters 0 # No warmup + --lr-decay-iters ${TRAIN_ITERS} + # --lr-decay-iters ${MIDTRAIN_ITERS} + --lr-decay-style linear + --train-iters ${TRAIN_ITERS} + --eval-interval 999999999 + --eval-iters 0 +) + +# Batch sizes +ALL_PARAMS+=( + --micro-batch-size 2 + --global-batch-size 1024 +) + +# Parallelism +ALL_PARAMS+=( + --tensor-model-parallel-size 1 + --pipeline-model-parallel-size 2 + --context-parallel-size 1 + --sequence-parallel + --use-distributed-optimizer + --distributed-backend nccl + # NOTE(odashi): Increasing timeout is required to prepare 15.6T dataset. + --distributed-timeout-minutes 120 + --use-mpi +) + +# Load TRAIN_DATA_PATH +source ${TASK_DIR}/train_data_${DATASET_SIZE}.sh # options: 50B, 100B, and 300B +SEED=42 +# Dataset +ALL_PARAMS+=( + --data-path ${TRAIN_DATA_PATH[@]} + --data-cache-path ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/cache + --split 1,0,0 + --seed ${SEED} +) + +TASK_CHECKPOINT_DIR=${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/checkpoints +mkdir -p ${TASK_CHECKPOINT_DIR} + +if [ -e ${TASK_CHECKPOINT_DIR}/latest_checkpointed_iteration.txt ]; then + # Continue existing training + ALL_PARAMS+=( + --load ${TASK_CHECKPOINT_DIR} + --save ${TASK_CHECKPOINT_DIR} + ) + echo "Continue existing training" +else + # Start new training from scratch + ALL_PARAMS+=( + --save ${TASK_CHECKPOINT_DIR} + ) + echo "Start new training from scratch" +fi +ALL_PARAMS+=( + --save-interval 1000 +) + +# Other implementation-related parameters +ALL_PARAMS+=( + --bf16 + --use-mcore-models + --no-masked-softmax-fusion + --use-flash-attn + + # NOTE(odashi): For adjusting throughput + #--recompute-activations + #--recompute-granularity selective + #--overlap-grad-reduce + #--overlap-param-gather + + --attention-softmax-in-fp32 + --transformer-impl transformer_engine + + # NOTE(odashi): Newer implementation requires to set attention backend by parameter. + #--attention-backend flash +) + +# NOTE(odashi): Disable fused attention for Sakura cluster due to some inconsistency. +# export NVTE_FUSED_ATTN=0 + +# Logging +ALL_PARAMS+=( + --log-interval 1 + --log-throughput + --wandb-entity llm-jp + --wandb-project 0176_merge_megatron_upstream + --wandb-exp-name train_$(basename ${TASK_DIR}) +) diff --git a/pretrain/scripts/v5-test/train/params/8x1.3b-llama3-ecjk.sh b/pretrain/scripts/v5-test/train/params/8x1.3b-llama3-ecjk.sh new file mode 100644 index 0000000..c3ec406 --- /dev/null +++ b/pretrain/scripts/v5-test/train/params/8x1.3b-llama3-ecjk.sh @@ -0,0 +1,175 @@ +# Pretraining hyperparameters for v4 7.7B. +# Model card: https://github.com/llm-jp/model-cards/pull/30 +# Ref: https://github.com/llm-jp/scripts/blob/ec3516a38f93047b7bc0d8305879d62a375e6ee2/pretrain/scripts/v4-training/params/7.7b-cont1.sh + +ALL_PARAMS=() + +# Model hyperparameters +ALL_PARAMS+=( + --num-layers 16 + --hidden-size 2048 + --ffn-hidden-size 7168 + --num-attention-heads 16 + --group-query-attention + --num-query-groups 8 + --seq-length 8192 + --max-position-embeddings 8192 + --position-embedding-type rope + --rotary-base 500000 + --untie-embeddings-and-output-weights + --swiglu + --normalization RMSNorm + --norm-epsilon 1e-5 + --disable-bias-linear +) + +# Tokenizer +ALL_PARAMS+=( + --tokenizer-type Llama2Tokenizer + --tokenizer-model ${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model +) + +# Optimizer hyperparameters +ALL_PARAMS+=( + --optimizer adam + # --lr 3e-4 # will be defined later + # --min-lr 3e-5 # will be defined later + --adam-beta1 0.9 + --adam-beta2 0.95 + --adam-eps 1e-8 + --clip-grad 1.0 + --weight-decay 0.1 + --init-method-std 0.02 + --attention-dropout 0.0 + --hidden-dropout 0.0 + --override-opt_param-scheduler + # --no-load-optim +) + +# pretrain_iters: 1,859,665 +# 50B: ceil( 55,797,411,281 / 8192 / 1024 ) == 6652 +# 50B sum: 1,859,665 + 6,652 = 1,866,317 +# 100B: ceil( 113,460,356,693 / 8192 / 1024 ) == 13,526 +# 100B sum: 1,859,665 + 13,526 = 1,873,191 +# 300B: ceil( 337,681,167,151 / 8192 / 1024 ) == 40,255 +# 300B sum: 1,859,665 + 40,255 = 1,899,920 +MIDTRAIN_START=1859665 +# TRAIN_ITERS=$(cat ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/train_iters.txt) +DATASET_SIZE=50B +TRAIN_ITERS=6652 # 50B +# MIDTRAIN_ITERS=$((TRAIN_ITERS - MIDTRAIN_START)) + +# Scheduler +# Scheduler +ALL_PARAMS+=( + --lr 3e-5 # Start LR + --min-lr 3e-5 # End LR + # --min-lr 0 # End LR + # --lr-warmup-iters ${MIDTRAIN_START} # No warmup + --lr-warmup-iters 0 # No warmup + --lr-decay-iters ${TRAIN_ITERS} + # --lr-decay-iters ${MIDTRAIN_ITERS} + --lr-decay-style linear + --train-iters ${TRAIN_ITERS} + --eval-interval 999999999 + --eval-iters 0 +) + +# Batch sizes +ALL_PARAMS+=( + --micro-batch-size 1 + --global-batch-size 1024 +) + +# Parallelism +ALL_PARAMS+=( + # model parallel size is set to 2 for 2 node training. + # (World size (=8 GPUs)) % ((model parallel size) x (moe parallel size)) should be 0. + --tensor-model-parallel-size 1 + --pipeline-model-parallel-size 1 + --context-parallel-size 1 + --sequence-parallel + --use-distributed-optimizer + --distributed-backend nccl + # NOTE(odashi): Increasing timeout is required to prepare 15.6T dataset. + --distributed-timeout-minutes 120 + --use-mpi +) + +# Load TRAIN_DATA_PATH +source ${TASK_DIR}/train_data_${DATASET_SIZE}.sh # options: 50B, 100B, and 300B +SEED=42 +# Dataset +ALL_PARAMS+=( + --data-path ${TRAIN_DATA_PATH[@]} + --data-cache-path ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/cache + --split 1,0,0 + --seed ${SEED} +) + +TASK_CHECKPOINT_DIR=${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/checkpoints +mkdir -p ${TASK_CHECKPOINT_DIR} + +if [ -e ${TASK_CHECKPOINT_DIR}/latest_checkpointed_iteration.txt ]; then + # Continue existing training + ALL_PARAMS+=( + --load ${TASK_CHECKPOINT_DIR} + --save ${TASK_CHECKPOINT_DIR} + ) + echo "Continue existing training" +else + # Start new training from scratch + ALL_PARAMS+=( + --save ${TASK_CHECKPOINT_DIR} + ) + echo "Start new training from scratch" +fi +ALL_PARAMS+=( + --save-interval 1000 +) + +# Other implementation-related parameters +ALL_PARAMS+=( + --bf16 + --use-mcore-models + --no-masked-softmax-fusion + --use-flash-attn + + # NOTE(odashi): For adjusting throughput + #--recompute-activations + #--recompute-granularity selective + #--overlap-grad-reduce + #--overlap-param-gather + + --attention-softmax-in-fp32 + --transformer-impl transformer_engine + + # NOTE(odashi): Newer implementation requires to set attention backend by parameter. + #--attention-backend flash +) + +# MoE args +# See https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/README.md +ALL_PARAMS+=( + --num-experts 8 + --expert-model-parallel-size 8 + --moe-grouped-gemm + # --moe-permute-fusion # Not compatible with `TE < 2.1.0` + --moe-router-load-balancing-type aux_loss # options: aux_loss, sinkhorn, none. Default is aux_loss. + --moe-router-topk 2 + --moe-aux-loss-coeff 1e-2 + --use-distributed-optimizer + --moe-token-dispatcher-type alltoall +) + +# NOTE(odashi): Disable fused attention for Sakura cluster due to some inconsistency. +# export NVTE_FUSED_ATTN=0 + +# Logging +ALL_PARAMS+=( + --log-interval 1 + --log-throughput + --wandb-entity llm-jp + --wandb-project 0176_merge_megatron_upstream + --wandb-exp-name train_$(basename ${TASK_DIR}) +) diff --git a/pretrain/scripts/v5-test/train/params/8x7.7b-llama3-ecjk.sh b/pretrain/scripts/v5-test/train/params/8x7.7b-llama3-ecjk.sh new file mode 100644 index 0000000..afdd44a --- /dev/null +++ b/pretrain/scripts/v5-test/train/params/8x7.7b-llama3-ecjk.sh @@ -0,0 +1,175 @@ +# Pretraining hyperparameters for v4 7.7B. +# Model card: https://github.com/llm-jp/model-cards/pull/30 +# Ref: https://github.com/llm-jp/scripts/blob/ec3516a38f93047b7bc0d8305879d62a375e6ee2/pretrain/scripts/v4-training/params/7.7b-cont1.sh + +ALL_PARAMS=() + +# Model hyperparameters +ALL_PARAMS+=( + --num-layers 32 + --hidden-size 4096 + --ffn-hidden-size 14336 + --num-attention-heads 32 + --group-query-attention + --num-query-groups 8 + --seq-length 8192 + --max-position-embeddings 8192 + --position-embedding-type rope + --rotary-base 500000 + --untie-embeddings-and-output-weights + --swiglu + --normalization RMSNorm + --norm-epsilon 1e-5 + --disable-bias-linear +) + +# Tokenizer +ALL_PARAMS+=( + --tokenizer-type Llama2Tokenizer + --tokenizer-model ${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model +) + +# Optimizer hyperparameters +ALL_PARAMS+=( + --optimizer adam + # --lr 3e-4 # will be defined later + # --min-lr 3e-5 # will be defined later + --adam-beta1 0.9 + --adam-beta2 0.95 + --adam-eps 1e-8 + --clip-grad 1.0 + --weight-decay 0.1 + --init-method-std 0.02 + --attention-dropout 0.0 + --hidden-dropout 0.0 + --override-opt_param-scheduler + # --no-load-optim +) + +# pretrain_iters: 1,859,665 +# 50B: ceil( 55,797,411,281 / 8192 / 1024 ) == 6652 +# 50B sum: 1,859,665 + 6,652 = 1,866,317 +# 100B: ceil( 113,460,356,693 / 8192 / 1024 ) == 13,526 +# 100B sum: 1,859,665 + 13,526 = 1,873,191 +# 300B: ceil( 337,681,167,151 / 8192 / 1024 ) == 40,255 +# 300B sum: 1,859,665 + 40,255 = 1,899,920 +MIDTRAIN_START=1859665 +# TRAIN_ITERS=$(cat ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/train_iters.txt) +DATASET_SIZE=50B +TRAIN_ITERS=6652 # 50B +# MIDTRAIN_ITERS=$((TRAIN_ITERS - MIDTRAIN_START)) + +# Scheduler +# Scheduler +ALL_PARAMS+=( + --lr 3e-5 # Start LR + --min-lr 3e-5 # End LR + # --min-lr 0 # End LR + # --lr-warmup-iters ${MIDTRAIN_START} # No warmup + --lr-warmup-iters 0 # No warmup + --lr-decay-iters ${TRAIN_ITERS} + # --lr-decay-iters ${MIDTRAIN_ITERS} + --lr-decay-style linear + --train-iters ${TRAIN_ITERS} + --eval-interval 999999999 + --eval-iters 0 +) + +# Batch sizes +ALL_PARAMS+=( + --micro-batch-size 1 + --global-batch-size 256 +) + +# Parallelism +ALL_PARAMS+=( + # model parallel size is set to 2 for 2 node training. + # (World size (=8 GPUs)) % ((model parallel size) x (moe parallel size)) should be 0. + --tensor-model-parallel-size 1 + --pipeline-model-parallel-size 4 + --context-parallel-size 1 + --sequence-parallel + --use-distributed-optimizer + --distributed-backend nccl + # NOTE(odashi): Increasing timeout is required to prepare 15.6T dataset. + --distributed-timeout-minutes 120 + --use-mpi +) + +# Load TRAIN_DATA_PATH +source ${TASK_DIR}/train_data_${DATASET_SIZE}.sh # options: 50B, 100B, and 300B +SEED=42 +# Dataset +ALL_PARAMS+=( + --data-path ${TRAIN_DATA_PATH[@]} + --data-cache-path ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/cache + --split 1,0,0 + --seed ${SEED} +) + +TASK_CHECKPOINT_DIR=${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/checkpoints +mkdir -p ${TASK_CHECKPOINT_DIR} + +if [ -e ${TASK_CHECKPOINT_DIR}/latest_checkpointed_iteration.txt ]; then + # Continue existing training + ALL_PARAMS+=( + --load ${TASK_CHECKPOINT_DIR} + --save ${TASK_CHECKPOINT_DIR} + ) + echo "Continue existing training" +else + # Start new training from scratch + ALL_PARAMS+=( + --save ${TASK_CHECKPOINT_DIR} + ) + echo "Start new training from scratch" +fi +ALL_PARAMS+=( + --save-interval 1000 +) + +# Other implementation-related parameters +ALL_PARAMS+=( + --bf16 + --use-mcore-models + --no-masked-softmax-fusion + --use-flash-attn + + # NOTE(odashi): For adjusting throughput + #--recompute-activations + #--recompute-granularity selective + #--overlap-grad-reduce + #--overlap-param-gather + + --attention-softmax-in-fp32 + --transformer-impl transformer_engine + + # NOTE(odashi): Newer implementation requires to set attention backend by parameter. + #--attention-backend flash +) + +# MoE args +# See https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/README.md +ALL_PARAMS+=( + --num-experts 8 + --expert-model-parallel-size 8 + --moe-grouped-gemm + # --moe-permute-fusion # Not compatible with `TE < 2.1.0` + --moe-router-load-balancing-type aux_loss # options: aux_loss, sinkhorn, none. Default is aux_loss. + --moe-router-topk 2 + --moe-aux-loss-coeff 1e-2 + --use-distributed-optimizer + --moe-token-dispatcher-type alltoall +) + +# NOTE(odashi): Disable fused attention for Sakura cluster due to some inconsistency. +# export NVTE_FUSED_ATTN=0 + +# Logging +ALL_PARAMS+=( + --log-interval 1 + --log-throughput + --wandb-entity llm-jp + --wandb-project 0176_merge_megatron_upstream + --wandb-exp-name train_$(basename ${TASK_DIR}) +) diff --git a/pretrain/scripts/v5-test/train/qsub_train.sh b/pretrain/scripts/v5-test/train/qsub_train.sh new file mode 100644 index 0000000..4a44b60 --- /dev/null +++ b/pretrain/scripts/v5-test/train/qsub_train.sh @@ -0,0 +1,63 @@ +#!/bin/bash +#PBS -P gcg51557 +#PBS -q R9920251000 +#PBS -N 0176_merge_megatron_upstream +#PBS -l walltime=10000:00:00 +#PBS -m n + +cd $PBS_O_WORKDIR + +JOBID=${PBS_JOBID%%.*} +mkdir -p ${TASK_DIR}/logs +LOGFILE=${TASK_DIR}/logs/train-${JOBID}.out +ERRFILE=${TASK_DIR}/logs/train-${JOBID}.err +exec > $LOGFILE 2> $ERRFILE + +set -eu -o pipefail + +EXPERIMENT_DIR=/home/ach17726fj/experiments/0176_megatron_upstream_merge/ +SCRIPT_DIR=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v5-test/train +# Takes $ENV_DIR from the environment variable +# ENV_DIR=${EXPERIMENT_DIR}/environments +# ENV_DIR=${EXPERIMENT_DIR}/environment2 +# ENV_DIR=${EXPERIMENT_DIR}/test_environment + +# Setup environment +source ${SCRIPT_DIR}/common/setup.sh + +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | hostname -f) +export MASTER_PORT=$((10000 + RANDOM % 1000)) +echo "hostname: ${MASTER_ADDR}" + +NUM_NODES=$(wc -l < $PBS_NODEFILE) +NUM_GPUS_PER_NODE=8 +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) +echo "nnodes: ${NUM_NODES}; ngpus: ${NUM_GPUS}" +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +cat $PBS_NODEFILE + +# Load TRAIN_DATA_PATH +source ${TASK_DIR}/train_data_50B.sh +echo "TRAIN_DATA_PATH: ${TRAIN_DATA_PATH}" + +# Load ALL_PARAMS +source ${SCRIPT_DIR}/params/${PARAM_NAME}.sh +echo "ALL_PARAMS: ${ALL_PARAMS[@]}" + +# export NVTE_FUSED_ATTN=0 + +mpirun \ + --display-allocation \ + --report-bindings \ + --oversubscribe \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + python ${ENV_DIR}/src/Megatron-LM/pretrain_gpt.py \ + ${ALL_PARAMS[@]} diff --git a/pretrain/scripts/v5-test/train/run_train.sh b/pretrain/scripts/v5-test/train/run_train.sh new file mode 100644 index 0000000..910ee23 --- /dev/null +++ b/pretrain/scripts/v5-test/train/run_train.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -eu -o pipefail + +if [ $# -ne 4 ]; then + >&2 echo "Usage: $0 " + >&2 echo "Example: $0 v4-high-quality v3-13b 32" + exit 1 +fi + +task_dir=$1; shift +param_name=$1; shift +num_nodes=$1; shift +env_dir=$1; shift + +script_root=/home/ach17726fj/experiments/0176_megatron_upstream_merge/scripts/pretrain/scripts/v5-test + +qsub -l select=${num_nodes} \ + -v TASK_DIR=${task_dir},PARAM_NAME=${param_name},RTYPE=rt_HF \ + -o /dev/null -e /dev/null \ + -m n \ + ${script_root}/train/qsub_train.sh