diff --git a/experiments/v4-hq_tokenizer_test/common/setup.sh b/experiments/v4-hq_tokenizer_test/common/setup.sh new file mode 100644 index 0000000..d7adb0d --- /dev/null +++ b/experiments/v4-hq_tokenizer_test/common/setup.sh @@ -0,0 +1,28 @@ +# Script for setup trainer environment. + +module load cuda/12.8/12.8.1 +module load cudnn/9.5/9.5.1 +module load hpcx/2.20 +module load nccl/2.25/2.25.1-1 +# (cliu) Only for cuda/12.8; there is no folder for cuda/12.8 in cudnn/9.5.1 +export CUDNN_HOME=/apps/cudnn/9.5.1/cuda12.0 +export CUDNN_PATH=$CUDNN_HOME +export LD_LIBRARY_PATH=/apps/cudnn/9.5.1/cuda12.0/lib:$LD_LIBRARY_PATH +export CPATH=/apps/cudnn/9.5.1/cuda12.0/include:$CPATH +export LIBRARY_PATH=/apps/cudnn/9.5.1/cuda12.0/lib:$LIBRARY_PATH +echo $(module list) + +## Debug/logging flags +export LOGLEVEL=INFO +# export NCCL_DEBUG=WARN +export NCCL_DEBUG=INFO +export NCCL_DEBUG_SUBSYS=WARN +export PYTHONFAULTHANDLER=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export CUDA_LAUNCH_BLOCKING=0 +export CUDNN_LOGDEST_DBG=stderr +export CUDNN_LOGERR_DBG=1 + +export NVTE_FLASH_ATTN=1 +export NVTE_DEBUG=1 +export NVTE_DEBUG_LEVEL=2 \ No newline at end of file diff --git a/experiments/v4-hq_tokenizer_test/installer/install_megatron.sh b/experiments/v4-hq_tokenizer_test/installer/install_megatron.sh new file mode 100755 index 0000000..ac58ff0 --- /dev/null +++ b/experiments/v4-hq_tokenizer_test/installer/install_megatron.sh @@ -0,0 +1,102 @@ +#!/bin/bash +#PBS -P gcg51557 +#PBS -q R9920251000 +#PBS -v RTYPE=rt_HF +#PBS -l select=2 +#PBS -l walltime=4:00:00 +#PBS -k n +#PBS -N 0163_install + +set -euo pipefail + +EXP_DIR="/groups/gcg51557/experiments/0163_math_midtraining" +SCRIPT_DIR="${EXP_DIR}/scripts/experiments/v4-hq_tokenizer_test/installer" + +mkdir -p ${EXP_DIR}/logs/installer + +TIMESTAMP=$(date +%Y%m%d%H%M%S) +JOBID=${PBS_JOBID%%.*} +LOGFILE=${EXP_DIR}/logs/installer/$TIMESTAMP-$JOBID.out +ERRFILE=${EXP_DIR}/logs/installer/$TIMESTAMP-$JOBID.err +exec > $LOGFILE 2> $ERRFILE + +source ${SCRIPT_DIR}/../common/setup.sh + +cd ${EXP_DIR} +mkdir -p src +pushd src + +echo "Install Python" +mkdir -p python +git clone https://github.com/python/cpython -b v3.12.8 +PYTHONPATH=$(pwd)/python +pushd cpython + ./configure --prefix=${PYTHONPATH} --enable-optimizations + make -j 64 + make altinstall +popd + +echo "Setup venv" +${PYTHONPATH}/bin/python3.12 -m venv ../venv +source ../venv/bin/activate +pip install --upgrade pip + +echo "Install torch" +# pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126 +pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128 + +echo "Install requirements" +pip install -r ${SCRIPT_DIR}/requirements.txt + +echo "Install apex" +git clone --recurse-submodules https://github.com/NVIDIA/apex +pushd apex + pip install -v \ + --no-cache-dir \ + --no-build-isolation \ + --config-settings "--build-option=--cpp_ext" \ + --config-settings "--build-option=--cuda_ext" \ + ./ +popd + +# echo "Install flash-attn" +git clone https://github.com/Dao-AILab/flash-attention.git +pushd flash-attention + git checkout 27f501d && cd hopper/ && python setup.py install + python_path=`python -c "import site; print(site.getsitepackages()[0])"` + mkdir -p $python_path/flash_attn_3 + wget -P $python_path/flash_attn_3 https://raw.githubusercontent.com/Dao-AILab/flash-attention/27f501dbe011f4371bff938fe7e09311ab3002fa/hopper/flash_attn_interface.py +popd + +# pip install \ +# --no-build-isolation \ +# --no-cache-dir \ +# flash-attn + +# echo "Install transformer_engine" +git clone --branch stable --recursive https://github.com/NVIDIA/TransformerEngine.git +pushd TransformerEngine + export NVTE_FRAMEWORK=pytorch + pip install . +popd + +# pip install \ +# --no-build-isolation \ +# --no-cache-dir \ +# transformer_engine[pytorch] + +echo "Install Megatron-LM" +git clone https://github.com/llm-jp/Megatron-LM -b v4-old +pushd Megatron-LM/megatron/core/datasets + MEGATRON_HELPER_CPPFLAGS=( + -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color + $(python -m pybind11 --includes) + ) + MEGATRON_HELPER_EXT=$(${PYTHONPATH}/bin/python3.12-config --extension-suffix) + g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers_cpp${MEGATRON_HELPER_EXT} +popd + +deactivate +popd + +echo "Done" diff --git a/experiments/v4-hq_tokenizer_test/installer/requirements.txt b/experiments/v4-hq_tokenizer_test/installer/requirements.txt new file mode 100644 index 0000000..1f2c672 --- /dev/null +++ b/experiments/v4-hq_tokenizer_test/installer/requirements.txt @@ -0,0 +1,14 @@ +accelerate==1.3.0 +cmake==3.31.4 +einops==0.8.1 +ninja==1.11.1.3 +numpy==2.1.2 +packaging==24.2 +pybind11==2.13.6 +regex==2024.11.6 +safetensors==0.5.2 +sentencepiece==0.2.0 +six==1.17.0 +transformers==4.48.3 +wandb==0.19.3 +wheel==0.45.1