diff --git a/pretrain/scripts/v5-test/README.md b/pretrain/scripts/v5-test/README.md
new file mode 100644
index 0000000..8a2694d
--- /dev/null
+++ b/pretrain/scripts/v5-test/README.md
@@ -0,0 +1,29 @@
+# v5 test
+
+v4-midtraining のコードをもとにした、v5用のMegatron-LMの学習のテスト用スクリプト
+
+## 環境構築
+
+```bash
+cd installer-abci/
+bash run_setup.sh <install_path>
+```
+
+## 実行
+
+```bash
+# bash train/run_train.sh <task_dir> <param_name> <num_nodes>
+bash train/run_train.sh $(realpath tasks/v4-dolmino-mix-1124) 7.7b-llama3-ecjk 1
+```
+
+## Notes
+
+### Installer 関連
+
+- `--attention-backend=auto` の設定と競合するため `NVTE_FUSED_ATTN` の設定を除去
+- installer-abci/src/install_megatron_lm.sh において extention name を `helpers_cpp` に変更
+
+### MoE 関連
+
+- [Megatron-LM MoE Quick Start](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/README.md#quick-start) を参考に 8x7.7b モデルの設定を作成
+  - `--moe-permute-fusion` は `TE < 2.1.0` では動作しないのでコメントアウト
\ No newline at end of file
diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/qsub_setup.sh b/pretrain/scripts/v5-test/installer-abci-hopper/qsub_setup.sh
new file mode 100755
index 0000000..9eced7d
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci-hopper/qsub_setup.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+#PBS -P gcg51557
+#PBS -q R9920251000
+#PBS -v RTYPE=rt_HF
+#PBS -l select=1
+#PBS -l walltime=01:00:00
+#PBS -o /dev/null
+#PBS -e /dev/null
+
+cd $PBS_O_WORKDIR
+
+TIMESTAMP=$(date +%Y%m%d%H%M%S)
+JOBID=${PBS_JOBID%%.*}
+mkdir -p logs
+LOGFILE=logs/install-$JOBID.out
+ERRFILE=logs/install-$JOBID.err
+exec > $LOGFILE 2> $ERRFILE
+
+set -eu -o pipefail
+
+echo "TARGET_DIR=${TARGET_DIR}"
+
+# Find the script directory
+if [ -n "${PBS_JOBID:-}" ]; then
+    SCRIPT_PATH="$PBS_O_WORKDIR/$(basename "$0")"
+else
+    SCRIPT_PATH=$(realpath "$0")
+fi
+SCRIPT_DIR=$(dirname "${SCRIPT_PATH}")
+echo "SCRIPT_DIR=${SCRIPT_DIR}"
+
+mkdir ${TARGET_DIR}
+mkdir ${TARGET_DIR}/src
+
+# Copy necessary scripts
+cp -r ${SCRIPT_DIR}/scripts ${TARGET_DIR}
+
+# Set variables
+source ${TARGET_DIR}/scripts/environment.sh
+set > ${TARGET_DIR}/installer_envvar.log
+
+# Install Libraries
+source ${SCRIPT_DIR}/src/install_python.sh
+source ${SCRIPT_DIR}/src/install_venv.sh
+source ${SCRIPT_DIR}/src/install_pytorch.sh
+source ${SCRIPT_DIR}/src/install_requirements.sh
+source ${SCRIPT_DIR}/src/install_apex.sh
+source ${SCRIPT_DIR}/src/install_flash_attention.sh
+source ${SCRIPT_DIR}/src/install_transformer_engine.sh
+source ${SCRIPT_DIR}/src/install_megatron_lm.sh
+source ${SCRIPT_DIR}/src/install_tokenizer.sh
+
+echo "Done"
diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/run_setup.sh b/pretrain/scripts/v5-test/installer-abci-hopper/run_setup.sh
new file mode 100755
index 0000000..ed2e42e
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci-hopper/run_setup.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -eu -o pipefail
+
+if [ $# -ne 1 ]; then
+    >&2 echo "Usage: $0 <target-dir>"
+    >&2 echo "Example: $0 /path/to/target_dir"
+    exit 1
+fi
+
+target_dir=$1; shift
+
+qsub \
+  -v TARGET_DIR=${target_dir},RTYPE=rt_HF \
+  -o /dev/null -e /dev/null \
+  -m n \
+  qsub_setup.sh
+
diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/scripts/environment.sh b/pretrain/scripts/v5-test/installer-abci-hopper/scripts/environment.sh
new file mode 100644
index 0000000..5b30285
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci-hopper/scripts/environment.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# List of environment variables and module loads for pretrain tasks
+
+export PRETRAIN_CUDA_VERSION_MAJOR=12
+export PRETRAIN_CUDA_VERSION_MINOR=4
+export PRETRAIN_CUDA_VERSION_PATCH=1
+
+export PRETRAIN_CUDA_VERSION=${PRETRAIN_CUDA_VERSION_MAJOR}.${PRETRAIN_CUDA_VERSION_MINOR}
+export PRETRAIN_CUDA_VERSION_SHORT=${PRETRAIN_CUDA_VERSION_MAJOR}${PRETRAIN_CUDA_VERSION_MINOR}
+export PRETRAIN_CUDNN_VERSION=9.5
+export PRETRAIN_CUDNN_VERSION_WITH_PATCH=9.5.1
+export PRETRAIN_HPCX_VERSION=2.20
+export PRETRAIN_NCCL_VERSION=2.25
+export PRETRAIN_NCCL_VERSION_WITH_PATCH=2.25.1-1
+
+export PRETRAIN_PYTHON_VERSION=3.10.4
+export PRETRAIN_TORCH_VERSION=2.6.0
+export PRETRAIN_TORCHVISION_VERSION=0.21.0
+export PRETRAIN_APEX_COMMIT=312acb44f9fe05cab8c67bba6daa0e64d3737863
+export PRETRAIN_FLASH_ATTENTION_VERSION=27f501d
+export PRETRAIN_TRANSFORMER_ENGINE_VERSION=2.3.0
+
+# export PRETRAIN_MEGATRON_TAG=v4
+export PRETRAIN_MEGATRON_TAG=0176_merge_nvidia_upstream
+# Ensure the appropriate Huggingface tokenizer is included
+# https://github.com/llm-jp/scripts/pull/12#discussion_r1708415209
+export PRETRAIN_TOKENIZER_TAG=v3.0b2
+
+source /etc/profile.d/modules.sh
+module load cuda/${PRETRAIN_CUDA_VERSION}/${PRETRAIN_CUDA_VERSION}.${PRETRAIN_CUDA_VERSION_PATCH}
+module load cudnn/${PRETRAIN_CUDNN_VERSION}/${PRETRAIN_CUDNN_VERSION_WITH_PATCH}
+module load hpcx/${PRETRAIN_HPCX_VERSION}
+module load nccl/${PRETRAIN_NCCL_VERSION}/${PRETRAIN_NCCL_VERSION_WITH_PATCH}
+
+export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_apex.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_apex.sh
new file mode 100644
index 0000000..a9fdd99
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_apex.sh
@@ -0,0 +1,25 @@
+# Install 
+
+echo "Installing apex with commit ${PRETRAIN_APEX_COMMIT}"
+source ${TARGET_DIR}/venv/bin/activate
+pushd ${TARGET_DIR}/src
+
+git clone --recurse-submodules https://github.com/NVIDIA/apex
+pushd apex
+
+# Checkout the specific commit
+git checkout ${PRETRAIN_APEX_COMMIT}
+git submodule update --init --recursive
+
+
+python -m pip install \
+  -v \
+  --no-cache-dir \
+  --no-build-isolation \
+  --config-settings "--build-option=--cpp_ext" \
+  --config-settings "--build-option=--cuda_ext" \
+  ./
+popd
+
+popd  # ${TARGET_DIR}/src
+deactivate
\ No newline at end of file
diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_flash_attention.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_flash_attention.sh
new file mode 100644
index 0000000..e4d25a3
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_flash_attention.sh
@@ -0,0 +1,25 @@
+# Installs flash attention.
+
+echo "Installing Flash Attention ${PRETRAIN_FLASH_ATTENTION_VERSION}"
+source ${TARGET_DIR}/venv/bin/activate
+
+pushd ${TARGET_DIR}/src
+
+git clone https://github.com/Dao-AILab/flash-attention.git
+pushd flash-attention/
+git checkout ${PRETRAIN_FLASH_ATTENTION_VERSION}
+
+# Use flash-attention 3
+pushd hopper/
+
+python setup.py install
+
+python_path=`python -c "import site; print(site.getsitepackages()[0])"`
+mkdir -p $python_path/flash_attn_3
+wget -P $python_path/flash_attn_3 https://raw.githubusercontent.com/Dao-AILab/flash-attention/27f501dbe011f4371bff938fe7e09311ab3002fa/hopper/flash_attn_interface.py
+
+popd # hopper/
+popd # flash-attention/
+popd # ${TARGET_DIR}/src
+
+deactivate
diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_megatron_lm.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_megatron_lm.sh
new file mode 100644
index 0000000..be36100
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_megatron_lm.sh
@@ -0,0 +1,30 @@
+# Installs Megatron-LM.
+
+echo "Installing Megatron-LM ${PRETRAIN_MEGATRON_TAG}"
+source ${TARGET_DIR}/venv/bin/activate
+pushd ${TARGET_DIR}/src
+
+# download our Megatron and build helper library
+git clone https://github.com/llm-jp/Megatron-LM -b ${PRETRAIN_MEGATRON_TAG}
+pushd Megatron-LM
+pushd megatron/core/datasets
+
+# NOTE(odashi):
+# Original makefile in the above directory uses the system's (or pyenv's) python3-config.
+# But we need to invoke python3-config installed on our target directory.
+MEGATRON_HELPER_CPPFLAGS=(
+  -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+  $(python -m pybind11 --includes)
+)
+MEGATRON_HELPER_EXT=$(${TARGET_DIR}/python/bin/python3-config --extension-suffix)
+
+# NOTE(odashi):
+# New version of Megatron-LM changed the extension name 'helpers' to 'helpers_cpp'
+g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers_cpp${MEGATRON_HELPER_EXT}
+# g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers${MEGATRON_HELPER_EXT}
+
+popd  # megatron/core/datasets
+popd  # Megatron-LM
+
+popd  # ${TARGET_DIR}/src
+deactivate
diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_python.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_python.sh
new file mode 100644
index 0000000..8ac43cb
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_python.sh
@@ -0,0 +1,17 @@
+# Script to install Python to TARGET_DIR
+#
+# This script will make the following directories:
+#   * ${TARGET_DIR}/src/cpython ... Source of Python
+#   * ${TARGET_DIR}/python ... installed Python binary
+
+echo "Installing Python ${PRETRAIN_PYTHON_VERSION}"
+pushd ${TARGET_DIR}/src
+
+git clone https://github.com/python/cpython -b v${PRETRAIN_PYTHON_VERSION}
+pushd cpython
+./configure --prefix="${TARGET_DIR}/python" --enable-optimizations
+make -j 64
+make install
+popd  # cpython
+
+popd  # ${TARGET_DIR}/src
diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_pytorch.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_pytorch.sh
new file mode 100644
index 0000000..1405a01
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_pytorch.sh
@@ -0,0 +1,13 @@
+# Install pytorch and torchvision
+
+echo "Installing torch ${PRETRAIN_TORCH_VERSION}+cu${PRETRAIN_CUDA_VERSION_SHORT} and torchvision ${PRETRAIN_TORCHVISION_VERSION}+cu${PRETRAIN_CUDA_VERSION_SHORT}"
+
+source ${TARGET_DIR}/venv/bin/activate
+
+python -m pip install \
+    --no-cache-dir \
+    torch==${PRETRAIN_TORCH_VERSION} \
+    torchvision==${PRETRAIN_TORCHVISION_VERSION} \
+    --index-url https://download.pytorch.org/whl/cu${PRETRAIN_CUDA_VERSION_SHORT}
+
+deactivate
diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_requirements.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_requirements.sh
new file mode 100644
index 0000000..e3554cd
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_requirements.sh
@@ -0,0 +1,9 @@
+# Installs prerequisite packages
+
+echo "Installing requirements"
+
+source ${TARGET_DIR}/venv/bin/activate
+
+python -m pip install --no-cache-dir -U -r ${SCRIPT_DIR}/src/requirements.txt
+
+deactivate
diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_tokenizer.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_tokenizer.sh
new file mode 100644
index 0000000..69ff9e6
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_tokenizer.sh
@@ -0,0 +1,10 @@
+# Install LLM-jp Tokenizer.
+
+echo "Installing LLM-jp Tokenizer ${PRETRAIN_TOKENIZER_TAG}"
+pushd ${TARGET_DIR}/src
+
+# download our tokeniser
+# Tokenizer
+git clone https://github.com/llm-jp/llm-jp-tokenizer -b ${PRETRAIN_TOKENIZER_TAG}
+
+popd  # ${TARGET_DIR}/src
diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_transformer_engine.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_transformer_engine.sh
new file mode 100644
index 0000000..6574a38
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_transformer_engine.sh
@@ -0,0 +1,12 @@
+# Installs Transformer Engine.
+
+echo "Installing Transformer Engine ${PRETRAIN_TRANSFORMER_ENGINE_VERSION}"
+source ${TARGET_DIR}/venv/bin/activate
+
+# install transformer engine
+# NOTE(odashi):
+# This implicitly installs flash-attn with their recommended version.
+# If the auto-installed flash-attn causes some problems, we need to re-install it.
+pip install --no-build-isolation --no-cache-dir transformer_engine[pytorch]==${PRETRAIN_TRANSFORMER_ENGINE_VERSION}
+
+deactivate
\ No newline at end of file
diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/install_venv.sh b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_venv.sh
new file mode 100644
index 0000000..5f036f5
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/install_venv.sh
@@ -0,0 +1,15 @@
+# Script to install Python to TARGET_DIR
+#
+# This script will make the following directories:
+#   * ${TARGET_DIR}/venv ... venv directory inherited from the above Python binary
+
+echo "Setup venv"
+pushd ${TARGET_DIR}
+
+python/bin/python3 -m venv venv
+
+source venv/bin/activate
+python -m pip install --no-cache-dir -U pip setuptools wheel
+deactivate
+
+popd  # ${TARGET_DIR}
diff --git a/pretrain/scripts/v5-test/installer-abci-hopper/src/requirements.txt b/pretrain/scripts/v5-test/installer-abci-hopper/src/requirements.txt
new file mode 100644
index 0000000..88e1930
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci-hopper/src/requirements.txt
@@ -0,0 +1,14 @@
+accelerate==1.0.1
+cmake==3.30.1
+einops==0.8.0
+ninja==1.11.1.1
+numpy==1.26.3
+packaging==24.1
+pybind11==2.13.6
+regex==2024.9.11
+safetensors==0.4.5
+sentencepiece==0.2.0
+six==1.16.0
+transformers==4.46.0
+wandb==0.18.5
+wheel==0.44.0
\ No newline at end of file
diff --git a/pretrain/scripts/v5-test/installer-abci/qsub_setup.sh b/pretrain/scripts/v5-test/installer-abci/qsub_setup.sh
new file mode 100755
index 0000000..9eced7d
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci/qsub_setup.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+#PBS -P gcg51557
+#PBS -q R9920251000
+#PBS -v RTYPE=rt_HF
+#PBS -l select=1
+#PBS -l walltime=01:00:00
+#PBS -o /dev/null
+#PBS -e /dev/null
+
+cd $PBS_O_WORKDIR
+
+TIMESTAMP=$(date +%Y%m%d%H%M%S)
+JOBID=${PBS_JOBID%%.*}
+mkdir -p logs
+LOGFILE=logs/install-$JOBID.out
+ERRFILE=logs/install-$JOBID.err
+exec > $LOGFILE 2> $ERRFILE
+
+set -eu -o pipefail
+
+echo "TARGET_DIR=${TARGET_DIR}"
+
+# Find the script directory
+if [ -n "${PBS_JOBID:-}" ]; then
+    SCRIPT_PATH="$PBS_O_WORKDIR/$(basename "$0")"
+else
+    SCRIPT_PATH=$(realpath "$0")
+fi
+SCRIPT_DIR=$(dirname "${SCRIPT_PATH}")
+echo "SCRIPT_DIR=${SCRIPT_DIR}"
+
+mkdir ${TARGET_DIR}
+mkdir ${TARGET_DIR}/src
+
+# Copy necessary scripts
+cp -r ${SCRIPT_DIR}/scripts ${TARGET_DIR}
+
+# Set variables
+source ${TARGET_DIR}/scripts/environment.sh
+set > ${TARGET_DIR}/installer_envvar.log
+
+# Install Libraries
+source ${SCRIPT_DIR}/src/install_python.sh
+source ${SCRIPT_DIR}/src/install_venv.sh
+source ${SCRIPT_DIR}/src/install_pytorch.sh
+source ${SCRIPT_DIR}/src/install_requirements.sh
+source ${SCRIPT_DIR}/src/install_apex.sh
+source ${SCRIPT_DIR}/src/install_flash_attention.sh
+source ${SCRIPT_DIR}/src/install_transformer_engine.sh
+source ${SCRIPT_DIR}/src/install_megatron_lm.sh
+source ${SCRIPT_DIR}/src/install_tokenizer.sh
+
+echo "Done"
diff --git a/pretrain/scripts/v5-test/installer-abci/run_setup.sh b/pretrain/scripts/v5-test/installer-abci/run_setup.sh
new file mode 100755
index 0000000..ed2e42e
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci/run_setup.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -eu -o pipefail
+
+if [ $# -ne 1 ]; then
+    >&2 echo "Usage: $0 <target-dir>"
+    >&2 echo "Example: $0 /path/to/target_dir"
+    exit 1
+fi
+
+target_dir=$1; shift
+
+qsub \
+  -v TARGET_DIR=${target_dir},RTYPE=rt_HF \
+  -o /dev/null -e /dev/null \
+  -m n \
+  qsub_setup.sh
+
diff --git a/pretrain/scripts/v5-test/installer-abci/scripts/environment.sh b/pretrain/scripts/v5-test/installer-abci/scripts/environment.sh
new file mode 100644
index 0000000..65c4bc7
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci/scripts/environment.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# List of environment variables and module loads for pretrain tasks
+
+export PRETRAIN_CUDA_VERSION_MAJOR=12
+export PRETRAIN_CUDA_VERSION_MINOR=4
+export PRETRAIN_CUDA_VERSION_PATCH=1
+
+export PRETRAIN_CUDA_VERSION=${PRETRAIN_CUDA_VERSION_MAJOR}.${PRETRAIN_CUDA_VERSION_MINOR}
+export PRETRAIN_CUDA_VERSION_SHORT=${PRETRAIN_CUDA_VERSION_MAJOR}${PRETRAIN_CUDA_VERSION_MINOR}
+export PRETRAIN_CUDNN_VERSION=9.5
+export PRETRAIN_CUDNN_VERSION_WITH_PATCH=9.5.1
+export PRETRAIN_HPCX_VERSION=2.20
+export PRETRAIN_NCCL_VERSION=2.25
+export PRETRAIN_NCCL_VERSION_WITH_PATCH=2.25.1-1
+
+export PRETRAIN_PYTHON_VERSION=3.10.4
+export PRETRAIN_TORCH_VERSION=2.6.0
+export PRETRAIN_TORCHVISION_VERSION=0.21.0
+export PRETRAIN_APEX_COMMIT=312acb44f9fe05cab8c67bba6daa0e64d3737863
+export PRETRAIN_FLASH_ATTENTION_VERSION=2.5.8
+export PRETRAIN_TRANSFORMER_ENGINE_VERSION=1.13.0
+
+# export PRETRAIN_MEGATRON_TAG=v4
+export PRETRAIN_MEGATRON_TAG=0176_merge_nvidia_upstream
+# Ensure the appropriate Huggingface tokenizer is included
+# https://github.com/llm-jp/scripts/pull/12#discussion_r1708415209
+export PRETRAIN_TOKENIZER_TAG=v3.0b2
+
+source /etc/profile.d/modules.sh
+module load cuda/${PRETRAIN_CUDA_VERSION}/${PRETRAIN_CUDA_VERSION}.${PRETRAIN_CUDA_VERSION_PATCH}
+module load cudnn/${PRETRAIN_CUDNN_VERSION}/${PRETRAIN_CUDNN_VERSION_WITH_PATCH}
+module load hpcx/${PRETRAIN_HPCX_VERSION}
+module load nccl/${PRETRAIN_NCCL_VERSION}/${PRETRAIN_NCCL_VERSION_WITH_PATCH}
+
+export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_apex.sh b/pretrain/scripts/v5-test/installer-abci/src/install_apex.sh
new file mode 100644
index 0000000..a9fdd99
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci/src/install_apex.sh
@@ -0,0 +1,25 @@
+# Install 
+
+echo "Installing apex with commit ${PRETRAIN_APEX_COMMIT}"
+source ${TARGET_DIR}/venv/bin/activate
+pushd ${TARGET_DIR}/src
+
+git clone --recurse-submodules https://github.com/NVIDIA/apex
+pushd apex
+
+# Checkout the specific commit
+git checkout ${PRETRAIN_APEX_COMMIT}
+git submodule update --init --recursive
+
+
+python -m pip install \
+  -v \
+  --no-cache-dir \
+  --no-build-isolation \
+  --config-settings "--build-option=--cpp_ext" \
+  --config-settings "--build-option=--cuda_ext" \
+  ./
+popd
+
+popd  # ${TARGET_DIR}/src
+deactivate
\ No newline at end of file
diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_flash_attention.sh b/pretrain/scripts/v5-test/installer-abci/src/install_flash_attention.sh
new file mode 100644
index 0000000..ae042b7
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci/src/install_flash_attention.sh
@@ -0,0 +1,11 @@
+# Installs flash attention.
+
+echo "Installing Flash Attention ${PRETRAIN_FLASH_ATTENTION_VERSION}"
+source ${TARGET_DIR}/venv/bin/activate
+
+python -m pip install \
+    --no-build-isolation \
+    --no-cache-dir \
+    "flash-attn==${PRETRAIN_FLASH_ATTENTION_VERSION}"
+
+deactivate
diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_megatron_lm.sh b/pretrain/scripts/v5-test/installer-abci/src/install_megatron_lm.sh
new file mode 100644
index 0000000..be36100
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci/src/install_megatron_lm.sh
@@ -0,0 +1,30 @@
+# Installs Megatron-LM.
+
+echo "Installing Megatron-LM ${PRETRAIN_MEGATRON_TAG}"
+source ${TARGET_DIR}/venv/bin/activate
+pushd ${TARGET_DIR}/src
+
+# download our Megatron and build helper library
+git clone https://github.com/llm-jp/Megatron-LM -b ${PRETRAIN_MEGATRON_TAG}
+pushd Megatron-LM
+pushd megatron/core/datasets
+
+# NOTE(odashi):
+# Original makefile in the above directory uses the system's (or pyenv's) python3-config.
+# But we need to invoke python3-config installed on our target directory.
+MEGATRON_HELPER_CPPFLAGS=(
+  -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+  $(python -m pybind11 --includes)
+)
+MEGATRON_HELPER_EXT=$(${TARGET_DIR}/python/bin/python3-config --extension-suffix)
+
+# NOTE(odashi):
+# New version of Megatron-LM changed the extension name 'helpers' to 'helpers_cpp'
+g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers_cpp${MEGATRON_HELPER_EXT}
+# g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers${MEGATRON_HELPER_EXT}
+
+popd  # megatron/core/datasets
+popd  # Megatron-LM
+
+popd  # ${TARGET_DIR}/src
+deactivate
diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_python.sh b/pretrain/scripts/v5-test/installer-abci/src/install_python.sh
new file mode 100644
index 0000000..8ac43cb
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci/src/install_python.sh
@@ -0,0 +1,17 @@
+# Script to install Python to TARGET_DIR
+#
+# This script will make the following directories:
+#   * ${TARGET_DIR}/src/cpython ... Source of Python
+#   * ${TARGET_DIR}/python ... installed Python binary
+
+echo "Installing Python ${PRETRAIN_PYTHON_VERSION}"
+pushd ${TARGET_DIR}/src
+
+git clone https://github.com/python/cpython -b v${PRETRAIN_PYTHON_VERSION}
+pushd cpython
+./configure --prefix="${TARGET_DIR}/python" --enable-optimizations
+make -j 64
+make install
+popd  # cpython
+
+popd  # ${TARGET_DIR}/src
diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_pytorch.sh b/pretrain/scripts/v5-test/installer-abci/src/install_pytorch.sh
new file mode 100644
index 0000000..1405a01
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci/src/install_pytorch.sh
@@ -0,0 +1,13 @@
+# Install pytorch and torchvision
+
+echo "Installing torch ${PRETRAIN_TORCH_VERSION}+cu${PRETRAIN_CUDA_VERSION_SHORT} and torchvision ${PRETRAIN_TORCHVISION_VERSION}+cu${PRETRAIN_CUDA_VERSION_SHORT}"
+
+source ${TARGET_DIR}/venv/bin/activate
+
+python -m pip install \
+    --no-cache-dir \
+    torch==${PRETRAIN_TORCH_VERSION} \
+    torchvision==${PRETRAIN_TORCHVISION_VERSION} \
+    --index-url https://download.pytorch.org/whl/cu${PRETRAIN_CUDA_VERSION_SHORT}
+
+deactivate
diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_requirements.sh b/pretrain/scripts/v5-test/installer-abci/src/install_requirements.sh
new file mode 100644
index 0000000..e3554cd
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci/src/install_requirements.sh
@@ -0,0 +1,9 @@
+# Installs prerequisite packages
+
+echo "Installing requirements"
+
+source ${TARGET_DIR}/venv/bin/activate
+
+python -m pip install --no-cache-dir -U -r ${SCRIPT_DIR}/src/requirements.txt
+
+deactivate
diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_tokenizer.sh b/pretrain/scripts/v5-test/installer-abci/src/install_tokenizer.sh
new file mode 100644
index 0000000..69ff9e6
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci/src/install_tokenizer.sh
@@ -0,0 +1,10 @@
+# Install LLM-jp Tokenizer.
+
+echo "Installing LLM-jp Tokenizer ${PRETRAIN_TOKENIZER_TAG}"
+pushd ${TARGET_DIR}/src
+
+# download our tokeniser
+# Tokenizer
+git clone https://github.com/llm-jp/llm-jp-tokenizer -b ${PRETRAIN_TOKENIZER_TAG}
+
+popd  # ${TARGET_DIR}/src
diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_transformer_engine.sh b/pretrain/scripts/v5-test/installer-abci/src/install_transformer_engine.sh
new file mode 100644
index 0000000..6574a38
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci/src/install_transformer_engine.sh
@@ -0,0 +1,12 @@
+# Installs Transformer Engine.
+
+echo "Installing Transformer Engine ${PRETRAIN_TRANSFORMER_ENGINE_VERSION}"
+source ${TARGET_DIR}/venv/bin/activate
+
+# install transformer engine
+# NOTE(odashi):
+# This implicitly installs flash-attn with their recommended version.
+# If the auto-installed flash-attn causes some problems, we need to re-install it.
+pip install --no-build-isolation --no-cache-dir transformer_engine[pytorch]==${PRETRAIN_TRANSFORMER_ENGINE_VERSION}
+
+deactivate
\ No newline at end of file
diff --git a/pretrain/scripts/v5-test/installer-abci/src/install_venv.sh b/pretrain/scripts/v5-test/installer-abci/src/install_venv.sh
new file mode 100644
index 0000000..5f036f5
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci/src/install_venv.sh
@@ -0,0 +1,15 @@
+# Script to install Python to TARGET_DIR
+#
+# This script will make the following directories:
+#   * ${TARGET_DIR}/venv ... venv directory inherited from the above Python binary
+
+echo "Setup venv"
+pushd ${TARGET_DIR}
+
+python/bin/python3 -m venv venv
+
+source venv/bin/activate
+python -m pip install --no-cache-dir -U pip setuptools wheel
+deactivate
+
+popd  # ${TARGET_DIR}
diff --git a/pretrain/scripts/v5-test/installer-abci/src/requirements.txt b/pretrain/scripts/v5-test/installer-abci/src/requirements.txt
new file mode 100644
index 0000000..88e1930
--- /dev/null
+++ b/pretrain/scripts/v5-test/installer-abci/src/requirements.txt
@@ -0,0 +1,14 @@
+accelerate==1.0.1
+cmake==3.30.1
+einops==0.8.0
+ninja==1.11.1.1
+numpy==1.26.3
+packaging==24.1
+pybind11==2.13.6
+regex==2024.9.11
+safetensors==0.4.5
+sentencepiece==0.2.0
+six==1.16.0
+transformers==4.46.0
+wandb==0.18.5
+wheel==0.44.0
\ No newline at end of file
diff --git a/pretrain/scripts/v5-test/tasks/.gitignore b/pretrain/scripts/v5-test/tasks/.gitignore
new file mode 100644
index 0000000..0d5b44a
--- /dev/null
+++ b/pretrain/scripts/v5-test/tasks/.gitignore
@@ -0,0 +1,6 @@
+cache/
+checkpoints/
+checkpoints_hf/
+logs/
+checkpoints_bak/
+train_iters.txt
diff --git a/pretrain/scripts/v5-test/tasks/v4-dolmino-mix-1124/train_data_50B.sh b/pretrain/scripts/v5-test/tasks/v4-dolmino-mix-1124/train_data_50B.sh
new file mode 100644
index 0000000..8e1a689
--- /dev/null
+++ b/pretrain/scripts/v5-test/tasks/v4-dolmino-mix-1124/train_data_50B.sh
@@ -0,0 +1,39 @@
+DATASET_ROOT="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-tokenized"
+export TRAIN_DATA_PATH=(
+1074221928 ${DATASET_ROOT}/dclm/dclm-0000-0009_text_document
+1084576663 ${DATASET_ROOT}/dclm/dclm-0010-0019_text_document
+1078047741 ${DATASET_ROOT}/dclm/dclm-0020-0029_text_document
+1078674563 ${DATASET_ROOT}/dclm/dclm-0030-0039_text_document
+1088235637 ${DATASET_ROOT}/dclm/dclm-0040-0049_text_document
+1074121727 ${DATASET_ROOT}/dclm/dclm-0050-0059_text_document
+1074361471 ${DATASET_ROOT}/dclm/dclm-0060-0069_text_document
+1072100298 ${DATASET_ROOT}/dclm/dclm-0070-0079_text_document
+1081117973 ${DATASET_ROOT}/dclm/dclm-0080-0089_text_document
+1083258442 ${DATASET_ROOT}/dclm/dclm-0090-0099_text_document
+1085639589 ${DATASET_ROOT}/dclm/dclm-0100-0109_text_document
+1083698256 ${DATASET_ROOT}/dclm/dclm-0110-0119_text_document
+1065593782 ${DATASET_ROOT}/dclm/dclm-0120-0129_text_document
+1079189542 ${DATASET_ROOT}/dclm/dclm-0130-0139_text_document
+1075808767 ${DATASET_ROOT}/dclm/dclm-0140-0149_text_document
+1082090476 ${DATASET_ROOT}/dclm/dclm-0150-0159_text_document
+1079732898 ${DATASET_ROOT}/dclm/dclm-0160-0169_text_document
+1083010268 ${DATASET_ROOT}/dclm/dclm-0170-0179_text_document
+1076637801 ${DATASET_ROOT}/dclm/dclm-0180-0189_text_document
+1077502310 ${DATASET_ROOT}/dclm/dclm-0190-0199_text_document
+1078931337 ${DATASET_ROOT}/dclm/dclm-0200-0209_text_document
+1080430161 ${DATASET_ROOT}/dclm/dclm-0210-0219_text_document
+1069347924 ${DATASET_ROOT}/dclm/dclm-0220-0229_text_document
+1066776191 ${DATASET_ROOT}/dclm/dclm-0230-0239_text_document
+667806924 ${DATASET_ROOT}/dclm/dclm-0240-0246_text_document
+9242742021 ${DATASET_ROOT}/flan/flan-all_text_document
+2174159 ${DATASET_ROOT}/math/codesearchnet-owmfilter-all_text_document
+31677007 ${DATASET_ROOT}/math/dolmino_math_synth-all_text_document
+2841494 ${DATASET_ROOT}/math/gsm8k-all_text_document
+4098243004 ${DATASET_ROOT}/math/mathcoder2-synthmath-all_text_document
+85423408 ${DATASET_ROOT}/math/metamath-owmfilter-all_text_document
+6944299886 ${DATASET_ROOT}/math/tinyGSM-MIND-all_text_document
+250390697 ${DATASET_ROOT}/math/tulu_math-all_text_document
+3236969300 ${DATASET_ROOT}/pes2o/pes2o-all_text_document
+1464772187 ${DATASET_ROOT}/stackexchange/stackexchange-all_text_document
+3896965449 ${DATASET_ROOT}/wiki/wiki-all_text_document
+)
diff --git a/pretrain/scripts/v5-test/train/common/setup.sh b/pretrain/scripts/v5-test/train/common/setup.sh
new file mode 100644
index 0000000..e0b7ddc
--- /dev/null
+++ b/pretrain/scripts/v5-test/train/common/setup.sh
@@ -0,0 +1,29 @@
+# Script for setup trainer environment.
+
+source /etc/profile.d/modules.sh
+# module load cuda/12.1/12.1.1
+module load cuda/12.4/12.4.1
+module load cudnn/9.5/9.5.1
+module load hpcx/2.20
+# module load nccl/2.23/2.23.4-1
+module load nccl/2.25/2.25.1-1
+# echo $(module list)
+loaded=$(module -t list 2>&1)
+echo "-----"
+echo "Modules: $loaded"
+echo "-----"
+
+
+source ${ENV_DIR}/venv/bin/activate
+# source ${ENV_DIR}/scripts/environment.sh # ADD
+
+## Debug/logging flags
+export LOGLEVEL=INFO
+# export NCCL_DEBUG=WARN
+export NCCL_DEBUG=INFO
+export NCCL_DEBUG_SUBSYS=WARN
+export PYTHONFAULTHANDLER=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export CUDA_LAUNCH_BLOCKING=0
+export CUDNN_LOGDEST_DBG=stderr
+export CUDNN_LOGERR_DBG=1
diff --git a/pretrain/scripts/v5-test/train/params/7.7b-llama3-ecjk.sh b/pretrain/scripts/v5-test/train/params/7.7b-llama3-ecjk.sh
new file mode 100644
index 0000000..4ab1ccf
--- /dev/null
+++ b/pretrain/scripts/v5-test/train/params/7.7b-llama3-ecjk.sh
@@ -0,0 +1,159 @@
+# Pretraining hyperparameters for v4 7.7B.
+# Model card: https://github.com/llm-jp/model-cards/pull/30
+# Ref: https://github.com/llm-jp/scripts/blob/ec3516a38f93047b7bc0d8305879d62a375e6ee2/pretrain/scripts/v4-training/params/7.7b-cont1.sh
+
+ALL_PARAMS=()
+
+# Model hyperparameters
+ALL_PARAMS+=(
+    --num-layers 32
+    --hidden-size 4096
+    --ffn-hidden-size 14336
+    --num-attention-heads 32
+    --group-query-attention
+    --num-query-groups 8
+    --seq-length 8192
+    --max-position-embeddings 8192
+    --position-embedding-type rope
+    --rotary-base 500000
+    --untie-embeddings-and-output-weights
+    --swiglu
+    --normalization RMSNorm
+    --norm-epsilon 1e-5
+    --disable-bias-linear
+)
+
+# Tokenizer
+ALL_PARAMS+=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model
+)
+
+# Optimizer hyperparameters
+ALL_PARAMS+=(
+    --optimizer adam
+    # --lr 3e-4 # will be defined later
+    # --min-lr 3e-5 # will be defined later
+    --adam-beta1 0.9
+    --adam-beta2 0.95
+    --adam-eps 1e-8
+    --clip-grad 1.0
+    --weight-decay 0.1
+    --init-method-std 0.02
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --override-opt_param-scheduler
+    # --no-load-optim
+)
+
+# pretrain_iters: 1,859,665
+# 50B: ceil( 55,797,411,281 / 8192 / 1024 ) == 6652
+# 50B sum: 1,859,665 + 6,652 = 1,866,317
+# 100B: ceil( 113,460,356,693 / 8192 / 1024 ) == 13,526
+# 100B sum: 1,859,665 + 13,526 = 1,873,191
+# 300B: ceil( 337,681,167,151 / 8192 / 1024 ) == 40,255
+# 300B sum: 1,859,665 + 40,255 = 1,899,920
+MIDTRAIN_START=1859665
+# TRAIN_ITERS=$(cat ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/train_iters.txt)
+DATASET_SIZE=50B
+TRAIN_ITERS=6652 # 50B
+# MIDTRAIN_ITERS=$((TRAIN_ITERS - MIDTRAIN_START))
+
+# Scheduler
+# Scheduler
+ALL_PARAMS+=(
+    --lr 3e-5   # Start LR
+    --min-lr 3e-5  # End LR
+    # --min-lr 0  # End LR
+    # --lr-warmup-iters ${MIDTRAIN_START} # No warmup
+    --lr-warmup-iters 0 # No warmup
+    --lr-decay-iters ${TRAIN_ITERS}
+    # --lr-decay-iters ${MIDTRAIN_ITERS}
+    --lr-decay-style linear
+    --train-iters ${TRAIN_ITERS}
+    --eval-interval 999999999
+    --eval-iters 0
+)
+
+# Batch sizes
+ALL_PARAMS+=(
+    --micro-batch-size 2
+    --global-batch-size 1024
+)
+
+# Parallelism
+ALL_PARAMS+=(
+    --tensor-model-parallel-size 1
+    --pipeline-model-parallel-size 2
+    --context-parallel-size 1
+    --sequence-parallel
+    --use-distributed-optimizer
+    --distributed-backend nccl
+    # NOTE(odashi): Increasing timeout is required to prepare 15.6T dataset.
+    --distributed-timeout-minutes 120
+    --use-mpi
+)
+
+# Load TRAIN_DATA_PATH
+source ${TASK_DIR}/train_data_${DATASET_SIZE}.sh # options: 50B, 100B, and 300B
+SEED=42
+# Dataset
+ALL_PARAMS+=(
+    --data-path ${TRAIN_DATA_PATH[@]}
+    --data-cache-path ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/cache
+    --split 1,0,0
+    --seed ${SEED}
+)
+
+TASK_CHECKPOINT_DIR=${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/checkpoints
+mkdir -p ${TASK_CHECKPOINT_DIR}
+
+if [ -e ${TASK_CHECKPOINT_DIR}/latest_checkpointed_iteration.txt ]; then
+  # Continue existing training
+  ALL_PARAMS+=(
+    --load ${TASK_CHECKPOINT_DIR}
+    --save ${TASK_CHECKPOINT_DIR}
+  )
+  echo "Continue existing training"
+else
+  # Start new training from scratch
+  ALL_PARAMS+=(
+    --save ${TASK_CHECKPOINT_DIR}
+  )
+  echo "Start new training from scratch"
+fi
+ALL_PARAMS+=(
+    --save-interval 1000
+)
+
+# Other implementation-related parameters
+ALL_PARAMS+=(
+    --bf16
+    --use-mcore-models
+    --no-masked-softmax-fusion
+    --use-flash-attn
+
+    # NOTE(odashi): For adjusting throughput
+    #--recompute-activations
+    #--recompute-granularity selective
+    #--overlap-grad-reduce
+    #--overlap-param-gather
+
+    --attention-softmax-in-fp32
+    --transformer-impl transformer_engine
+
+    # NOTE(odashi): Newer implementation requires to set attention backend by parameter.
+    #--attention-backend flash
+)
+
+# NOTE(odashi): Disable fused attention for Sakura cluster due to some inconsistency.
+# export NVTE_FUSED_ATTN=0
+
+# Logging
+ALL_PARAMS+=(
+    --log-interval 1
+    --log-throughput
+    --wandb-entity llm-jp
+    --wandb-project 0176_merge_megatron_upstream
+    --wandb-exp-name train_$(basename ${TASK_DIR})
+)
diff --git a/pretrain/scripts/v5-test/train/params/8x1.3b-llama3-ecjk.sh b/pretrain/scripts/v5-test/train/params/8x1.3b-llama3-ecjk.sh
new file mode 100644
index 0000000..c3ec406
--- /dev/null
+++ b/pretrain/scripts/v5-test/train/params/8x1.3b-llama3-ecjk.sh
@@ -0,0 +1,175 @@
+# Pretraining hyperparameters for v4 7.7B.
+# Model card: https://github.com/llm-jp/model-cards/pull/30
+# Ref: https://github.com/llm-jp/scripts/blob/ec3516a38f93047b7bc0d8305879d62a375e6ee2/pretrain/scripts/v4-training/params/7.7b-cont1.sh
+
+ALL_PARAMS=()
+
+# Model hyperparameters
+ALL_PARAMS+=(
+    --num-layers 16
+    --hidden-size 2048
+    --ffn-hidden-size 7168
+    --num-attention-heads 16
+    --group-query-attention
+    --num-query-groups 8
+    --seq-length 8192
+    --max-position-embeddings 8192
+    --position-embedding-type rope
+    --rotary-base 500000
+    --untie-embeddings-and-output-weights
+    --swiglu
+    --normalization RMSNorm
+    --norm-epsilon 1e-5
+    --disable-bias-linear
+)
+
+# Tokenizer
+ALL_PARAMS+=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model
+)
+
+# Optimizer hyperparameters
+ALL_PARAMS+=(
+    --optimizer adam
+    # --lr 3e-4 # will be defined later
+    # --min-lr 3e-5 # will be defined later
+    --adam-beta1 0.9
+    --adam-beta2 0.95
+    --adam-eps 1e-8
+    --clip-grad 1.0
+    --weight-decay 0.1
+    --init-method-std 0.02
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --override-opt_param-scheduler
+    # --no-load-optim
+)
+
+# pretrain_iters: 1,859,665
+# 50B: ceil( 55,797,411,281 / 8192 / 1024 ) == 6652
+# 50B sum: 1,859,665 + 6,652 = 1,866,317
+# 100B: ceil( 113,460,356,693 / 8192 / 1024 ) == 13,526
+# 100B sum: 1,859,665 + 13,526 = 1,873,191
+# 300B: ceil( 337,681,167,151 / 8192 / 1024 ) == 40,255
+# 300B sum: 1,859,665 + 40,255 = 1,899,920
+MIDTRAIN_START=1859665
+# TRAIN_ITERS=$(cat ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/train_iters.txt)
+DATASET_SIZE=50B
+TRAIN_ITERS=6652 # 50B
+# MIDTRAIN_ITERS=$((TRAIN_ITERS - MIDTRAIN_START))
+
+# Scheduler
+# Scheduler
+ALL_PARAMS+=(
+    --lr 3e-5   # Start LR
+    --min-lr 3e-5  # End LR
+    # --min-lr 0  # End LR
+    # --lr-warmup-iters ${MIDTRAIN_START} # No warmup
+    --lr-warmup-iters 0 # No warmup
+    --lr-decay-iters ${TRAIN_ITERS}
+    # --lr-decay-iters ${MIDTRAIN_ITERS}
+    --lr-decay-style linear
+    --train-iters ${TRAIN_ITERS}
+    --eval-interval 999999999
+    --eval-iters 0
+)
+
+# Batch sizes
+ALL_PARAMS+=(
+    --micro-batch-size 1
+    --global-batch-size 1024
+)
+
+# Parallelism
+ALL_PARAMS+=(
+    # model parallel size is set to 2 for 2 node training.
+    # (World size (=8 GPUs)) % ((model parallel size) x (moe parallel size)) should be 0.
+    --tensor-model-parallel-size 1
+    --pipeline-model-parallel-size 1
+    --context-parallel-size 1
+    --sequence-parallel
+    --use-distributed-optimizer
+    --distributed-backend nccl
+    # NOTE(odashi): Increasing timeout is required to prepare 15.6T dataset.
+    --distributed-timeout-minutes 120
+    --use-mpi
+)
+
+# Load TRAIN_DATA_PATH
+source ${TASK_DIR}/train_data_${DATASET_SIZE}.sh # options: 50B, 100B, and 300B
+SEED=42
+# Dataset
+ALL_PARAMS+=(
+    --data-path ${TRAIN_DATA_PATH[@]}
+    --data-cache-path ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/cache
+    --split 1,0,0
+    --seed ${SEED}
+)
+
+TASK_CHECKPOINT_DIR=${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/checkpoints
+mkdir -p ${TASK_CHECKPOINT_DIR}
+
+if [ -e ${TASK_CHECKPOINT_DIR}/latest_checkpointed_iteration.txt ]; then
+  # Continue existing training
+  ALL_PARAMS+=(
+    --load ${TASK_CHECKPOINT_DIR}
+    --save ${TASK_CHECKPOINT_DIR}
+  )
+  echo "Continue existing training"
+else
+  # Start new training from scratch
+  ALL_PARAMS+=(
+    --save ${TASK_CHECKPOINT_DIR}
+  )
+  echo "Start new training from scratch"
+fi
+ALL_PARAMS+=(
+    --save-interval 1000
+)
+
+# Other implementation-related parameters
+ALL_PARAMS+=(
+    --bf16
+    --use-mcore-models
+    --no-masked-softmax-fusion
+    --use-flash-attn
+
+    # NOTE(odashi): For adjusting throughput
+    #--recompute-activations
+    #--recompute-granularity selective
+    #--overlap-grad-reduce
+    #--overlap-param-gather
+
+    --attention-softmax-in-fp32
+    --transformer-impl transformer_engine
+
+    # NOTE(odashi): Newer implementation requires to set attention backend by parameter.
+    #--attention-backend flash
+)
+
+# MoE args
+# See https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/README.md
+ALL_PARAMS+=(
+    --num-experts 8
+    --expert-model-parallel-size 8
+    --moe-grouped-gemm
+    # --moe-permute-fusion # Not compatible with `TE < 2.1.0`
+    --moe-router-load-balancing-type aux_loss # options: aux_loss, sinkhorn, none. Default is aux_loss.
+    --moe-router-topk 2
+    --moe-aux-loss-coeff 1e-2
+    --use-distributed-optimizer
+    --moe-token-dispatcher-type alltoall
+)
+
+# NOTE(odashi): Disable fused attention for Sakura cluster due to some inconsistency.
+# export NVTE_FUSED_ATTN=0
+
+# Logging
+ALL_PARAMS+=(
+    --log-interval 1
+    --log-throughput
+    --wandb-entity llm-jp
+    --wandb-project 0176_merge_megatron_upstream
+    --wandb-exp-name train_$(basename ${TASK_DIR})
+)
diff --git a/pretrain/scripts/v5-test/train/params/8x7.7b-llama3-ecjk.sh b/pretrain/scripts/v5-test/train/params/8x7.7b-llama3-ecjk.sh
new file mode 100644
index 0000000..afdd44a
--- /dev/null
+++ b/pretrain/scripts/v5-test/train/params/8x7.7b-llama3-ecjk.sh
@@ -0,0 +1,175 @@
+# Pretraining hyperparameters for v4 7.7B.
+# Model card: https://github.com/llm-jp/model-cards/pull/30
+# Ref: https://github.com/llm-jp/scripts/blob/ec3516a38f93047b7bc0d8305879d62a375e6ee2/pretrain/scripts/v4-training/params/7.7b-cont1.sh
+
+ALL_PARAMS=()
+
+# Model hyperparameters
+ALL_PARAMS+=(
+    --num-layers 32
+    --hidden-size 4096
+    --ffn-hidden-size 14336
+    --num-attention-heads 32
+    --group-query-attention
+    --num-query-groups 8
+    --seq-length 8192
+    --max-position-embeddings 8192
+    --position-embedding-type rope
+    --rotary-base 500000
+    --untie-embeddings-and-output-weights
+    --swiglu
+    --normalization RMSNorm
+    --norm-epsilon 1e-5
+    --disable-bias-linear
+)
+
+# Tokenizer
+ALL_PARAMS+=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model
+)
+
+# Optimizer hyperparameters
+ALL_PARAMS+=(
+    --optimizer adam
+    # --lr 3e-4 # will be defined later
+    # --min-lr 3e-5 # will be defined later
+    --adam-beta1 0.9
+    --adam-beta2 0.95
+    --adam-eps 1e-8
+    --clip-grad 1.0
+    --weight-decay 0.1
+    --init-method-std 0.02
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --override-opt_param-scheduler
+    # --no-load-optim
+)
+
+# pretrain_iters: 1,859,665
+# 50B: ceil( 55,797,411,281 / 8192 / 1024 ) == 6652
+# 50B sum: 1,859,665 + 6,652 = 1,866,317
+# 100B: ceil( 113,460,356,693 / 8192 / 1024 ) == 13,526
+# 100B sum: 1,859,665 + 13,526 = 1,873,191
+# 300B: ceil( 337,681,167,151 / 8192 / 1024 ) == 40,255
+# 300B sum: 1,859,665 + 40,255 = 1,899,920
+MIDTRAIN_START=1859665
+# TRAIN_ITERS=$(cat ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/train_iters.txt)
+DATASET_SIZE=50B
+TRAIN_ITERS=6652 # 50B
+# MIDTRAIN_ITERS=$((TRAIN_ITERS - MIDTRAIN_START))
+
+# Scheduler
+# Scheduler
+ALL_PARAMS+=(
+    --lr 3e-5   # Start LR
+    --min-lr 3e-5  # End LR
+    # --min-lr 0  # End LR
+    # --lr-warmup-iters ${MIDTRAIN_START} # No warmup
+    --lr-warmup-iters 0 # No warmup
+    --lr-decay-iters ${TRAIN_ITERS}
+    # --lr-decay-iters ${MIDTRAIN_ITERS}
+    --lr-decay-style linear
+    --train-iters ${TRAIN_ITERS}
+    --eval-interval 999999999
+    --eval-iters 0
+)
+
+# Batch sizes
+ALL_PARAMS+=(
+    --micro-batch-size 1
+    --global-batch-size 256
+)
+
+# Parallelism
+ALL_PARAMS+=(
+    # model parallel size is set to 2 for 2 node training.
+    # (World size (=8 GPUs)) % ((model parallel size) x (moe parallel size)) should be 0.
+    --tensor-model-parallel-size 1
+    --pipeline-model-parallel-size 4
+    --context-parallel-size 1
+    --sequence-parallel
+    --use-distributed-optimizer
+    --distributed-backend nccl
+    # NOTE(odashi): Increasing timeout is required to prepare 15.6T dataset.
+    --distributed-timeout-minutes 120
+    --use-mpi
+)
+
+# Load TRAIN_DATA_PATH
+source ${TASK_DIR}/train_data_${DATASET_SIZE}.sh # options: 50B, 100B, and 300B
+SEED=42
+# Dataset
+ALL_PARAMS+=(
+    --data-path ${TRAIN_DATA_PATH[@]}
+    --data-cache-path ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/cache
+    --split 1,0,0
+    --seed ${SEED}
+)
+
+TASK_CHECKPOINT_DIR=${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/checkpoints
+mkdir -p ${TASK_CHECKPOINT_DIR}
+
+if [ -e ${TASK_CHECKPOINT_DIR}/latest_checkpointed_iteration.txt ]; then
+  # Continue existing training
+  ALL_PARAMS+=(
+    --load ${TASK_CHECKPOINT_DIR}
+    --save ${TASK_CHECKPOINT_DIR}
+  )
+  echo "Continue existing training"
+else
+  # Start new training from scratch
+  ALL_PARAMS+=(
+    --save ${TASK_CHECKPOINT_DIR}
+  )
+  echo "Start new training from scratch"
+fi
+ALL_PARAMS+=(
+    --save-interval 1000
+)
+
+# Other implementation-related parameters
+ALL_PARAMS+=(
+    --bf16
+    --use-mcore-models
+    --no-masked-softmax-fusion
+    --use-flash-attn
+
+    # NOTE(odashi): For adjusting throughput
+    #--recompute-activations
+    #--recompute-granularity selective
+    #--overlap-grad-reduce
+    #--overlap-param-gather
+
+    --attention-softmax-in-fp32
+    --transformer-impl transformer_engine
+
+    # NOTE(odashi): Newer implementation requires to set attention backend by parameter.
+    #--attention-backend flash
+)
+
+# MoE args
+# See https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/README.md
+ALL_PARAMS+=(
+    --num-experts 8
+    --expert-model-parallel-size 8
+    --moe-grouped-gemm
+    # --moe-permute-fusion # Not compatible with `TE < 2.1.0`
+    --moe-router-load-balancing-type aux_loss # options: aux_loss, sinkhorn, none. Default is aux_loss.
+    --moe-router-topk 2
+    --moe-aux-loss-coeff 1e-2
+    --use-distributed-optimizer
+    --moe-token-dispatcher-type alltoall
+)
+
+# NOTE(odashi): Disable fused attention for Sakura cluster due to some inconsistency.
+# export NVTE_FUSED_ATTN=0
+
+# Logging
+ALL_PARAMS+=(
+    --log-interval 1
+    --log-throughput
+    --wandb-entity llm-jp
+    --wandb-project 0176_merge_megatron_upstream
+    --wandb-exp-name train_$(basename ${TASK_DIR})
+)
diff --git a/pretrain/scripts/v5-test/train/qsub_train.sh b/pretrain/scripts/v5-test/train/qsub_train.sh
new file mode 100644
index 0000000..4a44b60
--- /dev/null
+++ b/pretrain/scripts/v5-test/train/qsub_train.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+#PBS -P gcg51557
+#PBS -q R9920251000
+#PBS -N 0176_merge_megatron_upstream
+#PBS -l walltime=10000:00:00
+#PBS -m n
+
+cd $PBS_O_WORKDIR
+
+JOBID=${PBS_JOBID%%.*}
+mkdir -p ${TASK_DIR}/logs
+LOGFILE=${TASK_DIR}/logs/train-${JOBID}.out
+ERRFILE=${TASK_DIR}/logs/train-${JOBID}.err
+exec > $LOGFILE 2> $ERRFILE
+
+set -eu -o pipefail
+
+EXPERIMENT_DIR=/home/ach17726fj/experiments/0176_megatron_upstream_merge/
+SCRIPT_DIR=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v5-test/train
+# Takes $ENV_DIR from the environment variable
+# ENV_DIR=${EXPERIMENT_DIR}/environments
+# ENV_DIR=${EXPERIMENT_DIR}/environment2
+# ENV_DIR=${EXPERIMENT_DIR}/test_environment
+
+# Setup environment
+source ${SCRIPT_DIR}/common/setup.sh
+
+source ${ENV_DIR}/venv/bin/activate
+
+export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | hostname -f)
+export MASTER_PORT=$((10000 + RANDOM % 1000))
+echo "hostname: ${MASTER_ADDR}"
+
+NUM_NODES=$(wc -l < $PBS_NODEFILE)
+NUM_GPUS_PER_NODE=8
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE}))
+echo "nnodes: ${NUM_NODES}; ngpus: ${NUM_GPUS}"
+echo NUM_NODES=$NUM_NODES
+echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
+echo NUM_GPUS=$NUM_GPUS
+
+cat $PBS_NODEFILE
+
+# Load TRAIN_DATA_PATH
+source ${TASK_DIR}/train_data_50B.sh
+echo "TRAIN_DATA_PATH: ${TRAIN_DATA_PATH}"
+
+# Load ALL_PARAMS
+source ${SCRIPT_DIR}/params/${PARAM_NAME}.sh
+echo "ALL_PARAMS: ${ALL_PARAMS[@]}"
+
+# export NVTE_FUSED_ATTN=0
+
+mpirun \
+  --display-allocation \
+  --report-bindings \
+  --oversubscribe \
+  -np $NUM_GPUS \
+  --npernode $NUM_GPUS_PER_NODE \
+  -bind-to none \
+  -map-by slot \
+  python ${ENV_DIR}/src/Megatron-LM/pretrain_gpt.py \
+    ${ALL_PARAMS[@]}
diff --git a/pretrain/scripts/v5-test/train/run_train.sh b/pretrain/scripts/v5-test/train/run_train.sh
new file mode 100644
index 0000000..910ee23
--- /dev/null
+++ b/pretrain/scripts/v5-test/train/run_train.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -eu -o pipefail
+
+if [ $# -ne 4 ]; then
+    >&2 echo "Usage: $0 <task-dir> <param-name> <num-nodes> <env-dir>"
+    >&2 echo "Example: $0 v4-high-quality v3-13b 32"
+    exit 1
+fi
+
+task_dir=$1; shift
+param_name=$1; shift
+num_nodes=$1; shift
+env_dir=$1; shift
+
+script_root=/home/ach17726fj/experiments/0176_megatron_upstream_merge/scripts/pretrain/scripts/v5-test
+
+qsub -l select=${num_nodes} \
+  -v TASK_DIR=${task_dir},PARAM_NAME=${param_name},RTYPE=rt_HF \
+  -o /dev/null -e /dev/null \
+  -m n \
+  ${script_root}/train/qsub_train.sh