-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsubmit.sh
More file actions
49 lines (41 loc) · 1.62 KB
/
Copy pathsubmit.sh
File metadata and controls
49 lines (41 loc) · 1.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/bin/bash
#SBATCH --job-name=llm-train
#SBATCH --partition=gpus
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=8
#SBATCH --exclusive
#SBATCH --output=logs/%j_%N.out
#SBATCH --error=logs/%j_%N.err
set -euo pipefail
mkdir -p logs
echo "[$(date)] Job $SLURM_JOB_ID starting on $(hostname)" >&2
# ── Rendezvous info derived from SLURM ──────────────────────────────────────
MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1)
MASTER_PORT=29500
NNODES=$SLURM_NNODES
NPROC_PER_NODE=$SLURM_GPUS_PER_NODE # GPUs per node
echo "Master: $MASTER_ADDR:$MASTER_PORT | Nodes: $NNODES | GPUs/node: $NPROC_PER_NODE"
echo "[$(date)] Activating venv..." >&2
if ! source ../Hackhaton-PyTorch/.venv/bin/activate; then
echo "[ERROR] Failed to activate venv at ../Hackhaton-PyTorch/.venv/bin/activate" >&2
exit 1
fi
echo "[$(date)] Venv activated" >&2
# ── Launch one torchrun per node via srun ────────────────────────────────────
srun python -m torch.distributed.run \
--nnodes="$NNODES" \
--nproc_per_node="$NPROC_PER_NODE" \
--rdzv_backend=c10d \
--rdzv_endpoint="$MASTER_ADDR:$MASTER_PORT" \
--rdzv_id="$SLURM_JOB_ID" \
train.py \
--data_dir /home/data/ \
--checkpoint_path checkpoint.pt \
--seq_len 1024 \
--batch_size 8 \
--grad_accum_steps 4 \
--max_steps 5000 \
--time_limit_min 10 \
--eval_every_steps 100 \
--eval_max_batches 200