Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions orby/mcli/sft_qwen2_5_vl_7b_grounding.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,30 @@ image: whatcanyousee/verl:ngc-cu124-vllm0.8.3-sglang0.4.5-mcore0.12.0-te2.2
integrations:
- integration_type: git_repo
git_repo: orby-ai-engineering/verl
git_branch: main # TODO: Change this according to your experiment!
git_branch: rishu/grounding_subtask_full # TODO: Change this according to your experiment!
pip_install: .
ssh_clone: true

compute:
gpus: 16 # Number of GPUs to use (TODO: Change this according to your experiment!)
gpus: 64 # Number of GPUs to use (TODO: Change this according to your experiment!)
cluster: r8z13p2 # TODO: Change this according to your experiment!
gpu_type: h100_80gb # TODO: Change this according to your experiment!
node_names: [10.0.148.165, 10.0.151.72, 10.0.151.82, 10.0.152.224, 10.0.153.79, 10.0.154.216, 10.0.155.149, 10.0.155.205] #, 10.0.156.119, 10.0.158.66, 10.0.158.95]

command: |
# TODO: Set these variables before running the script.
export NUM_NODES=2
export NUM_NODES=8
export MODEL_NAME=Qwen/Qwen2.5-VL-7B-Instruct
export PROJECT_NAME=verl_sft_grounding
export DATASET_VERSION=os_atlas
export PROJECT_NAME=verl_sft_grounding_subtask
export DATASET_VERSION=os_atlas_uground_subtask
export EXPERIMENT_NAME=$MODEL_NAME-$DATASET_VERSION-sft
export DATA_SPLIT=5k # "Set the data split here (example 100k, 5k, 0.05k, etc.)"
export DATA_SPLIT=100k # "Set the data split here (example 100k, 5k, 0.05k, etc.)"
export S3_CHECKPOINT_DIR=s3://orby-osu-va/verl-checkpoints/$PROJECT_NAME/$EXPERIMENT_NAME/$DATA_SPLIT
export TRAIN_BATCH_SIZE=32
export TRAIN_BATCH_SIZE=128
export MICRO_BATCH_SIZE_PER_GPU=2
export FILTER_OVERLONG_PROMPTS_WORKERS=24 # (24 seems to work well for OSAtlas + Uground data)
export TRAIN_DIR=s3://orby-osu-va/Rishu-SFT-Dataset/os_atlas/subtask/$DATA_SPLIT/train/
export TEST_DIR=s3://orby-osu-va/Rishu-SFT-Dataset/os_atlas/subtask/$DATA_SPLIT/test/
export TRAIN_DIR=s3://orby-osu-va/Rishu-Experiment/UGround+OsAtlas+Subtask/full-Uground+OsAtlas+Subtask/train/
export TEST_DIR=s3://orby-osu-va/Rishu-Experiment/UGround+OsAtlas+Subtask/full-Uground+OsAtlas+Subtask/test/
export MAX_PROMPT_LENGTH=7100

cd /workspace/verl
Expand Down