diff --git a/orby/mcli/sft_qwen2_5_vl_7b_grounding.yaml b/orby/mcli/sft_qwen2_5_vl_7b_grounding.yaml index df4f427adf4..52de3be535d 100644 --- a/orby/mcli/sft_qwen2_5_vl_7b_grounding.yaml +++ b/orby/mcli/sft_qwen2_5_vl_7b_grounding.yaml @@ -4,29 +4,30 @@ image: whatcanyousee/verl:ngc-cu124-vllm0.8.3-sglang0.4.5-mcore0.12.0-te2.2 integrations: - integration_type: git_repo git_repo: orby-ai-engineering/verl - git_branch: main # TODO: Change this according to your experiment! + git_branch: rishu/grounding_subtask_full # TODO: Change this according to your experiment! pip_install: . ssh_clone: true compute: - gpus: 16 # Number of GPUs to use (TODO: Change this according to your experiment!) + gpus: 64 # Number of GPUs to use (TODO: Change this according to your experiment!) cluster: r8z13p2 # TODO: Change this according to your experiment! gpu_type: h100_80gb # TODO: Change this according to your experiment! + node_names: [10.0.148.165, 10.0.151.72, 10.0.151.82, 10.0.152.224, 10.0.153.79, 10.0.154.216, 10.0.155.149, 10.0.155.205] #, 10.0.156.119, 10.0.158.66, 10.0.158.95] command: | # TODO: Set these variables before running the script. - export NUM_NODES=2 + export NUM_NODES=8 export MODEL_NAME=Qwen/Qwen2.5-VL-7B-Instruct - export PROJECT_NAME=verl_sft_grounding - export DATASET_VERSION=os_atlas + export PROJECT_NAME=verl_sft_grounding_subtask + export DATASET_VERSION=os_atlas_uground_subtask export EXPERIMENT_NAME=$MODEL_NAME-$DATASET_VERSION-sft - export DATA_SPLIT=5k # "Set the data split here (example 100k, 5k, 0.05k, etc.)" + export DATA_SPLIT=100k # "Set the data split here (example 100k, 5k, 0.05k, etc.)" export S3_CHECKPOINT_DIR=s3://orby-osu-va/verl-checkpoints/$PROJECT_NAME/$EXPERIMENT_NAME/$DATA_SPLIT - export TRAIN_BATCH_SIZE=32 + export TRAIN_BATCH_SIZE=128 export MICRO_BATCH_SIZE_PER_GPU=2 export FILTER_OVERLONG_PROMPTS_WORKERS=24 # (24 seems to work well for OSAtlas + Uground data) - export TRAIN_DIR=s3://orby-osu-va/Rishu-SFT-Dataset/os_atlas/subtask/$DATA_SPLIT/train/ - export TEST_DIR=s3://orby-osu-va/Rishu-SFT-Dataset/os_atlas/subtask/$DATA_SPLIT/test/ + export TRAIN_DIR=s3://orby-osu-va/Rishu-Experiment/UGround+OsAtlas+Subtask/full-Uground+OsAtlas+Subtask/train/ + export TEST_DIR=s3://orby-osu-va/Rishu-Experiment/UGround+OsAtlas+Subtask/full-Uground+OsAtlas+Subtask/test/ export MAX_PROMPT_LENGTH=7100 cd /workspace/verl