uniphore · RishuG-work · Jul 7, 2025 · Jul 8, 2025 · Jul 8, 2025
diff --git a/orby/mcli/sft_qwen2_5_vl_7b_grounding.yaml b/orby/mcli/sft_qwen2_5_vl_7b_grounding.yaml
@@ -4,29 +4,30 @@ image: whatcanyousee/verl:ngc-cu124-vllm0.8.3-sglang0.4.5-mcore0.12.0-te2.2
 integrations:
   - integration_type: git_repo
     git_repo: orby-ai-engineering/verl
-    git_branch: main # TODO: Change this according to your experiment!
+    git_branch: rishu/grounding_subtask_full # TODO: Change this according to your experiment!
     pip_install: .
     ssh_clone: true
 
 compute:
-  gpus: 16 # Number of GPUs to use (TODO: Change this according to your experiment!)
+  gpus: 64 # Number of GPUs to use (TODO: Change this according to your experiment!)
   cluster: r8z13p2 # TODO: Change this according to your experiment!
   gpu_type: h100_80gb # TODO: Change this according to your experiment!
+  node_names: [10.0.148.165, 10.0.151.72, 10.0.151.82, 10.0.152.224, 10.0.153.79, 10.0.154.216, 10.0.155.149, 10.0.155.205] #, 10.0.156.119, 10.0.158.66, 10.0.158.95]
 
 command: |
   # TODO: Set these variables before running the script.
-  export NUM_NODES=2
+  export NUM_NODES=8
   export MODEL_NAME=Qwen/Qwen2.5-VL-7B-Instruct
-  export PROJECT_NAME=verl_sft_grounding
-  export DATASET_VERSION=os_atlas
+  export PROJECT_NAME=verl_sft_grounding_subtask
+  export DATASET_VERSION=os_atlas_uground_subtask
   export EXPERIMENT_NAME=$MODEL_NAME-$DATASET_VERSION-sft
-  export DATA_SPLIT=5k # "Set the data split here (example 100k, 5k, 0.05k, etc.)"
+  export DATA_SPLIT=100k # "Set the data split here (example 100k, 5k, 0.05k, etc.)"
   export S3_CHECKPOINT_DIR=s3://orby-osu-va/verl-checkpoints/$PROJECT_NAME/$EXPERIMENT_NAME/$DATA_SPLIT
-  export TRAIN_BATCH_SIZE=32
+  export TRAIN_BATCH_SIZE=128
   export MICRO_BATCH_SIZE_PER_GPU=2 
   export FILTER_OVERLONG_PROMPTS_WORKERS=24 # (24 seems to work well for OSAtlas + Uground data)
-  export TRAIN_DIR=s3://orby-osu-va/Rishu-SFT-Dataset/os_atlas/subtask/$DATA_SPLIT/train/
-  export TEST_DIR=s3://orby-osu-va/Rishu-SFT-Dataset/os_atlas/subtask/$DATA_SPLIT/test/
+  export TRAIN_DIR=s3://orby-osu-va/Rishu-Experiment/UGround+OsAtlas+Subtask/full-Uground+OsAtlas+Subtask/train/
+  export TEST_DIR=s3://orby-osu-va/Rishu-Experiment/UGround+OsAtlas+Subtask/full-Uground+OsAtlas+Subtask/test/
   export MAX_PROMPT_LENGTH=7100
 
   cd /workspace/verl