From a3ea93715208c3b22df8647dbeba99c1446b7a62 Mon Sep 17 00:00:00 2001 From: Costa Huang Date: Thu, 9 Jan 2025 12:57:50 -0500 Subject: [PATCH] Add `--try_auto_save_to_beaker` arg (#505) * Add `--try_auto_save_to_beaker` arg * push changes --- open_instruct/dpo_tune.py | 9 ++++++--- open_instruct/dpo_tune_cache.py | 9 ++++++--- open_instruct/finetune.py | 9 ++++++--- open_instruct/ppo_vllm_thread_ray.py | 4 +++- open_instruct/ppo_vllm_thread_ray_gtrl.py | 4 +++- 5 files changed, 24 insertions(+), 11 deletions(-) diff --git a/open_instruct/dpo_tune.py b/open_instruct/dpo_tune.py index 92a783cfb..d5364661c 100644 --- a/open_instruct/dpo_tune.py +++ b/open_instruct/dpo_tune.py @@ -22,6 +22,7 @@ import math import os import random +import shutil import subprocess import time from copy import deepcopy @@ -365,6 +366,8 @@ class FlatArguments: default=0.001, metadata={"help": "Weight for load balancing loss if applicable."}, ) + try_auto_save_to_beaker: bool = True + """Whether to try to save the model to Beaker dataset `/output` after training""" push_to_hub: bool = True """Whether to upload the saved model to huggingface""" hf_entity: Optional[str] = None @@ -487,9 +490,6 @@ def main(args: FlatArguments): if is_beaker_job(): beaker_config = maybe_get_beaker_config() - # try saving to the beaker `/output`, which will be uploaded to the beaker dataset - if len(beaker_config.beaker_dataset_id_urls) > 0: - args.output_dir = "/output" accelerator_log_kwargs = {} @@ -1119,6 +1119,9 @@ def load_model(): if accelerator.is_local_main_process: clean_last_n_checkpoints(args.output_dir, keep_last_n_checkpoints=0) + if args.try_auto_save_to_beaker and accelerator.is_main_process == 0 and len(beaker_config.beaker_dataset_id_urls) > 0 and args.output_dir != "/output": + shutil.copytree(args.output_dir, "/output", dirs_exist_ok=True) + if is_beaker_job() and accelerator.is_main_process: # dpo script only supports these two options right now for datasets if args.dataset_mixer: diff --git a/open_instruct/dpo_tune_cache.py b/open_instruct/dpo_tune_cache.py index 73b180e5f..78385cdfd 100644 --- a/open_instruct/dpo_tune_cache.py +++ b/open_instruct/dpo_tune_cache.py @@ -22,6 +22,7 @@ import math import os import random +import shutil import subprocess import time from dataclasses import dataclass, field @@ -375,6 +376,8 @@ class FlatArguments: ) concatenated_forward: bool = True """Whether to concatenate chosen and rejected for DPO training; True is good but you can set to False for saving memory.""" + try_auto_save_to_beaker: bool = True + """Whether to try to save the model to Beaker dataset `/output` after training""" push_to_hub: bool = True """Whether to upload the saved model to huggingface""" hf_entity: Optional[str] = None @@ -501,9 +504,6 @@ def main(args: FlatArguments): if is_beaker_job(): beaker_config = maybe_get_beaker_config() - # try saving to the beaker `/output`, which will be uploaded to the beaker dataset - if len(beaker_config.beaker_dataset_id_urls) > 0: - args.output_dir = "/output" accelerator_log_kwargs = {} @@ -1139,6 +1139,9 @@ def load_model(): if accelerator.is_local_main_process: clean_last_n_checkpoints(args.output_dir, keep_last_n_checkpoints=0) + if args.try_auto_save_to_beaker and accelerator.is_main_process == 0 and len(beaker_config.beaker_dataset_id_urls) > 0 and args.output_dir != "/output": + shutil.copytree(args.output_dir, "/output", dirs_exist_ok=True) + if is_beaker_job() and accelerator.is_main_process: # dpo script only supports these two options right now for datasets if args.dataset_mixer: diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py index 355d103b5..c6a20cbd9 100644 --- a/open_instruct/finetune.py +++ b/open_instruct/finetune.py @@ -19,6 +19,7 @@ import math import os import random +import shutil import subprocess import time from dataclasses import dataclass, field @@ -337,6 +338,8 @@ class FlatArguments: default=0.5, metadata={"help": "Weight for load balancing loss if applicable."}, ) + try_auto_save_to_beaker: bool = True + """Whether to try to save the model to Beaker dataset `/output` after training""" push_to_hub: bool = True """Whether to upload the saved model to huggingface""" hf_entity: Optional[str] = None @@ -471,9 +474,6 @@ def main(args: FlatArguments): if is_beaker_job(): beaker_config = maybe_get_beaker_config() - # try saving to the beaker `/output`, which will be uploaded to the beaker dataset - if len(beaker_config.beaker_dataset_id_urls) > 0: - args.output_dir = "/output" accelerator_log_kwargs = {} @@ -1033,6 +1033,9 @@ def main(args: FlatArguments): if accelerator.is_local_main_process: clean_last_n_checkpoints(args.output_dir, keep_last_n_checkpoints=0) + if args.try_auto_save_to_beaker and accelerator.is_main_process == 0 and len(beaker_config.beaker_dataset_id_urls) > 0 and args.output_dir != "/output": + shutil.copytree(args.output_dir, "/output", dirs_exist_ok=True) + if is_beaker_job() and accelerator.is_main_process: # dpo script only supports these two options right now for datasets if args.dataset_mixer: diff --git a/open_instruct/ppo_vllm_thread_ray.py b/open_instruct/ppo_vllm_thread_ray.py index e05e05beb..19e0ad192 100644 --- a/open_instruct/ppo_vllm_thread_ray.py +++ b/open_instruct/ppo_vllm_thread_ray.py @@ -274,6 +274,8 @@ class Args: """Whether to launch beaker evaluation jobs after training""" try_launch_beaker_eval_jobs_on_weka: bool = False """Whether to launch beaker evaluation jobs after training on weka""" + try_auto_save_to_beaker: bool = True + """Whether to try to save the model to Beaker dataset `/output` after training""" oe_eval_tasks: Optional[List[str]] = None """The beaker evaluation tasks to launch""" hf_metadata_dataset: Optional[str] = "allenai/tulu-3-evals" @@ -1300,7 +1302,7 @@ def vllm_generate( # Ai2 logic: we use /output to store the artifacts of the job, so we # make a copy of the model to `/output` in the end. - if self.rank == 0 and len(self.beaker_config.beaker_dataset_id_urls) > 0: + if args.try_auto_save_to_beaker and self.rank == 0 and len(self.beaker_config.beaker_dataset_id_urls) > 0 and args.output_dir != "/output": shutil.copytree(args.output_dir, "/output", dirs_exist_ok=True) print("finished training") diff --git a/open_instruct/ppo_vllm_thread_ray_gtrl.py b/open_instruct/ppo_vllm_thread_ray_gtrl.py index 085133203..0e6626eec 100644 --- a/open_instruct/ppo_vllm_thread_ray_gtrl.py +++ b/open_instruct/ppo_vllm_thread_ray_gtrl.py @@ -289,6 +289,8 @@ class Args: """Whether to launch beaker evaluation jobs after training""" try_launch_beaker_eval_jobs_on_weka: bool = False """Whether to launch beaker evaluation jobs after training on weka""" + try_auto_save_to_beaker: bool = True + """Whether to try to save the model to Beaker dataset `/output` after training""" oe_eval_tasks: Optional[List[str]] = None """The beaker evaluation tasks to launch""" hf_metadata_dataset: Optional[str] = "allenai/tulu-3-evals" @@ -1375,7 +1377,7 @@ def vllm_generate( # Ai2 logic: we use /output to store the artifacts of the job, so we # make a copy of the model to `/output` in the end. - if self.rank == 0 and len(self.beaker_config.beaker_dataset_id_urls) > 0: + if args.try_auto_save_to_beaker and self.rank == 0 and len(self.beaker_config.beaker_dataset_id_urls) > 0 and args.output_dir != "/output": shutil.copytree(args.output_dir, "/output", dirs_exist_ok=True) print("finished training")