[usability] accelerate support initial commit

OptimalScale · Feb 22, 2025 · beba6ef · beba6ef
1 parent 65b9125
commit beba6ef
Show file tree

Hide file tree

Showing 33 changed files with 158 additions and 148 deletions.
diff --git a/configs/iterative_dpo.yaml b/configs/iterative_dpo.yaml
@@ -17,7 +17,6 @@ preprocessing_num_workers: 16
 output_dir: ./output_models/iterative_dpo
 run_name: iterative_dpo
 random_seed: 42
-use_accelerator: True
 enable_distributed_inference: True
 distributed_inference_num_instances: 8
 initial_iter_idx: 0 # 0 refers to the first dataset in dataset_path_list

diff --git a/examples/benchmarking.py b/examples/benchmarking.py
@@ -214,7 +214,7 @@ def main():
     dataset_name = benchmarking_args.dataset_name
     # metric = pipeline_args.metric
     if is_lmflow_local_benchmarking(dataset_name):   # TODO (@Jipeng)
-        model = AutoModel.get_model(model_args, tune_strategy='none', ds_config=ds_config)
+        model = AutoModel.get_model(model_args, do_train=False, ds_config=ds_config)
         run_lmflow_local_benchmarking(dataset_name,pipeline_name,model_args,pipeline_args,model)  # Pass args TODO (@Jipeng)
     elif is_lm_evaluation_benchmarking(dataset_name):
         model = model_args.model_name_or_path

diff --git a/examples/chatbot.py b/examples/chatbot.py
@@ -64,10 +64,9 @@ def main():
 
     model = AutoModel.get_model(
         model_args,
-        tune_strategy='none',
+        do_train=False,
         ds_config=ds_config,
         device=pipeline_args.device,
-        use_accelerator=True,
     )
 
     # We don't need input data, we will read interactively from stdin

diff --git a/examples/chatbot_gradio.py b/examples/chatbot_gradio.py
@@ -110,7 +110,7 @@ class ChatbotArguments:
 
 model = AutoModel.get_model(
     model_args,
-    tune_strategy='none',
+    do_train=False,
     ds_config=ds_config,
     device=pipeline_args.device,
     torch_dtype=torch.float16

diff --git a/examples/evaluation.py b/examples/evaluation.py
@@ -36,9 +36,8 @@
 
 model = AutoModel.get_model(
     model_args, 
-    tune_strategy='none', 
+    do_train=False, 
     ds_config=ds_config, 
-    use_accelerator=pipeline_args.use_accelerator_for_evaluator
 )
 dataset = Dataset(data_args)
 

diff --git a/examples/finetune_multi_modal.py b/examples/finetune_multi_modal.py
@@ -59,7 +59,7 @@ def main():
     # do not resiger deepspeed in the model.
     # with_deepspeed flag may be removed
     # by modifying the tune strategy in the future.
-    model = AutoModel.get_model(model_args, tune_strategy='none',
+    model = AutoModel.get_model(model_args, do_train=True,
                                 ds_config=pipeline_args.deepspeed,
                                 custom_model=True,
                                 with_deepspeed=False,

diff --git a/examples/inference.py b/examples/inference.py
@@ -39,10 +39,9 @@ def main():
 
     model = AutoModel.get_model(
         model_args,
-        tune_strategy='none',
+        do_train=False,
         ds_config=ds_config,
         device=pipeline_args.device,
-        use_accelerator=True,
     )
 
     # We don't need input data, we will read interactively from stdin

diff --git a/examples/merge_lora.py b/examples/merge_lora.py
@@ -62,7 +62,7 @@ def main():
     model_args.use_lora = True
     model = AutoModel.get_model(
         model_args, 
-        tune_strategy='none', 
+        do_train=False, 
         device=merge_lora_args.device,
         ds_config=merge_lora_args.ds_config
     )

diff --git a/examples/rm_inference.py b/examples/rm_inference.py
@@ -40,7 +40,7 @@ def main():
         model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
 
     dataset = Dataset(data_args)
-    model = AutoModel.get_model(model_args, tune_strategy='none', use_accelerator=pipeline_args.use_accelerator)
+    model = AutoModel.get_model(model_args, do_train=False)
     inferencer = AutoPipeline.get_pipeline(
         pipeline_name=pipeline_name,
         model_args=model_args,

diff --git a/examples/vis_chatbot.py b/examples/vis_chatbot.py
@@ -105,7 +105,7 @@ def main():
         ds_config = json.load(f)
     model = AutoModel.get_model(
         model_args,
-        tune_strategy='none',
+        do_train=False,
         ds_config=ds_config,
         device=pipeline_args.device,
         custom_model=model_args.custom_model,

diff --git a/examples/vis_chatbot_gradio.py b/examples/vis_chatbot_gradio.py
@@ -245,7 +245,7 @@ def start_inferencer(
 
     model = AutoModel.get_model(
         model_args,
-        tune_strategy='none',
+        do_train=False,
         ds_config=ds_config,
         device=pipeline_args.device,
         custom_model=model_args.custom_model,

diff --git a/examples/vllm_inference.py b/examples/vllm_inference.py
@@ -40,7 +40,7 @@ def main():
         model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
 
     dataset = Dataset(data_args)
-    model = AutoModel.get_model(model_args, tune_strategy='none')
+    model = AutoModel.get_model(model_args, do_train=False)
     inferencer = AutoPipeline.get_pipeline(
         pipeline_name=pipeline_name,
         model_args=model_args,

diff --git a/requirements.txt b/requirements.txt
@@ -5,7 +5,6 @@ tokenizers>=0.13.3
 peft>=0.10.0
 torch>=2.0.1
 wandb
-deepspeed>=0.14.4
 sentencepiece
 transformers>=4.31.0
 cpm_kernels==1.0.11

diff --git a/scripts/run_chatbot.sh b/scripts/run_chatbot.sh
@@ -16,7 +16,6 @@ accelerate launch --config_file configs/accelerator_multigpu_config.yaml \
   examples/chatbot.py \
     --deepspeed configs/ds_config_chatbot.json \
     --model_name_or_path ${model} \
-    --use_accelerator True \
     --max_new_tokens 256 \
     --temperature 1.0 \
     --end_string "#" \

diff --git a/scripts/run_evaluation_accelerator.sh b/scripts/run_evaluation_accelerator.sh
@@ -13,5 +13,4 @@ CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/accelerator_singl
     --metric accuracy \
     --output_dir output_dir/accelerator_1_card \
     --inference_batch_size_per_device 1 \
-    --use_accelerator_for_evaluator True \
     --torch_dtype bfloat16
diff --git a/scripts/run_inference.sh b/scripts/run_inference.sh
@@ -15,7 +15,6 @@ accelerate launch --config_file configs/accelerator_multigpu_config.yaml \
   examples/inference.py \
     --deepspeed configs/ds_config_chatbot.json \
     --model_name_or_path ${model} \
-    --use_accelerator True \
     --max_new_tokens 256 \
     --temperature 1.0 \
     ${lora_args}
diff --git a/scripts/run_rm_inference.sh b/scripts/run_rm_inference.sh
@@ -61,7 +61,6 @@ accelerate launch --config_file configs/accelerator_multigpu_config.yaml \
         --trust_remote_code ${trust_remote_code} \
         --model_name_or_path ${model_name_or_path} \
         --arch_type text_regression \
-        --use_accelerator True \
         --block_size 4096 \
         --inference_batch_size 16 \
         --dataset_path ${dataset_path} \

diff --git a/service/app.py b/service/app.py
@@ -54,7 +54,7 @@ class AppArguments:
 local_rank = int(os.getenv("LOCAL_RANK", "0"))
 world_size = int(os.getenv("WORLD_SIZE", "1"))
 torch.cuda.set_device(local_rank)
-model = AutoModel.get_model(model_args, tune_strategy='none', ds_config=ds_config, use_accelerator=True)
+model = AutoModel.get_model(model_args, do_train=False, ds_config=ds_config)
 accelerator = Accelerator()
 
 def stream_generate(inputs,context_len = 1024, max_new_tokens=128, end_string="##"):

diff --git a/setup.py b/setup.py
@@ -22,7 +22,8 @@
     "gradio": ["gradio"],
     "flask": ["flask", "flask_cors"],
     "flash_attn": ["flash-attn>=2.0.2"],
-    "trl": ["trl==0.8.0"]
+    "trl": ["trl==0.8.0"],
+    "deepspeed": ["deepspeed>=0.14.4"],
 }
 
 readme_path = os.path.join(folder, "README.md")

diff --git a/src/lmflow/args.py b/src/lmflow/args.py
@@ -924,8 +924,8 @@ class EvaluatorArguments:
             ),
         },
     )
-    use_accelerator_for_evaluator: bool = field(
-        default=False, metadata={"help": "Whether to use Huggingface Accelerator instead of Deepspeed"},
+    use_accelerator_for_evaluator: Optional[bool] = field(
+        default=None, metadata={"help": "[Deprecated] Whether to use Huggingface Accelerator instead of Deepspeed"},
     )
 
     temperature: float = field(
@@ -942,6 +942,14 @@ class EvaluatorArguments:
         default=100,
         metadata={"help": "Maximum length during inference."},
     )
+
+    def __post_init__(self):
+        if self.use_accelerator_for_evaluator is not None:
+            logger.warning(
+                "You've specified `use_accelerator_for_evaluator`. This argument is deprecated. "
+                "It will not take effect and will be removed in a future version, "
+                "since LMFlow now can automatically detect whether is in Accelerate or Deepspeed environment."
+            )
 
 
 @dataclass
@@ -1061,8 +1069,8 @@ class InferencerArguments:
             "help": "whether turn on true random sampling during inference."
         },
     )
-    use_accelerator: bool = field(
-        default=False, metadata={"help": "Whether to use Huggingface Accelerator instead of Deepspeed"},
+    use_accelerator: Optional[bool] = field(
+        default=None, metadata={"help": "[Deprecated] Whether to use Huggingface Accelerator instead of Deepspeed"},
     )
     use_beam_search: Optional[bool] = field(
         default=False,
@@ -1131,6 +1139,13 @@ class InferencerArguments:
     )
 
     def __post_init__(self):
+        if self.use_accelerator is not None:
+            logger.warning(
+                "You've specified `use_accelerator`. This argument is deprecated. "
+                "It will not take effect and will be removed in a future version, "
+                "since LMFlow now can automatically detect whether is in Accelerate or Deepspeed environment."
+            )
+
         if self.save_results:
             if self.results_path is None:
                 raise ValueError("Need to specify results_path when save_results is True.")

diff --git a/src/lmflow/models/hf_decoder_model.py b/src/lmflow/models/hf_decoder_model.py
@@ -3,16 +3,12 @@
 """This is a class called HFDecoderModel which is a wrapper around transformers model and
 tokenizer classes. It has several methods such as __init__, tokenize, and train that are 
 used for training and fine-tuning the model. The __init__ method takes in several arguments
-such as model_args, tune_strategy, and ds_config, which are used to load the pretrained 
+such as model_args which are used to load the pretrained 
 model and tokenizer, and initialize the training settings.
 
 The tokenize method is used to tokenize the input text and return the input IDs and attention
 masks that can be fed to the model for training or inference.
 
-This class supports different tune_strategy options such as 'normal', 'none', 'lora', and
-'adapter', which allow for different fine-tuning settings of the model. However, the 'lora'
-and 'adapter' strategies are not yet implemented.
-
 Overall, this class provides a convenient interface for loading and fine-tuning transformer
 models and can be used for various NLP tasks such as language modeling, text classification,
 and question answering.
@@ -46,6 +42,7 @@
     conversation_tokenize_function
 )
 from lmflow.utils.versioning import is_ray_available, is_vllm_available, is_flash_attn_available
+from lmflow.utils.envs import is_accelerate_env
 
 
 logger = logging.getLogger(__name__)
@@ -74,11 +71,9 @@ class HFDecoderModel(DecoderModel, HFModelMixin, Tunable):
     model_args : 
         Model arguments such as model name, path, revision, etc.
 
-    tune_strategy : str or none,  default="normal".
-        A string representing the dataset backend. Defaults to "huggingface".
-    
-    ds_config :   
-        Deepspeed configuations.
+    do_train : bool, default True
+        Determines whether to prepare the model for training, including distribtued env, model placement, quantization,
+        lora, etc.
     
     args : Optional.
         Positional arguments.
@@ -90,26 +85,16 @@ class HFDecoderModel(DecoderModel, HFModelMixin, Tunable):
     def __init__(
         self,
         model_args,
-        tune_strategy='normal',
-        ds_config=None,
+        do_train=True,
         device="gpu",
-        use_accelerator=False,
         *args,
         **kwargs
     ):
-        """
-        Initializes a HFDecoderModel instance.
-        :param model_args: dictionary with model arguments such as model name, path, revision, etc.
-        :param tune_strategy: tuning strategy: normal, none, lora or adapter
-        :param ds_config: deepspeed configuration for distributed training
-        """
         HFModelMixin.__init__(
             self,
             model_args=model_args,
-            do_train=True if tune_strategy == "normal" else False,
-            ds_config=ds_config,
+            do_train=do_train,
             device=device,
-            use_accelerator=use_accelerator,
             *args,
             **kwargs
         )
@@ -384,7 +369,7 @@ def __inference(self, inputs, *args, **kwargs):
             The generated sequence output 
         """
         with torch.no_grad():
-            if self.use_accelerator:
+            if is_accelerate_env():
                 outputs = self.backend_model.generate(
                     input_ids=inputs,
                     pad_token_id=self.tokenizer.pad_token_id,