From de314c0ea0da05c5fc5369278cb6994f95329a84 Mon Sep 17 00:00:00 2001 From: YeonwooSung Date: Sun, 18 Aug 2024 21:21:43 +0900 Subject: [PATCH] feat: Add accelerate FSDP script and docs for finetune llama3.1 70B --- .../README.md | 101 ++++++++++++++ .../fine_tuning_FSDP_QLoRA.py | 125 ++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 DistributedTraining/finetune_llama31_with_accelerate/README.md create mode 100644 DistributedTraining/finetune_llama31_with_accelerate/fine_tuning_FSDP_QLoRA.py diff --git a/DistributedTraining/finetune_llama31_with_accelerate/README.md b/DistributedTraining/finetune_llama31_with_accelerate/README.md new file mode 100644 index 0000000..9fd6c00 --- /dev/null +++ b/DistributedTraining/finetune_llama31_with_accelerate/README.md @@ -0,0 +1,101 @@ +# Finetune Llama3.1 70B with Accelerate + +To run this example with FSDP, you need more than 2 GPUs, and each GPU should have at least 24GB memory. +It can be consumer GPUs such as the RTX 3090 or RTX 4090. + +## Configuration + +We need Hugging Face’s Accelerate. Make sure it’s installed and up to date: +```bash +pip install accelerate --upgrade +``` + +Then, configure it by running: +```bash +accelerate config +``` + +It will ask you several questions. +The goal here is to generate a configuration file that will be used for fine-tuning with FSDP. +Some of the questions can be difficult to answer if you don’t know well how FSDP works. +If this is the case, you can skip this step and use an existing configuration file, such as this one: +``` +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: FSDP +downcast_bf16: 'no' +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_backward_prefetch: BACKWARD_PRE + fsdp_cpu_ram_efficient_loading: true + fsdp_forward_prefetch: false + fsdp_offload_params: true + fsdp_sharding_strategy: FULL_SHARD + fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_sync_module_states: true + fsdp_use_orig_params: false +machine_rank: 0 +main_training_function: main +mixed_precision: 'no' +num_machines: 1 +num_processes: 2 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false +``` + +You only have to modify `num_processes` which is the number of GPUs you have on your machine. +Then, save it into a file, e.g., config_fsdp.yaml. + +## Fine-tuning + +Use accelerate to launch the fine-tuning script: +```bash +accelerate launch --config_file config_fsdp.yaml fine_tuning_FSDP_QLoRA.py +``` + +### Trick 1 + +Since the model is split during fine-tuning, the checkpoints only contain pieces of the model. + +To save the model, we need to gather all the pieces of the model on one device. +This is achieved by the following code that we have to run after the training (this code handles the (Q)LoRA case): +```python +fsdp_plugin = trainer.accelerator.state.fsdp_plugin +fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(trainer.model) + +if trainer.is_fsdp_enabled: + trainer.accelerator.state.fsdp_plugin. + set_state_dict_type("FULL_STATE_DICT") + +``` + +### Trick 2 + +For QLoRA training, we need to prepare the model for training. +For single-GPU QLoRA fine-tuning, we would simply add this line: +```python +model = prepare_model_for_kbit_training(model, gradient_checkpointing_kwargs={'use_reentrant':True}) +``` + +It does the following: + 1) Cast the layernorm and the language modeling head in fp32 + 2) Freeze the parameters of the models + 3) Make output embedding layer requires grads + 4) Activate gradient checkpointing + +With FSDP, (1) doesn’t seem to be possible and triggers an error when the fine-tuning starts. +To avoid this casting, I implemented what `prepare_model_for_kbit_training` does, minus this first step: +```python +for name, param in model.named_parameters(): + # freeze base model's layers + param.requires_grad = False + +def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) +model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) +model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant':True}) +``` diff --git a/DistributedTraining/finetune_llama31_with_accelerate/fine_tuning_FSDP_QLoRA.py b/DistributedTraining/finetune_llama31_with_accelerate/fine_tuning_FSDP_QLoRA.py new file mode 100644 index 0000000..cf870a0 --- /dev/null +++ b/DistributedTraining/finetune_llama31_with_accelerate/fine_tuning_FSDP_QLoRA.py @@ -0,0 +1,125 @@ +import torch, os, multiprocessing +from datasets import load_dataset +from peft import LoraConfig, prepare_model_for_kbit_training +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + set_seed +) +from trl import SFTTrainer, SFTConfig +from peft.utils.other import fsdp_auto_wrap_policy +from accelerate import Accelerator + + +accelerator = Accelerator() +set_seed(1234) + +model_name = "meta-llama/Meta-Llama-3.1-70B" +output_dir = "./Llama3.1_70b_QLoRA/" +dataset_name = "timdettmers/openassistant-guanaco" + + +#use bf16 and FlashAttention if supported +if torch.cuda.is_bf16_supported(): + os.system('pip install flash_attn') + compute_dtype = torch.bfloat16 + attn_implementation = 'flash_attention_2' +else: + compute_dtype = torch.float16 + attn_implementation = 'sdpa' + +#Tokenizer +tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) +tokenizer.pad_token = "<|finetune_right_pad_id|>" +tokenizer.pad_token_id = 128004 +tokenizer.padding_side = 'right' + +ds = load_dataset(dataset_name) + + +#Add the EOS token +def process(row): + row["text"] = row["text"]+"<|end_of_text|>" + return row + + +ds = ds.map( + process, + num_proc= multiprocessing.cpu_count(), + load_from_cache_file=False, +) +bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_storage=compute_dtype, +) +model = AutoModelForCausalLM.from_pretrained( + model_name, + quantization_config=bnb_config, + torch_dtype=torch.bfloat16, + attn_implementation=attn_implementation +) + +for name, param in model.named_parameters(): + # freeze base model's layers + param.requires_grad = False + +def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + +model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) +model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant':True}) + +peft_config = LoraConfig( + lora_alpha=16, + lora_dropout=0.05, + r=16, + bias="none", + task_type="CAUSAL_LM", + target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"] +) + +training_arguments = SFTConfig( + output_dir=output_dir , + eval_strategy="steps", + do_eval=True, + optim="adamw_torch", + per_device_train_batch_size=1, + gradient_accumulation_steps=16, + per_device_eval_batch_size=1, + log_level="debug", + logging_steps=10, + learning_rate=1e-4, + bf16 = True, + eval_steps=10, + max_steps=50, + warmup_ratio=0.1, + lr_scheduler_type="linear", + dataset_text_field="text", + max_seq_length=512, +) + +trainer = SFTTrainer( + model=model, + train_dataset=ds['train'], + eval_dataset=ds['test'], + peft_config=peft_config, + tokenizer=tokenizer, + args=training_arguments, +) + +fsdp_plugin = trainer.accelerator.state.fsdp_plugin +fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(trainer.model) + +trainer.train() + +# If we use FSDP, model weights might be sharded across multiple devices. +# So if the fsdp is enabled, we need to set the state_dict_type to "FULL_STATE_DICT" before saving the model. +# Otherwise, the saved checkpoint will only contain the local state_dict of the model, which is only a part of the full model. +if trainer.is_fsdp_enabled: + trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT") + +trainer.save_model(output_dir)