feat: Add accelerate FSDP script and docs for finetune llama3.1 70B

YeonwooSung · Aug 18, 2024 · de314c0 · de314c0
1 parent 3133a3a
commit de314c0
Show file tree

Hide file tree

Showing 2 changed files with 226 additions and 0 deletions.
diff --git a/DistributedTraining/finetune_llama31_with_accelerate/README.md b/DistributedTraining/finetune_llama31_with_accelerate/README.md
@@ -0,0 +1,101 @@
+# Finetune Llama3.1 70B with Accelerate
+
+To run this example with FSDP, you need more than 2 GPUs, and each GPU should have at least 24GB memory.
+It can be consumer GPUs such as the RTX 3090 or RTX 4090.
+
+## Configuration
+
+We need Hugging Face’s Accelerate. Make sure it’s installed and up to date:
+```bash
+pip install accelerate --upgrade
+```
+
+Then, configure it by running:
+```bash
+accelerate config
+```
+
+It will ask you several questions.
+The goal here is to generate a configuration file that will be used for fine-tuning with FSDP.
+Some of the questions can be difficult to answer if you don’t know well how FSDP works.
+If this is the case, you can skip this step and use an existing configuration file, such as this one:
+```
+compute_environment: LOCAL_MACHINE                                                                                                                                           
+debug: false                                                                                                                                                                 
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: true
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+You only have to modify `num_processes` which is the number of GPUs you have on your machine.
+Then, save it into a file, e.g., config_fsdp.yaml.
+
+## Fine-tuning
+
+Use accelerate to launch the fine-tuning script:
+```bash
+accelerate launch --config_file config_fsdp.yaml fine_tuning_FSDP_QLoRA.py
+```
+
+### Trick 1
+
+Since the model is split during fine-tuning, the checkpoints only contain pieces of the model.
+
+To save the model, we need to gather all the pieces of the model on one device.
+This is achieved by the following code that we have to run after the training (this code handles the (Q)LoRA case):
+```python
+fsdp_plugin = trainer.accelerator.state.fsdp_plugin
+fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(trainer.model)
+
+if trainer.is_fsdp_enabled:
+    trainer.accelerator.state.fsdp_plugin.
+    set_state_dict_type("FULL_STATE_DICT")
+
+```
+
+### Trick 2
+
+For QLoRA training, we need to prepare the model for training.
+For single-GPU QLoRA fine-tuning, we would simply add this line:
+```python
+model = prepare_model_for_kbit_training(model, gradient_checkpointing_kwargs={'use_reentrant':True})
+```
+
+It does the following:
+    1) Cast the layernorm and the language modeling head in fp32
+    2) Freeze the parameters of the models
+    3) Make output embedding layer requires grads
+    4) Activate gradient checkpointing
+
+With FSDP, (1) doesn’t seem to be possible and triggers an error when the fine-tuning starts.
+To avoid this casting, I implemented what `prepare_model_for_kbit_training` does, minus this first step:
+```python
+for name, param in model.named_parameters():
+    # freeze base model's layers
+    param.requires_grad = False
+
+def make_inputs_require_grad(module, input, output):
+    output.requires_grad_(True)
+model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant':True})
+```
diff --git a/DistributedTraining/finetune_llama31_with_accelerate/fine_tuning_FSDP_QLoRA.py b/DistributedTraining/finetune_llama31_with_accelerate/fine_tuning_FSDP_QLoRA.py
@@ -0,0 +1,125 @@
+import torch, os, multiprocessing
+from datasets import load_dataset
+from peft import LoraConfig, prepare_model_for_kbit_training
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    set_seed
+)
+from trl import SFTTrainer, SFTConfig
+from peft.utils.other import fsdp_auto_wrap_policy
+from accelerate import Accelerator
+
+
+accelerator = Accelerator()
+set_seed(1234)
+
+model_name = "meta-llama/Meta-Llama-3.1-70B"
+output_dir = "./Llama3.1_70b_QLoRA/"
+dataset_name = "timdettmers/openassistant-guanaco"
+
+
+#use bf16 and FlashAttention if supported
+if torch.cuda.is_bf16_supported():
+    os.system('pip install flash_attn')
+    compute_dtype = torch.bfloat16
+    attn_implementation = 'flash_attention_2'
+else:
+    compute_dtype = torch.float16
+    attn_implementation = 'sdpa'
+
+#Tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+tokenizer.pad_token = "<|finetune_right_pad_id|>"
+tokenizer.pad_token_id = 128004
+tokenizer.padding_side = 'right'
+
+ds = load_dataset(dataset_name)
+
+
+#Add the EOS token
+def process(row):
+    row["text"] = row["text"]+"<|end_of_text|>"
+    return row
+
+
+ds = ds.map(
+    process,
+    num_proc= multiprocessing.cpu_count(),
+    load_from_cache_file=False,
+)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=compute_dtype,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_storage=compute_dtype,
+)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    torch_dtype=torch.bfloat16,
+    attn_implementation=attn_implementation
+)
+
+for name, param in model.named_parameters():
+    # freeze base model's layers
+    param.requires_grad = False
+
+def make_inputs_require_grad(module, input, output):
+    output.requires_grad_(True)
+
+model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant':True})
+
+peft_config = LoraConfig(
+    lora_alpha=16,
+    lora_dropout=0.05,
+    r=16,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
+)
+
+training_arguments = SFTConfig(
+    output_dir=output_dir ,
+    eval_strategy="steps",
+    do_eval=True,
+    optim="adamw_torch",
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=16,
+    per_device_eval_batch_size=1,
+    log_level="debug",
+    logging_steps=10,
+    learning_rate=1e-4,
+    bf16 = True,
+    eval_steps=10,
+    max_steps=50,
+    warmup_ratio=0.1,
+    lr_scheduler_type="linear",
+    dataset_text_field="text",
+    max_seq_length=512,
+)
+
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=ds['train'],
+    eval_dataset=ds['test'],
+    peft_config=peft_config,
+    tokenizer=tokenizer,
+    args=training_arguments,
+)
+
+fsdp_plugin = trainer.accelerator.state.fsdp_plugin
+fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(trainer.model)
+
+trainer.train()
+
+# If we use FSDP, model weights might be sharded across multiple devices.
+# So if the fsdp is enabled, we need to set the state_dict_type to "FULL_STATE_DICT" before saving the model.
+# Otherwise, the saved checkpoint will only contain the local state_dict of the model, which is only a part of the full model.
+if trainer.is_fsdp_enabled:
+    trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
+
+trainer.save_model(output_dir)