From f30ad4c4302573f64e703bcf2aa12de3fce98f75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Haian=20Huang=28=E6=B7=B1=E5=BA=A6=E7=9C=B8=29?= Date: Mon, 22 Jul 2024 14:45:09 +0800 Subject: [PATCH] Support InternVL 1.5/2.0 finetune (#737) * support internvl finetune * fix * fix * fix * update * update * update * update cfg * fix * support phi3 * support full+lora+qlora * support internvl 26b * fix lora cfg * update all * update * update * update config * update config * update config * fix type and add readme * update readme * RENAME * fix * update * support internvl2 * update --- xtuner/configs/internvl/README.md | 152 +++++++ xtuner/configs/internvl/README_zh-CN.md | 152 +++++++ .../internvl/v1_5/convert_to_official.py | 56 +++ .../internvl_v1_5_internlm2_26b_finetune.py | 170 ++++++++ ...ternvl_v1_5_internlm2_26b_lora_finetune.py | 183 ++++++++ ...ernvl_v1_5_internlm2_26b_qlora_finetune.py | 185 ++++++++ .../internvl_v1_5_internlm2_2b_finetune.py | 170 ++++++++ ...nternvl_v1_5_internlm2_2b_lora_finetune.py | 183 ++++++++ ...ternvl_v1_5_internlm2_2b_qlora_finetune.py | 185 ++++++++ .../v1_5/internvl_v1_5_phi3_4b_finetune.py | 170 ++++++++ .../internvl_v1_5_phi3_4b_lora_finetune.py | 183 ++++++++ .../internvl_v1_5_phi3_4b_qlora_finetune.py | 185 ++++++++ .../v2/internvl_v2_internlm2_26b_finetune.py | 170 ++++++++ ...internvl_v2_internlm2_26b_lora_finetune.py | 183 ++++++++ ...nternvl_v2_internlm2_26b_qlora_finetune.py | 185 ++++++++ .../v2/internvl_v2_internlm2_2b_finetune.py | 170 ++++++++ .../internvl_v2_internlm2_2b_lora_finetune.py | 183 ++++++++ ...internvl_v2_internlm2_2b_qlora_finetune.py | 185 ++++++++ .../v2/internvl_v2_internlm2_5_8b_finetune.py | 170 ++++++++ ...nternvl_v2_internlm2_5_8b_lora_finetune.py | 183 ++++++++ ...ternvl_v2_internlm2_5_8b_qlora_finetune.py | 185 ++++++++ .../v2/internvl_v2_phi3_4b_finetune.py | 170 ++++++++ .../v2/internvl_v2_phi3_4b_lora_finetune.py | 183 ++++++++ .../v2/internvl_v2_phi3_4b_qlora_finetune.py | 185 ++++++++ xtuner/dataset/__init__.py | 3 +- .../dataset/collate_fns/default_collate_fn.py | 3 +- xtuner/dataset/internvl_dataset.py | 409 ++++++++++++++++++ xtuner/dataset/samplers/length_grouped.py | 6 + xtuner/model/__init__.py | 3 +- xtuner/model/internvl.py | 320 ++++++++++++++ 30 files changed, 4867 insertions(+), 3 deletions(-) create mode 100644 xtuner/configs/internvl/README.md create mode 100644 xtuner/configs/internvl/README_zh-CN.md create mode 100644 xtuner/configs/internvl/v1_5/convert_to_official.py create mode 100644 xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_26b_finetune.py create mode 100644 xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_26b_lora_finetune.py create mode 100644 xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_26b_qlora_finetune.py create mode 100644 xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_2b_finetune.py create mode 100644 xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_2b_lora_finetune.py create mode 100644 xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_2b_qlora_finetune.py create mode 100644 xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_finetune.py create mode 100644 xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_lora_finetune.py create mode 100644 xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_qlora_finetune.py create mode 100644 xtuner/configs/internvl/v2/internvl_v2_internlm2_26b_finetune.py create mode 100644 xtuner/configs/internvl/v2/internvl_v2_internlm2_26b_lora_finetune.py create mode 100644 xtuner/configs/internvl/v2/internvl_v2_internlm2_26b_qlora_finetune.py create mode 100644 xtuner/configs/internvl/v2/internvl_v2_internlm2_2b_finetune.py create mode 100644 xtuner/configs/internvl/v2/internvl_v2_internlm2_2b_lora_finetune.py create mode 100644 xtuner/configs/internvl/v2/internvl_v2_internlm2_2b_qlora_finetune.py create mode 100644 xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_finetune.py create mode 100644 xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_lora_finetune.py create mode 100644 xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_qlora_finetune.py create mode 100644 xtuner/configs/internvl/v2/internvl_v2_phi3_4b_finetune.py create mode 100644 xtuner/configs/internvl/v2/internvl_v2_phi3_4b_lora_finetune.py create mode 100644 xtuner/configs/internvl/v2/internvl_v2_phi3_4b_qlora_finetune.py create mode 100644 xtuner/dataset/internvl_dataset.py create mode 100644 xtuner/model/internvl.py diff --git a/xtuner/configs/internvl/README.md b/xtuner/configs/internvl/README.md new file mode 100644 index 000000000..1f1acf191 --- /dev/null +++ b/xtuner/configs/internvl/README.md @@ -0,0 +1,152 @@ +# InterVL Full Pipeline + +English | [简体中文](./README_zh-CN.md) + +## InterVL 2 + +> [InternVL-2: Better than the Best—Expanding Performance Boundaries of Open-Source Multimodal Models with the Progressive Scaling Strategy](https://internvl.github.io/blog/2024-07-02-InternVL-2.0/) + +We introduce InternVL-2, currently the most powerful open-source Multimodal Large Language Model (MLLM). The InternVL-2 family includes models ranging from a 2B model, suitable for edge devices, to a 108B model, which is significantly more powerful. With larger-scale language models, InternVL-2-Pro demonstrates outstanding multimodal understanding capabilities, matching the performance of commercial closed-source models across various benchmarks. + +InternVL-2 family is built upon the following designs: + +- Progressive with larger language models: We introduce a progressive alignment training strategy, resulting in the first vision foundation model aligned with large language models. By employing the progressive training strategy where the model scales from small to large while the data refines from coarse to fine, we have completed the training of large models at a relatively low cost. This approach has demonstrated excellent performance with limited resources. +- Multimodal input: With one set of parameters, our model supports multiple modalities of input, including text, images, video, audio, and 3D point clouds. +- Multitask output: Our model supports various output formats, such as images, bounding boxes, and masks, demonstrating extensive versatility. By connecting the MLLM with multiple downstream task decoders, InternVL-2 can be generalized to hundreds of vision-language tasks while achieving performance comparable to expert models. + +
+Image +
+ +### Basic Introduction + +- `./v2/` contains the configuration files for training InterVL 2 +- Supported fine-tuning of the InternVL 2B/4B/8B/26B model in full/LoRA/QLoRA single-image mode for now. We will support fine-tuning on multiple images and videos as soon as possible. +- After training, you can use the `./v1_5/convert_to_official.py` script to convert the model trained by XTuner to the official format, so as to reuse all the official supported toolchains +- All configurations are based on 8xA100 80G graphics cards, 2B/4B can use ZERO1 training, 8B models can use ZERO2, 26B models must run ZERO3, and there is no excessive adjustment of parameters, you can modify them according to your own needs +- It is verified with LLaVA SFT data, which cannot fully reflect the fine-tuning performance. You can customize the data according to your own needs. We will provide a relatively fair fine-tuning dataset later + +### Data preparation + +If you also want to use the LLaVA SFT dataset for training, please refer to the [document](../../../docs/en/user_guides/dataset_prepare.md#llava-dataset) to prepare the data. + +For custom data, support multiple json and jsonl formats, the data organization can refer to the LLaVA SFT format, and support data sampling operations. + +**(1) Support multiple json or jsonl data** + +```text +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=['a.json','b.jsonl','c.json'], + image_folders=['a',None,'c'], + template=prompt_template, + max_length=max_length) +``` + +**(2) Support custom sampling** + +```text +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=['a.json','b.jsonl','c.json'], + image_folders=['a',None,'c'], + repeat_times=[2,0.5,3.5], + template=prompt_template, + max_length=max_length) +``` + +### Training + +The provided configuration is mainly used for fine-tuning based on the official weights. After preparing the data, you can use the following command to train: + +```bash +NPROC_PER_NODE=8 xtuner train internvl_v2_internlm2_5_8b_lora_finetune --deepspeed deepspeed_zero2 +``` + +Default saved in `./work_dirs/internvl_v2_internlm2_5_8b_lora_finetune/`. + +### Model Conversion + +After training, we will get a set of weights, that is `./work_dirs/internvl_v2_internlm2_5_8b_lora_finetune/iter_xxx.pth`, in order to facilitate evaluation and dialogue, we can convert it to official weights. + +```bash +python xtuner/configs/internvl/v1_5/convert_to_official.py xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_lora_finetune.py ./work_dirs/internvl_v2_internlm2_5_8b_lora_finetune/iter_xxx.pth ./work_dirs/internvl_v2_internlm2_5_8b_lora_finetune/convert_model/ +``` + +Here, a complete set of official weights including configuration will be generated under `./work_dirs/internvl_v2_internlm2_5_8b_lora_finetune/convert_model`, you can use the [official toolchain](https://huggingface.co/OpenGVLab/InternVL2-8B) for evaluation and dialogue. + +If you encounter any problems during use, please feel free to contact us!!! + +## InterVL 1.5 + +> [How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites](https://arxiv.org/abs/2404.16821) + +In this report, we introduce InternVL 1.5, an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding. We introduce three simple improvements: (1) Strong Vision Encoder: we explored a continuous learning strategy for the large-scale vision foundation model -- InternViT-6B, boosting its visual understanding capabilities, and making it can be transferred and reused in different LLMs. (2) Dynamic High-Resolution: we divide images into tiles ranging from 1 to 40 of 448×448 pixels according to the aspect ratio and resolution of the input images, which supports up to 4K resolution input. (3) High-Quality Bilingual Dataset: we carefully collected a high-quality bilingual dataset that covers common scenes, document images, and annotated them with English and Chinese question-answer pairs, significantly enhancing performance in OCR- and Chinese-related tasks. We evaluate InternVL 1.5 through a series of benchmarks and comparative studies. Compared to both open-source and proprietary models, InternVL 1.5 shows competitive performance, achieving state-of-the-art results in 8 of 18 benchmarks. + +
+Image +
+ +### Basic Introduction + +- `./v1_5/` contains the configuration files for training InterVL 1.5 +- Support InternVL 2B/4B/26B model full/LoRA/Qing efficiency and performance, it is recommended to choose the 4B model first +- After training, you can use the `./v1_5/convert_to_official.py` script to convert the model trained by XTuner to the official format, so as to reuse all the official supported toolchains +- All configurations are based on 8xA100 80G graphics cards, 2B/4B can use ZERO1 training, 8B models must run ZERO2, and there is no excessive adjustment of parameters, you can modify them according to your own needs +- It is verified with LLaVA SFT data, which cannot fully reflect the fine-tuning performance. You can customize the data according to your own needs. We will provide a relatively fair fine-tuning dataset later + +### Data preparation + +If you also want to use the LLaVA SFT dataset for training, please refer to the [document](../../../docs/en/user_guides/dataset_prepare.md#llava-dataset) to prepare the data. + +For custom data, support multiple json and jsonl formats, the data organization can refer to the LLaVA SFT format, and support data sampling operations. + +**(1) Support multiple json or jsonl data** + +```text +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=['a.json','b.jsonl','c.json'], + image_folders=['a',None,'c'], + template=prompt_template, + max_length=max_length) +``` + +**(2) Support custom sampling** + +```text +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=['a.json','b.jsonl','c.json'], + image_folders=['a',None,'c'], + repeat_times=[2,0.5,3.5], + template=prompt_template, + max_length=max_length) +``` + +### Training + +The provided configuration is mainly used for fine-tuning based on the official weights. After preparing the data, you can use the following command to train: + +```bash +NPROC_PER_NODE=8 xtuner train internvl_v1_5_phi3_4b_lora_finetune --deepspeed deepspeed_zero1 +# NPROC_PER_NODE=8 xtuner train internvl_v1_5_internlm2_26b_lora_finetune.py --deepspeed deepspeed_zero3 +``` + +Default saved in `./work_dirs/internvl_v1_5_phi3_4b_lora_finetune/`. + +### Model Conversion + +After training, we will get a set of weights, that is `./work_dirs/internvl_v1_5_phi3_4b_lora_finetune/iter_xxx.pth`, in order to facilitate evaluation and dialogue, we can convert it to official weights. + +```bash +python xtuner/configs/internvl/v1_5/convert_to_official.py xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_lora_finetune.py ./work_dirs/internvl_v1_5_phi3_4b_lora_finetune/iter_xxx.pth ./work_dirs/internvl_v1_5_phi3_4b_lora_finetune/internvl_v1_5_phi3_4b/ +``` + +Here, a complete set of official weights including configuration will be generated under `./work_dirs/internvl_v1_5_phi3_4b_lora_finetune/internvl_v1_5_phi3_4b/`, you can use the [official toolchain](https://github.com/OpenGVLab/InternVL) for evaluation and dialogue. + +If you encounter any problems during use, please feel free to contact us!!! diff --git a/xtuner/configs/internvl/README_zh-CN.md b/xtuner/configs/internvl/README_zh-CN.md new file mode 100644 index 000000000..cdaa59348 --- /dev/null +++ b/xtuner/configs/internvl/README_zh-CN.md @@ -0,0 +1,152 @@ +# InterVL 全流程 + +[English](./README.md) | 简体中文 + +## InterVL 2 + +> [InternVL-2: Better than the Best—Expanding Performance Boundaries of Open-Source Multimodal Models with the Progressive Scaling Strategy](https://internvl.github.io/blog/2024-07-02-InternVL-2.0/) + +我们引入了 InternVL-2,目前最强大的开源多模态大语言模型(MLLM)。InternVL-2 系列包括从适合于边缘设备的 2B 模型到强大的 108B 模型等多种规模的模型。借助更大规模的语言模型,InternVL-2-Pro 展现出了出色的多模态理解能力,在各种基准测试中的性能与商业闭源模型相匹配。 + +InternVL-2 系列基于以下设计: + +- 渐进式的大型语言模型:我们引入了一种渐进式对齐训练策略,实现了首个与大型语言模型对齐的视觉基础模型。通过采用从小到大模型扩展、从粗到细数据优化的渐进式训练策略,我们以较低的成本完成了大模型的训练。这种方法已经展示了出色的性能,资源有限的情况下也能取得良好的结果。 +- 多模态输入:使用一套参数,我们的模型支持文本、图像、视频、音频和 3D 点云等多种输入模态。 +- 多任务输出:我们的模型支持图像、边界框和掩码等各种输出格式,展现出广泛的多功能性。通过将 MLLM 与多个下游任务解码器相连接,InternVL-2 可以泛化到数百个视觉语言任务,并取得与专家模型相当的性能。 + +
+Image +
+ +### 基本说明 + +- `./v2/` 包含着 InterVL 2 训练配置的配置文件 +- 支持了 InternVL 2B/4B/8B/26B 模型全量/LoRA/QLoRA 单图模式的微调,会尽快支持多图和视频的微调。 +- 在训练完成后,可以使用 `./v1_5/convert_to_official.py` 脚本将 XTuner 训练的模型转换为官方格式,从而复用官方所支持的所有工具链 +- 目前所有配置都是以 8xA100 80G 显卡为基准,2B/4B 可以使用 ZERO1 训练,8B 模型要 ZERO2 运行,26B 模型必须要 ZERO3,并且没有对参数进行过多的调整,你可以按照你自己的需求进行修改 +- 目前是以 LLaVA SFT 数据进行验证,无法充分反应微调性能,你可以根据自己的需求进行数据自定义,后续我们会提供一个相对公平的微调数据集 + +### 数据准备 + +如果你也想使用 LLaVA SFT 数据集进行训练,请参考[文档](../../../docs/zh_cn/user_guides/dataset_prepare.md#llava-dataset) 准备数据。 + +对于自定义数据,支持多种 json 和 jsonl 格式,内部数据组织可以参考 LLaVA SFT 格式,且支持数据采样操作。 + +**(1) 支持多个 json 或者 jsonl 数据** + +```text +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=['a.json','b.jsonl','c.json'], + image_folders=['a',None,'c'], + template=prompt_template, + max_length=max_length) +``` + +**(2) 支持自定义采样** + +```text +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=['a.json','b.jsonl','c.json'], + image_folders=['a',None,'c'], + repeat_times=[2,0.5,3.5], + template=prompt_template, + max_length=max_length) +``` + +### 训练流程 + +所提供的配置主要用于基于官方权重继续微调。在准备好数据后,你可以使用以下命令进行训练: + +```bash +NPROC_PER_NODE=8 xtuner train internvl_v2_internlm2_5_8b_lora_finetune --deepspeed deepspeed_zero2 +``` + +默认保存在 `./work_dirs/internvl_v2_internlm2_5_8b_lora_finetune/`。 + +### 模型转换 + +训练后,我们将获得一组权重即 `./work_dirs/internvl_v2_internlm2_5_8b_lora_finetune/iter_xxx.pth`,为了方便评测和对话,可以将其转换为官方权重。 + +```bash +python xtuner/configs/internvl/v1_5/convert_to_official.py xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_lora_finetune.py ./work_dirs/internvl_v2_internlm2_5_8b_lora_finetune/iter_xxx.pth ./work_dirs/internvl_v2_internlm2_5_8b_lora_finetune/convert_model/ +``` + +此时,会在 `./work_dirs/internvl_v2_internlm2_5_8b_lora_finetune/convert_model` 下生成一组包括配置的完整官方权重,你可以使用[官方工具链](https://huggingface.co/OpenGVLab/InternVL2-8B)进行评测和对话。 + +如果你在使用中碰到任何问题,欢迎联系我们!!! + +## InterVL 1.5 + +> [How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites](https://arxiv.org/abs/2404.16821) + +在本报告中,我们介绍了开源多模态大语言模型 InternVL 1.5,以弥补开源模型与商业专有模型在多模态理解能力上的差距。我们引入了三项简单的改进:(1) 强大的视觉编码器:我们探索了大规模视觉基础模型 InternViT-6B 的连续学习策略,提升了其视觉理解能力,并使其可以在不同的大语言模型中进行迁移和重复利用。(2) 动态高分辨率:我们根据输入图像的长宽比和分辨率,将图像划分为从1到40个448×448像素的瓦片,支持高达4K分辨率的输入。(3) 高质量双语数据集:我们精心收集了一个高质量的双语数据集,涵盖了常见场景、文档图像,并用英语和中文问答对进行了注释,显著提升了在OCR和中文相关任务中的性能。我们通过一系列基准测试和对比研究评估了 InternVL 1.5。与开源和专有模型相比,InternVL 1.5 表现出了竞争力,在18个基准中的8个中取得了最先进的结果。 + +
+Image +
+ +### 基本说明 + +- `./v1_5/` 包含着 InterVL 1.5 训练配置的配置文件 +- 支持 InternVL 2B/4B/26B 模型全量/LoRA/QLoRA 微调,综合考虑效率性能,建议你优先选择 4B 模型 +- 在训练完成后,可以使用 `./v1_5/convert_to_official.py` 脚本将 XTuner 训练的模型转换为官方格式,从而复用官方所支持的所有工具链 +- 目前所有配置都是以 8xA100 80G 显卡为基准,2B/4B 可以使用 ZERO1 训练,26B 模型必须要 ZERO3 运行,并且没有对参数进行过多的调整,你可以按照你自己的需求进行修改 +- 目前是以 LLaVA SFT 数据进行验证,无法充分反应微调性能,你可以根据自己的需求进行数据自定义,后续我们会提供一个相对公平的微调数据集 + +### 数据准备 + +如果你也想使用 LLaVA SFT 数据集进行训练,请参考[文档](../../../docs/zh_cn/user_guides/dataset_prepare.md#llava-dataset) 准备数据。 + +对于自定义数据,支持多种 json 和 jsonl 格式,内部数据组织可以参考 LLaVA SFT 格式,且支持数据采样操作。 + +**(1) 支持多个 json 或者 jsonl 数据** + +```text +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=['a.json','b.jsonl','c.json'], + image_folders=['a',None,'c'], + template=prompt_template, + max_length=max_length) +``` + +**(2) 支持自定义采样** + +```text +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=['a.json','b.jsonl','c.json'], + image_folders=['a',None,'c'], + repeat_times=[2,0.5,3.5], + template=prompt_template, + max_length=max_length) +``` + +### 训练流程 + +所提供的配置主要用于基于官方权重继续微调。在准备好数据后,你可以使用以下命令进行训练: + +```bash +NPROC_PER_NODE=8 xtuner train internvl_v1_5_phi3_4b_lora_finetune --deepspeed deepspeed_zero1 +# NPROC_PER_NODE=8 xtuner train internvl_v1_5_internlm2_26b_lora_finetune.py --deepspeed deepspeed_zero3 +``` + +默认保存在 `./work_dirs/internvl_v1_5_phi3_4b_lora_finetune/`。 + +### 模型转换 + +训练后,我们将获得一组权重即 `./work_dirs/internvl_v1_5_phi3_4b_lora_finetune/iter_xxx.pth`,为了方便评测和对话,可以将其转换为官方权重。 + +```bash +python xtuner/configs/internvl/v1_5/convert_to_official.py xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_lora_finetune.py ./work_dirs/iter_xxx.pth ./work_dirs/internvl_v1_5_phi3_4b_lora_finetune/internvl_v1_5_phi3_4b/ +``` + +此时,会在 `./work_dirs/internvl_v1_5_phi3_4b_lora_finetune/internvl_v1_5_phi3_4b/` 下生成一组包括配置的完整官方权重,你可以使用[官方工具链](https://github.com/OpenGVLab/InternVL)进行评测和对话。 + +如果你在使用中碰到任何问题,欢迎联系我们!!! diff --git a/xtuner/configs/internvl/v1_5/convert_to_official.py b/xtuner/configs/internvl/v1_5/convert_to_official.py new file mode 100644 index 000000000..765855daa --- /dev/null +++ b/xtuner/configs/internvl/v1_5/convert_to_official.py @@ -0,0 +1,56 @@ +import argparse +import os.path as osp + +import torch +from mmengine.config import Config +from transformers import AutoTokenizer + +from xtuner.model.utils import LoadWoInit +from xtuner.registry import BUILDER + + +def convert_to_official(config, trained_path, save_path): + cfg = Config.fromfile(config) + cfg.model.pretrained_pth = trained_path + cfg.model.quantization_vit = False + cfg.model.quantization_llm = False + + with LoadWoInit(): + model = BUILDER.build(cfg.model) + model.to(torch.bfloat16) + + if model.use_visual_encoder_lora: + vision_model = model.model.vision_model.merge_and_unload() + model.model.vision_model = vision_model + + if model.use_llm_lora: + language_model = model.model.language_model.merge_and_unload() + model.model.language_model = language_model + + model.model.save_pretrained(save_path) + + tokenizer = AutoTokenizer.from_pretrained( + cfg.model.model_path, trust_remote_code=True) + tokenizer.save_pretrained(save_path) + + print(model) + + +def main(): + parser = argparse.ArgumentParser( + description='Convert the pth model to HuggingFace model') + parser.add_argument('config', help='config file name or path.') + parser.add_argument('trained_model_pth', help='The trained model path.') + parser.add_argument( + 'save_path', help='The path to save the converted model.') + args = parser.parse_args() + + if osp.realpath(args.trained_model_pth) == osp.realpath(args.save_path): + raise ValueError( + 'The trained path and save path should not be the same.') + + convert_to_official(args.config, args.trained_model_pth, args.save_path) + + +if __name__ == '__main__': + main() diff --git a/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_26b_finetune.py b/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_26b_finetune.py new file mode 100644 index 000000000..d5eec7829 --- /dev/null +++ b/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_26b_finetune.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL-Chat-V1-5' +prompt_template = PROMPT_TEMPLATE.internlm2_chat + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +max_length = 4096 + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 8 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 2e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.01 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=False, + freeze_visual_encoder=True # or False +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_26b_lora_finetune.py b/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_26b_lora_finetune.py new file mode 100644 index 000000000..0fb511d42 --- /dev/null +++ b/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_26b_lora_finetune.py @@ -0,0 +1,183 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL-Chat-V1-5' +prompt_template = PROMPT_TEMPLATE.internlm2_chat + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +max_length = 4096 + +# Scheduler & Optimizer +batch_size = 2 # per_device +accumulative_counts = 4 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 2e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.01 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_26b_qlora_finetune.py b/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_26b_qlora_finetune.py new file mode 100644 index 000000000..8d994c81d --- /dev/null +++ b/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_26b_qlora_finetune.py @@ -0,0 +1,185 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL-Chat-V1-5' +prompt_template = PROMPT_TEMPLATE.internlm2_chat + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +max_length = 4096 + +# Scheduler & Optimizer +batch_size = 2 # per_device +accumulative_counts = 4 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 2e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.01 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + quantization_llm=True, # or False + quantization_vit=False, # or True and uncomment visual_encoder_lora + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_2b_finetune.py b/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_2b_finetune.py new file mode 100644 index 000000000..09fb01e3f --- /dev/null +++ b/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_2b_finetune.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/Mini-InternVL-Chat-2B-V1-5' +prompt_template = PROMPT_TEMPLATE.internlm2_chat + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 4 # per_device +accumulative_counts = 4 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=False, + freeze_visual_encoder=True # or False +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_2b_lora_finetune.py b/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_2b_lora_finetune.py new file mode 100644 index 000000000..193e2f269 --- /dev/null +++ b/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_2b_lora_finetune.py @@ -0,0 +1,183 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/Mini-InternVL-Chat-2B-V1-5' +prompt_template = PROMPT_TEMPLATE.internlm2_chat + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_2b_qlora_finetune.py b/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_2b_qlora_finetune.py new file mode 100644 index 000000000..6bb28e490 --- /dev/null +++ b/xtuner/configs/internvl/v1_5/internvl_v1_5_internlm2_2b_qlora_finetune.py @@ -0,0 +1,185 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/Mini-InternVL-Chat-2B-V1-5' +prompt_template = PROMPT_TEMPLATE.internlm2_chat + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + quantization_llm=True, # or False + quantization_vit=False, # or True and uncomment visual_encoder_lora + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_finetune.py b/xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_finetune.py new file mode 100644 index 000000000..5d34a928b --- /dev/null +++ b/xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_finetune.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/Mini-InternVL-Chat-4B-V1-5' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 4 # per_device +accumulative_counts = 4 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=False, + freeze_visual_encoder=True # or False +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_lora_finetune.py b/xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_lora_finetune.py new file mode 100644 index 000000000..19588cb95 --- /dev/null +++ b/xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_lora_finetune.py @@ -0,0 +1,183 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/Mini-InternVL-Chat-4B-V1-5' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_qlora_finetune.py b/xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_qlora_finetune.py new file mode 100644 index 000000000..cb150f0c4 --- /dev/null +++ b/xtuner/configs/internvl/v1_5/internvl_v1_5_phi3_4b_qlora_finetune.py @@ -0,0 +1,185 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/Mini-InternVL-Chat-4B-V1-5' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + quantization_llm=True, # or False + quantization_vit=False, # or True and uncomment visual_encoder_lora + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v2/internvl_v2_internlm2_26b_finetune.py b/xtuner/configs/internvl/v2/internvl_v2_internlm2_26b_finetune.py new file mode 100644 index 000000000..0916df44a --- /dev/null +++ b/xtuner/configs/internvl/v2/internvl_v2_internlm2_26b_finetune.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL2-26B' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.internlm2_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 8 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=False, + freeze_visual_encoder=True # or False +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v2/internvl_v2_internlm2_26b_lora_finetune.py b/xtuner/configs/internvl/v2/internvl_v2_internlm2_26b_lora_finetune.py new file mode 100644 index 000000000..045fd7055 --- /dev/null +++ b/xtuner/configs/internvl/v2/internvl_v2_internlm2_26b_lora_finetune.py @@ -0,0 +1,183 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL2-26B' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.internlm2_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 2 # per_device +accumulative_counts = 4 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v2/internvl_v2_internlm2_26b_qlora_finetune.py b/xtuner/configs/internvl/v2/internvl_v2_internlm2_26b_qlora_finetune.py new file mode 100644 index 000000000..60717b312 --- /dev/null +++ b/xtuner/configs/internvl/v2/internvl_v2_internlm2_26b_qlora_finetune.py @@ -0,0 +1,185 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL2-26B' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.internlm2_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 2 # per_device +accumulative_counts = 4 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + quantization_llm=True, # or False + quantization_vit=False, # or True and uncomment visual_encoder_lora + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v2/internvl_v2_internlm2_2b_finetune.py b/xtuner/configs/internvl/v2/internvl_v2_internlm2_2b_finetune.py new file mode 100644 index 000000000..a921cf0c0 --- /dev/null +++ b/xtuner/configs/internvl/v2/internvl_v2_internlm2_2b_finetune.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL2-2B' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.internlm2_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 4 # per_device +accumulative_counts = 4 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=False, + freeze_visual_encoder=True # or False +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v2/internvl_v2_internlm2_2b_lora_finetune.py b/xtuner/configs/internvl/v2/internvl_v2_internlm2_2b_lora_finetune.py new file mode 100644 index 000000000..44b3c3944 --- /dev/null +++ b/xtuner/configs/internvl/v2/internvl_v2_internlm2_2b_lora_finetune.py @@ -0,0 +1,183 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL2-2B' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.internlm2_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v2/internvl_v2_internlm2_2b_qlora_finetune.py b/xtuner/configs/internvl/v2/internvl_v2_internlm2_2b_qlora_finetune.py new file mode 100644 index 000000000..5840a593f --- /dev/null +++ b/xtuner/configs/internvl/v2/internvl_v2_internlm2_2b_qlora_finetune.py @@ -0,0 +1,185 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL2-2B' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.internlm2_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + quantization_llm=True, # or False + quantization_vit=False, # or True and uncomment visual_encoder_lora + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_finetune.py b/xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_finetune.py new file mode 100644 index 000000000..2a92c017f --- /dev/null +++ b/xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_finetune.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL2-8B' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.internlm2_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 4 # per_device +accumulative_counts = 4 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=False, + freeze_visual_encoder=True # or False +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_lora_finetune.py b/xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_lora_finetune.py new file mode 100644 index 000000000..d9fa7ab3a --- /dev/null +++ b/xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_lora_finetune.py @@ -0,0 +1,183 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL2-8B' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.internlm2_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_qlora_finetune.py b/xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_qlora_finetune.py new file mode 100644 index 000000000..b3d04bb43 --- /dev/null +++ b/xtuner/configs/internvl/v2/internvl_v2_internlm2_5_8b_qlora_finetune.py @@ -0,0 +1,185 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL2-8B' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.internlm2_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + quantization_llm=True, # or False + quantization_vit=False, # or True and uncomment visual_encoder_lora + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v2/internvl_v2_phi3_4b_finetune.py b/xtuner/configs/internvl/v2/internvl_v2_phi3_4b_finetune.py new file mode 100644 index 000000000..41a712569 --- /dev/null +++ b/xtuner/configs/internvl/v2/internvl_v2_phi3_4b_finetune.py @@ -0,0 +1,170 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL2-4B' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 4 # per_device +accumulative_counts = 4 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=False, + freeze_visual_encoder=True # or False +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v2/internvl_v2_phi3_4b_lora_finetune.py b/xtuner/configs/internvl/v2/internvl_v2_phi3_4b_lora_finetune.py new file mode 100644 index 000000000..64a20450f --- /dev/null +++ b/xtuner/configs/internvl/v2/internvl_v2_phi3_4b_lora_finetune.py @@ -0,0 +1,183 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL2-4B' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/internvl/v2/internvl_v2_phi3_4b_qlora_finetune.py b/xtuner/configs/internvl/v2/internvl_v2_phi3_4b_qlora_finetune.py new file mode 100644 index 000000000..8302fa5cc --- /dev/null +++ b/xtuner/configs/internvl/v2/internvl_v2_phi3_4b_qlora_finetune.py @@ -0,0 +1,185 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = 'OpenGVLab/InternVL2-4B' + +# Data +data_root = './data/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +# official 1024 -> 4e-5 +lr = 1e-6 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + quantization_llm=True, # or False + quantization_vit=False, # or True and uncomment visual_encoder_lora + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/dataset/__init__.py b/xtuner/dataset/__init__.py index bcfe0dcc3..2ad3d7bd9 100644 --- a/xtuner/dataset/__init__.py +++ b/xtuner/dataset/__init__.py @@ -6,6 +6,7 @@ from .intern_repo import (build_packed_dataset, load_intern_repo_tokenized_dataset, load_intern_repo_untokenized_dataset) +from .internvl_dataset import InternVL_V1_5_Dataset from .json_dataset import load_json_file from .llava import LLaVADataset from .modelscope import process_ms_dataset @@ -24,5 +25,5 @@ 'load_intern_repo_tokenized_dataset', 'load_intern_repo_untokenized_dataset', 'build_packed_dataset', 'RefCOCOJsonDataset', 'RefCOCOJsonEvalDataset', 'InvRefCOCOJsonDataset', - 'load_json_file' + 'load_json_file', 'InternVL_V1_5_Dataset' ] diff --git a/xtuner/dataset/collate_fns/default_collate_fn.py b/xtuner/dataset/collate_fns/default_collate_fn.py index 0ca9264f0..3d9fe18fb 100644 --- a/xtuner/dataset/collate_fns/default_collate_fn.py +++ b/xtuner/dataset/collate_fns/default_collate_fn.py @@ -89,7 +89,8 @@ def default_collate_fn(instances: Sequence[Dict], } if has_image: - pixel_values = torch.stack(pixel_values) + if all(x.shape == pixel_values[0].shape for x in pixel_values): + pixel_values = torch.stack(pixel_values, dim=0) data_dict['pixel_values'] = pixel_values if return_hf_format: diff --git a/xtuner/dataset/internvl_dataset.py b/xtuner/dataset/internvl_dataset.py new file mode 100644 index 000000000..82904ae87 --- /dev/null +++ b/xtuner/dataset/internvl_dataset.py @@ -0,0 +1,409 @@ +import copy +import io +import json +import os +import random +import warnings + +import numpy as np +import torch +import torchvision.transforms as T +from mmengine import print_log +from mmengine.fileio import get +from PIL import Image +from torch.utils.data import Dataset +from torchvision.transforms.functional import InterpolationMode +from transformers import AutoConfig, AutoTokenizer + +from xtuner.utils import IGNORE_INDEX + + +# Referenced from InternVL +def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, + image_size): + best_ratio_diff = float('inf') + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + +def dynamic_preprocess(image, + min_num=1, + max_num=6, + image_size=448, + use_thumbnail=False): + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = {(i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) for j in range(1, n + 1) + if i * j <= max_num and i * j >= min_num} + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, + target_ratios, orig_width, + orig_height, image_size) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ((i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + +def total_image_token(orig_size, + min_num=1, + max_num=12, + image_size=448, + use_thumbnail=True): + orig_width, orig_height = orig_size + + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = {(i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) for j in range(1, n + 1) + if max_num >= i * j >= min_num} + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, + target_ratios, orig_width, + orig_height, image_size) + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + if use_thumbnail: + blocks += 1 + + return blocks + + +def load_json_or_jsonl(json_path): + if json_path.endswith('.json'): + with open(json_path) as f: + data = json.load(f) + elif json_path.endswith('.jsonl'): + with open(json_path) as f: + data = [json.loads(line) for line in f] + else: + raise ValueError(f'Unsupported file format: {json_path}, ' + f'only support .json and .jsonl.') + return data + + +class InternVL_V1_5_Dataset(Dataset): + os.environ['TOKENIZERS_PARALLELISM'] = 'true' + IMG_CONTEXT_TOKEN = '' + IMG_START_TOKEN = '' + IMG_END_TOKEN = '' + + IMAGENET_MEAN = (0.485, 0.456, 0.406) + IMAGENET_STD = (0.229, 0.224, 0.225) + + def __init__(self, + model_path, + template, + data_paths, + image_folders=None, + repeat_times=1, + max_length=8192): + self.template = template + self.max_length = max_length + + self.cfg = AutoConfig.from_pretrained( + model_path, trust_remote_code=True) + + # The following modifications are only to ensure full + # consistency with the official template, + # without investigating the impact on performance. + if self.cfg.llm_config.architectures[0] == 'Phi3ForCausalLM': + self._system = 'You are an AI assistant whose name is Phi-3.' + self.template[ + 'INSTRUCTION'] = '<|user|>\n{input}<|end|><|assistant|>\n' + elif self.cfg.llm_config.architectures[0] == 'InternLM2ForCausalLM': + self._system = 'You are an AI assistant whose name ' \ + 'is InternLM (书生·浦语).' + self.template['SYSTEM'] = '<|im_start|>system\n{system}<|im_end|>' + self.template[ + 'INSTRUCTION'] = '<|im_start|>user\n{input}' \ + '<|im_end|><|im_start|>assistant\n' + else: + raise NotImplementedError + + self.min_dynamic_patch = self.cfg.min_dynamic_patch + self.max_dynamic_patch = self.cfg.max_dynamic_patch + self.downsample_ratio = self.cfg.downsample_ratio + self.image_size = self.cfg.force_image_size + self.use_thumbnail = self.cfg.use_thumbnail + patch_size = self.cfg.vision_config.patch_size + self.patch_token = int( + (self.image_size // patch_size)**2 * (self.downsample_ratio**2)) + self.tokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=True) + self.transformer = T.Compose([ + T.Lambda(lambda img: img.convert('RGB') + if img.mode != 'RGB' else img), + T.Resize((self.image_size, self.image_size), + interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=self.IMAGENET_MEAN, std=self.IMAGENET_STD) + ]) + + if not isinstance(data_paths, (list, tuple)): + data_paths = [data_paths] + if not isinstance(image_folders, (list, tuple)): + image_folders = [image_folders] + if not isinstance(repeat_times, (list, tuple)): + repeat_times = [repeat_times] + assert len(data_paths) == len(image_folders) == len(repeat_times) + + print_log('Starting to loading data and calc length', logger='current') + self.data = [] + self.image_folder = [] + self.group_length = [] + self.conv2length_text = { + } # using dict to speedup the calculation of token length + + for data_file, image_folder, repeat_time in zip( + data_paths, image_folders, repeat_times): + print_log( + f'=======Starting to process {data_file} =======', + logger='current') + assert repeat_time > 0 + json_data = load_json_or_jsonl(data_file) + if repeat_time < 1: + json_data = random.sample(json_data, + int(len(json_data) * repeat_time)) + elif repeat_time > 1: + int_repeat_time = int(repeat_time) + remaining_repeat_time = repeat_time - repeat_time + if remaining_repeat_time > 0: + remaining_json_data = random.sample( + json_data, int(len(json_data) * remaining_repeat_time)) + json_data = json_data * int_repeat_time + json_data.extend(remaining_json_data) + else: + json_data = json_data * int_repeat_time + + self.data.extend(json_data) + self.image_folder.extend([image_folder] * len(json_data)) + + # TODO: multi process + for data_item in json_data: + if 'length' in data_item: + token_length = data_item['length'] # include image token + else: + conversations = '\n'.join( + [temp['value'] for temp in data_item['conversations']]) + str_length = len(conversations) + + if str_length not in self.conv2length_text: + token_length = self.tokenizer( + conversations, + return_tensors='pt', + padding=False, + truncation=False, + ).input_ids.size(1) + self.conv2length_text[str_length] = token_length + else: + token_length = self.conv2length_text[str_length] + + if 'image' in data_item and data_item['image'] is not None: + if 'image_wh' in data_item and data_item[ + 'image_wh'] is not None: + # more accurate calculation of image token + image_wh = data_item['image_wh'] + if isinstance(image_wh[0], list): + image_wh = image_wh[0] + image_token = total_image_token( + image_wh, self.min_dynamic_patch, + self.max_dynamic_patch, self.image_size, + self.use_thumbnail) + image_token = self.patch_token * image_token + else: + # max_dynamic_patch + use_thumbnail + image_token = self.patch_token * ( + self.max_dynamic_patch + self.use_thumbnail) + + token_length = token_length + image_token + else: + token_length = -token_length + + self.group_length.append(token_length) + print_log( + f'=======total {len(json_data)} samples of {data_file}=======', + logger='current') + + assert len(self.group_length) == len(self.data) + print_log('end loading data and calc length', logger='current') + print_log( + f'=======total {len(self.data)} samples=======', logger='current') + self._max_refetch = 1000 + + def __getitem__(self, index): + for _ in range(self._max_refetch + 1): + data = self.prepare_data(index) + # Broken images may cause the returned data to be None + if data is None: + index = self._rand_another() + continue + return data + + def __len__(self): + return len(self.data) + + @property + def modality_length(self): + return self.group_length + + @property + def length(self): + group_length = np.array(self.group_length) + group_length = np.abs(group_length).tolist() + return group_length + + def prepare_data(self, index): + data_dict: dict = self.data[index] + image_folder = self.image_folder[index] + + out_data_dict = {} + if data_dict.get('image', None) is not None: + image_file = data_dict['image'] + if isinstance(image_file, (list, tuple)): + assert len(image_file) == 1 + image_file = image_file[0] + + try: + image = self.get_image(os.path.join(image_folder, image_file)) + except Exception as e: + print(f'Error: {e}', flush=True) + print_log(f'Error: {e}', logger='current') + return None + + images = dynamic_preprocess(image, self.min_dynamic_patch, + self.max_dynamic_patch, + self.image_size, self.use_thumbnail) + pixel_values = [self.transformer(image) for image in images] + pixel_values = torch.stack(pixel_values) + out_data_dict['pixel_values'] = pixel_values + + num_image_tokens = pixel_values.shape[0] * self.patch_token + image_token_str = f'{self.IMG_START_TOKEN}' \ + f'{self.IMG_CONTEXT_TOKEN * num_image_tokens}' \ + f'{self.IMG_END_TOKEN}' + token_dict = self.get_inputid_labels(data_dict['conversations'], + image_token_str) + out_data_dict.update(token_dict) + else: + token_dict = self.get_inputid_labels(data_dict['conversations'], + None) + out_data_dict.update(token_dict) + out_data_dict['pixel_values'] = torch.zeros( + 1, 3, self.image_size, self.image_size) + return out_data_dict + + def _rand_another(self) -> int: + return np.random.randint(0, len(self.data)) + + def get_image(self, path): + if 's3://' in path: + img_bytes = get(path) + with io.BytesIO(img_bytes) as buff: + img = Image.open(buff).convert('RGB') + return img + else: + return Image.open(path).convert('RGB') + + def get_inputid_labels(self, conversations, image_token_str) -> dict: + input = '' + out_conversation = [] + while conversations and conversations[0]['from'] == 'gpt': + # Skip the first one if it is from gpt + conversations = conversations[1:] + for msg in conversations: + if msg['from'] == 'human': + if image_token_str is None and '' in msg['value']: + warnings.warn( + f'The current data << {msg["value"]} >> is ' + f'in plain text mode, but ' + 'there are tags present in the data. ' + 'We need to remove the tags.') + msg['value'] = msg['value'].replace('', '') + if '' in msg['value']: + msg['value'] = msg['value'].replace('', '').strip() + msg['value'] = image_token_str + '\n' + msg['value'] + msg['value'] = msg['value'].strip() + input += msg['value'].strip() + elif msg['from'] == 'gpt': + out_conversation.append({ + 'input': input, + 'output': msg['value'].strip() + }) + input = '' + else: + raise NotImplementedError + + input_ids, labels = [], [] + for i, single_turn_conversation in enumerate(out_conversation): + input = single_turn_conversation.get('input', '') + if input is None: + input = '' + input_text = self.template.INSTRUCTION.format( + input=input, round=i + 1) + + if i == 0: + system = self.template.SYSTEM.format(system=self._system) + input_text = system + input_text + input_encode = self.tokenizer.encode( + input_text, add_special_tokens=True) + else: + input_encode = self.tokenizer.encode( + input_text, add_special_tokens=False) + input_ids += input_encode + labels += [IGNORE_INDEX] * len(input_encode) + + output_text = single_turn_conversation.get('output', '') + if self.template.get('SUFFIX', None): + output_text += self.template.SUFFIX + output_encode = self.tokenizer.encode( + output_text, add_special_tokens=False) + input_ids += output_encode + labels += copy.deepcopy(output_encode) + + if len(input_ids) > self.max_length: + input_ids = input_ids[:self.max_length] + labels = labels[:self.max_length] + print_log( + f'Warning: input_ids length({len(input_ids)}) ' + f'is longer than max_length, cut to {self.max_length}', + logger='current') + return {'input_ids': input_ids, 'labels': labels} diff --git a/xtuner/dataset/samplers/length_grouped.py b/xtuner/dataset/samplers/length_grouped.py index ad37957f2..184827837 100644 --- a/xtuner/dataset/samplers/length_grouped.py +++ b/xtuner/dataset/samplers/length_grouped.py @@ -4,6 +4,7 @@ import torch from mmengine.dist import get_dist_info, sync_random_seed +from mmengine.logging import print_log from torch.utils.data import ConcatDataset as TorchConcatDataset from torch.utils.data import Sampler @@ -78,6 +79,7 @@ def __init__(self, mega_batch_mult: Optional[int] = None, seed: Optional[int] = None, round_up: bool = True) -> None: + print_log('LengthGroupedSampler is used.', logger='current') rank, world_size = get_dist_info() self.rank = rank self.world_size = world_size @@ -120,6 +122,10 @@ def __init__(self, assert isinstance(self.length, (list, tuple)) self.total_batch_size = total_batch_size + print_log( + f'LengthGroupedSampler construction is complete, ' + f'and the selected attribute is {length_property}', + logger='current') def __iter__(self) -> Iterator[int]: """Iterate the indices.""" diff --git a/xtuner/model/__init__.py b/xtuner/model/__init__.py index 39547b2d7..1b3a501d4 100644 --- a/xtuner/model/__init__.py +++ b/xtuner/model/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .internvl import InternVL_V1_5 from .llava import LLaVAModel from .sft import SupervisedFinetune -__all__ = ['SupervisedFinetune', 'LLaVAModel'] +__all__ = ['SupervisedFinetune', 'LLaVAModel', 'InternVL_V1_5'] diff --git a/xtuner/model/internvl.py b/xtuner/model/internvl.py new file mode 100644 index 000000000..0358266a9 --- /dev/null +++ b/xtuner/model/internvl.py @@ -0,0 +1,320 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict +from typing import List, Optional, Tuple, Union + +import torch +from mmengine import print_log +from mmengine.config import Config, ConfigDict +from mmengine.model import BaseModel +from peft import get_peft_model, prepare_model_for_kbit_training +from torch.nn import CrossEntropyLoss +from transformers import (AutoConfig, AutoModel, AutoTokenizer, + BitsAndBytesConfig) +from transformers.modeling_outputs import CausalLMOutputWithPast + +from xtuner.registry import BUILDER +from .utils import (find_all_linear_names, get_peft_model_state_dict, + guess_load_checkpoint, make_inputs_require_grad) + + +class InternVL_V1_5(BaseModel): + + def __init__(self, + model_path, + freeze_llm=False, + freeze_visual_encoder=False, + llm_lora=None, + visual_encoder_lora=None, + quantization_vit=False, + quantization_llm=False, + pretrained_pth=None): + print_log('Start to load InternVL_V1_5 model.', logger='current') + super().__init__() + self.freeze_llm = freeze_llm + self.freeze_visual_encoder = freeze_visual_encoder + self.use_llm_lora = llm_lora is not None + self.use_visual_encoder_lora = visual_encoder_lora is not None + self.quantization_vit = quantization_vit + self.quantization_llm = quantization_llm + if quantization_vit: + assert visual_encoder_lora is not None + if quantization_llm: + assert quantization_llm and llm_lora is not None + + config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + if config.llm_config.model_type == 'internlm2': + config.llm_config.attn_implementation = 'flash_attention_2' + else: + config.llm_config._attn_implementation = 'flash_attention_2' + + if quantization_vit is False and quantization_llm is False: + quantization = None + else: + llm_int8_skip_modules = ['mlp1'] + if quantization_llm and not quantization_vit: + llm_int8_skip_modules.append('vision_model') + + if quantization_vit and not quantization_llm: + llm_int8_skip_modules.append('language_model') + + quantization_config = dict( + type=BitsAndBytesConfig, + llm_int8_skip_modules=llm_int8_skip_modules, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4') + quantization_clazz = quantization_config.pop('type') + quantization = quantization_clazz(**quantization_config) + + self.model = AutoModel.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + quantization_config=quantization, + config=config, + trust_remote_code=True) + + tokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=True) + img_context_token_id = tokenizer.convert_tokens_to_ids('') + self.model.img_context_token_id = img_context_token_id + + if self.freeze_llm: + self.model.language_model.requires_grad_(False) + if self.freeze_visual_encoder: + self.model.vision_model.requires_grad_(False) + + if hasattr(self.model.language_model, 'enable_input_require_grads'): + self.model.language_model.enable_input_require_grads() + else: + self.model.language_model.get_input_embeddings( + ).register_forward_hook(make_inputs_require_grad) + + self.gradient_checkpointing_enable() + + if self.use_llm_lora: + self._prepare_llm_for_lora(llm_lora) + + if self.use_visual_encoder_lora: + self._prepare_visual_encoder_for_lora(visual_encoder_lora) + + if pretrained_pth is not None: + pretrained_state_dict = guess_load_checkpoint(pretrained_pth) + + self.load_state_dict(pretrained_state_dict, strict=False) + print(f'Load pretrained weight from {pretrained_pth}') + + self._count = 0 + print_log(self, logger='current') + print_log('InternVL_V1_5 construction is complete', logger='current') + + def _parse_lora_config(self, lora_config): + if isinstance(lora_config, dict) or isinstance( + lora_config, Config) or isinstance(lora_config, ConfigDict): + lora_config = BUILDER.build(lora_config) + return lora_config + + def _prepare_llm_for_lora(self, + lora_config, + use_activation_checkpointing=True): + lora_config = self._parse_lora_config(lora_config) + self.model.language_model = prepare_model_for_kbit_training( + self.model.language_model, use_activation_checkpointing) + if lora_config.target_modules is None: + modules = find_all_linear_names(self.model.language_model) + lora_config.target_modules = modules + self.model.language_model = get_peft_model(self.model.language_model, + lora_config) + + def _prepare_visual_encoder_for_lora(self, lora_config): + lora_config = self._parse_lora_config(lora_config) + if lora_config.target_modules is None: + modules = find_all_linear_names(self.model.vision_model) + lora_config.target_modules = modules + self.model.vision_model = get_peft_model(self.model.vision_model, + lora_config) + + def gradient_checkpointing_enable(self): + self.activation_checkpointing_enable() + + def activation_checkpointing_enable(self): + self.model.language_model.gradient_checkpointing_enable() + + def gradient_checkpointing_disable(self): + self.activation_checkpointing_disable() + + def activation_checkpointing_disable(self): + self.model.language_model.gradient_checkpointing_disable() + + def state_dict(self, *args, **kwargs): + state_dict = super().state_dict(*args, **kwargs) + to_return = OrderedDict() + # Step 1. visual_encoder + if self.use_visual_encoder_lora: + to_return.update( + get_peft_model_state_dict( + self.model.vision_model, state_dict=state_dict)) + elif not self.freeze_visual_encoder: + to_return.update({ + k: v + for k, v in state_dict.items() if 'model.vision_model.' in k + }) + # Step 2. LLM + if self.use_llm_lora: + to_return.update( + get_peft_model_state_dict( + self.model.language_model, state_dict=state_dict)) + elif not self.freeze_llm: + to_return.update({ + k: v + for k, v in state_dict.items() if 'model.language_model.' in k + }) + # Step 3. Projector + to_return.update( + {k: v + for k, v in state_dict.items() if 'model.mlp1.' in k}) + return to_return + + def init_weights(self): + pass + + def forward(self, data, data_samples=None, mode='loss'): + pixel_values = data['pixel_values'] + + if type(pixel_values) is list or pixel_values.ndim == 5: + if type(pixel_values) is list: + pixel_values = [ + x.unsqueeze(0) if x.ndim == 3 else x for x in pixel_values + ] + # b*n, c, h, w + concat_images = torch.cat([ + image.to(self.model.vision_model.dtype) + for image in pixel_values + ], + dim=0) + else: + raise NotImplementedError() + + input_ids = data['input_ids'] + position_ids = data['position_ids'] + attention_mask = data['attention_mask'] + # sum is 0 are text + image_flags = torch.sum(concat_images, dim=(1, 2, 3)) != 0 + image_flags = image_flags.long() + + labels = data['labels'] + use_cache = False + + # Directly calling this code in LORA fine-tuning + # will result in an error,so we must rewrite it. + # TODO: Once the official is fixed, we can remove it. + # outputs = self.model(input_ids=input_ids, + # position_ids=position_ids, + # attention_mask=attention_mask, + # image_flags=image_flags, + # pixel_values=concat_images, + # labels=labels, + # use_cache=use_cache) + outputs = self._llm_forward( + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + image_flags=image_flags, + pixel_values=concat_images, + labels=labels, + use_cache=use_cache) + loss_dict = {'loss': outputs.loss} + return loss_dict + + def _llm_forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + image_flags: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + return_dict = return_dict if return_dict is not None \ + else self.model.config.use_return_dict + + image_flags = image_flags.squeeze(-1) + # We only added the clone code here to avoid the error. + input_embeds = self.model.language_model.get_input_embeddings()( + input_ids).clone() + + vit_embeds = self.model.extract_feature(pixel_values) + vit_embeds = vit_embeds[image_flags == 1] + vit_batch_size = pixel_values.shape[0] + + B, N, C = input_embeds.shape + input_embeds = input_embeds.reshape(B * N, C) + + if torch.distributed.get_rank() == 0 and self._count % 100 == 0: + print(f'dynamic ViT batch size: {vit_batch_size}, ' + f'images per sample: {vit_batch_size / B}, ' + f'dynamic token length: {N}') + self._count += 1 + + input_ids = input_ids.reshape(B * N) + selected = (input_ids == self.model.img_context_token_id) + try: + input_embeds[ + selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape( + -1, C) + except Exception as e: + vit_embeds = vit_embeds.reshape(-1, C) + print(f'warning: {e}, input_embeds[selected].shape=' + f'{input_embeds[selected].shape}, ' + f'vit_embeds.shape={vit_embeds.shape}') + n_token = selected.sum() + input_embeds[ + selected] = input_embeds[selected] * 0.0 + vit_embeds[:n_token] + + input_embeds = input_embeds.reshape(B, N, C) + + outputs = self.model.language_model( + inputs_embeds=input_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view( + -1, self.model.language_model.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits, ) + outputs[1:] + return (loss, ) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + )