Initial commit

daihuaiii · Apr 15, 2024 · 850d4d7 · 850d4d7
commit 850d4d7
Show file tree

Hide file tree

Showing 13 changed files with 2,760 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,10 @@
+# ChatGLM2-6B-int4 
+
+- Add new functions in lora_utils.py
+- Add new method for class ChatGLMForConditionalGeneration in modeling_chatglm.py
+- Loading the model for ft with different manner in train_lora.py
+
+## Ref
+
+- https://github.com/DracoUnion/chatglm2-6b-int4-lora
+- https://github.com/shuxueslpi/chatGLM-6B-QLoRA
diff --git a/chatGLM_6B_LoRA.json b/chatGLM_6B_LoRA.json
@@ -0,0 +1,19 @@
+{
+    "output_dir": "saved_files/chatGLM_6B_int4_LoRA",
+    "per_device_train_batch_size": 2,
+    "gradient_accumulation_steps": 4,
+    "per_device_eval_batch_size": 2,
+    "learning_rate": 5e-4,
+    "num_train_epochs": 10.0,
+    "lr_scheduler_type": "linear",
+    "warmup_ratio": 0.1,
+    "logging_steps": 100,
+    "save_strategy": "epoch",
+    "load_best_model_at_end": false,
+    "evaluation_strategy": "epoch",
+    "optim": "adamw_torch",
+    "fp16": false,
+    "remove_unused_columns": false,
+    "ddp_find_unused_parameters": false,
+    "seed": 42
+}
diff --git a/glm2-6b-int4/MODEL_LICENSE b/glm2-6b-int4/MODEL_LICENSE
@@ -0,0 +1,33 @@
+The ChatGLM2-6B License
+
+1. Definitions
+
+“Licensor” means the ChatGLM2-6B Model Team that distributes its Software.
+
+“Software” means the ChatGLM2-6B model parameters made available under this license.
+
+2. License Grant
+
+Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+3. Restriction
+
+You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
+
+You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
+
+4. Disclaimer
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+5. Limitation of Liability
+
+EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+6. Dispute Resolution
+
+This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
+
+Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at [email protected].
diff --git a/glm2-6b-int4/config.json b/glm2-6b-int4/config.json
@@ -0,0 +1,41 @@
+{
+  "_name_or_path": "THUDM/chatglm2-6b",
+  "model_type": "chatglm",
+  "architectures": [
+    "ChatGLMModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
+  },
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "bias_dropout_fusion": true,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "quantization_bit": 4,
+  "rmsnorm": true,
+  "seq_length": 32768,
+  "use_cache": true,
+  "torch_dtype": "float16",
+  "transformers_version": "4.27.1",
+  "tie_word_embeddings": false,
+  "eos_token_id": 2,
+  "pad_token_id": 0
+}
diff --git a/glm2-6b-int4/configuration_chatglm.py b/glm2-6b-int4/configuration_chatglm.py
@@ -0,0 +1,59 @@
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        super().__init__(**kwargs)
diff --git a/glm2-6b-int4/lora_utils.py b/glm2-6b-int4/lora_utils.py
@@ -0,0 +1,141 @@
+'''
+Edited according to https://zhuanlan.zhihu.com/p/662569090
+for CPU peft ft
+
+'''
+import torch
+from .quantization import *
+
+
+
+
+class LoraQuantizedLinear(torch.nn.Module):
+
+    def __init__(self, q_linear, lora_r=32, lora_alpha=32, lora_dropout_rate=0.0):
+        super().__init__()
+
+        # 保存原始参数和Lora配置
+        self.lora_r = lora_r
+        self.lora_alpha = lora_alpha
+        self.lora_dropout_rate = lora_dropout_rate
+        self.weight_bit_width = q_linear.weight_bit_width
+        self.weight = q_linear.weight
+        self.weight_scale = q_linear.weight_scale
+        self.bias = q_linear.bias
+
+        # 冻结原始参数
+        self.weight.requires_grad = False
+        self.weight_scale.requires_grad = False
+        if self.bias is not None: self.bias.requires_grad = False
+
+        # 创建 Lora 参数，FP16
+        out_dim, in_dim = self.weight.shape
+        # INT4 模型下，InDim 是原始大小的一半
+        if self.weight_bit_width == 4: in_dim *= 2
+        # LoraA 正态初始化
+        self.lora_a = torch.nn.Parameter(torch.empty(
+            [self.lora_r, in_dim],
+            device=self.weight.device,
+            dtype=torch.float16,
+        ))
+        torch.nn.init.kaiming_normal_(self.lora_a)
+        # LoraB 全零初始化
+        self.lora_b = torch.nn.Parameter(torch.zeros(
+            [out_dim, self.lora_r],
+            device=self.weight.device,
+            dtype=torch.float16,
+        ))
+        self.lora_dropout = torch.nn.Dropout(self.lora_dropout_rate)
+        self.lora_scale = self.lora_alpha / self.lora_r
+
+    def forward(self, input):
+        ori_output = QuantizedLinear.forward(self, input)
+        lora_output = (
+            self.lora_dropout(input.half()) @
+            self.lora_a.transpose(0, 1) @
+            self.lora_b.transpose(0, 1) *
+            self.lora_scale
+        )
+        return ori_output + lora_output.to(ori_output.dtype)
+
+    def merge(self):
+        # H = XW + b + XAB * s => H = X(W + AB * s) + b
+        # 将 int 原始参数转成 fp16
+        weight = extract_weight_to_half(self.weight, self.weight_scale, self.weight_bit_width)
+        # 合并 lora 参数
+        weight += self.lora_b @ self.lora_a * self.lora_scale
+        # 再转回 int
+        weight, weight_scale = half_weight_to_int(weight, self.weight_bit_width)
+        self.weight = torch.nn.Parameter(weight, requires_grad=False)
+        self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+        # 重新初始化 lora 两个矩阵
+        torch.nn.init.kaiming_normal_(self.lora_a)
+        torch.nn.init.zeros_(self.lora_b)
+
+
+
+# 将所有的QuantizedLinear改成LoraQuantizedLinear
+def attach_lora(model, lora_r=32, lora_alpha=32, lora_dropout_rate=0.0):
+    if model.lora_attached: return model
+    lora_conf = dict(lora_r=lora_r, lora_alpha=lora_alpha, lora_dropout_rate=lora_dropout_rate)
+    for mod in model.modules():
+        for name in dir(mod):
+            submod = getattr(mod, name, None)
+            if not isinstance(submod, QuantizedLinear):
+                continue
+            new_submod = LoraQuantizedLinear(submod, **lora_conf)
+            setattr(mod, name, new_submod)
+
+    for name, param in model.named_parameters():
+        if 'lora_' not in name:
+            param.requires_grad = False
+    model.lora_attached = True
+    return model
+
+
+# 导出所有 Lora 参数。
+def lora_state_dict(model):
+    return {
+        k:v
+        for k, v in model.state_dict().items()
+        if 'lora_' in k
+    }
+
+
+def base_state_dict(model):
+    return {   
+        k:v
+        for k, v in model.state_dict().items()
+        if 'lora_' not in k
+    }
+
+
+# 将所有LoraQuantizedLinear的参数合并。
+def merge_lora(model):
+    for mod in model.modules():
+        if isinstance(mod, LoraQuantizedLinear):
+            mod.merge()
+    return model
+
+
+# 搜索模型的所有模块，再搜索它的直接子模块，如果任何东西是LoraQuantizedLinear，就把它替换回QuantizedLinear。
+def detach_lora(model):
+    if not model.lora_attached: return model
+
+    for mod in model.modules():
+        for name in dir(mod):
+            submod = getattr(mod, name, None)
+            if not isinstance(submod, LoraQuantizedLinear):
+                continue
+            new_submod = QuantizedLinear.from_params(
+                submod.weight_bit_width,
+                submod.weight,
+                submod.weight_scale,
+                submod.bias,
+            )
+            setattr(mod, name, new_submod)
+
+    model.lora_attached = False
+    return model
+
+