example back to origin

MayDomine · MayDomine · commit 832141a5c42d · 2023-08-29T11:21:20.000+08:00
diff --git a/example/layers/attention.py b/example/layers/attention.py
@@ -1,14 +1,8 @@
 from typing import Optional
 import torch
 import bmtrain as bmt
-from bmtrain.nn import (
-    Linear,
-    ColumnParallelLinear,
-    RowParallelLinear,
-)
+from bmtrain.nn import Linear
 import math
-from bmtrain.global_var import config
-from bmtrain.distributed import all_gather 
 
 class Attention(bmt.DistributedModule):
     def __init__(self, 
@@ -18,21 +12,14 @@ def __init__(self,
         ) -> None:
         super().__init__()
 
-        if config['tp_size'] > 1:
-            self.project_q = ColumnParallelLinear(dim_model, dim_head * num_heads, bias=bias, dtype=dtype, gather_input=False)
-            self.project_k = ColumnParallelLinear(dim_model, dim_head * num_heads, bias=bias, dtype=dtype, gather_input=False)
-            self.project_v = ColumnParallelLinear(dim_model, dim_head * num_heads, bias=bias, dtype=dtype, gather_input=False)
-            self.project_out = RowParallelLinear(dim_head * num_heads, dim_model, bias=bias, dtype=dtype)
-        else:
-            self.project_q = Linear(dim_model, dim_head * num_heads, bias=bias, dtype=dtype)
-            self.project_k = Linear(dim_model, dim_head * num_heads, bias=bias, dtype=dtype)
-            self.project_v = Linear(dim_model, dim_head * num_heads, bias=bias, dtype=dtype)
-            self.project_out = Linear(dim_head * num_heads, dim_model, bias=bias, dtype=dtype)
+        self.project_q = Linear(dim_model, dim_head * num_heads, bias=bias, dtype=dtype)
+        self.project_k = Linear(dim_model, dim_head * num_heads, bias=bias, dtype=dtype)
+        self.project_v = Linear(dim_model, dim_head * num_heads, bias=bias, dtype=dtype)
 
+        self.project_out = Linear(dim_head * num_heads, dim_model, bias=bias, dtype=dtype)
 
         self.softmax = torch.nn.Softmax(dim=-1)
         self.num_heads = num_heads
-        self.num_kv_heads = num_heads
         self.dim_head = dim_head
         self.dim_model = dim_model
     
@@ -45,50 +32,32 @@ def forward(self,
         batch_size, seq_q, dim_model = hidden_q.size()
         seq_kv = hidden_kv.size(1)
 
-        if isinstance(self.project_q, ColumnParallelLinear):
-            assert hidden_q.data_ptr() == hidden_kv.data_ptr()
-            hidden_q = bmt.nn.OpParallelLinear.apply(
-                hidden_q,
-                torch.cat([self.project_q.weight, self.project_k.weight, self.project_v.weight], dim=0),
-                torch.cat([self.project_q.bias, self.project_k.bias, self.project_v.bias], dim=0) if self.project_q.bias is not None else None,
-                True, False,
-                False, None
-            )
-            h_q, h_k, h_v = hidden_q.chunk(3, dim=-1)
-        else:
-            h_q : torch.Tensor = self.project_q(hidden_q)
-            h_k : torch.Tensor = self.project_k(hidden_q)
-            h_v : torch.Tensor = self.project_v(hidden_q)
-        if config['tp_size'] > 1:
-            #batch_size  will changed in TensorParallel
-            batch_size = h_v.shape[0]
-
-        h_q = h_q.view(batch_size, seq_q, -1, self.dim_head)
-        h_k = h_k.view(batch_size, seq_kv, -1, self.dim_head)
-        h_v = h_v.view(batch_size, seq_kv, -1, self.dim_head)
+        h_q : torch.Tensor = self.project_q(hidden_q)
+        h_k : torch.Tensor = self.project_k(hidden_kv)
+        h_v : torch.Tensor = self.project_v(hidden_kv)
+
+        h_q = h_q.view(batch_size, seq_q, self.num_heads, self.dim_head)
+        h_k = h_k.view(batch_size, seq_kv, self.num_heads, self.dim_head)
+        h_v = h_v.view(batch_size, seq_kv, self.num_heads, self.dim_head)
 
         h_q = h_q.permute(0, 2, 1, 3).contiguous()
         h_k = h_k.permute(0, 2, 1, 3).contiguous()
         h_v = h_v.permute(0, 2, 1, 3).contiguous()
 
-        h_q = h_q.view(-1, seq_q, self.dim_head)
-        h_k = h_k.view(-1, seq_kv, self.dim_head)
-        h_v = h_v.view(-1, seq_kv, self.dim_head)
+        h_q = h_q.view(batch_size * self.num_heads, seq_q, self.dim_head)
+        h_k = h_k.view(batch_size * self.num_heads, seq_kv, self.dim_head)
+        h_v = h_v.view(batch_size * self.num_heads, seq_kv, self.dim_head)
 
         score = torch.bmm(
             h_q, h_k.transpose(1, 2)
         )
         score = score / math.sqrt(self.dim_head)
 
-        score = score.view(batch_size, -1, seq_q, seq_kv)
+        score = score.view(batch_size, self.num_heads, seq_q, seq_kv)
 
         if position_bias is not None:
-            score = score + position_bias.view(batch_size, -1, seq_q, seq_kv)
-         
-        if config['tp_size'] > 1:
-            with torch.no_grad():
-                mask = all_gather(mask, config['tp_comm']).flatten(0,1)
-
+            score = score + position_bias.view(batch_size, self.num_heads, seq_q, seq_kv)
+        
         score = torch.where(
             mask.view(batch_size, 1, seq_q, seq_kv),
             score,
@@ -101,14 +70,14 @@ def forward(self,
             torch.scalar_tensor(0, device=score.device, dtype=score.dtype)
         )
 
-        score = score.view(-1, seq_q, seq_kv)
+        score = score.view(batch_size * self.num_heads, seq_q, seq_kv)
 
         h_out = torch.bmm(
             score, h_v
         )
-        h_out = h_out.view(batch_size, -1, seq_q, self.dim_head)
+        h_out = h_out.view(batch_size, self.num_heads, seq_q, self.dim_head)
         h_out = h_out.permute(0, 2, 1, 3).contiguous()
-        h_out = h_out.view(batch_size, seq_q, -1)
+        h_out = h_out.view(batch_size, seq_q, self.num_heads * self.dim_head)
 
         attn_out = self.project_out(h_out)
         return attn_out
diff --git a/example/layers/embedding.py b/example/layers/embedding.py
@@ -77,13 +77,11 @@ def from_pretrained(cls, embeddings, freeze=True, padding_idx=None,
 
     def forward(self, input: torch.Tensor, projection : bool = False) -> torch.Tensor:
         if not projection:
-            out = F.embedding(
+            return F.embedding(
                 input, self.weight, self.padding_idx, self.max_norm,
                 self.norm_type, self.scale_grad_by_freq, self.sparse)
-            return out
         else:
-            out = F.linear(input, self.weight)
-            return out
+            return F.linear(input, self.weight) / math.sqrt(self.embedding_dim)
 
     def extra_repr(self) -> str:
         s = '{num_embeddings}, {embedding_dim}'
@@ -99,4 +97,4 @@ def extra_repr(self) -> str:
             s += ', sparse=True'
         return s.format(**self.__dict__)
 
-    
+    
diff --git a/example/layers/feedforward.py b/example/layers/feedforward.py
@@ -1,23 +1,16 @@
 import torch
 import bmtrain as bmt
-from bmtrain.nn import (
-    Linear,
-    ColumnParallelLinear,
-    RowParallelLinear)
-from bmtrain.global_var import config
+from bmtrain.nn import Linear
 
 class Feedforward(bmt.DistributedModule):
     def __init__(self, dim_model : int, dim_ff : int, bias : bool = True, dtype = None) -> None:
         super().__init__()
 
-        if config['tp_size'] > 1:
-            self.w_in = ColumnParallelLinear(dim_model, dim_ff, bias = bias, dtype=dtype)
-            self.w_out = RowParallelLinear(dim_ff, dim_model, bias = bias, dtype=dtype)
-        else:
-            self.w_in = Linear(dim_model, dim_ff, bias=bias, dtype=dtype)
-            self.w_out = Linear(dim_ff, dim_model, bias=bias, dtype=dtype)
+        self.w_in = Linear(dim_model, dim_ff, bias = bias, dtype=dtype)
+        self.w_out = Linear(dim_ff, dim_model, bias = bias, dtype=dtype)
 
         self.relu = torch.nn.ReLU()
     
     def forward(self, input : torch.Tensor) -> torch.Tensor:
+
         return self.w_out(self.relu(self.w_in(input)))
diff --git a/example/layers/transformer.py b/example/layers/transformer.py
@@ -20,10 +20,10 @@ def forward(self,
             hidden : torch.Tensor,      # (batch, seq_len, dim_model)
             mask : torch.BoolTensor,    # (batch, seq_len, dim_model)
             position_bias : Optional[torch.Tensor] = None,   # (batch, num_head, seq_len, seq_len)
-    ):
-        # bmt.inspect.record_tensor(hidden, "hidden")
+        ):
+        bmt.inspect.record_tensor(hidden, "hidden")
         x = self.ln_attn(hidden)
-        x = self.attn(x, x, mask)
+        x = self.attn(x, x, mask, position_bias)
         hidden = hidden + x
 
         x = self.ln_ff(hidden)
diff --git a/example/models/gpt.py b/example/models/gpt.py
@@ -1,38 +1,28 @@
 import torch
 import bmtrain as bmt
 from layers import TransformerEncoder, Layernorm, Embedding, TransformerEncoder
-from bmtrain.global_var import config
 
 class GPT(bmt.DistributedModule):
     def __init__(self,
             num_layers : int, vocab_size : int,
             dim_model : int, dim_head : int, num_heads : int, dim_ff : int,
             max_distance : int,
-            bias : bool = True, dtype = None, offload = False, offload_level = 0
+            bias : bool = True, dtype = None
         ) -> None:
         super().__init__()
 
         self.max_distance = max_distance
 
-        if config['tp_size'] > 1:
-            self.word_emb = bmt.nn.ParallelEmbedding(vocab_size, dim_model, dtype=dtype)
-        else:
-            self.word_emb = Embedding(vocab_size, dim_model, dtype=dtype)
+        self.word_emb = Embedding(vocab_size, dim_model, dtype=dtype)
         self.pos_emb = Embedding(max_distance, dim_model, dtype=dtype)
-        if offload:
-            offload_mask = [True if i%4 == 0 else False for i in range(num_layers)] 
-            ckpt_mask = [not offload_mask[i] for i in range(num_layers)]
-            offload_level = offload_level
-        else:
-            ckpt_mask = [ True for i in range(num_layers) ]
-            offload_mask = [ False for i in range(num_layers) ]
+        
         self.transformers = bmt.TransformerBlockList([
             bmt.CheckpointBlock(
                 TransformerEncoder(
                     dim_model, dim_head, num_heads, dim_ff, bias, dtype
-                ),use_checkpoint=ckpt_mask[i],use_offload=offload_mask[i],offload_level=offload_level
+                )
             )
-            for i in range(num_layers)
+            for _ in range(num_layers)
         ])
 
         self.layernorm = Layernorm(dim_model, dtype=dtype)
@@ -52,10 +42,7 @@ def forward(self,
         out = self.transformers(out, mask_2d, None)
         out = self.layernorm(out)
 
-        if config['tp_size'] > 1:
-            logits = self.word_emb.projection(out)
-        else:
-            logits = self.word_emb(out, projection=True)
+        logits = self.word_emb(out, projection=True)
         bmt.inspect.record_tensor(logits, "logits")
 
-        return logits
+        return logits
diff --git a/example/run.sh b/example/run.sh
@@ -1 +1,3 @@
-torchrun --nnodes=1 --nproc_per_node=4 --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=localhost $1
+export NCCL_P2P_DISABLE=1
+export CUDA_LAUNCH_BLOCKING=1
+torchrun --nnodes=1 --nproc_per_node=4 --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=localhost train.py
diff --git a/example/train.py b/example/train.py
@@ -3,32 +3,28 @@
 from models import GPT
 import time
 from bmtrain import optim
-from bmtrain.global_var import config
 from bmtrain import inspect
 
 def main():
     bmt.init_distributed(
         seed=0,
-        tp_size=2,
+        zero_level=2,
     )
-    offload = True
-    seq_len = 4096
-    offload_level = 0
+
     model = GPT(
-        num_layers=24,
-        vocab_size=80000,
-        dim_model=1024,
-        dim_head=64,
-        num_heads=16,
-        dim_ff=4096,
-        max_distance=seq_len,
-        bias=False,
-        dtype=torch.half,
-        offload=offload,
-        offload_level=offload_level
+        num_layers=8,
+        vocab_size=10240, 
+        dim_model=2560,
+        dim_head=80,
+        num_heads=32,
+        dim_ff=8192,
+        max_distance=1024,
+        bias=True,
+        dtype=torch.half
     )
 
     bmt.init_parameters(model)
+    # print_inspect(model, "*")
 
     bmt.print_rank("Model memory")
     bmt.print_rank(torch.cuda.memory_summary())
@@ -37,7 +33,10 @@ def main():
     # data
     # generate dummy data for each rank
     torch.manual_seed(1234)
-    batch_size = 4
+
+    batch_size = 2
+    seq_len = 512
+
     for i in range(bmt.world_size()):
         sent = torch.randint(0, 10240, (batch_size, seq_len + 1))
         enc_length = torch.randint(128, seq_len, (batch_size,)).long().cuda()
@@ -53,11 +52,7 @@ def main():
         if i == bmt.rank():
             break
     
-    if config['tp_size'] > 1:
-        loss_func = bmt.loss.FusedCrossEntropy(ignore_index=-100, parallel=True)
-    else:
-        loss_func = torch.nn.CrossEntropyLoss(ignore_index=-100)
-
+    loss_func = torch.nn.CrossEntropyLoss(ignore_index=-100)
     optimizer = optim.AdamOffloadOptimizer(model.parameters(), weight_decay=1e-2)
     lr_scheduler = bmt.lr_scheduler.Noam(optimizer, start_lr=1e-3, warmup_iter=40, end_iter=1000, num_iter=0)
 
@@ -69,43 +64,40 @@ def main():
     avg_time_recorder = bmt.utils.AverageRecorder()
     avg_loss_recorder = bmt.utils.AverageRecorder()
 
-    for iteration in range(30):
+    for iteration in range(1000):
         # load data
         st = time.time()
 
-        # with bmt.inspect.inspect_tensor() as inspector:
-        pos = torch.arange(enc_input.size(1)).long().cuda().repeat(enc_input.size(0), 1)
-        logits = model(
-            enc_input,
-            pos,
-            pos < enc_length[:, None]
-        )
-        batch, seq_len, vocab_out_size = logits.size()
+        with inspect.inspect_tensor() as inspector:
+            pos = torch.arange(enc_input.size(1)).long().cuda().repeat(enc_input.size(0), 1)
+            logits = model(
+                enc_input,
+                pos,
+                pos < enc_length[:, None]
+            )
+            batch, seq_len, vocab_out_size = logits.size()
 
-            if config['tp_size'] > 1:
-                loss = loss_func(logits.view(batch * seq_len, vocab_out_size), targets)
-            else:
-                loss = loss_func(logits.float().view(batch * seq_len, vocab_out_size), targets.view(batch * seq_len))
+            loss = loss_func(logits.view(batch * seq_len, vocab_out_size), targets.view(batch * seq_len))
         
             global_loss = bmt.sum_loss(loss).item()
 
-        optim_manager.zero_grad()
+            optim_manager.zero_grad()
 
-        optim_manager.backward(loss)
+            optim_manager.backward(loss)
         
         # print inspected tensors in the forward & backward pass
         # print parameters of the model
-        # if iteration % 100 == 0:
-        #     bmt.print_rank(
-        #         bmt.inspect.format_summary(
-        #             inspector.get_summary()
-        #         )
-        #     )
-        #     bmt.print_rank(
-        #         bmt.inspect.format_summary(
-        #             bmt.inspect.inspect_model(model, "*")
-        #         )
-        #     )
+        if iteration % 100 == 0:
+            bmt.print_rank(
+                inspect.format_summary(
+                    inspector.get_summary()
+                )
+            )
+            bmt.print_rank(
+                inspect.format_summary(
+                    inspect.inspect_model(model, "*")
+                )
+            )
 
         optim_manager.step()