optimize performance with pad -> concat

Mark-ZhouWX · Mark-ZhouWX · commit 846438408202 · 2023-09-13T11:19:27.000+08:00
diff --git a/research/segment-anything/configs/coco_box_finetune.yaml b/research/segment-anything/configs/coco_box_finetune.yaml
@@ -61,7 +61,7 @@ train_loader:
 
   shuffle: True
   batch_size: 1
-  epoch_size: 10
+  epoch_size: 20
   drop_remainder: True
   num_workers: 2
   max_rowsize: 24  # 24M space for dataloader
diff --git a/research/segment-anything/segment_anything/modeling/image_encoder.py b/research/segment-anything/segment_anything/modeling/image_encoder.py
@@ -264,7 +264,10 @@ def window_partition(x: ms.Tensor, window_size: int) -> Tuple[ms.Tensor, Tuple[i
     pad_h = (window_size - H % window_size) % window_size
     pad_w = (window_size - W % window_size) % window_size
     if pad_h > 0 or pad_w > 0:
-        x = ops.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+        # replace ops.pad with ops.concat for better performance
+        pad_mat1 = ops.zeros((B, H, pad_w, C), x.dtype)
+        pad_mat2 = ops.zeros((B, pad_h, W + pad_w, C), x.dtype)
+        x = ops.concat([ops.concat([x, pad_mat1], axis=2), pad_mat2], axis=1)
     Hp, Wp = H + pad_h, W + pad_w
 
     x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
@@ -401,7 +404,8 @@ def __init__(
         )
 
     def construct(self, x: ms.Tensor) -> ms.Tensor:
-        x = ops.pad(x, (self.padding[0], self.padding[0], self.padding[1], self.padding[1]))  # to align with torch
+        if sum(self.padding) > 0:
+            x = ops.pad(x, (self.padding[0], self.padding[0], self.padding[1], self.padding[1]))  # to align with torch
         x = self.proj(x)
         # B C H W -> B H W C
         x = x.permute(0, 2, 3, 1)