optimize performance with bmm fp32->16

Mark-ZhouWX · Mark-ZhouWX · commit f08bbdd3830c · 2023-09-13T10:17:08.000+08:00
diff --git a/research/segment-anything/segment_anything/modeling/transformer.py b/research/segment-anything/segment_anything/modeling/transformer.py
@@ -228,13 +228,13 @@ def construct(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
 
         # Attention
         _, _, _, c_per_head = q.shape
+        dtype = q.dtype
         attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
-        attn = attn / Tensor(math.sqrt(c_per_head), ms.float32)
+        attn = attn / Tensor(math.sqrt(c_per_head), dtype)
         attn = ops.softmax(attn, axis=-1)
 
         # Get output
-        dtype = attn.dtype
-        out = attn @ v.astype(dtype)
+        out = attn @ v
         out = self._recombine_heads(out)
         out = self.out_proj(out)