use einsum

guocuimi · guocuimi · commit c54d4c8fc7f9 · 2024-12-31T20:02:58.000-08:00
diff --git a/src/kernels/attention/attention_cpu.h b/src/kernels/attention/attention_cpu.h
@@ -91,7 +91,7 @@ inline void mha(torch::Tensor query,
           }
           // apply causal mask
           if (kv_idx_base + j > q_idx_base + q_idx) {
-            s(j) = -INFINITY;
+            s(j) = -5e4;
           }
           max = std::max(max, s(j));
         }
diff --git a/src/kernels/attention/attention_cpu_test.cpp b/src/kernels/attention/attention_cpu_test.cpp
@@ -35,7 +35,7 @@ torch::Tensor masked_self_attention(
   torch::Tensor mask = torch::ones({1, q_seq_len, seq_len}, torch::kBool);
   // returns the lower triangular part of a matrix
   mask = torch::tril(mask, /*diagonal=*/seq_len - q_seq_len).to(query);
-  scores = scores.masked_fill(mask == 0, -INFINITY);
+  scores = scores.masked_fill(mask == 0, -5e4);
 
   // safe softmax
   scores = torch::softmax(scores, /*dim=*/-1);
diff --git a/src/kernels/attention/attention_kernel_sm80_test.cu b/src/kernels/attention/attention_kernel_sm80_test.cu
@@ -21,17 +21,15 @@ torch::Tensor attention_ref(
 
   const float sm_scale = 1.0 / sqrt(head_dim);
   // query * key => [n_heads, q_seq_len, seq_len]
-  // auto scores = torch::einsum("bhqd,bhkd->bhqk", {query, key});
-  auto scores = torch::matmul(query, key.transpose(-2, -1));
+  auto scores = torch::einsum("bhqd,bhkd->bhqk", {query, key});
   // apply scale
   scores *= sm_scale;
 
   // safe softmax
   scores = torch::softmax(scores, /*dim=*/-1);
 
-  // score * value => [batch_size, q_seq_len, n_heads, head_dim]
-  // return torch::einsum("bhqk,bkhd->bhqd", {scores, value});
-  return torch::matmul(scores, value);
+  // score * value => [batch_size, n_heads, q_seq_len, head_dim]
+  return torch::einsum("bhqk,bhkd->bhqd", {scores, value});
 }
 
 torch::Tensor attention_sm80(

Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ inline void mha(torch::Tensor query,`
`91`	`91`	`}`
`92`	`92`	`// apply causal mask`
`93`	`93`	`if (kv_idx_base + j > q_idx_base + q_idx) {`
`94`		`- s(j) = -INFINITY;`
	`94`	`+ s(j) = -5e4;`
`95`	`95`	`}`
`96`	`96`	`max = std::max(max, s(j));`
`97`	`97`	`}`