kernel: added logits soft cap support for attention

guocuimi · guocuimi · commit aacbdfd20136 · 2025-01-04T12:51:08.000-08:00
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -2,8 +2,12 @@ ARG BASE_IMAGE=vectorchai/scalellm_devel:cuda12.4
 FROM ${BASE_IMAGE}
 
 ARG USER=vscode
+ARG UID=1000
+ARG GID=1000
+
 # Run as non-root user
-RUN useradd -m ${USER}
+RUN groupadd --gid ${GID} ${USER} \
+    && useradd --uid ${UID} --gid ${GID} -m ${USER} --shell /bin/bash
 RUN echo ${USER} ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/${USER} \
     && chmod 0440 /etc/sudoers.d/${USER}
 
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -5,7 +5,9 @@
     "dockerfile": "Dockerfile",
     "args": {
       "BASE_IMAGE": "vectorchai/scalellm_devel:cuda12.4",
-      "USER": "${localEnv:USER}"
+      "USER": "${localEnv:USER:vscode}",
+      "UID": "${localEnv:UID:1000}",
+      "GID": "${localEnv:GID:1000}"
     }
   },
   // Access GPUs from inside the container
@@ -17,8 +19,8 @@
     "HUGGING_FACE_HUB_TOKEN": "${localEnv:HUGGING_FACE_HUB_TOKEN}"
   },
   // Run as the current user
-  "remoteUser": "${localEnv:USER}",
-  "containerUser": "${localEnv:USER}",
+  "remoteUser": "${localEnv:USER:vscode}",
+  "containerUser": "${localEnv:USER:vscode}",
   "updateRemoteUserUID": true,
   // Ports should be forwarded from inside container to the local machine
   "forwardPorts": [],
@@ -35,7 +37,9 @@
         "ms-vscode.cpptools-extension-pack",
         "llvm-vs-code-extensions.vscode-clangd",
         "ms-python.python",
-        "ms-azuretools.vscode-docker"
+        "ms-azuretools.vscode-docker",
+        "ziruiwang.nvidia-monitor",
+        "mutantdino.resourcemonitor"
       ],
       "settings": {
         "extensions.verifySignature": false,
diff --git a/src/kernels/attention/CMakeLists.txt b/src/kernels/attention/CMakeLists.txt
@@ -7,6 +7,10 @@ cc_library(
     attention.kernel
   HDRS
     attention_cpu.h
+    ptx.cuh 
+    fast_cast.cuh
+    online_softmax.cuh
+    attention_traits_sm80.h
     attention_kernel_sm80.cuh
   SRCS 
     # attention.cu
@@ -37,6 +41,8 @@ cc_binary(
     nvbench::nvbench
     nvbench::main
     :attention.kernel
+  COPTS
+    -lineinfo 
 )
 
 add_subdirectory(flash_attn)
diff --git a/src/kernels/attention/attention_bench_sm80.cu b/src/kernels/attention/attention_bench_sm80.cu
@@ -20,6 +20,7 @@ void attention_bench_sm80(nvbench::state& state) {
   const auto n_heads = state.get_int64("n_heads");
   const auto n_kv_heads = state.get_int64("n_kv_heads");
   const auto head_dim = state.get_int64("head_dim");
+  const float logits_soft_cap = state.get_float64("logits_soft_cap");
 
   const auto options = torch::dtype(torch::kHalf).device(torch::kCUDA);
   const auto query =
@@ -31,7 +32,7 @@ void attention_bench_sm80(nvbench::state& state) {
 
   auto out = torch::empty_like(query);
 
-  const float sm_scale = 1.0 / sqrt(head_dim) * M_LOG2E;
+  const float sm_scale = 1.0 / sqrt(head_dim);
   const auto h_stride = query.stride(1);
   const auto kv_h_stride = key.stride(1);
 
@@ -43,7 +44,7 @@ void attention_bench_sm80(nvbench::state& state) {
       AttentionTraitsSM80<cute::half_t, kHeadDim, kBlockM, kBlockN>;
 
   dim3 block = AttentionTraits::kThreadNum;
-  dim3 grid((q_len + kBlockM - 1) / kBlockM, batch_size * head_dim);
+  dim3 grid(q_len / kBlockM, batch_size * n_heads);
 
   const auto smem_size = AttentionTraits::kSmemSize;
   auto attention_kernel = mha_kernel_sm80<AttentionTraits>;
@@ -61,14 +62,16 @@ void attention_bench_sm80(nvbench::state& state) {
         kv_h_stride,
         q_len,
         kv_len,
-        sm_scale);
+        sm_scale,
+        logits_soft_cap);
   });
 }
 
 NVBENCH_BENCH(attention_bench_sm80)
     .add_int64_axis("batch_size", {1})
-    .add_int64_axis("q_len", {64})
-    .add_int64_axis("kv_len", {64, 128})
-    .add_int64_axis("n_heads", {2})
-    .add_int64_axis("n_kv_heads", {2})
-    .add_int64_axis("head_dim", {64});
+    .add_int64_axis("q_len", {1024})
+    .add_int64_axis("kv_len", {1024})
+    .add_int64_axis("n_heads", {32})
+    .add_int64_axis("n_kv_heads", {32})
+    .add_int64_axis("head_dim", {64})
+    .add_float64_axis("logits_soft_cap", {0.0});
diff --git a/src/kernels/attention/attention_kernel_sm80.cuh b/src/kernels/attention/attention_kernel_sm80.cuh
@@ -7,6 +7,7 @@
 
 #include "fast_cast.cuh"
 #include "online_softmax.cuh"
+#include "ptx.cuh"
 
 namespace llm {
 
@@ -19,7 +20,8 @@ __global__ void mha_kernel_sm80(void* o,
                                 int64_t kv_h_stride,
                                 int64_t q_len,
                                 int64_t kv_len,
-                                float sm_scale) {
+                                float sm_scale,
+                                float logits_soft_cap) {
   using namespace cute;
 
   // type alias
@@ -49,6 +51,30 @@ __global__ void mha_kernel_sm80(void* o,
   const auto base_id = blockIdx.y;
   const auto tidx = threadIdx.x;
 
+  // preprocess input parameters
+  // TODO: Move following logic to the host side?
+  if (logits_soft_cap != 0.0) {
+    //    Softmax(x * sm_scale) + apply_logits_soft_cap
+    // => Softmax(Tanh(x * sm_scale / soft_cap) * soft_cap)
+    // => Softmax(S' * sm_scale') where
+    //    S'        = Tanh(x * sm_scale / soft_cap)
+    //              = Tanh(x * soft_cap')
+    //    soft_cap' = sm_scale / soft_cap
+    //    sm_scale' = soft_cap
+    const auto sm_scale_hat = logits_soft_cap;
+    logits_soft_cap = sm_scale * ptx::rcp(logits_soft_cap);
+    sm_scale = sm_scale_hat;
+  }
+  auto apply_logits_soft_cap = [&](auto& tSrAccS) {
+    CUTE_UNROLL
+    for (int i = 0; i < size(tSrAccS); ++i) {
+      tSrAccS(i) = ptx::tanh(tSrAccS(i) * logits_soft_cap);
+    }
+  };
+
+  // use exp2f instead of expf for better performance
+  sm_scale *= M_LOG2E;
+
   // ProblemShape
   // TODO: support non-contiguous layout
   // (q_len, head_dim)
@@ -136,10 +162,22 @@ __global__ void mha_kernel_sm80(void* o,
   // S = Q@K.T
   // tSrAccS: (MMA,MMA_M,MMA_N)
   auto compute_qk = [&](auto& tSrAccS) {
+    // prefetch kv
+    cute::copy(smem_tiled_copy_Q, tSsQ(_, _, _0{}), tSrQ_copy_view(_, _, _0{}));
+    cute::copy(smem_tiled_copy_K, tSsK(_, _, _0{}), tSrK_copy_view(_, _, _0{}));
+
     CUTE_UNROLL
     for (int ki = 0; ki < size<2>(tSrQ); ++ki) {
-      cute::copy(smem_tiled_copy_Q, tSsQ(_, _, ki), tSrQ_copy_view(_, _, ki));
-      cute::copy(smem_tiled_copy_K, tSsK(_, _, ki), tSrK_copy_view(_, _, ki));
+      // prefetch next kv
+      if (ki != size<2>(tSrQ) - 1) {
+        const auto next_ki = ki + 1;
+        cute::copy(smem_tiled_copy_Q,
+                   tSsQ(_, _, next_ki),
+                   tSrQ_copy_view(_, _, next_ki));
+        cute::copy(smem_tiled_copy_K,
+                   tSsK(_, _, next_ki),
+                   tSrK_copy_view(_, _, next_ki));
+      }
       cute::gemm(tiled_mma, tSrQ(_, _, ki), tSrK(_, _, ki), tSrAccS);
     }
   };
@@ -163,10 +201,18 @@ __global__ void mha_kernel_sm80(void* o,
     // convert layout from gemm-I C to gemm-II A
     auto tOrS = make_tensor(tSrS.data(), Layout::to_mma_a(tSrS.layout()));
 
+    // prefetch V^t
+    cute::copy(
+        smem_tiled_copy_Vt, tOsVt(_, _, _0{}), tOrVt_copy_view(_, _, _0{}));
     CUTE_UNROLL
     for (int ki = 0; ki < size<2>(tOrS); ++ki) {
-      cute::copy(
-          smem_tiled_copy_Vt, tOsVt(_, _, ki), tOrVt_copy_view(_, _, ki));
+      // prefetch next V^t
+      if (ki != size<2>(tOrS) - 1) {
+        const auto next_ki = ki + 1;
+        cute::copy(smem_tiled_copy_Vt,
+                   tOsVt(_, _, next_ki),
+                   tOrVt_copy_view(_, _, next_ki));
+      }
       cute::gemm(tiled_mma, tOrS(_, _, ki), tOrVt(_, _, ki), tOrAccO);
     }
   };
@@ -246,6 +292,11 @@ __global__ void mha_kernel_sm80(void* o,
     // 1> S = Q@K.T
     compute_qk(tSrAccS);
 
+    // apply soft cap if needed
+    if (logits_soft_cap != 0.0) {
+      apply_logits_soft_cap(tSrAccS);
+    }
+
     // apply softmax and rescale
     softmax.rescale(tSrAccS_rc_view, tOrAccO_rc_view);
 
diff --git a/src/kernels/attention/attention_kernel_sm80_test.cu b/src/kernels/attention/attention_kernel_sm80_test.cu
@@ -11,32 +11,40 @@ namespace {
 torch::Tensor attention_ref(
     torch::Tensor query,  // [batch_size, n_heads, q_len, head_dim]
     torch::Tensor key,    // [batch_size, n_kv_heads, kv_len, head_dim]
-    torch::Tensor value   // [batch_size, n_kv_heads, kv_len, head_dim]
-) {
+    torch::Tensor value,  // [batch_size, n_kv_heads, kv_len, head_dim]
+    float logits_soft_cap) {
   const auto n_heads = query.size(1);
   const auto n_kv_heads = key.size(1);
   const auto head_dim = query.size(3);
   assert(n_heads == n_kv_heads);
 
   const float sm_scale = 1.0 / sqrt(head_dim);
   // query * key => [n_heads, q_seq_len, seq_len]
-  auto scores = torch::einsum("bhqd,bhkd->bhqk", {query, key});
+  auto scores = torch::einsum("bhqd,bhkd->bhqk",
+                              {query.to(torch::kFloat), key.to(torch::kFloat)});
   // apply scale
   scores *= sm_scale;
 
+  // apply softcap if needed
+  if (logits_soft_cap != 0.0) {
+    scores = torch::tanh(scores / logits_soft_cap) * logits_soft_cap;
+  }
+
   // safe softmax
   scores = torch::softmax(scores, /*dim=*/-1);
 
   // score * value => [batch_size, n_heads, q_seq_len, head_dim]
-  return torch::einsum("bhqk,bhkd->bhqd", {scores, value});
+  return torch::einsum("bhqk,bhkd->bhqd", {scores, value.to(torch::kFloat)})
+      .type_as(query);
 }
 
 torch::Tensor attention_sm80(
     torch::Tensor query,  // [batch_size, n_heads, q_len, head_dim]
     torch::Tensor key,    // [batch_size, n_kv_heads, kv_len, head_dim]
-    torch::Tensor value   // [batch_size, n_kv_heads, kv_len, head_dim]
-) {
+    torch::Tensor value,  // [batch_size, n_kv_heads, kv_len, head_dim]
+    float logits_soft_cap) {
   const auto batch_size = query.size(0);
+  const auto n_heads = query.size(1);
   const auto q_len = query.size(2);
   const auto kv_len = key.size(2);
   const auto head_dim = query.size(3);
@@ -50,13 +58,13 @@ torch::Tensor attention_sm80(
   constexpr int32_t kBlockM = 64;
   constexpr int32_t kBlockN = 64;
 
-  const float sm_scale = 1.0 / sqrt(head_dim) * M_LOG2E;
+  const float sm_scale = 1.0 / sqrt(head_dim);
 
   using AttentionTraits =
       AttentionTraitsSM80<cute::half_t, kHeadDim, kBlockM, kBlockN>;
 
   dim3 block = AttentionTraits::kThreadNum;
-  dim3 grid((q_len + kBlockM - 1) / kBlockM, batch_size * head_dim);
+  dim3 grid((q_len + kBlockM - 1) / kBlockM, batch_size * n_heads);
 
   const auto smem_size = AttentionTraits::kSmemSize;
   auto attention_kernel = mha_kernel_sm80<AttentionTraits>;
@@ -72,7 +80,8 @@ torch::Tensor attention_sm80(
                                                kv_h_stride,
                                                q_len,
                                                kv_len,
-                                               sm_scale);
+                                               sm_scale,
+                                               logits_soft_cap);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return out;
 }
@@ -85,11 +94,23 @@ class AttentionKernelTest
                                                  int64_t /*kv_len*/,
                                                  int64_t /*n_heads*/,
                                                  int64_t /*n_kv_heads*/,
-                                                 int64_t /*head_dim*/>> {};
+                                                 int64_t /*head_dim*/,
+                                                 float /*logits_soft_cap*/>> {
+ public:
+  void SetUp() override {
+    // Set random seed for test stability
+    torch::manual_seed(0);
+  }
+};
 
 TEST_P(AttentionKernelTest, MHA) {
-  const auto [batch_size, q_len, kv_len, n_heads, n_kv_heads, head_dim] =
-      GetParam();
+  const auto [batch_size,
+              q_len,
+              kv_len,
+              n_heads,
+              n_kv_heads,
+              head_dim,
+              logits_soft_cap] = GetParam();
 
   const auto options = torch::dtype(torch::kHalf).device(torch::kCUDA);
 
@@ -100,21 +121,22 @@ TEST_P(AttentionKernelTest, MHA) {
   const auto value =
       torch::randn({batch_size, n_kv_heads, kv_len, head_dim}, options);
 
-  auto ref_out = attention_ref(query, key, value);
-  auto out = attention_sm80(query, key, value);
+  auto ref_out = attention_ref(query, key, value, logits_soft_cap);
+  auto out = attention_sm80(query, key, value, logits_soft_cap);
 
   EXPECT_TRUE(torch::allclose(out, ref_out, /*rtol=*/1e-3, /*atol=*/1e-3));
 }
 
-INSTANTIATE_TEST_SUITE_P(MHA,
-                         AttentionKernelTest,
-                         ::testing::Combine(::testing::Values(1),  // batch_size
-                                            ::testing::Values(64),  // q_len
-                                            ::testing::Values(64,
-                                                              256),  // kv_len
-                                            ::testing::Values(2),    // n_heads
-                                            ::testing::Values(2),  // n_kv_heads
-                                            ::testing::Values(64)  // head_dim
-                                            ));
+INSTANTIATE_TEST_SUITE_P(
+    MHA,
+    AttentionKernelTest,
+    ::testing::Combine(::testing::Values(1, 2, 4),         // batch_size
+                       ::testing::Values(128, 256, 1024),  // q_len
+                       ::testing::Values(128, 256, 1024),  // kv_len
+                       ::testing::Values(16),              // n_heads
+                       ::testing::Values(16),              // n_kv_heads
+                       ::testing::Values(64),              // head_dim
+                       ::testing::Values(0.0, 50.0)        // logits_soft_cap
+                       ));
 
 }  // namespace llm
diff --git a/src/kernels/attention/online_softmax.cuh b/src/kernels/attention/online_softmax.cuh
diff --git a/src/kernels/attention/ptx.cuh b/src/kernels/attention/ptx.cuh