SHI-Labs
diff --git a/‎csrc/autogen/include/natten_autogen/cuda/reference/kernels.h‎
Lines changed: 252 additions & 84 deletions b/‎csrc/autogen/include/natten_autogen/cuda/reference/kernels.h‎
Lines changed: 252 additions & 84 deletions
diff --git a/‎csrc/autogen/src/cuda/reference/source_0.cu‎
Lines changed: 126 additions & 42 deletions b/‎csrc/autogen/src/cuda/reference/source_0.cu‎
Lines changed: 126 additions & 42 deletions
diff --git a/‎csrc/autogen/src/cuda/reference/source_1.cu‎
Lines changed: 126 additions & 42 deletions b/‎csrc/autogen/src/cuda/reference/source_1.cu‎
Lines changed: 126 additions & 42 deletions
diff --git a/‎csrc/autogen/src/cuda/reference/source_2.cu‎
Lines changed: 126 additions & 42 deletions b/‎csrc/autogen/src/cuda/reference/source_2.cu‎
Lines changed: 126 additions & 42 deletions
diff --git a/‎csrc/autogen/src/cuda/reference/source_3.cu‎
Lines changed: 126 additions & 42 deletions b/‎csrc/autogen/src/cuda/reference/source_3.cu‎
Lines changed: 126 additions & 42 deletions
diff --git a/‎csrc/include/natten/cuda/fna/fna_backward.cuh‎
Lines changed: 17 additions & 1 deletion b/‎csrc/include/natten/cuda/fna/fna_backward.cuh‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎csrc/include/natten/cuda/fna/fna_forward.cuh‎
Lines changed: 17 additions & 1 deletion b/‎csrc/include/natten/cuda/fna/fna_forward.cuh‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎csrc/include/natten/cuda/fna/kernel_backward.h‎
Lines changed: 35 additions & 1 deletion b/‎csrc/include/natten/cuda/fna/kernel_backward.h‎
Lines changed: 35 additions & 1 deletion
diff --git a/‎csrc/include/natten/cuda/fna/kernel_forward.h‎
Lines changed: 32 additions & 0 deletions b/‎csrc/include/natten/cuda/fna/kernel_forward.h‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎csrc/include/natten/cuda/reference/fna_reference_backward.hpp‎
Lines changed: 107 additions & 13 deletions b/‎csrc/include/natten/cuda/reference/fna_reference_backward.hpp‎
Lines changed: 107 additions & 13 deletions
@@ -89,7 +89,11 @@ void fna_backward_generic(
     float attn_scale,
     IntTuple query_tile_shape,
     IntTuple key_tile_shape,
-    IntTuple num_splits_key) {
+    IntTuple num_splits_key,
+    bool has_dot_product_min,
+    bool has_dot_product_max,
+    float dot_product_min,
+    float dot_product_max) {
   static constexpr auto kRank =
       std::tuple_size<decltype(spatial_extent)>::value;
   using Dim = typename GetDim<kRank>::type;
@@ -157,6 +161,18 @@ void fna_backward_generic(
 
     p.num_splits_key = tuple_to_na_dim<Dim>(num_splits_key);
 
+    // Optional dot product clipping
+    p.has_dot_product_clip = has_dot_product_min || has_dot_product_max;
+    p.has_dot_product_min = has_dot_product_min;
+    p.has_dot_product_max = has_dot_product_max;
+    if (has_dot_product_min) {
+      p.dot_product_min = dot_product_min;
+    }
+    if (has_dot_product_max) {
+      p.dot_product_max = dot_product_max;
+    }
+    //
+
     int64_t size_bytes = p.workspace_size();
     if (size_bytes) {
       void* workspace_ptr = nullptr;
 
@@ -79,7 +79,11 @@ void fna_forward_generic(
     float attn_scale,
     void* logsumexp_ptr,
     IntTuple query_tile_shape,
-    IntTuple key_tile_shape) {
+    IntTuple key_tile_shape,
+    bool has_dot_product_min,
+    bool has_dot_product_max,
+    float dot_product_min,
+    float dot_product_max) {
   static constexpr auto kRank =
       std::tuple_size<decltype(spatial_extent)>::value;
   using Dim = typename GetDim<kRank>::type;
@@ -167,6 +171,18 @@ void fna_forward_generic(
     p.query_tile_shape = tuple_to_na_dim<Dim>(query_tile_shape);
     p.key_tile_shape = tuple_to_na_dim<Dim>(key_tile_shape);
 
+    // Optional dot product clipping
+    p.has_dot_product_clip = has_dot_product_min || has_dot_product_max;
+    p.has_dot_product_min = has_dot_product_min;
+    p.has_dot_product_max = has_dot_product_max;
+    if (has_dot_product_min) {
+      p.dot_product_min = dot_product_min;
+    }
+    if (has_dot_product_max) {
+      p.dot_product_max = dot_product_max;
+    }
+    //
+
     if (smem_bytes > 0xc000) {
       auto err = cudaFuncSetAttribute(
           kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
 
@@ -692,6 +692,17 @@ struct FusedNeighborhoodAttentionBackwardKernel {
     bool is_fully_block_sparse = false;
     bool has_q_padding = false;
 
+    // Optional dot product clipping -- all must be set explicitly for avoiding
+    // comparisons.
+    bool has_dot_product_clip = false;
+    bool has_dot_product_min = false;
+    bool has_dot_product_max = false;
+    accum_t dot_product_min =
+        -cutlass::platform::numeric_limits<accum_t>::infinity();
+    accum_t dot_product_max =
+        cutlass::platform::numeric_limits<accum_t>::infinity();
+    //
+
     // Dimensions/strides
     int32_t head_dim = -1;
     int32_t head_dim_value = -1;
@@ -1615,7 +1626,6 @@ struct FusedNeighborhoodAttentionBackwardKernel {
       mma.set_prologue_done(kPrologueQK);
       mma.set_zero_outside_bounds(/*!skipBoundsChecks*/ true);
       mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
-      accum = cutlass::multiplies<typename Mma::FragmentC>()(scale, accum);
 
       // Epilogue: add LSE + exp and store that to our shared memory buffer
       // shmem <- (matmul_result -
@@ -1629,6 +1639,30 @@ struct FusedNeighborhoodAttentionBackwardKernel {
       auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset(
           lane_id, warp_id, output_tile_coords);
 
+      // (Optional) clip dot products -- MUST BE DONE PRIOR TO MASKING &
+      // SCALING.
+      if (p.has_dot_product_clip) {
+        if (not p.has_dot_product_max) {
+          for (int i = 0; i < Mma::FragmentC::kElements; ++i) {
+            accum[i] = cutlass::fast_max(accum[i], p.dot_product_min);
+          }
+        } else if (not p.has_dot_product_min) {
+          for (int i = 0; i < Mma::FragmentC::kElements; ++i) {
+            accum[i] = cutlass::fast_min(accum[i], p.dot_product_max);
+          }
+        } else {
+          // assert(p.has_dot_product_min && p.has_dot_product_max);
+          for (int i = 0; i < Mma::FragmentC::kElements; ++i) {
+            accum[i] = cutlass::fast_max(
+                cutlass::fast_min(accum[i], p.dot_product_max),
+                p.dot_product_min);
+          }
+        }
+      }
+
+      // Dot product scale
+      accum = cutlass::multiplies<typename Mma::FragmentC>()(scale, accum);
+
       if (not p.is_fully_block_sparse) {
         // Neighborhood Attention masking
         Dim first_col, query_bound, row_idx;
 
@@ -200,6 +200,17 @@ struct FusedNeighborhoodAttentionKernel {
     bool is_fully_block_sparse = false;
     bool has_kv_padding = false;
 
+    // Optional dot product clipping -- all must be set explicitly for avoiding
+    // comparisons.
+    bool has_dot_product_clip = false;
+    bool has_dot_product_min = false;
+    bool has_dot_product_max = false;
+    accum_t dot_product_min =
+        -cutlass::platform::numeric_limits<accum_t>::infinity();
+    accum_t dot_product_max =
+        cutlass::platform::numeric_limits<accum_t>::infinity();
+    //
+
     // Moves pointers to what we should process
     // Returns "false" if there is no work to do
     CUTLASS_DEVICE bool advance_to_block() {
@@ -734,6 +745,27 @@ struct FusedNeighborhoodAttentionKernel {
         MM1::Mma::drain_cp_asyncs();
       }
 
+      // (Optional) clip dot products -- MUST BE DONE PRIOR TO MASKING &
+      // SCALING.
+      if (p.has_dot_product_clip) {
+        if (not p.has_dot_product_max) {
+          for (int i = 0; i < MM0::Mma::FragmentC::kElements; ++i) {
+            accum[i] = cutlass::fast_max(accum[i], p.dot_product_min);
+          }
+        } else if (not p.has_dot_product_min) {
+          for (int i = 0; i < MM0::Mma::FragmentC::kElements; ++i) {
+            accum[i] = cutlass::fast_min(accum[i], p.dot_product_max);
+          }
+        } else {
+          // assert(p.has_dot_product_min && p.has_dot_product_max);
+          for (int i = 0; i < MM0::Mma::FragmentC::kElements; ++i) {
+            accum[i] = cutlass::fast_max(
+                cutlass::fast_min(accum[i], p.dot_product_max),
+                p.dot_product_min);
+          }
+        }
+      }
+
       if (not p.is_fully_block_sparse) {
         // Neighborhood Attention masking
         Dim first_col, key_bound, row_idx;
 
@@ -74,7 +74,11 @@ void __global__ fna_bwd_reference_dQ_kernel(
     Causal is_causal,
     QKVLayout qkv_layout,
     float attn_scale,
-    int num_additional_kv) {
+    int num_additional_kv,
+    bool has_dot_product_min,
+    bool has_dot_product_max,
+    float dot_product_min,
+    float dot_product_max) {
   using namespace cute;
 
   auto attention_mask =
@@ -111,6 +115,20 @@ void __global__ fna_bwd_reference_dQ_kernel(
             acc_dov += mDO(idx_Q, idx_D1, idx_L) * mV(idx_K, idx_D1, idx_L);
             acc_doo += mDO(idx_Q, idx_D1, idx_L) * mO(idx_Q, idx_D1, idx_L);
           } // for idx_D1
+
+          // (Optional) clip dot products -- MUST BE DONE PRIOR TO MASKING &
+          // SCALING.
+          if (has_dot_product_min || has_dot_product_max) {
+            if (not has_dot_product_max) {
+              acc_qk = cutlass::fast_max(acc_qk, dot_product_min);
+            } else if (not has_dot_product_min) {
+              acc_qk = cutlass::fast_min(acc_qk, dot_product_max);
+            } else {
+              acc_qk = cutlass::fast_max(
+                  cutlass::fast_min(acc_qk, dot_product_max), dot_product_min);
+            }
+          }
+
           acc_qk *= attn_scale;
           acc_dov *= attn_scale;
           acc_doo *= attn_scale;
@@ -186,7 +204,11 @@ void __global__ fna_bwd_reference_dK_kernel(
     Causal is_causal,
     QKVLayout qkv_layout,
     float attn_scale,
-    int num_additional_kv) {
+    int num_additional_kv,
+    bool has_dot_product_min,
+    bool has_dot_product_max,
+    float dot_product_min,
+    float dot_product_max) {
   using namespace cute;
 
   auto attention_mask =
@@ -223,6 +245,20 @@ void __global__ fna_bwd_reference_dK_kernel(
             acc_dov += mDO(idx_Q, idx_D1, idx_L) * mV(idx_K, idx_D1, idx_L);
             acc_doo += mDO(idx_Q, idx_D1, idx_L) * mO(idx_Q, idx_D1, idx_L);
           } // for idx_D1
+
+          // (Optional) clip dot products -- MUST BE DONE PRIOR TO MASKING &
+          // SCALING.
+          if (has_dot_product_min || has_dot_product_max) {
+            if (not has_dot_product_max) {
+              acc_qk = cutlass::fast_max(acc_qk, dot_product_min);
+            } else if (not has_dot_product_min) {
+              acc_qk = cutlass::fast_min(acc_qk, dot_product_max);
+            } else {
+              acc_qk = cutlass::fast_max(
+                  cutlass::fast_min(acc_qk, dot_product_max), dot_product_min);
+            }
+          }
+
           acc_qk *= attn_scale;
           acc_dov *= attn_scale;
           acc_doo *= attn_scale;
@@ -299,7 +335,11 @@ void __global__ fna_bwd_reference_dV_kernel(
     Causal is_causal,
     QKVLayout qkv_layout,
     float attn_scale,
-    int num_additional_kv) {
+    int num_additional_kv,
+    bool has_dot_product_min,
+    bool has_dot_product_max,
+    float dot_product_min,
+    float dot_product_max) {
   using namespace cute;
 
   auto attention_mask =
@@ -333,6 +373,20 @@ void __global__ fna_bwd_reference_dV_kernel(
             ElementAccumulator rK = mK(idx_K, idx_D0, idx_L);
             acc_qk += rQ * rK;
           } // for idx_D0
+
+          // (Optional) clip dot products -- MUST BE DONE PRIOR TO MASKING &
+          // SCALING.
+          if (has_dot_product_min || has_dot_product_max) {
+            if (not has_dot_product_max) {
+              acc_qk = cutlass::fast_max(acc_qk, dot_product_min);
+            } else if (not has_dot_product_min) {
+              acc_qk = cutlass::fast_min(acc_qk, dot_product_max);
+            } else {
+              acc_qk = cutlass::fast_max(
+                  cutlass::fast_min(acc_qk, dot_product_max), dot_product_min);
+            }
+          }
+
           acc_qk *= attn_scale;
 
           auto id = make_identity_tensor(make_shape(1, 1));
@@ -408,7 +462,11 @@ void fna_bwd_reference_dQ(
     QKVLayout qkv_layout,
     float attn_scale,
     int num_additional_kv,
-    cudaStream_t stream) {
+    cudaStream_t stream,
+    bool has_dot_product_min,
+    bool has_dot_product_max,
+    float dot_product_min,
+    float dot_product_max) {
   using namespace cute;
 
   // Only so that we don't oversubscribe shmem when seqlen is large.
@@ -447,7 +505,11 @@ void fna_bwd_reference_dQ(
           Causal{},
           qkv_layout,
           attn_scale,
-          num_additional_kv);
+          num_additional_kv,
+          has_dot_product_min,
+          has_dot_product_max,
+          dot_product_min,
+          dot_product_max);
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -480,7 +542,11 @@ void fna_bwd_reference_dK(
     QKVLayout qkv_layout,
     float attn_scale,
     int num_additional_kv,
-    cudaStream_t stream) {
+    cudaStream_t stream,
+    bool has_dot_product_min,
+    bool has_dot_product_max,
+    float dot_product_min,
+    float dot_product_max) {
   using namespace cute;
 
   // Only so that we don't oversubscribe shmem when seqlen is large.
@@ -519,7 +585,11 @@ void fna_bwd_reference_dK(
           Causal{},
           qkv_layout,
           attn_scale,
-          num_additional_kv);
+          num_additional_kv,
+          has_dot_product_min,
+          has_dot_product_max,
+          dot_product_min,
+          dot_product_max);
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -552,7 +622,11 @@ void fna_bwd_reference_dV(
     QKVLayout qkv_layout,
     float attn_scale,
     int num_additional_kv,
-    cudaStream_t stream) {
+    cudaStream_t stream,
+    bool has_dot_product_min,
+    bool has_dot_product_max,
+    float dot_product_min,
+    float dot_product_max) {
   using namespace cute;
 
   // Only so that we don't oversubscribe shmem when seqlen is large.
@@ -591,7 +665,11 @@ void fna_bwd_reference_dV(
           Causal{},
           qkv_layout,
           attn_scale,
-          num_additional_kv);
+          num_additional_kv,
+          has_dot_product_min,
+          has_dot_product_max,
+          dot_product_min,
+          dot_product_max);
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -619,7 +697,11 @@ void fna_reference_backward(
     NADim dilation,
     Causal is_causal,
     float attn_scale,
-    cudaStream_t stream) {
+    cudaStream_t stream,
+    bool has_dot_product_min,
+    bool has_dot_product_max,
+    float dot_product_min,
+    float dot_product_max) {
   using namespace cute;
 
   // No GQA/MQA for now
@@ -694,7 +776,11 @@ void fna_reference_backward(
       qkv_layout,
       attn_scale,
       num_additional_kv,
-      stream);
+      stream,
+      has_dot_product_min,
+      has_dot_product_max,
+      dot_product_min,
+      dot_product_max);
   fna_bwd_reference_dK(
       problem_shape,
       mQ,
@@ -711,7 +797,11 @@ void fna_reference_backward(
       qkv_layout,
       attn_scale,
       num_additional_kv,
-      stream);
+      stream,
+      has_dot_product_min,
+      has_dot_product_max,
+      dot_product_min,
+      dot_product_max);
   fna_bwd_reference_dV(
       problem_shape,
       mQ,
@@ -728,7 +818,11 @@ void fna_reference_backward(
       qkv_layout,
       attn_scale,
       num_additional_kv,
-      stream);
+      stream,
+      has_dot_product_min,
+      has_dot_product_max,
+      dot_product_min,
+      dot_product_max);
 }
 
 } // namespace reference