We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 8c719e6 commit fece864Copy full SHA for fece864
csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
@@ -894,7 +894,7 @@ __device__ inline int64_t findTotalEltsLessThanTarget_v2(T const* sorted_indices
894
const int lane_id = threadIdx.x & (WARP_SZ - 1);
895
896
int local_count = 0;
897
-#pragma unroll
+#pragma unroll(4)
898
for (int k = 0; k < arr_length / WARP_SZ; ++k) {
899
const int idx = lane_id + k * WARP_SZ;
900
T v = sorted_indices[idx];
0 commit comments