Use additional semaphores to avoid data races in TPU paged_attention_kernel.

jburnim · Google-ML-Automation · commit 0e690c1a8804 · 2025-05-22T14:53:42.000-07:00
Also prevents an out-of-bounds read of SMEM. And re-enables tests for the TPU paged_attention_kernel. @apaszke confirmed the presence of data races using the race detector in the new TPU interpret mode. With the additional semaphores, the race detector no longer detects any races in the this kernel and I no longer see any test failures in 20+ test runs on a TPU. Details on the data races: - In each iteration, the kernel: (a) Starts copying data for `k` and `v` for the next iteration. (b) Waits for the copy of `k` for the current iteration to finish. (c) Waits for the copy of `v` for the current iteration to finish. - It is possible for these copies to happen out of order -- that is: (a) The copies for the next iteration can finish before the copies for the current iteration. (b) And the copies for `v` for the current iteration can finish before the copies for `k` for the current iteration. - If the same DMA semaphore is used for everything, then out-of-order copies can lead to: (a) `k = async_copy_k.wait_and_get_loaded()` returns but the data isn't all available because the underlying semaphore was signaled by the completion of copies of `v` for the current iteration or copies of `k` or `v` for the next iteration. (a) `v = async_copy_v.wait_and_get_loaded()` returns but the data isn't all available because the underlying semaphore was signaled by the completion of copies of `k` or `v` for the next iteration. PiperOrigin-RevId: 762136079
diff --git a/jax/experimental/pallas/ops/tpu/paged_attention/paged_attention_kernel.py b/jax/experimental/pallas/ops/tpu/paged_attention/paged_attention_kernel.py
@@ -127,7 +127,8 @@ def paged_flash_attention_kernel(
     k_scales_vmem_buffer,
     v_vmem_buffer,
     v_scales_vmem_buffer,
-    sem,
+    k_sems,
+    v_sems,
     *,
     batch_size: int,
     pages_per_compute_block: int,
@@ -176,7 +177,9 @@ def advance_to_next_non_zero_length():
 
       return (
           lax.cond(
-              jnp.logical_and(next_b < batch_size, lengths_ref[next_b] == 0),
+              jnp.logical_and(
+                  next_b < batch_size,
+                  lengths_ref[lax.clamp(0, next_b, batch_size - 1)] == 0),
               advance_to_next_non_zero_length,
               lambda: next_b,
           ),
@@ -200,7 +203,7 @@ def create_kv_async_copy_descriptors(b, h, i, buffer_index):
         k_scales_vmem_buffer.at[buffer_index]
         if k_scales_vmem_buffer is not None
         else None,
-        sem,
+        k_sems.at[buffer_index],
         page_indices_ref,
         page_offset,
         pages_to_load,
@@ -213,7 +216,7 @@ def create_kv_async_copy_descriptors(b, h, i, buffer_index):
         v_scales_vmem_buffer.at[buffer_index]
         if v_scales_vmem_buffer is not None
         else None,
-        sem,
+        v_sems.at[buffer_index],
         page_indices_ref,
         page_offset,
         pages_to_load,
@@ -301,7 +304,8 @@ def paged_flash_attention_kernel_inline_seq_dim(
     k_scales_vmem_buffer,
     v_vmem_buffer,
     v_scales_vmem_buffer,
-    sem,
+    k_sems,
+    v_sems,
     *,
     batch_size: int,
     pages_per_compute_block: int,
@@ -336,7 +340,8 @@ def body(i, _):
         k_scales_vmem_buffer,
         v_vmem_buffer,
         v_scales_vmem_buffer,
-        sem,
+        k_sems,
+        v_sems,
         batch_size=batch_size,
         pages_per_compute_block=pages_per_compute_block,
         pages_per_sequence=pages_per_sequence,
@@ -584,7 +589,8 @@ def paged_attention(
             ),
             v_scales_pages.dtype,  # pytype: disable=attribute-error
         ),  # v_scales_pages buffer
-        pltpu.SemaphoreType.DMA,
+        pltpu.SemaphoreType.DMA((2,)),
+        pltpu.SemaphoreType.DMA((2,)),
     )
   else:
     in_specs = [
@@ -615,7 +621,8 @@ def paged_attention(
             v_pages.dtype,
         ),  # v_pages buffer
         None,
-        pltpu.SemaphoreType.DMA,
+        pltpu.SemaphoreType.DMA((2,)),
+        pltpu.SemaphoreType.DMA((2,)),
     )
 
   out, _, _ = pl.pallas_call(
diff --git a/tests/pallas/tpu_paged_attention_kernel_test.py b/tests/pallas/tpu_paged_attention_kernel_test.py
@@ -265,8 +265,6 @@ def test_paged_attention(
       attn_logits_soft_cap,
       are_kv_quantized,
   ):
-    # TODO(mvoz, skyewm): Re-enable this test once the data race is fixed.
-    self.skipTest("This kernel has data races that need to be fixed.")
     if not jtu.is_device_tpu_at_least(4):
       self.skipTest("Only supports TPU generation 4 or above")
     if jtu.is_device_tpu(version=4) and are_kv_quantized: