address review comment

whitneywhtsang · whitneywhtsang · commit 9d64d49512cb · 2025-04-23T22:19:53.000Z
diff --git a/benchmarks/triton_kernels_benchmark/flex_attention_benchmark_causal_mask.py b/benchmarks/triton_kernels_benchmark/flex_attention_benchmark_causal_mask.py
@@ -67,7 +67,10 @@ def causal_mask(_, __, q_idx, kv_idx):
 
         # FlexDecoding configuration. N_CTX_q equals 1. N_CTX_kv >= 1k
         # Decode shapes of Llama-3.1-8B
-        [[z, 32, 8, 1, 1024 + 64, 128, 128, 'fwd'] for z in batch_sizes] +
+        [
+            # AssertionError: elements mismatched
+            # [z, 32, 8, 1, 1024 + 64, 128, 128, 'fwd'] for z in batch_sizes
+        ] +
         # Decode shapes of Phi3-mini-3.8B
         [
             # acc = acc.reshape(G, BLOCK_M_PER_HQ, V_HEAD_DIM)
@@ -116,8 +119,7 @@ def benchmark(Z, H_q, H_kv, N_CTX_q, N_CTX_kv, D_HEAD_qk, D_HEAD_v, MODE, provid
             triton_do = torch.randn_like(triton_o)
             triton_fn = lambda: triton_o.backward(triton_do, retain_graph=True)
 
-        atol = 1e-1
-        benchmark_suit.assert_close(triton_fn, torch_fn, atol=atol, rtol=1e-3, err_msg='triton to torch')
+        benchmark_suit.assert_close(triton_fn, torch_fn, atol=1e-2, rtol=1e-3, err_msg='triton to torch')
         _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles)
 
     elif provider == 'onednn':