intel
diff --git a/‎.github/workflows/integration-tests.yml
Lines changed: 7 additions & 7 deletions b/‎.github/workflows/integration-tests.yml
Lines changed: 7 additions & 7 deletions
diff --git a/‎.github/workflows/integration-tests.yml.in
Lines changed: 6 additions & 6 deletions b/‎.github/workflows/integration-tests.yml.in
Lines changed: 6 additions & 6 deletions
diff --git a/‎bench/bench/bench_mlp.py
Lines changed: 4 additions & 4 deletions b/‎bench/bench/bench_mlp.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎bench/tests/test_compact.py renamed to ‎bench/tests/test_compaction.py
Lines changed: 4 additions & 4 deletions b/‎bench/tests/test_compact.py renamed to ‎bench/tests/test_compaction.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎bench/tests/test_matmul.py
Lines changed: 2 additions & 2 deletions b/‎bench/tests/test_matmul.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎bench/tests/test_swiglu.py
Lines changed: 14 additions & 6 deletions b/‎bench/tests/test_swiglu.py
Lines changed: 14 additions & 6 deletions
diff --git a/‎bench/triton_bench/compact.py renamed to ‎bench/triton_bench/compaction.py
Lines changed: 4 additions & 22 deletions b/‎bench/triton_bench/compact.py renamed to ‎bench/triton_bench/compaction.py
Lines changed: 4 additions & 22 deletions
diff --git a/‎bench/triton_bench/compaction_details/_masked_compaction.py
Lines changed: 19 additions & 0 deletions b/‎bench/triton_bench/compaction_details/_masked_compaction.py
Lines changed: 19 additions & 0 deletions
@@ -114,7 +114,7 @@ jobs:
         if: env.enable_integration == 'true'
         run: |
           if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
-            echo '::set-output name=matrix-CUDA::[["a100-runner-set"], ["h100-runner-set"], ["gb200-runner-set"]]'
+            echo '::set-output name=matrix-CUDA::[["nvidia-a100"], ["nvidia-h100"], ["nvidia-gb200"]]'
             echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"], ["amd-gfx942"]]'
             echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           else
@@ -232,7 +232,7 @@ jobs:
         env:
           CUDA_HOME: "/usr/local/cuda"
         run: |
-          if [ "${{ matrix.runner[0] }}" == "gb200-runner-set" ]; then
+          if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
             source /venv/bin/activate
           fi
           echo "PATH is '$PATH'"
@@ -244,23 +244,23 @@ jobs:
         run: make test-lit
       - name: Run python tests on CUDA
         run: |
-          if [ "${{ matrix.runner[0] }}" == "gb200-runner-set" ]; then
+          if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
             source /venv/bin/activate
           fi
           make test-unit
       - name: Run interpreter tests
-        if: ${{ matrix.runner[0] == 'h100-runner-set' }}
+        if: ${{ matrix.runner[0] == 'nvidia-h100' }}
         run: make test-interpret
       - name: Run regression tests
         run: |
-          if [ "${{ matrix.runner[0] }}" == "gb200-runner-set" ]; then
+          if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
             source /venv/bin/activate
           fi
           make test-regression
       - name: Run C++ unittests
         run: make test-cpp
       - name: Run Proton tests
-        if: ${{ matrix.runner[0] != 'gb200-runner-set' }}
+        if: ${{ matrix.runner[0] != 'nvidia-gb200' }}
         run: make test-proton
       - name: Inspect cache directories
         run: |
@@ -409,7 +409,7 @@ jobs:
           cd python/test/regression
           python3 -m pytest -s -n 8 ./test_cast_matmul.py
       - name: Run Proton tests
-        if: ${{ matrix.runner[0] != 'gb200-runner-set' }}
+        if: ${{ matrix.runner[0] != 'nvidia-gb200' }}
         run: make test-proton
       - name: Run C++ unittests
         run: make test-cpp
 
@@ -123,7 +123,7 @@ jobs:
         if: env.enable_integration == 'true'
         run: |
           if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
-            echo '::set-output name=matrix-CUDA::[["a100-runner-set"], ["h100-runner-set"], ["gb200-runner-set"]]'
+            echo '::set-output name=matrix-CUDA::[["nvidia-a100"], ["nvidia-h100"], ["nvidia-gb200"]]'
             echo '::set-output name=matrix-HIP::[["self-hosted", "gfx90a"], ["amd-gfx942"]]'
             echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           else
@@ -264,7 +264,7 @@ jobs:
         env:
           CUDA_HOME: "/usr/local/cuda"
         run: |
-          if [ "${{ matrix.runner[0] }}" == "gb200-runner-set" ]; then
+          if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
             source /venv/bin/activate
           fi
           echo "PATH is '$PATH'"
@@ -281,18 +281,18 @@ jobs:
 
       - name: Run python tests on CUDA
         run:  |
-          if [ "${{ matrix.runner[0] }}" == "gb200-runner-set" ]; then
+          if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
             source /venv/bin/activate
           fi
           make test-unit
 
       - name: Run interpreter tests
-        if: ${{ matrix.runner[0] == 'h100-runner-set' }}
+        if: ${{ matrix.runner[0] == 'nvidia-h100' }}
         run: make test-interpret
 
       - name: Run regression tests
         run:  |
-          if [ "${{ matrix.runner[0] }}" == "gb200-runner-set" ]; then
+          if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
             source /venv/bin/activate
           fi
           make test-regression
@@ -303,7 +303,7 @@ jobs:
 
       - &run-proton-tests-step
         name: Run Proton tests
-        if: ${{ matrix.runner[0] != 'gb200-runner-set' }}
+        if: ${{ matrix.runner[0] != 'nvidia-gb200' }}
         run: make test-proton
 
       - *inspect-cache-directories-step
 
@@ -3,11 +3,11 @@
 import triton.profiler as proton
 import torch
 import triton_bench.swiglu
-from triton_bench.mxfp import downcast_to_mxfp
+from triton_bench.numerics_details.mxfp import downcast_to_mxfp
 from triton_bench.matmul_ogs import MicroscalingCtx, matmul_ogs, PrecisionConfig, FlexCtx
 from triton_bench.numerics import InFlexData
 from triton_bench.routing import routing
-from triton_bench.meta import cuda_capability_geq, is_hip, get_cdna_version
+from triton_bench.target_info import is_hip, get_cdna_version
 
 if torch.cuda.is_available() and not is_hip():
     from triton._C.libtriton import nvidia
@@ -152,5 +152,5 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype,
     qxdtype = "fp8" if has_native_mx4 else "bf16"
     print(bench_mlp(8192, 8192, 8192, 1, 1, "fp8", "fp8", TP=1, EP=1, name="dense"))
     print(bench_mlp(8192, 8192, 8192, 1, 1, qxdtype, "mx4", TP=1, EP=1, name="dense"))
-    print(bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=2, name="llama4"))
-    print(bench_mlp(2048, 5120, 8192, 128, 4, qxdtype, "mx4", TP=4, EP=2, name="llama4"))
+    print(bench_mlp(2048, 5120, 8192, 128, 4, "fp8", "fp8", TP=4, EP=1, name="llama4"))
+    print(bench_mlp(2048, 5120, 8192, 128, 4, qxdtype, "mx4", TP=4, EP=1, name="llama4"))
@@ -1,6 +1,6 @@
 import pytest
 import torch
-from triton_bench.compact import masked_compact, masked_compact_torch
+from triton_bench.compaction import compaction, compaction_torch
 
 
 @pytest.mark.parametrize("n_tokens, n_cols, k, p", [
@@ -9,7 +9,7 @@
     (131, 128, 16, 0.6),
     (496, 128, 16, 0.),
 ])
-def test_masked_compact(n_tokens, n_cols, k, p):
+def test_compaction(n_tokens, n_cols, k, p):
     device = "cuda"
     yi = torch.rand((n_tokens, n_cols), device=device).argsort(dim=-1)
     yi = yi[:, :k].to(torch.int32)
@@ -23,7 +23,7 @@ def test_masked_compact(n_tokens, n_cols, k, p):
     chunks = mask.view(*mask.shape[:-1], -1, 32)
     weights = (1 << torch.arange(32, dtype=torch.int32, device=device))
     bitmask = (chunks.int() * weights).sum(dim=-1)
-    yv_ref, yi_ref = masked_compact_torch(yv, yi, bitmask)
-    yv_tri, yi_tri = masked_compact(yv, yi, bitmask)
+    yv_ref, yi_ref = compaction_torch(yv, yi, bitmask)
+    yv_tri, yi_tri = compaction(yv, yi, bitmask)
     assert torch.all(yi_ref == yi_tri)
     assert torch.all(yv_ref == yv_tri)
@@ -11,11 +11,11 @@
 from triton_bench.matmul_ogs import matmul_ogs, matmul_ogs_torch
 # numerics utilities
 from triton_bench.numerics import InFlexData, OutFlexData
-from triton_bench.mxfp import downcast_to_mxfp, upcast_from_mxfp
+from triton_bench.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp
 # testing utilities
 from triton_bench.testing import assert_close, compute_actual_scale
 # target-specific utilities
-from triton_bench.meta import is_hip
+from triton_bench.target_info import is_hip
 
 # ---------------
 # initialize data
 
@@ -1,8 +1,12 @@
+from triton_bench.routing import routing_torch
 from triton_bench.swiglu import swiglu, swiglu_torch, PrecisionConfig
 from triton_bench.testing import assert_close
 import torch
 import pytest
 
+from .test_routing import init_data as init_routing_data
+from .test_routing import ref_expt_data
+
 # ---------------
 # initialize data
 # ---------------
@@ -15,10 +19,6 @@ def alloc_rand(shape, device, dtype, requires_grad=True):
     return torch.randn(shape, device=device, dtype=dtype, requires_grad=requires_grad)
 
 
-def alloc_rand_like(x):
-    return alloc_rand(x.shape, x.device, x.dtype, x.requires_grad)
-
-
 # ---------------
 # unit tests
 # ---------------
@@ -30,9 +30,17 @@ def test_op(M, N, limit, alpha=0.5):
     torch.manual_seed(2)
     dev = "cuda"
     dtype = torch.bfloat16
+    # initialize expert data
+    n_expts_tot = 6
+    n_expts_act = 2
+    logits = init_routing_data(M, n_expts_tot).detach()
+    routing_data, _, _ = routing_torch(logits, n_expts_act)
+    expt_data = ref_expt_data(routing_data, M * n_expts_act, block_m=128)
+    n_tokens = expt_data[2 * n_expts_tot].sum()
+
     # initialize data
-    x = alloc_rand([M, N], device=dev, dtype=torch.bfloat16)
+    x = alloc_rand([n_tokens, N], device=dev, dtype=dtype)
     precision_config = PrecisionConfig(limit=limit)
-    tri_y = swiglu(x, alpha, precision_config)
+    tri_y = swiglu(x, alpha, precision_config, expt_data, n_expts_tot)
     ref_y = swiglu_torch(x, alpha, precision_config)
     assert_close(tri_y, ref_y)
@@ -1,27 +1,9 @@
 import torch
-import triton
-import triton.language as tl
+from .compaction_details._masked_compaction import _masked_compaction
 from triton_bench import Bitmatrix
 
 
-@triton.jit
-def _masked_compact(Yv, Yi, BitMask, stride_bm, RetYv, RetYi, sentinel, K: tl.constexpr):
-    pid_m = tl.program_id(0)
-    yv = tl.load(Yv + pid_m * K + tl.arange(0, K))
-    yi = tl.load(Yi + pid_m * K + tl.arange(0, K))
-    div = yi // 32
-    rem = yi % 32
-    active_bits = (tl.load(BitMask + pid_m * stride_bm + div) >> rem) & 1
-    exc_cumsum = tl.cumsum(active_bits, 0) - active_bits
-    rev_arange = tl.where(active_bits, 0, K - 1 - tl.arange(0, K))
-    write_indx = exc_cumsum + rev_arange
-    yv = tl.where(active_bits, yv, sentinel)
-    yi = tl.where(active_bits, yi, sentinel)
-    tl.store(RetYv + pid_m * K + write_indx, yv)
-    tl.store(RetYi + pid_m * K + write_indx, yi)
-
-
-def masked_compact(yv, yi, bitmask, sentinel=-1):
+def compaction(yv, yi, bitmask, sentinel=-1):
     """
     Return compacted copies of *yv* and *yi* based on a per-row bitmask.
 
@@ -53,7 +35,7 @@ def masked_compact(yv, yi, bitmask, sentinel=-1):
     if isinstance(bitmask, Bitmatrix):
         bitmask = bitmask.data
 
-    _masked_compact[(n_rows, )](
+    _masked_compaction[(n_rows, )](
         yv, yi, bitmask, bitmask.stride(0),  # inputs
         ret_yv, ret_yi,  # outputs
         sentinel,  # sentinel
@@ -62,7 +44,7 @@ def masked_compact(yv, yi, bitmask, sentinel=-1):
     return ret_yv, ret_yi
 
 
-def masked_compact_torch(yv: torch.Tensor, yi: torch.Tensor, bitmask: torch.Tensor, sentinel=-1):
+def compaction_torch(yv: torch.Tensor, yi: torch.Tensor, bitmask: torch.Tensor, sentinel=-1):
     """
     reference implementation of `masked_compact`
     """
 
@@ -0,0 +1,19 @@
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _masked_compaction(Yv, Yi, BitMask, stride_bm, RetYv, RetYi, sentinel, K: tl.constexpr):
+    pid_m = tl.program_id(0)
+    yv = tl.load(Yv + pid_m * K + tl.arange(0, K))
+    yi = tl.load(Yi + pid_m * K + tl.arange(0, K))
+    div = yi // 32
+    rem = yi % 32
+    active_bits = (tl.load(BitMask + pid_m * stride_bm + div) >> rem) & 1
+    exc_cumsum = tl.cumsum(active_bits, 0) - active_bits
+    rev_arange = tl.where(active_bits, 0, K - 1 - tl.arange(0, K))
+    write_indx = exc_cumsum + rev_arange
+    yv = tl.where(active_bits, yv, sentinel)
+    yi = tl.where(active_bits, yi, sentinel)
+    tl.store(RetYv + pid_m * K + write_indx, yv)
+    tl.store(RetYi + pid_m * K + write_indx, yi)