intel
diff --git a/‎.github/workflows/build-test-reusable.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/build-test-reusable.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/triton-benchmarks.yml
Lines changed: 22 additions & 37 deletions b/‎.github/workflows/triton-benchmarks.yml
Lines changed: 22 additions & 37 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py
Lines changed: 2 additions & 2 deletions b/‎benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/gemm_benchmark.py
Lines changed: 5 additions & 12 deletions b/‎benchmarks/triton_kernels_benchmark/gemm_benchmark.py
Lines changed: 5 additions & 12 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/gemm_tensor_desc_benchmark.py
Lines changed: 0 additions & 3 deletions b/‎benchmarks/triton_kernels_benchmark/gemm_tensor_desc_benchmark.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/gemm_tensor_of_ptr_benchmark.py
Lines changed: 0 additions & 3 deletions b/‎benchmarks/triton_kernels_benchmark/gemm_tensor_of_ptr_benchmark.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎python/triton/_utils.py
Lines changed: 0 additions & 23 deletions b/‎python/triton/_utils.py
Lines changed: 0 additions & 23 deletions
diff --git a/‎python/triton/compiler/compiler.py
Lines changed: 0 additions & 19 deletions b/‎python/triton/compiler/compiler.py
Lines changed: 0 additions & 19 deletions
diff --git a/‎python/triton/runtime/jit.py
Lines changed: 1 addition & 19 deletions b/‎python/triton/runtime/jit.py
Lines changed: 1 addition & 19 deletions
@@ -76,6 +76,7 @@ env:
 jobs:
   build:
     name: Build
+    timeout-minutes: 720
     runs-on: ${{ fromJson(inputs.runner_label && format('["linux", "{0}"]', inputs.runner_label) || format('["linux", "{0}", "{1}", "{2}"]', inputs.device, inputs.driver_version, inputs.runner_version)) }}
     defaults:
       run:
 
@@ -140,20 +140,6 @@ jobs:
           python build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
 
       - name: Run Triton GEMM kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }}
-        run: |
-          cd benchmarks/triton_kernels_benchmark
-          NEW_SHAPES=0 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
-          source ../../scripts/capture-hw-details.sh
-          python build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-triton-report.csv --benchmark gemm-legacy --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
-          python build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm-legacy --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
-          python build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-onednn-report.csv --benchmark gemm-legacy --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
-          if [[ "${{ inputs.runner_label }}" = "max1550" ]]; then
-            # FIXME: enable cuttlass on bmg
-            python build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-cutlass-report.csv --benchmark gemm-legacy --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
-          fi
-
-      - name: Run Triton GEMM kernel benchmark - new shapes
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
@@ -274,8 +260,8 @@ jobs:
           python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
 
           source ../../scripts/capture-hw-details.sh
-          python build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
-          python build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
+          python build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark flash-attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          python build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark flash-attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
 
       - name: Run Triton FA bwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }}
@@ -286,8 +272,8 @@ jobs:
           mv $REPORTS/attn-performance.csv $REPORTS/attn-bwd-performance.csv
 
           source ../../scripts/capture-hw-details.sh
-          python build_report.py $REPORTS/attn-bwd-performance.csv $REPORTS/attn-bwd-triton-report.csv --benchmark attn-bwd --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
-          python build_report.py $REPORTS/attn-bwd-performance.csv $REPORTS/attn-bwd-xetla-report.csv --benchmark attn-bwd --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
+          python build_report.py $REPORTS/attn-bwd-performance.csv $REPORTS/attn-bwd-triton-report.csv --benchmark flash-attn-bwd --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          python build_report.py $REPORTS/attn-bwd-performance.csv $REPORTS/attn-bwd-xetla-report.csv --benchmark flash-attn-bwd --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
 
       - name: Run Triton FA fwd kernel benchmark - with tensor descriptors
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }}
@@ -297,22 +283,8 @@ jobs:
           mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv
 
           source ../../scripts/capture-hw-details.sh
-          python build_report.py $REPORTS/attn-tensor-desc-performance.csv $REPORTS/attn-tensor-desc-triton-report.csv --benchmark attn-tensor-desc --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
-          python build_report.py $REPORTS/attn-tensor-desc-performance.csv $REPORTS/attn-tensor-desc-xetla-report.csv --benchmark attn-tensor-desc --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
-
-      - name: Run Prefix Sums kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
-        run: |
-          cd benchmarks/triton_kernels_benchmark
-          python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS
-          source ../../scripts/capture-hw-details.sh
-          python build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
-
-      - name: Run micro benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
-        run: |
-          cd benchmarks/micro_benchmarks
-          python run_benchmarks.py --reports $REPORTS
+          python build_report.py $REPORTS/attn-tensor-desc-performance.csv $REPORTS/attn-tensor-desc-triton-report.csv --benchmark flash-attn-tensor-desc --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          python build_report.py $REPORTS/attn-tensor-desc-performance.csv $REPORTS/attn-tensor-desc-xetla-report.csv --benchmark flash-attn-tensor-desc --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
 
       - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
@@ -321,7 +293,7 @@ jobs:
           python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
 
           source ../../scripts/capture-hw-details.sh
-          python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-triton-report.csv --benchmark flexAttnCausal --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          python build_report.py $REPORTS/flexAttnCausal-performance.csv $REPORTS/flexAttnCausal-triton-report.csv --benchmark flex-attn-causal --compiler triton --param_cols "Z,H_q,H_kv,N_CTX_q,N_CTX_kv,D_HEAD_qk,D_HEAD_v" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run Triton FlexAttention Custom Masks fwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }}
@@ -330,9 +302,22 @@ jobs:
           python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS
 
           source ../../scripts/capture-hw-details.sh
-          python build_report.py $REPORTS/flexAttnMasks-performance.csv $REPORTS/flexAttnMasks-triton-report.csv --benchmark flexAttnMasks --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,MASK" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG --mask
-          python build_report.py $REPORTS/flexAttnMasks-performance.csv $REPORTS/flexAttnMasks-onednn-report.csv --benchmark flexAttnMasks --compiler onednn --param_cols "Z,H,N_CTX,D_HEAD,MASK" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG --mask
+          python build_report.py $REPORTS/flexAttnMasks-performance.csv $REPORTS/flexAttnMasks-triton-report.csv --benchmark flex-attn-masks --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,MASK" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG --mask
+          python build_report.py $REPORTS/flexAttnMasks-performance.csv $REPORTS/flexAttnMasks-onednn-report.csv --benchmark flex-attn-masks --compiler onednn --param_cols "Z,H,N_CTX,D_HEAD,MASK" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG --mask
 
+      - name: Run Prefix Sums kernel benchmark
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
+        run: |
+          cd benchmarks/triton_kernels_benchmark
+          python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS
+          source ../../scripts/capture-hw-details.sh
+          python build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+
+      - name: Run micro benchmark
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
+        run: |
+          cd benchmarks/micro_benchmarks
+          python run_benchmarks.py --reports $REPORTS
 
       - name: Upload benchmark reports
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
 
@@ -160,12 +160,12 @@ def _attn_fwd_with_block_pointers(Q, K, V, sm_scale, M, Out,  #
 configs = [
     triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large', 'one_matrix_per_load_for_bt': True}, num_stages=s, num_warps=w) \
     for BM in [128, 256] \
-    for BN in [32, 64, 128] \
+    for BN in [32, 64] \
     for s in [2, 3, 4] \
     for w in [8, 16, 32] \
     ]
 
-tuner = triton.autotune(configs, key=['N_CTX', 'BLOCK_DMODEL'])
+tuner = triton.autotune(configs, key=['N_CTX', 'BLOCK_DMODEL', 'STAGE'])
 
 
 @triton.jit
 
@@ -232,15 +232,13 @@ def get_shapes(B, M, N, K, transpose_a, transpose_b):
     return a_shape, b_shape
 
 
-NEW_X_VALS = [  #
+X_VALS = [  #
+    [1, 1024 * i, 1024 * i, 1024 * i] for i in [1, 2, 4, 8]
+] + [  #
     [1, m, n, 4096] for m in [1, 8] for n in [1024, 4096, 6144, 14336, 28672, 128256]
 ] + [  #
     [1, m, 4096, 14336] for m in [1, 8]
 ] + [  #
-    [1, 8192, 4096, 4096]  #
-]
-
-X_VALS = [[1, 1024 * i, 1024 * i, 1024 * i] for i in [1, 2, 4, 8]] + [
     [1, 1, 13824, 5120],
     [1, 4, 12288, 4096],
     [1, 512, 8192, 8192],
@@ -261,6 +259,7 @@ def get_shapes(B, M, N, K, transpose_a, transpose_b):
     [32, 4096, 128, 4096],
     [4096, 8, 128, 16384],
     [4096, 8, 16384, 128],
+    [1, 8192, 4096, 4096],
 ]
 
 DEVICE_NAME = torch.xpu.get_device_name()
@@ -281,16 +280,13 @@ def is_enough_memory(x_val):
     return enough_memory
 
 
-if os.getenv('NEW_SHAPES', '1') == '1':
-    X_VALS += NEW_X_VALS
 X_VALS = [x_val for x_val in X_VALS if is_enough_memory(x_val)]
 
 
 def get_benchmark(
     providers_filter: Optional[list[str]] = None,
     transpose_a=False,
     transpose_b=False,
-    new_shapes=False,
     matmul_kernel=matmul_kernel_with_block_pointers,
     matmul_kernel_batched=matmul_kernel_with_block_pointers_batched,
     plot_name='matmul-performance',
@@ -303,10 +299,8 @@ def get_benchmark(
         'triton': 'Triton',
         'onednn': 'OneDNN',
     }
-    # use_xetla and use_cutlass
+    # use_cutlass
     if not (transpose_a or transpose_b):
-        if not new_shapes:
-            supported_providers['xetla'] = 'XeTLA'
         if '580' not in torch.xpu.get_device_name():
             # FIXME: enable cutlass on bmg
             supported_providers['cutlass'] = 'CUTLASS'
@@ -459,6 +453,5 @@ def cutlass_invoker():
     _benchmark = get_benchmark(
         transpose_a=(os.getenv('TRANSPOSE_A', '0') == '1'),
         transpose_b=(os.getenv('TRANSPOSE_B', '0') == '1'),
-        new_shapes=(os.getenv('NEW_SHAPES', '1') == '1'),
     )
     _benchmark.run(show_plots=False, print_data=True)
@@ -117,7 +117,6 @@ def get_benchmark(
     providers_filter: Optional[List[str]] = None,
     transpose_a=False,
     transpose_b=False,
-    new_shapes=True,
 ):
     return gemm_benchmark.get_benchmark(
         providers_filter=providers_filter,
@@ -126,14 +125,12 @@ def get_benchmark(
         plot_name='matmul-tensor-desc-performance',
         transpose_a=transpose_a,
         transpose_b=transpose_b,
-        new_shapes=new_shapes,
     )
 
 
 if __name__ == '__main__':
     _benchmark = get_benchmark(
         transpose_a=(os.getenv('TRANSPOSE_A', '0') == '1'),
         transpose_b=(os.getenv('TRANSPOSE_B', '0') == '1'),
-        new_shapes=(os.getenv('NEW_SHAPES', '1') == '1'),
     )
     _benchmark.run(show_plots=False, print_data=True)
@@ -124,7 +124,6 @@ def get_benchmark(
     providers_filter: Optional[List[str]] = None,
     transpose_a=False,
     transpose_b=False,
-    new_shapes=True,
 ):
     return gemm_benchmark.get_benchmark(
         providers_filter=providers_filter,
@@ -133,14 +132,12 @@ def get_benchmark(
         plot_name='matmul-tensor-of-ptr-performance',
         transpose_a=transpose_a,
         transpose_b=transpose_b,
-        new_shapes=new_shapes,
     )
 
 
 if __name__ == '__main__':
     _benchmark = get_benchmark(
         transpose_a=(os.getenv('TRANSPOSE_A', '0') == '1'),
         transpose_b=(os.getenv('TRANSPOSE_B', '0') == '1'),
-        new_shapes=(os.getenv('NEW_SHAPES', '1') == '1'),
     )
     _benchmark.run(show_plots=False, print_data=True)
@@ -35,26 +35,3 @@ def _impl(path: tuple[int, ...], current: Any):
     _impl((), iterable)
 
     return list(ret.keys())
-
-
-class ClassPropertyDescriptor:
-
-    def __init__(self, fget, fset=None):
-        self.fget = fget
-        self.fset = fset
-
-    def __get__(self, obj, cls):
-        return self.fget(cls)
-
-    def __set__(self, obj, value):
-        if self.fset is None:
-            raise AttributeError("can't set attribute")
-        self.fset(obj.__class__, value)
-
-    def setter(self, fset):
-        self.fset = fset
-        return self
-
-
-def classproperty(func):
-    return ClassPropertyDescriptor(func)
@@ -9,7 +9,6 @@
 from ..runtime.cache import get_cache_manager, get_dump_manager, get_override_manager
 from ..runtime.driver import driver
 from ..tools.disasm import get_sass, get_spvdis
-from .._utils import classproperty
 # TODO: this shouldn't be here
 from .code_generator import ast_to_ttir
 from pathlib import Path
@@ -432,24 +431,6 @@ def __missing__(self, key):
 
 class CompiledKernel:
 
-    # FIXME: remove launch_enter_hook/launch_exit_hook properties
-    # when pytorch has a compatible layer for the new API.
-    @classproperty
-    def launch_enter_hook(cls):
-        return knobs.runtime.launch_enter_hook
-
-    @launch_enter_hook.setter
-    def launch_enter_hook(cls, value):
-        knobs.runtime.launch_enter_hook = value
-
-    @classproperty
-    def launch_exit_hook(cls):
-        return knobs.runtime.launch_exit_hook
-
-    @launch_exit_hook.setter
-    def launch_exit_hook(cls, value):
-        knobs.runtime.launch_exit_hook = value
-
     def __init__(self, src, metadata_group, hash):
         from collections import namedtuple
         metadata_path = next((Path(p) for c, p in metadata_group.items() if c.endswith(".json")))
 
@@ -14,7 +14,7 @@
 from types import ModuleType
 from .. import knobs
 from ..runtime.driver import driver
-from .._utils import find_paths_if, get_iterable_path, classproperty
+from .._utils import find_paths_if, get_iterable_path
 
 TRITON_MODULE = __name__[:-len(".runtime.jit")]
 
@@ -494,24 +494,6 @@ class JitFunctionInfo:
 
 class JITFunction(KernelInterface[T]):
 
-    # FIXME: remove cache_hook/compiled_hook properties
-    # when pytorch has a compatible layer for the new API.
-    @classproperty
-    def cache_hook(cls):
-        return knobs.runtime.jit_cache_hook
-
-    @cache_hook.setter
-    def cache_hook(cls, value):
-        knobs.runtime.jit_cache_hook = value
-
-    @classproperty
-    def compiled_hook(cls):
-        return knobs.runtime.jit_post_compile_hook
-
-    @compiled_hook.setter
-    def compiled_hook(cls, value):
-        knobs.runtime.jit_post_compile_hook = value
-
     def _call_hook(
         self,
         hook,