From 81f79d29d220ac039544b732e17479c0e4b8d5f7 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 19 May 2025 20:50:18 +0200 Subject: [PATCH 01/17] Try intel-pti 0.12.2 for benchmarks Signed-off-by: Anatoly Myachev --- .github/workflows/triton-benchmarks.yml | 42 +++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index c81d2167ac..e2939f24d8 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -121,11 +121,15 @@ jobs: id: install run: | cd benchmarks + pip install intel-pti==0.12.2 pip install . - name: Run Triton Softmax kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + ls $PTI_LIBS_DIR + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -135,6 +139,8 @@ jobs: - name: Run Triton GEMM kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark NEW_SHAPES=0 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -146,6 +152,8 @@ jobs: - name: Run Triton GEMM kernel benchmark - new shapes if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv @@ -157,6 +165,8 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor of pointer if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -167,6 +177,8 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor descriptor if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -177,6 +189,8 @@ jobs: - name: Run Triton GEMM (A@B^t) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv @@ -188,6 +202,8 @@ jobs: - name: Run Triton GEMM (A^t@B) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv @@ -199,6 +215,8 @@ jobs: - name: Run Triton GEMM (stream-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -208,6 +226,8 @@ jobs: - name: Run Triton GEMM (split-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -217,6 +237,8 @@ jobs: - name: Run Triton GEMM + PreOp (exp) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -225,6 +247,8 @@ jobs: - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -233,6 +257,8 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -242,6 +268,8 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -251,6 +279,8 @@ jobs: - name: Run Triton FA fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -261,6 +291,8 @@ jobs: - name: Run Triton FA bwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark FA_KERNEL_MODE="bwd" \ python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -273,6 +305,8 @@ jobs: - name: Run Triton FA fwd kernel benchmark - with tensor descriptors if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv @@ -284,6 +318,8 @@ jobs: - name: Run Prefix Sums kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -292,12 +328,16 @@ jobs: - name: Run micro benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -307,6 +347,8 @@ jobs: - name: Run Triton FlexAttention Custom Masks fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS From a8f0d093cdbba589d9c49fda85db49ef7772ad2f Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 19 May 2025 20:51:19 +0200 Subject: [PATCH 02/17] REVERTME Signed-off-by: Anatoly Myachev --- .github/workflows/build-test.yml | 4 ---- .github/workflows/build-windows.yml | 4 ---- 2 files changed, 8 deletions(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 37902bc06b..c73f3993c1 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -44,10 +44,6 @@ on: type: boolean default: false - pull_request: - branches: - - main - - release/** push: branches: - main diff --git a/.github/workflows/build-windows.yml b/.github/workflows/build-windows.yml index 87e2e561dc..c204e52af4 100644 --- a/.github/workflows/build-windows.yml +++ b/.github/workflows/build-windows.yml @@ -3,10 +3,6 @@ name: Build on Windows on: workflow_dispatch: - pull_request: - branches: - - main - - release/** push: branches: - main From 4c3bbb3085fb11bb3743d16118c58267c50e9d9d Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 19 May 2025 21:18:39 +0200 Subject: [PATCH 03/17] Specify: PTI_DEVICE_SYNC_DELTA=1 Signed-off-by: Anatoly Myachev --- .github/workflows/triton-benchmarks.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index e2939f24d8..30c4daa8bb 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -130,6 +130,7 @@ jobs: PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") ls $PTI_LIBS_DIR export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -141,6 +142,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark NEW_SHAPES=0 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -154,6 +156,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv @@ -167,6 +170,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -179,6 +183,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -191,6 +196,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv @@ -204,6 +210,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv @@ -217,6 +224,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -228,6 +236,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -239,6 +248,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -249,6 +259,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -259,6 +270,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -270,6 +282,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -281,6 +294,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -293,6 +307,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark FA_KERNEL_MODE="bwd" \ python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -307,6 +322,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv @@ -320,6 +336,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -330,6 +347,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS @@ -338,6 +356,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -349,6 +368,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS From fd8b689a588a3d9afca9f62d77aaa3294609966d Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 19 May 2025 22:03:44 +0200 Subject: [PATCH 04/17] specify intel_gpu_bmg_g21 Signed-off-by: Anatoly Myachev --- benchmarks/cutlass_kernel/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/cutlass_kernel/CMakeLists.txt b/benchmarks/cutlass_kernel/CMakeLists.txt index e6ecb516ea..704d778618 100644 --- a/benchmarks/cutlass_kernel/CMakeLists.txt +++ b/benchmarks/cutlass_kernel/CMakeLists.txt @@ -1,13 +1,13 @@ set(CUTLASS_KERNEL_FLAGS ${CUTLASS_KERNEL_FLAGS} -fsycl - -fsycl-targets=intel_gpu_pvc + -fsycl-targets=intel_gpu_pvc,intel_gpu_bmg_g21 -fsycl-device-code-split=per_kernel -Xspirv-translator -spirv-ext=+SPV_INTEL_split_barrier ) Python3_add_library(cutlass_kernel MODULE WITH_SOABI python_main.cpp) -target_compile_options(cutlass_kernel PRIVATE "-fsycl" "-fsycl-targets=intel_gpu_pvc" "-fpreview-breaking-changes") +target_compile_options(cutlass_kernel PRIVATE "-fsycl" "-fsycl-targets=intel_gpu_pvc,intel_gpu_bmg_g21" "-fpreview-breaking-changes") target_compile_options(cutlass_kernel PRIVATE "-DCUTLASS_ENABLE_SYCL") target_compile_options(cutlass_kernel PRIVATE "-DSYCL_INTEL_TARGET") From 2cb0d4199c3c66abfe556aee805c66bb9c4ce31c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 19 May 2025 22:18:36 +0200 Subject: [PATCH 05/17] Revert "specify intel_gpu_bmg_g21" This reverts commit fd8b689a588a3d9afca9f62d77aaa3294609966d. --- benchmarks/cutlass_kernel/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/cutlass_kernel/CMakeLists.txt b/benchmarks/cutlass_kernel/CMakeLists.txt index 704d778618..e6ecb516ea 100644 --- a/benchmarks/cutlass_kernel/CMakeLists.txt +++ b/benchmarks/cutlass_kernel/CMakeLists.txt @@ -1,13 +1,13 @@ set(CUTLASS_KERNEL_FLAGS ${CUTLASS_KERNEL_FLAGS} -fsycl - -fsycl-targets=intel_gpu_pvc,intel_gpu_bmg_g21 + -fsycl-targets=intel_gpu_pvc -fsycl-device-code-split=per_kernel -Xspirv-translator -spirv-ext=+SPV_INTEL_split_barrier ) Python3_add_library(cutlass_kernel MODULE WITH_SOABI python_main.cpp) -target_compile_options(cutlass_kernel PRIVATE "-fsycl" "-fsycl-targets=intel_gpu_pvc,intel_gpu_bmg_g21" "-fpreview-breaking-changes") +target_compile_options(cutlass_kernel PRIVATE "-fsycl" "-fsycl-targets=intel_gpu_pvc" "-fpreview-breaking-changes") target_compile_options(cutlass_kernel PRIVATE "-DCUTLASS_ENABLE_SYCL") target_compile_options(cutlass_kernel PRIVATE "-DSYCL_INTEL_TARGET") From 3d80f0d5ff34535ca2e0a7fe3c202733f730e55e Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 19 May 2025 22:53:17 +0200 Subject: [PATCH 06/17] don't run cutlass on bmg Signed-off-by: Anatoly Myachev --- .github/workflows/triton-benchmarks.yml | 20 +++++++++++++++---- .../gemm_benchmark.py | 4 +++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 30c4daa8bb..f3061c8aad 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -149,7 +149,10 @@ jobs: python build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-triton-report.csv --benchmark gemm-legacy --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG python build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm-legacy --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG python build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-onednn-report.csv --benchmark gemm-legacy --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG - python build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-cutlass-report.csv --benchmark gemm-legacy --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG + if [[ "${{ inputs.runner_label }}" = "max1550" ]]; then + # FIXME: enable cuttlass on bmg + python build_report.py $REPORTS/matmul-performance.csv $REPORTS/gemm-cutlass-report.csv --benchmark gemm-legacy --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG + fi - name: Run Triton GEMM kernel benchmark - new shapes if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }} @@ -163,7 +166,10 @@ jobs: source ../../scripts/capture-hw-details.sh python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-onednn-report.csv --benchmark gemm --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG - python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG + if [[ "${{ inputs.runner_label }}" = "max1550" ]]; then + # FIXME: enable cuttlass on bmg + python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG + fi - name: Run Triton GEMM kernel benchmark - with tensor of pointer if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }} @@ -176,7 +182,10 @@ jobs: source ../../scripts/capture-hw-details.sh python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-triton-report.csv --benchmark gemm-tensor-of-ptr --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-onednn-report.csv --benchmark gemm-tensor-of-ptr --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG - python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG + if [[ "${{ inputs.runner_label }}" = "max1550" ]]; then + # FIXME: enable cuttlass on bmg + python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG + fi - name: Run Triton GEMM kernel benchmark - with tensor descriptor if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }} @@ -189,7 +198,10 @@ jobs: source ../../scripts/capture-hw-details.sh python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-triton-report.csv --benchmark gemm-tensor-desc --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-onednn-report.csv --benchmark gemm-tensor-desc --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG - python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG + if [[ "${{ inputs.runner_label }}" = "max1550" ]]; then + # FIXME: enable cuttlass on bmg + python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG + fi - name: Run Triton GEMM (A@B^t) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} diff --git a/benchmarks/triton_kernels_benchmark/gemm_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_benchmark.py index 331228c381..4dc1f0ecba 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_benchmark.py @@ -307,7 +307,9 @@ def get_benchmark( if not (transpose_a or transpose_b): if not new_shapes: supported_providers['xetla'] = 'XeTLA' - supported_providers['cutlass'] = 'CUTLASS' + if "580" not in torch.xpu.get_device_name(): + # FIXME: enable cutlass on bmg + supported_providers['cutlass'] = 'CUTLASS' providers = benchmark_suite.filter_providers(supported_providers, providers_filter) # Benchmark Performance From 17d2a5d0bd21f800a7c7fee5c96d989028cf6231 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 20 May 2025 13:21:53 +0200 Subject: [PATCH 07/17] try PTI_DEVICE_SYNC_DELTA=10000 with pti 0.12.0 Signed-off-by: Anatoly Myachev --- .github/workflows/triton-benchmarks.yml | 60 +++++++++---------------- 1 file changed, 20 insertions(+), 40 deletions(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index f3061c8aad..021882cfe1 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -129,8 +129,7 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") ls $PTI_LIBS_DIR - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -141,8 +140,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark NEW_SHAPES=0 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -158,8 +156,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv @@ -175,8 +172,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -191,8 +187,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -207,8 +202,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv @@ -221,8 +215,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv @@ -235,8 +228,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -247,8 +239,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -259,8 +250,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -270,8 +260,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -281,8 +270,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -293,8 +281,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -305,8 +292,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -318,8 +304,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark FA_KERNEL_MODE="bwd" \ python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -333,8 +318,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv @@ -347,8 +331,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -358,8 +341,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS @@ -367,8 +349,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -379,8 +360,7 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS From a2c22f5736d0a537dbfccd769e8b952a6ab1adb2 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 20 May 2025 18:06:06 +0200 Subject: [PATCH 08/17] Revert "REVERTME" This reverts commit a8f0d093cdbba589d9c49fda85db49ef7772ad2f. --- .github/workflows/build-test.yml | 4 ++++ .github/workflows/build-windows.yml | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index c73f3993c1..37902bc06b 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -44,6 +44,10 @@ on: type: boolean default: false + pull_request: + branches: + - main + - release/** push: branches: - main diff --git a/.github/workflows/build-windows.yml b/.github/workflows/build-windows.yml index c204e52af4..87e2e561dc 100644 --- a/.github/workflows/build-windows.yml +++ b/.github/workflows/build-windows.yml @@ -3,6 +3,10 @@ name: Build on Windows on: workflow_dispatch: + pull_request: + branches: + - main + - release/** push: branches: - main From 2a6ca23e594d483ff0a0c41d6b5eb1bf53b29ba7 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 20 May 2025 18:10:46 +0200 Subject: [PATCH 09/17] cleanup Signed-off-by: Anatoly Myachev --- .github/workflows/triton-benchmarks.yml | 43 +------------------------ 1 file changed, 1 insertion(+), 42 deletions(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 021882cfe1..541465fddf 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -60,6 +60,7 @@ permissions: read-all env: PYTHON_VERSION: "3.10" + PTI_DEVICE_SYNC_DELTA: "1" BENCHMARKING_METHOD: ${{ inputs.benchmarking_method || 'UPSTREAM_PYTORCH_PROFILER' }} VERIFY: ${{ (github.event_name == 'pull_request' || github.event_name == 'schedule' || inputs.verify) && '1' || '0' }} TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }} @@ -121,15 +122,11 @@ jobs: id: install run: | cd benchmarks - pip install intel-pti==0.12.2 pip install . - name: Run Triton Softmax kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - ls $PTI_LIBS_DIR - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -139,8 +136,6 @@ jobs: - name: Run Triton GEMM kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark NEW_SHAPES=0 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -155,8 +150,6 @@ jobs: - name: Run Triton GEMM kernel benchmark - new shapes if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv @@ -171,8 +164,6 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor of pointer if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -186,8 +177,6 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor descriptor if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -201,8 +190,6 @@ jobs: - name: Run Triton GEMM (A@B^t) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv @@ -214,8 +201,6 @@ jobs: - name: Run Triton GEMM (A^t@B) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv @@ -227,8 +212,6 @@ jobs: - name: Run Triton GEMM (stream-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -238,8 +221,6 @@ jobs: - name: Run Triton GEMM (split-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -249,8 +230,6 @@ jobs: - name: Run Triton GEMM + PreOp (exp) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -259,8 +238,6 @@ jobs: - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -269,8 +246,6 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -280,8 +255,6 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -291,8 +264,6 @@ jobs: - name: Run Triton FA fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -303,8 +274,6 @@ jobs: - name: Run Triton FA bwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark FA_KERNEL_MODE="bwd" \ python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -317,8 +286,6 @@ jobs: - name: Run Triton FA fwd kernel benchmark - with tensor descriptors if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv @@ -330,8 +297,6 @@ jobs: - name: Run Prefix Sums kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -340,16 +305,12 @@ jobs: - name: Run micro benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -359,8 +320,6 @@ jobs: - name: Run Triton FlexAttention Custom Masks fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS From 933eb33e8729c278728cd779947c2bc2e724845c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 20 May 2025 19:49:51 +0200 Subject: [PATCH 10/17] fix lint Signed-off-by: Anatoly Myachev --- benchmarks/triton_kernels_benchmark/gemm_benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/triton_kernels_benchmark/gemm_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_benchmark.py index 4dc1f0ecba..d05e08930b 100644 --- a/benchmarks/triton_kernels_benchmark/gemm_benchmark.py +++ b/benchmarks/triton_kernels_benchmark/gemm_benchmark.py @@ -307,7 +307,7 @@ def get_benchmark( if not (transpose_a or transpose_b): if not new_shapes: supported_providers['xetla'] = 'XeTLA' - if "580" not in torch.xpu.get_device_name(): + if '580' not in torch.xpu.get_device_name(): # FIXME: enable cutlass on bmg supported_providers['cutlass'] = 'CUTLASS' providers = benchmark_suite.filter_providers(supported_providers, providers_filter) From f72e4f086fa14a1b377f6caf1008c0aaad41a179 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 20 May 2025 21:19:45 +0200 Subject: [PATCH 11/17] Revert "cleanup" This reverts commit 2a6ca23e594d483ff0a0c41d6b5eb1bf53b29ba7. --- .github/workflows/triton-benchmarks.yml | 43 ++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 541465fddf..021882cfe1 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -60,7 +60,6 @@ permissions: read-all env: PYTHON_VERSION: "3.10" - PTI_DEVICE_SYNC_DELTA: "1" BENCHMARKING_METHOD: ${{ inputs.benchmarking_method || 'UPSTREAM_PYTORCH_PROFILER' }} VERIFY: ${{ (github.event_name == 'pull_request' || github.event_name == 'schedule' || inputs.verify) && '1' || '0' }} TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }} @@ -122,11 +121,15 @@ jobs: id: install run: | cd benchmarks + pip install intel-pti==0.12.2 pip install . - name: Run Triton Softmax kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + ls $PTI_LIBS_DIR + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -136,6 +139,8 @@ jobs: - name: Run Triton GEMM kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark NEW_SHAPES=0 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -150,6 +155,8 @@ jobs: - name: Run Triton GEMM kernel benchmark - new shapes if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv @@ -164,6 +171,8 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor of pointer if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -177,6 +186,8 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor descriptor if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -190,6 +201,8 @@ jobs: - name: Run Triton GEMM (A@B^t) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv @@ -201,6 +214,8 @@ jobs: - name: Run Triton GEMM (A^t@B) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv @@ -212,6 +227,8 @@ jobs: - name: Run Triton GEMM (stream-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -221,6 +238,8 @@ jobs: - name: Run Triton GEMM (split-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -230,6 +249,8 @@ jobs: - name: Run Triton GEMM + PreOp (exp) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -238,6 +259,8 @@ jobs: - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -246,6 +269,8 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -255,6 +280,8 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -264,6 +291,8 @@ jobs: - name: Run Triton FA fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -274,6 +303,8 @@ jobs: - name: Run Triton FA bwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark FA_KERNEL_MODE="bwd" \ python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -286,6 +317,8 @@ jobs: - name: Run Triton FA fwd kernel benchmark - with tensor descriptors if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv @@ -297,6 +330,8 @@ jobs: - name: Run Prefix Sums kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -305,12 +340,16 @@ jobs: - name: Run micro benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -320,6 +359,8 @@ jobs: - name: Run Triton FlexAttention Custom Masks fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }} run: | + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + export PTI_DEVICE_SYNC_DELTA=10000 cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS From 5bcdf3df411225c7a5dd5a3c897fb0378a8f0195 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 20 May 2025 21:21:40 +0200 Subject: [PATCH 12/17] Revert "try PTI_DEVICE_SYNC_DELTA=10000 with pti 0.12.0" This reverts commit 17d2a5d0bd21f800a7c7fee5c96d989028cf6231. --- .github/workflows/triton-benchmarks.yml | 60 ++++++++++++++++--------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 021882cfe1..f3061c8aad 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -129,7 +129,8 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") ls $PTI_LIBS_DIR - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -140,7 +141,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark NEW_SHAPES=0 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -156,7 +158,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv @@ -172,7 +175,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -187,7 +191,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -202,7 +207,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv @@ -215,7 +221,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv @@ -228,7 +235,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -239,7 +247,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -250,7 +259,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -260,7 +270,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -270,7 +281,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -281,7 +293,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -292,7 +305,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -304,7 +318,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark FA_KERNEL_MODE="bwd" \ python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -318,7 +333,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv @@ -331,7 +347,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -341,7 +358,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS @@ -349,7 +367,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -360,7 +379,8 @@ jobs: if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }} run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - export PTI_DEVICE_SYNC_DELTA=10000 + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH + export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS From 5f96ac704e02c9d1e7c7a5f9ee0a01d1dbff1b93 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 20 May 2025 21:24:08 +0200 Subject: [PATCH 13/17] cleanup Signed-off-by: Anatoly Myachev --- .github/workflows/triton-benchmarks.yml | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index f3061c8aad..f50143378c 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -60,6 +60,7 @@ permissions: read-all env: PYTHON_VERSION: "3.10" + PTI_DEVICE_SYNC_DELTA: "1" BENCHMARKING_METHOD: ${{ inputs.benchmarking_method || 'UPSTREAM_PYTORCH_PROFILER' }} VERIFY: ${{ (github.event_name == 'pull_request' || github.event_name == 'schedule' || inputs.verify) && '1' || '0' }} TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }} @@ -130,7 +131,6 @@ jobs: PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") ls $PTI_LIBS_DIR export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -142,7 +142,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark NEW_SHAPES=0 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -159,7 +158,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv @@ -176,7 +174,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -192,7 +189,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -208,7 +204,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv @@ -222,7 +217,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv @@ -236,7 +230,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -248,7 +241,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -260,7 +252,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -271,7 +262,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -282,7 +272,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -294,7 +283,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -306,7 +294,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -319,7 +306,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark FA_KERNEL_MODE="bwd" \ python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -334,7 +320,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv @@ -348,7 +333,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -359,7 +343,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS @@ -368,7 +351,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -380,7 +362,6 @@ jobs: run: | PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH - export PTI_DEVICE_SYNC_DELTA=1 cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS From 88b808bf2adb750b1c609e3f003b290af83b711c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 21 May 2025 15:13:49 +0200 Subject: [PATCH 14/17] address review comments Signed-off-by: Anatoly Myachev --- .github/workflows/triton-benchmarks.yml | 28 ++++++------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index f50143378c..576cd45934 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -60,6 +60,7 @@ permissions: read-all env: PYTHON_VERSION: "3.10" + # FIXME: in the next versions of pti (most likely 0.12.3) this will not need to be done PTI_DEVICE_SYNC_DELTA: "1" BENCHMARKING_METHOD: ${{ inputs.benchmarking_method || 'UPSTREAM_PYTORCH_PROFILER' }} VERIFY: ${{ (github.event_name == 'pull_request' || github.event_name == 'schedule' || inputs.verify) && '1' || '0' }} @@ -122,14 +123,16 @@ jobs: id: install run: | cd benchmarks - pip install intel-pti==0.12.2 pip install . + pip install intel-pti==0.12.2 + PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") + # the output should contain: `libpti.so`, `libpti_metrics.so.0.12.2` and `libpti_view.so.0.12.2` + ls $PTI_LIBS_DIR + echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV - name: Run Triton Softmax kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") - ls $PTI_LIBS_DIR export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS @@ -140,7 +143,6 @@ jobs: - name: Run Triton GEMM kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark NEW_SHAPES=0 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -156,7 +158,6 @@ jobs: - name: Run Triton GEMM kernel benchmark - new shapes if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -172,7 +173,6 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor of pointer if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -187,7 +187,6 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor descriptor if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -202,7 +201,6 @@ jobs: - name: Run Triton GEMM (A@B^t) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -215,7 +213,6 @@ jobs: - name: Run Triton GEMM (A^t@B) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -228,7 +225,6 @@ jobs: - name: Run Triton GEMM (stream-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -239,7 +235,6 @@ jobs: - name: Run Triton GEMM (split-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -250,7 +245,6 @@ jobs: - name: Run Triton GEMM + PreOp (exp) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -260,7 +254,6 @@ jobs: - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -270,7 +263,6 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -281,7 +273,6 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -292,7 +283,6 @@ jobs: - name: Run Triton FA fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -304,7 +294,6 @@ jobs: - name: Run Triton FA bwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark FA_KERNEL_MODE="bwd" \ @@ -318,7 +307,6 @@ jobs: - name: Run Triton FA fwd kernel benchmark - with tensor descriptors if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -331,7 +319,6 @@ jobs: - name: Run Prefix Sums kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS @@ -341,7 +328,6 @@ jobs: - name: Run micro benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS @@ -349,7 +335,6 @@ jobs: - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -360,7 +345,6 @@ jobs: - name: Run Triton FlexAttention Custom Masks fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }} run: | - PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')") export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS From 61f0107e22903ab7e0e86e789888a6ca71e85946 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 21 May 2025 15:21:53 +0200 Subject: [PATCH 15/17] try setup 'LD_LIBRARY_PATH' in 'defaults:' section Signed-off-by: Anatoly Myachev --- .github/workflows/triton-benchmarks.yml | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 576cd45934..166aa6538c 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -76,7 +76,7 @@ jobs: timeout-minutes: 720 defaults: run: - shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" + shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; export LD_LIBRARY_PATH=\$PTI_LIBS_DIR:\$LD_LIBRARY_PATH; source {0}" steps: - name: Print inputs run: | @@ -133,7 +133,6 @@ jobs: - name: Run Triton Softmax kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -143,7 +142,6 @@ jobs: - name: Run Triton GEMM kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark NEW_SHAPES=0 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -158,7 +156,6 @@ jobs: - name: Run Triton GEMM kernel benchmark - new shapes if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv @@ -173,7 +170,6 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor of pointer if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -187,7 +183,6 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor descriptor if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -201,7 +196,6 @@ jobs: - name: Run Triton GEMM (A@B^t) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv @@ -213,7 +207,6 @@ jobs: - name: Run Triton GEMM (A^t@B) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv @@ -225,7 +218,6 @@ jobs: - name: Run Triton GEMM (stream-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -235,7 +227,6 @@ jobs: - name: Run Triton GEMM (split-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -245,7 +236,6 @@ jobs: - name: Run Triton GEMM + PreOp (exp) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -254,7 +244,6 @@ jobs: - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -263,7 +252,6 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -273,7 +261,6 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -283,7 +270,6 @@ jobs: - name: Run Triton FA fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -294,7 +280,6 @@ jobs: - name: Run Triton FA bwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark FA_KERNEL_MODE="bwd" \ python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -307,7 +292,6 @@ jobs: - name: Run Triton FA fwd kernel benchmark - with tensor descriptors if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv @@ -319,7 +303,6 @@ jobs: - name: Run Prefix Sums kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -328,14 +311,12 @@ jobs: - name: Run micro benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -345,7 +326,6 @@ jobs: - name: Run Triton FlexAttention Custom Masks fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }} run: | - export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS From f4ff3f7380a9c5a6ab1e6386d2bbc0d053cf7d5d Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 21 May 2025 15:51:16 +0200 Subject: [PATCH 16/17] fix Signed-off-by: Anatoly Myachev --- .github/workflows/triton-benchmarks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 18dc59e74c..8c629a3ea4 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -76,7 +76,7 @@ jobs: timeout-minutes: 720 defaults: run: - shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; export LD_LIBRARY_PATH=\$PTI_LIBS_DIR:\$LD_LIBRARY_PATH; source {0}" + shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH; source {0}" steps: - name: Print inputs run: | From 2ec88c49f4e99518cad1db17a8529f6d89c1a4c8 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 21 May 2025 16:30:43 +0200 Subject: [PATCH 17/17] address review comments and return export LD_LIBRARY_PATH into each benchmark section Signed-off-by: Anatoly Myachev --- .github/workflows/triton-benchmarks.yml | 27 +++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 8c629a3ea4..64865a1214 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -76,7 +76,7 @@ jobs: timeout-minutes: 720 defaults: run: - shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH; source {0}" + shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" steps: - name: Print inputs run: | @@ -133,6 +133,7 @@ jobs: - name: Run Triton Softmax kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -142,13 +143,14 @@ jobs: - name: Run Triton GEMM kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv source ../../scripts/capture-hw-details.sh python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-onednn-report.csv --benchmark gemm --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG - if [[ "${{ inputs.runner_label }}" = "max1550" ]]; then + if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then # FIXME: enable cuttlass on bmg python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG fi @@ -156,12 +158,13 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor of pointer if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-triton-report.csv --benchmark gemm-tensor-of-ptr --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-onednn-report.csv --benchmark gemm-tensor-of-ptr --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG - if [[ "${{ inputs.runner_label }}" = "max1550" ]]; then + if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then # FIXME: enable cuttlass on bmg python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG fi @@ -169,12 +172,13 @@ jobs: - name: Run Triton GEMM kernel benchmark - with tensor descriptor if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-triton-report.csv --benchmark gemm-tensor-desc --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-onednn-report.csv --benchmark gemm-tensor-desc --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG - if [[ "${{ inputs.runner_label }}" = "max1550" ]]; then + if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then # FIXME: enable cuttlass on bmg python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG fi @@ -182,6 +186,7 @@ jobs: - name: Run Triton GEMM (A@B^t) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv @@ -193,6 +198,7 @@ jobs: - name: Run Triton GEMM (A^t@B) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv @@ -204,6 +210,7 @@ jobs: - name: Run Triton GEMM (stream-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -213,6 +220,7 @@ jobs: - name: Run Triton GEMM (split-k) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -222,6 +230,7 @@ jobs: - name: Run Triton GEMM + PreOp (exp) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -230,6 +239,7 @@ jobs: - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -238,6 +248,7 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -247,6 +258,7 @@ jobs: - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8 if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -256,6 +268,7 @@ jobs: - name: Run Triton FA fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -266,6 +279,7 @@ jobs: - name: Run Triton FA bwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark FA_KERNEL_MODE="bwd" \ python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS @@ -278,6 +292,7 @@ jobs: - name: Run Triton FA fwd kernel benchmark - with tensor descriptors if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv @@ -289,6 +304,7 @@ jobs: - name: Run Triton FlexAttention Causal Mask fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS @@ -298,6 +314,7 @@ jobs: - name: Run Triton FlexAttention Custom Masks fwd kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS @@ -308,6 +325,7 @@ jobs: - name: Run Prefix Sums kernel benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS source ../../scripts/capture-hw-details.sh @@ -316,6 +334,7 @@ jobs: - name: Run micro benchmark if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} run: | + export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS