Skip to content

Fix Kineto+PTI profiling on BMG #4244

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
May 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 38 additions & 3 deletions .github/workflows/triton-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ permissions: read-all

env:
PYTHON_VERSION: "3.10"
# FIXME: in the next versions of pti (most likely 0.12.3) this will not need to be done
PTI_DEVICE_SYNC_DELTA: "1"
BENCHMARKING_METHOD: ${{ inputs.benchmarking_method || 'UPSTREAM_PYTORCH_PROFILER' }}
VERIFY: ${{ (github.event_name == 'pull_request' || github.event_name == 'schedule' || inputs.verify) && '1' || '0' }}
TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }}
Expand Down Expand Up @@ -122,10 +124,16 @@ jobs:
run: |
cd benchmarks
pip install .
pip install intel-pti==0.12.2
PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')")
# the output should contain: `libpti.so`, `libpti_metrics.so.0.12.2` and `libpti_view.so.0.12.2`
ls $PTI_LIBS_DIR
echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV

- name: Run Triton Softmax kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -135,37 +143,50 @@ jobs:
- name: Run Triton GEMM kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
source ../../scripts/capture-hw-details.sh
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-onednn-report.csv --benchmark gemm --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
# FIXME: enable cuttlass on bmg
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
fi

- name: Run Triton GEMM kernel benchmark - with tensor of pointer
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-triton-report.csv --benchmark gemm-tensor-of-ptr --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-onednn-report.csv --benchmark gemm-tensor-of-ptr --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
# FIXME: enable cuttlass on bmg
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
fi

- name: Run Triton GEMM kernel benchmark - with tensor descriptor
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-triton-report.csv --benchmark gemm-tensor-desc --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-onednn-report.csv --benchmark gemm-tensor-desc --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
# FIXME: enable cuttlass on bmg
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
fi

- name: Run Triton GEMM (A@B^t) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
Expand All @@ -177,6 +198,7 @@ jobs:
- name: Run Triton GEMM (A^t@B) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
Expand All @@ -188,6 +210,7 @@ jobs:
- name: Run Triton GEMM (stream-k) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -197,6 +220,7 @@ jobs:
- name: Run Triton GEMM (split-k) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -206,6 +230,7 @@ jobs:
- name: Run Triton GEMM + PreOp (exp) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -214,6 +239,7 @@ jobs:
- name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -222,6 +248,7 @@ jobs:
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -231,6 +258,7 @@ jobs:
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -240,6 +268,7 @@ jobs:
- name: Run Triton FA fwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS

Expand All @@ -250,6 +279,7 @@ jobs:
- name: Run Triton FA bwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
FA_KERNEL_MODE="bwd" \
python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
Expand All @@ -262,6 +292,7 @@ jobs:
- name: Run Triton FA fwd kernel benchmark - with tensor descriptors
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv
Expand All @@ -273,6 +304,7 @@ jobs:
- name: Run Triton FlexAttention Causal Mask fwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS

Expand All @@ -282,6 +314,7 @@ jobs:
- name: Run Triton FlexAttention Custom Masks fwd kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS

Expand All @@ -292,6 +325,7 @@ jobs:
- name: Run Prefix Sums kernel benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/triton_kernels_benchmark
python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS
source ../../scripts/capture-hw-details.sh
Expand All @@ -300,6 +334,7 @@ jobs:
- name: Run micro benchmark
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
run: |
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
cd benchmarks/micro_benchmarks
python run_benchmarks.py --reports $REPORTS

Expand Down
4 changes: 3 additions & 1 deletion benchmarks/triton_kernels_benchmark/gemm_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,9 @@ def get_benchmark(
}
# use_cutlass
if not (transpose_a or transpose_b):
supported_providers['cutlass'] = 'CUTLASS'
if '580' not in torch.xpu.get_device_name():
# FIXME: enable cutlass on bmg
supported_providers['cutlass'] = 'CUTLASS'
providers = benchmark_suite.filter_providers(supported_providers, providers_filter)

# Benchmark Performance
Expand Down
Loading