Skip to content

Commit 3f3bcf3

Browse files
authored
Fix Kineto+PTI profiling on BMG (#4244)
Signed-off-by: Anatoly Myachev <[email protected]>
1 parent 11cab64 commit 3f3bcf3

File tree

2 files changed

+41
-4
lines changed

2 files changed

+41
-4
lines changed

.github/workflows/triton-benchmarks.yml

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ permissions: read-all
6060

6161
env:
6262
PYTHON_VERSION: "3.10"
63+
# FIXME: in the next versions of pti (most likely 0.12.3) this will not need to be done
64+
PTI_DEVICE_SYNC_DELTA: "1"
6365
BENCHMARKING_METHOD: ${{ inputs.benchmarking_method || 'UPSTREAM_PYTORCH_PROFILER' }}
6466
VERIFY: ${{ (github.event_name == 'pull_request' || github.event_name == 'schedule' || inputs.verify) && '1' || '0' }}
6567
TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }}
@@ -122,10 +124,16 @@ jobs:
122124
run: |
123125
cd benchmarks
124126
pip install .
127+
pip install intel-pti==0.12.2
128+
PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')")
129+
# the output should contain: `libpti.so`, `libpti_metrics.so.0.12.2` and `libpti_view.so.0.12.2`
130+
ls $PTI_LIBS_DIR
131+
echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV
125132
126133
- name: Run Triton Softmax kernel benchmark
127134
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }}
128135
run: |
136+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
129137
cd benchmarks/triton_kernels_benchmark
130138
python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS
131139
source ../../scripts/capture-hw-details.sh
@@ -135,37 +143,50 @@ jobs:
135143
- name: Run Triton GEMM kernel benchmark
136144
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }}
137145
run: |
146+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
138147
cd benchmarks/triton_kernels_benchmark
139148
python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
140149
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
141150
source ../../scripts/capture-hw-details.sh
142151
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
143152
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-onednn-report.csv --benchmark gemm --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
144-
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
153+
if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
154+
# FIXME: enable cuttlass on bmg
155+
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
156+
fi
145157
146158
- name: Run Triton GEMM kernel benchmark - with tensor of pointer
147159
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }}
148160
run: |
161+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
149162
cd benchmarks/triton_kernels_benchmark
150163
python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS
151164
source ../../scripts/capture-hw-details.sh
152165
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-triton-report.csv --benchmark gemm-tensor-of-ptr --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
153166
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-onednn-report.csv --benchmark gemm-tensor-of-ptr --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
154-
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
167+
if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
168+
# FIXME: enable cuttlass on bmg
169+
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
170+
fi
155171
156172
- name: Run Triton GEMM kernel benchmark - with tensor descriptor
157173
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }}
158174
run: |
175+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
159176
cd benchmarks/triton_kernels_benchmark
160177
python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
161178
source ../../scripts/capture-hw-details.sh
162179
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-triton-report.csv --benchmark gemm-tensor-desc --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
163180
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-onednn-report.csv --benchmark gemm-tensor-desc --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
164-
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
181+
if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
182+
# FIXME: enable cuttlass on bmg
183+
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
184+
fi
165185
166186
- name: Run Triton GEMM (A@B^t) kernel benchmark
167187
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }}
168188
run: |
189+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
169190
cd benchmarks/triton_kernels_benchmark
170191
TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
171192
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
@@ -177,6 +198,7 @@ jobs:
177198
- name: Run Triton GEMM (A^t@B) kernel benchmark
178199
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }}
179200
run: |
201+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
180202
cd benchmarks/triton_kernels_benchmark
181203
TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
182204
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
@@ -188,6 +210,7 @@ jobs:
188210
- name: Run Triton GEMM (stream-k) kernel benchmark
189211
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }}
190212
run: |
213+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
191214
cd benchmarks/triton_kernels_benchmark
192215
python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
193216
source ../../scripts/capture-hw-details.sh
@@ -197,6 +220,7 @@ jobs:
197220
- name: Run Triton GEMM (split-k) kernel benchmark
198221
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }}
199222
run: |
223+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
200224
cd benchmarks/triton_kernels_benchmark
201225
python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
202226
source ../../scripts/capture-hw-details.sh
@@ -206,6 +230,7 @@ jobs:
206230
- name: Run Triton GEMM + PreOp (exp) kernel benchmark
207231
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }}
208232
run: |
233+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
209234
cd benchmarks/triton_kernels_benchmark
210235
python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS
211236
source ../../scripts/capture-hw-details.sh
@@ -214,6 +239,7 @@ jobs:
214239
- name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
215240
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }}
216241
run: |
242+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
217243
cd benchmarks/triton_kernels_benchmark
218244
python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS
219245
source ../../scripts/capture-hw-details.sh
@@ -222,6 +248,7 @@ jobs:
222248
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16
223249
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }}
224250
run: |
251+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
225252
cd benchmarks/triton_kernels_benchmark
226253
python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
227254
source ../../scripts/capture-hw-details.sh
@@ -231,6 +258,7 @@ jobs:
231258
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8
232259
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }}
233260
run: |
261+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
234262
cd benchmarks/triton_kernels_benchmark
235263
INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
236264
source ../../scripts/capture-hw-details.sh
@@ -240,6 +268,7 @@ jobs:
240268
- name: Run Triton FA fwd kernel benchmark
241269
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }}
242270
run: |
271+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
243272
cd benchmarks/triton_kernels_benchmark
244273
python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
245274
@@ -250,6 +279,7 @@ jobs:
250279
- name: Run Triton FA bwd kernel benchmark
251280
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }}
252281
run: |
282+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
253283
cd benchmarks/triton_kernels_benchmark
254284
FA_KERNEL_MODE="bwd" \
255285
python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
@@ -262,6 +292,7 @@ jobs:
262292
- name: Run Triton FA fwd kernel benchmark - with tensor descriptors
263293
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }}
264294
run: |
295+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
265296
cd benchmarks/triton_kernels_benchmark
266297
python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
267298
mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv
@@ -273,6 +304,7 @@ jobs:
273304
- name: Run Triton FlexAttention Causal Mask fwd kernel benchmark
274305
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
275306
run: |
307+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
276308
cd benchmarks/triton_kernels_benchmark
277309
python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
278310
@@ -283,6 +315,7 @@ jobs:
283315
- name: Run Triton FlexAttention Custom Masks fwd kernel benchmark
284316
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }}
285317
run: |
318+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
286319
cd benchmarks/triton_kernels_benchmark
287320
python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS
288321
@@ -293,6 +326,7 @@ jobs:
293326
- name: Run Prefix Sums kernel benchmark
294327
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
295328
run: |
329+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
296330
cd benchmarks/triton_kernels_benchmark
297331
python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS
298332
source ../../scripts/capture-hw-details.sh
@@ -301,6 +335,7 @@ jobs:
301335
- name: Run micro benchmark
302336
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
303337
run: |
338+
export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
304339
cd benchmarks/micro_benchmarks
305340
python run_benchmarks.py --reports $REPORTS
306341

benchmarks/triton_kernels_benchmark/gemm_benchmark.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,9 @@ def get_benchmark(
301301
}
302302
# use_cutlass
303303
if not (transpose_a or transpose_b):
304-
supported_providers['cutlass'] = 'CUTLASS'
304+
if '580' not in torch.xpu.get_device_name():
305+
# FIXME: enable cutlass on bmg
306+
supported_providers['cutlass'] = 'CUTLASS'
305307
providers = benchmark_suite.filter_providers(supported_providers, providers_filter)
306308

307309
# Benchmark Performance

0 commit comments

Comments
 (0)