@@ -60,6 +60,8 @@ permissions: read-all
60
60
61
61
env :
62
62
PYTHON_VERSION : " 3.10"
63
+ # FIXME: in the next versions of pti (most likely 0.12.3) this will not need to be done
64
+ PTI_DEVICE_SYNC_DELTA : " 1"
63
65
BENCHMARKING_METHOD : ${{ inputs.benchmarking_method || 'UPSTREAM_PYTORCH_PROFILER' }}
64
66
VERIFY : ${{ (github.event_name == 'pull_request' || github.event_name == 'schedule' || inputs.verify) && '1' || '0' }}
65
67
TAG : ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }}
@@ -122,10 +124,16 @@ jobs:
122
124
run : |
123
125
cd benchmarks
124
126
pip install .
127
+ pip install intel-pti==0.12.2
128
+ PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')")
129
+ # the output should contain: `libpti.so`, `libpti_metrics.so.0.12.2` and `libpti_view.so.0.12.2`
130
+ ls $PTI_LIBS_DIR
131
+ echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV
125
132
126
133
- name : Run Triton Softmax kernel benchmark
127
134
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'fused_softmax.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }}
128
135
run : |
136
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
129
137
cd benchmarks/triton_kernels_benchmark
130
138
python fused_softmax.py --reports $REPORTS --n_runs $N_RUNS
131
139
source ../../scripts/capture-hw-details.sh
@@ -135,37 +143,50 @@ jobs:
135
143
- name : Run Triton GEMM kernel benchmark
136
144
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_newshapes')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_newshapes') }}
137
145
run : |
146
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
138
147
cd benchmarks/triton_kernels_benchmark
139
148
python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
140
149
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv
141
150
source ../../scripts/capture-hw-details.sh
142
151
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
143
152
python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-onednn-report.csv --benchmark gemm --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
144
- python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
153
+ if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
154
+ # FIXME: enable cuttlass on bmg
155
+ python build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-newshapes-cutlass-report.csv --benchmark gemm --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
156
+ fi
145
157
146
158
- name : Run Triton GEMM kernel benchmark - with tensor of pointer
147
159
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_of_ptr_benchmark.py') }}
148
160
run : |
161
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
149
162
cd benchmarks/triton_kernels_benchmark
150
163
python gemm_tensor_of_ptr_benchmark.py --reports $REPORTS --n_runs $N_RUNS
151
164
source ../../scripts/capture-hw-details.sh
152
165
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-triton-report.csv --benchmark gemm-tensor-of-ptr --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
153
166
python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-onednn-report.csv --benchmark gemm-tensor-of-ptr --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
154
- python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
167
+ if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
168
+ # FIXME: enable cuttlass on bmg
169
+ python build_report.py $REPORTS/matmul-tensor-of-ptr-performance.csv $REPORTS/gemm-tensor-of-ptr-cutlass-report.csv --benchmark gemm-tensor-of-ptr --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
170
+ fi
155
171
156
172
- name : Run Triton GEMM kernel benchmark - with tensor descriptor
157
173
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_tensor_desc_benchmark.py') }}
158
174
run : |
175
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
159
176
cd benchmarks/triton_kernels_benchmark
160
177
python gemm_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
161
178
source ../../scripts/capture-hw-details.sh
162
179
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-triton-report.csv --benchmark gemm-tensor-desc --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
163
180
python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-onednn-report.csv --benchmark gemm-tensor-desc --compiler onednn --param_cols "B,M,K,N" --tflops_col OneDNN-TFlops --hbm_col "OneDNN-GB/s" --tag $TAG
164
- python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
181
+ if [[ "${{ inputs.runner_label || 'max1550' }}" = "max1550" ]]; then
182
+ # FIXME: enable cuttlass on bmg
183
+ python build_report.py $REPORTS/matmul-tensor-desc-performance.csv $REPORTS/gemm-tensor-desc-cutlass-report.csv --benchmark gemm-tensor-desc --compiler cutlass --param_cols "B,M,K,N" --tflops_col CUTLASS-TFlops --hbm_col "CUTLASS-GB/s" --tag $TAG
184
+ fi
165
185
166
186
- name : Run Triton GEMM (A@B^t) kernel benchmark
167
187
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_abt')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }}
168
188
run : |
189
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
169
190
cd benchmarks/triton_kernels_benchmark
170
191
TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
171
192
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv
@@ -177,6 +198,7 @@ jobs:
177
198
- name : Run Triton GEMM (A^t@B) kernel benchmark
178
199
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_benchmark.py_atb')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }}
179
200
run : |
201
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
180
202
cd benchmarks/triton_kernels_benchmark
181
203
TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS --n_runs $N_RUNS
182
204
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv
@@ -188,6 +210,7 @@ jobs:
188
210
- name : Run Triton GEMM (stream-k) kernel benchmark
189
211
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_streamk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }}
190
212
run : |
213
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
191
214
cd benchmarks/triton_kernels_benchmark
192
215
python gemm_streamk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
193
216
source ../../scripts/capture-hw-details.sh
@@ -197,6 +220,7 @@ jobs:
197
220
- name : Run Triton GEMM (split-k) kernel benchmark
198
221
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_splitk_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }}
199
222
run : |
223
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
200
224
cd benchmarks/triton_kernels_benchmark
201
225
python gemm_splitk_benchmark.py --reports $REPORTS --n_runs $N_RUNS
202
226
source ../../scripts/capture-hw-details.sh
@@ -206,6 +230,7 @@ jobs:
206
230
- name : Run Triton GEMM + PreOp (exp) kernel benchmark
207
231
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_preop_exp_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }}
208
232
run : |
233
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
209
234
cd benchmarks/triton_kernels_benchmark
210
235
python gemm_preop_exp_benchmark.py --reports $REPORTS --n_runs $N_RUNS
211
236
source ../../scripts/capture-hw-details.sh
@@ -214,6 +239,7 @@ jobs:
214
239
- name : Run Triton GEMM + PostOp (Gelu) kernel benchmark
215
240
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }}
216
241
run : |
242
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
217
243
cd benchmarks/triton_kernels_benchmark
218
244
python gemm_postop_gelu_benchmark.py --reports $REPORTS --n_runs $N_RUNS
219
245
source ../../scripts/capture-hw-details.sh
@@ -222,6 +248,7 @@ jobs:
222
248
- name : Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16
223
249
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_bfloat16.py') }}
224
250
run : |
251
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
225
252
cd benchmarks/triton_kernels_benchmark
226
253
python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
227
254
source ../../scripts/capture-hw-details.sh
@@ -231,6 +258,7 @@ jobs:
231
258
- name : Run Triton GEMM + PostOp (add matrix) kernel benchmark int8
232
259
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark_int8.py') }}
233
260
run : |
261
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
234
262
cd benchmarks/triton_kernels_benchmark
235
263
INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS --n_runs $N_RUNS
236
264
source ../../scripts/capture-hw-details.sh
@@ -240,6 +268,7 @@ jobs:
240
268
- name : Run Triton FA fwd kernel benchmark
241
269
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }}
242
270
run : |
271
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
243
272
cd benchmarks/triton_kernels_benchmark
244
273
python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
245
274
@@ -250,6 +279,7 @@ jobs:
250
279
- name : Run Triton FA bwd kernel benchmark
251
280
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }}
252
281
run : |
282
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
253
283
cd benchmarks/triton_kernels_benchmark
254
284
FA_KERNEL_MODE="bwd" \
255
285
python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
@@ -262,6 +292,7 @@ jobs:
262
292
- name : Run Triton FA fwd kernel benchmark - with tensor descriptors
263
293
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_tensor_desc_benchmark.py') }}
264
294
run : |
295
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
265
296
cd benchmarks/triton_kernels_benchmark
266
297
python flash_attention_tensor_desc_benchmark.py --reports $REPORTS --n_runs $N_RUNS
267
298
mv $REPORTS/attn-performance.csv $REPORTS/attn-tensor-desc-performance.csv
@@ -273,6 +304,7 @@ jobs:
273
304
- name : Run Triton FlexAttention Causal Mask fwd kernel benchmark
274
305
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_causal_mask.py') }}
275
306
run : |
307
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
276
308
cd benchmarks/triton_kernels_benchmark
277
309
python flex_attention_benchmark_causal_mask.py --reports $REPORTS --n_runs $N_RUNS
278
310
@@ -283,6 +315,7 @@ jobs:
283
315
- name : Run Triton FlexAttention Custom Masks fwd kernel benchmark
284
316
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flex_attention_benchmark_custom_masks.py') }}
285
317
run : |
318
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
286
319
cd benchmarks/triton_kernels_benchmark
287
320
python flex_attention_benchmark_custom_masks.py --reports $REPORTS --n_runs $N_RUNS
288
321
@@ -293,6 +326,7 @@ jobs:
293
326
- name : Run Prefix Sums kernel benchmark
294
327
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'prefix_sums.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
295
328
run : |
329
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
296
330
cd benchmarks/triton_kernels_benchmark
297
331
python prefix_sums.py --reports $REPORTS --n_runs $N_RUNS
298
332
source ../../scripts/capture-hw-details.sh
@@ -301,6 +335,7 @@ jobs:
301
335
- name : Run micro benchmark
302
336
if : ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'micro_benchmarks.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
303
337
run : |
338
+ export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
304
339
cd benchmarks/micro_benchmarks
305
340
python run_benchmarks.py --reports $REPORTS
306
341
0 commit comments