Skip to content

Commit

Permalink
add FP8 metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
feizheng10 committed Feb 14, 2025
1 parent 8c3895e commit fc31174
Show file tree
Hide file tree
Showing 21 changed files with 214 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ Panel Config:
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
pop: None # No perf counter
tips:
MFMA FLOPs (F8):
value: None # No HW module
unit: GFLOP
peak: None # No HW module
pop: None # No HW module
MFMA FLOPs (BF16):
value: None # No perf counter
unit: GFLOPs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,12 @@ Panel Config:
max: None # No HW module
unit: (instr + $normUnit)
tips:
MFMA-F8:
avg: None # No HW module
min: None # No HW module
max: None # No HW module None # No HW module
unit: (instr + $normUnit)
tips:
MFMA-F16:
avg: None # No HW module
min: None # No HW module
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ Panel Config:
peak: None
pop: None
tips:
MFMA FLOPs (F8):
value: None # No perf counter
unit: GFLOP
peak: None # No perf counter
pop: None # No perf counter
MFMA FLOPs (BF16):
value: None # No perf counter
Unit: None
Expand Down Expand Up @@ -174,6 +179,10 @@ Panel Config:
max: None # No perf counter
unit: (OPs + $normUnit)
tips:
F8 OPs:
avg: None # No HW module
min: None # No HW module
max: None # No HW module
F16 OPs:
avg: None # No perf counter
min: None # No perf counter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ Panel Config:
peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
pop: None # No perf counter
tips:
MFMA FLOPs (F8):
value: None # No HW module
unit: GFLOP
peak: None # No HW module
pop: None # No HW module
MFMA FLOPs (BF16):
value: None # No perf counter
unit: GFLOPs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,13 @@ Panel Config:
unit: Unit
tips: Tips
metric:
INT-32:
INT32:
avg: None # No perf counter
min: None # No perf counter
max: None # No perf counter
unit: (instr + $normUnit)
tips:
INT-64:
INT64:
avg: None # No perf counter
min: None # No perf counter
max: None # No perf counter
Expand Down Expand Up @@ -241,6 +241,12 @@ Panel Config:
max: None # No HW module
unit: (instr + $normUnit)
tips:
MFMA-F8:
avg: None # No HW module
min: None # No HW module
max: None # No HW module None # No HW module
unit: (instr + $normUnit)
tips:
MFMA-F16:
avg: None # No HW module
min: None # No HW module
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ Panel Config:
peak: None
pop: None
tips:
MFMA FLOPs (F8):
value: None # No perf counter
unit: GFLOP
peak: None # No perf counter
pop: None # No perf counter
MFMA FLOPs (BF16):
value: None # No perf counter
Unit: None
Expand Down Expand Up @@ -174,6 +179,10 @@ Panel Config:
max: None # No perf counter
unit: (OPs + $normUnit)
tips:
F8 OPs:
avg: None # No HW module
min: None # No HW module
max: None # No HW module
F16 OPs:
avg: None # No perf counter
min: None # No perf counter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ Panel Config:
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
- Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
tips:
MFMA FLOPs (F8):
value: None
unit: GFLOP
peak: None
pop: None
tips:
MFMA FLOPs (BF16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,12 @@ Panel Config:
max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
unit: (instr + $normUnit)
tips:
MFMA-F8:
avg: None
min: None
max: None
unit: (instr + $normUnit)
tips:
MFMA-F16:
avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ Panel Config:
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
- Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
tips:
MFMA FLOPs (F8):
value: None
unit: GFLOP
peak: None
pop: None
tips:
MFMA FLOPs (BF16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
Expand Down Expand Up @@ -216,6 +222,12 @@ Panel Config:
max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom)
unit: (OPs + $normUnit)
tips:
F8 OPs:
avg: None
min: None
max: None
unit: (OPs + $normUnit)
tips:
F16 OPs:
avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) +
(64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 *
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ Panel Config:
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
- Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
tips:
MFMA FLOPs (F8):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
MFMA FLOPs (BF16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
Expand Down Expand Up @@ -187,12 +194,14 @@ Panel Config:
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
tips:
L2-Fabric Read BW:
value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
* 64)) / (End_Timestamp - Start_Timestamp)))
value: AVG((128 * TCC_BUBBLE_sum +
64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) +
32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
unit: GB/s
peak: $hbm_bw
pop: ((100 * AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
* 64)) / (End_Timestamp - Start_Timestamp)))) / $hbm_bw)
pop: ((100 * (AVG((128 * TCC_BUBBLE_sum +
64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) +
32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbm_bw)
tips:
L2-Fabric Write BW:
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,12 @@ Panel Config:
max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
unit: (instr + $normUnit)
tips:
MFMA-F8:
avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
unit: (instr + $normUnit)
tips:
MFMA-F16:
avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ Panel Config:
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
- Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
tips:
MFMA FLOPs (F8):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
MFMA FLOPs (BF16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
Expand Down Expand Up @@ -188,21 +195,21 @@ Panel Config:
metric:
FLOPs (Total):
avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16)
+ (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512
+ (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512
* SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32)
+ SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
+ (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) /
$denom))
min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16)
+ (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512
+ (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512
* SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32)
+ SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
+ (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) /
$denom))
max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16)
+ (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512
+ (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512
* SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32)
+ SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
+ (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
Expand All @@ -216,6 +223,12 @@ Panel Config:
max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom)
unit: (OPs + $normUnit)
tips:
F8 OPs:
avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
unit: (OPs + $normUnit)
tips:
F16 OPs:
avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) +
(64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 *
Expand Down
38 changes: 24 additions & 14 deletions src/rocprof_compute_soc/analysis_configs/gfx940/1700_L2_cache.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ Panel Config:
unit: pct
tips:
L2-Fabric Read BW:
value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
* 64)) / (End_Timestamp - Start_Timestamp)))
value: AVG((128 * TCC_BUBBLE_sum +
64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) +
32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
unit: GB/s
tips:
L2-Fabric Write and Atomic BW:
Expand All @@ -53,12 +54,15 @@ Panel Config:
tips: Tips
metric:
Read BW:
avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
* 64)) / $denom))
min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
* 64)) / $denom))
max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
* 64)) / $denom))
avg: AVG(((128 * TCC_BUBBLE_sum +
64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) +
32 * TCC_EA0_RDREQ_32B_sum) / $denom))
min: MIN(((128 * TCC_BUBBLE_sum +
64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) +
32 * TCC_EA0_RDREQ_32B_sum) / $denom))
max: MAX(((128 * TCC_BUBBLE_sum +
64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) +
32 * TCC_EA0_RDREQ_32B_sum) / $denom))
unit: (Bytes + $normUnit)
tips:
HBM Read Traffic:
Expand All @@ -68,9 +72,9 @@ Panel Config:
unit: pct
tips:
Remote Read Traffic:
avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
unit: pct
tips:
Uncached Read Traffic:
Expand Down Expand Up @@ -380,9 +384,15 @@ Panel Config:
unit: (Req + $normUnit)
tips:
Read (64B):
avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
avg: AVG(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
min: MIN(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
max: MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
unit: (Req + $normUnit)
tips:
Read (128B):
avg: AVG(((TCC_BUBBLE_sum) / $denom))
min: MIN(((TCC_BUBBLE_sum) / $denom))
max: MAX(((TCC_BUBBLE_sum) / $denom))
unit: (Req + $normUnit)
tips:
HBM Read:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ Panel Config:
pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
- Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
tips:
MFMA FLOPs (F8):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
MFMA FLOPs (BF16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
Expand Down Expand Up @@ -187,12 +194,14 @@ Panel Config:
/ ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
tips:
L2-Fabric Read BW:
value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
* 64)) / (End_Timestamp - Start_Timestamp)))
value: AVG((128 * TCC_BUBBLE_sum +
64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) +
32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
unit: GB/s
peak: $hbm_bw
pop: ((100 * AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
* 64)) / (End_Timestamp - Start_Timestamp)))) / $hbm_bw)
pop: ((100 * (AVG((128 * TCC_BUBBLE_sum +
64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) +
32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbm_bw)
tips:
L2-Fabric Write BW:
value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,12 @@ Panel Config:
max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
unit: (instr + $normUnit)
tips:
MFMA-F8:
avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
unit: (instr + $normUnit)
tips:
MFMA-F16:
avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
Expand Down
Loading

0 comments on commit fc31174

Please sign in to comment.