|
| 1 | +# Copyright 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +""" |
| 5 | +Example 9: Advanced Custom Metrics from Multiple Metrics |
| 6 | +========================================================= |
| 7 | +
|
| 8 | +This example shows how to compute custom metrics from multiple metrics. |
| 9 | +
|
| 10 | +New concepts: |
| 11 | +- Using `derive_metric` to compute custom values from multiple metrics |
| 12 | +""" |
| 13 | + |
| 14 | +import torch |
| 15 | + |
| 16 | +import nsight |
| 17 | + |
| 18 | +sizes = [(2**i,) for i in range(10, 13)] |
| 19 | + |
| 20 | + |
| 21 | +def compute_avg_insts( |
| 22 | + ld_insts: int, st_insts: int, launch_sm_count: int, n: int |
| 23 | +) -> float: |
| 24 | + """ |
| 25 | + Compute average shared memory load/store instructions per SM. |
| 26 | +
|
| 27 | + Custom metric function signature: |
| 28 | + - First several arguments: the measured metrics, must match the order |
| 29 | + of metrics in @kernel decorator |
| 30 | + - Remaining arguments: must match the decorated function's signature |
| 31 | +
|
| 32 | + In this example: |
| 33 | + - ld_insts: Total shared memory load instructions |
| 34 | + (from smsp__inst_executed_pipe_lsu.shared_op_ld.sum metric) |
| 35 | + - st_insts: Total shared memory store instructions |
| 36 | + (from smsp__inst_executed_pipe_lsu.shared_op_st.sum metric) |
| 37 | + - launch_sm_count: Number of SMs that launched blocks |
| 38 | + (from launch__block_sm_count metric) |
| 39 | + - n: Matches the 'n' parameter from benchmark_avg_insts(n) |
| 40 | +
|
| 41 | + Args: |
| 42 | + ld_insts: Total shared memory load instructions |
| 43 | + st_insts: Total shared memory store instructions |
| 44 | + launch_sm_count: Number of SMs that launched blocks |
| 45 | + n: Matrix size (n x n) - parameter from the decorated benchmark function |
| 46 | +
|
| 47 | + Returns: |
| 48 | + Average shared memory load/store instructions per SM |
| 49 | + """ |
| 50 | + insts_per_sm = (ld_insts + st_insts) / launch_sm_count |
| 51 | + return insts_per_sm |
| 52 | + |
| 53 | + |
| 54 | +@nsight.analyze.plot( |
| 55 | + filename="09_advanced_metric_custom.png", |
| 56 | + ylabel="Average Shared Memory Load/Store Instructions per SM", # Custom y-axis label |
| 57 | + annotate_points=True, # Show values on the plot |
| 58 | +) |
| 59 | +@nsight.analyze.kernel( |
| 60 | + configs=sizes, |
| 61 | + runs=10, |
| 62 | + derive_metric=compute_avg_insts, # Use custom metric |
| 63 | + metrics=[ |
| 64 | + "smsp__sass_inst_executed_op_shared_ld.sum", |
| 65 | + "smsp__sass_inst_executed_op_shared_st.sum", |
| 66 | + "launch__sm_count", |
| 67 | + ], |
| 68 | +) |
| 69 | +def benchmark_avg_insts(n: int) -> None: |
| 70 | + """ |
| 71 | + Benchmark matmul and display results. |
| 72 | + """ |
| 73 | + a = torch.randn(n, n, device="cuda") |
| 74 | + b = torch.randn(n, n, device="cuda") |
| 75 | + |
| 76 | + with nsight.annotate("matmul"): |
| 77 | + _ = a @ b |
| 78 | + |
| 79 | + |
| 80 | +def main() -> None: |
| 81 | + result = benchmark_avg_insts() |
| 82 | + print(result.to_dataframe()) |
| 83 | + print("✓ Avg Insts benchmark complete! Check '09_advanced_metric_custom.png'") |
| 84 | + |
| 85 | + |
| 86 | +if __name__ == "__main__": |
| 87 | + main() |
0 commit comments