forked from NVIDIA/nsight-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path10_combine_kernel_metrics.py
More file actions
63 lines (48 loc) · 1.64 KB
/
10_combine_kernel_metrics.py
File metadata and controls
63 lines (48 loc) · 1.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Copyright 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Example 10: Multiple Kernels per Run with Combined Metrics
===========================================================
This example shows how to profile multiple kernels in a single run and combine their metrics.
New concepts:
- Using `combine_kernel_metrics` to aggregate metrics from multiple kernels
- Summing metrics from consecutive kernel executions
"""
import torch
import nsight
# Define configuration sizes
sizes = [(2**i,) for i in range(10, 13)]
@nsight.analyze.plot(
filename="10_combine_kernel_metrics.png",
ylabel="Total Cycles (Sum of 3 Kernels)",
annotate_points=True,
)
@nsight.analyze.kernel(
configs=sizes,
runs=7,
combine_kernel_metrics=lambda x, y: x + y, # Sum metrics from multiple kernels
metrics=[
"sm__cycles_elapsed.avg",
],
)
def benchmark_multiple_kernels(n: int) -> None:
"""
Benchmark three matrix multiplications in a single run.
Executes three matmul operations within one profiled context,
demonstrating metric combination across kernels.
Args:
n: Matrix size (n x n)
"""
a = torch.randn(n, n, device="cuda")
b = torch.randn(n, n, device="cuda")
with nsight.annotate("test"):
# Three consecutive kernel executions
_ = a @ b # Kernel 1
_ = a @ b # Kernel 2
_ = a @ b # Kernel 3
def main() -> None:
result = benchmark_multiple_kernels()
print(result.to_dataframe())
print("\n✓ Total Cycles benchmark complete! Check '10_combine_kernel_metrics.png'")
if __name__ == "__main__":
main()