Skip to content

Commit 1c36385

Browse files
Merge branch 'main' into multiple-func-single-script
Signed-off-by: ConvolutedDog <yangjianchao16@nudt.edu.cn>
2 parents 0aed45f + 65a73f6 commit 1c36385

22 files changed

+767
-136
lines changed

.gitignore

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,4 +64,10 @@ Thumbs.db
6464
html/
6565

6666
# Test report
67-
report.xml
67+
report.xml
68+
69+
# Logs / NCU Reps / PNGs / CSVs
70+
*.log
71+
*.ncu-rep
72+
*.png
73+
*.csv

docs/source/overview/architecture.rst

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,23 @@ Advanced Options
2121
----------------
2222

2323
**Metric Selection**
24-
Nsight Python collects `gpu__time_duration.sum` by default. To collect another NVIDIA Nsight Compute metric:
24+
Nsight Python collects `gpu__time_duration.sum` by default. To collect other NVIDIA Nsight Compute metrics:
2525

2626
.. code-block:: python
2727
28-
@nsight.analyze.kernel(metric="sm__throughput.avg.pct_of_peak_sustained_elapsed")
29-
def benchmark(...):
28+
@nsight.analyze.kernel(metrics=["sm__throughput.avg.pct_of_peak_sustained_elapsed"])
29+
def benchmark1(...):
30+
...
31+
32+
# or
33+
@nsight.analyze.kernel(
34+
metrics=[
35+
"smsp__sass_inst_executed_op_shared_ld.sum",
36+
"smsp__sass_inst_executed_op_shared_st.sum",
37+
"launch__sm_count",
38+
],
39+
)
40+
def benchmark2(...):
3041
...
3142
3243
**Derived Metrics**

examples/01_compare_throughput.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
New concepts:
1111
- Multiple `nsight.annotate()` blocks to profile different kernels
1212
- Using `@nsight.annotate()` as a function decorator (alternative to context manager)
13-
- Using the `metric` parameter to collect a specific Nsight Compute metric (DRAM throughput instead of execution time)
13+
- Using the `metrics` parameter to collect a specific Nsight Compute metric (DRAM throughput instead of execution time)
1414
- Using `print_data=True` to print the collected dataframe to the terminal
1515
"""
1616

@@ -33,7 +33,7 @@ def einsum_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
3333
@nsight.analyze.kernel(
3434
runs=10,
3535
# Collect DRAM throughput as percentage of peak instead of time
36-
metric="dram__throughput.avg.pct_of_peak_sustained_elapsed",
36+
metrics=["dram__throughput.avg.pct_of_peak_sustained_elapsed"],
3737
)
3838
def benchmark_matmul_throughput(n: int) -> None:
3939
"""

examples/02_parameter_sweep.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ def benchmark_matmul_sizes(n: int) -> None:
3636

3737

3838
def main() -> None:
39-
benchmark_matmul_sizes() # notice no n parameter is passed, it is passed in the configs list instead
39+
# notice no n parameter is passed, it is passed in the configs list instead
40+
result = benchmark_matmul_sizes()
41+
print(result.to_dataframe())
4042
print("✓ Benchmark complete! Check '02_parameter_sweep.png'")
4143

4244

examples/03_custom_metrics.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ def benchmark_tflops(n: int) -> None:
6969

7070

7171
def main() -> None:
72-
benchmark_tflops()
72+
result = benchmark_tflops()
73+
print(result.to_dataframe())
7374
print("✓ TFLOPs benchmark complete! Check '03_custom_metrics.png'")
7475

7576

examples/08_multiple_metrics.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Copyright 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""
5+
Example 8: Collecting Multiple Metrics
6+
=======================================
7+
8+
This example shows how to collect multiple metrics in a single profiling run.
9+
10+
New concepts:
11+
- Using the `metrics` parameter to collect multiple metrics
12+
- `@nsight.analyze.plot` decorator does NOT support multiple metrics now
13+
"""
14+
15+
import torch
16+
17+
import nsight
18+
19+
sizes = [(2**i,) for i in range(11, 13)]
20+
21+
22+
@nsight.analyze.kernel(
23+
configs=sizes,
24+
runs=5,
25+
# Collect both shared memory load and store SASS instructions
26+
metrics=[
27+
"smsp__sass_inst_executed_op_shared_ld.sum",
28+
"smsp__sass_inst_executed_op_shared_st.sum",
29+
],
30+
)
31+
def analyze_shared_memory_ops(n: int) -> None:
32+
"""Analyze both shared memory load and store SASS instructions
33+
for different kernels.
34+
35+
Note: To evaluate multiple metrics, pass them as a sequence
36+
(list/tuple). All results are merged into one ProfileResults
37+
object, with the 'Metric' column indicating each specific metric.
38+
"""
39+
40+
a = torch.randn(n, n, device="cuda")
41+
b = torch.randn(n, n, device="cuda")
42+
c = torch.randn(2 * n, 2 * n, device="cuda")
43+
d = torch.randn(2 * n, 2 * n, device="cuda")
44+
45+
with nsight.annotate("@-operator"):
46+
_ = a @ b
47+
48+
with nsight.annotate("torch.matmul"):
49+
_ = torch.matmul(c, d)
50+
51+
52+
def main() -> None:
53+
# Run analysis with multiple metrics
54+
results = analyze_shared_memory_ops()
55+
56+
df = results.to_dataframe()
57+
print(df)
58+
59+
unique_metrics = df["Metric"].unique()
60+
print(f"\n✓ Collected {len(unique_metrics)} metrics:")
61+
for metric in unique_metrics:
62+
print(f" - {metric}")
63+
64+
print("\n✓ Sample data:")
65+
print(df[["Annotation", "n", "Metric", "AvgValue"]].to_string(index=False))
66+
67+
print("\n" + "=" * 60)
68+
print("IMPORTANT: @plot decorator limitation")
69+
print("=" * 60)
70+
print("When multiple metrics are collected:")
71+
print(" ✓ All metrics are collected in a single ProfileResults object")
72+
print(" ✓ DataFrame has 'Metric' column to distinguish them")
73+
print(" ✗ @nsight.analyze.plot decorator will RAISE AN ERROR")
74+
print(" Why? @plot can only visualize one metric at a time.")
75+
print(" Tip: Use separate @kernel functions for each metric or use")
76+
print(" 'derive_metric' to compute custom values.")
77+
78+
79+
if __name__ == "__main__":
80+
main()
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Copyright 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""
5+
Example 9: Advanced Custom Metrics from Multiple Metrics
6+
=========================================================
7+
8+
This example shows how to compute custom metrics from multiple metrics.
9+
10+
New concepts:
11+
- Using `derive_metric` to compute custom values from multiple metrics
12+
"""
13+
14+
import torch
15+
16+
import nsight
17+
18+
sizes = [(2**i,) for i in range(10, 13)]
19+
20+
21+
def compute_avg_insts(
22+
ld_insts: int, st_insts: int, launch_sm_count: int, n: int
23+
) -> float:
24+
"""
25+
Compute average shared memory load/store instructions per SM.
26+
27+
Custom metric function signature:
28+
- First several arguments: the measured metrics, must match the order
29+
of metrics in @kernel decorator
30+
- Remaining arguments: must match the decorated function's signature
31+
32+
In this example:
33+
- ld_insts: Total shared memory load instructions
34+
(from smsp__inst_executed_pipe_lsu.shared_op_ld.sum metric)
35+
- st_insts: Total shared memory store instructions
36+
(from smsp__inst_executed_pipe_lsu.shared_op_st.sum metric)
37+
- launch_sm_count: Number of SMs that launched blocks
38+
(from launch__block_sm_count metric)
39+
- n: Matches the 'n' parameter from benchmark_avg_insts(n)
40+
41+
Args:
42+
ld_insts: Total shared memory load instructions
43+
st_insts: Total shared memory store instructions
44+
launch_sm_count: Number of SMs that launched blocks
45+
n: Matrix size (n x n) - parameter from the decorated benchmark function
46+
47+
Returns:
48+
Average shared memory load/store instructions per SM
49+
"""
50+
insts_per_sm = (ld_insts + st_insts) / launch_sm_count
51+
return insts_per_sm
52+
53+
54+
@nsight.analyze.plot(
55+
filename="09_advanced_metric_custom.png",
56+
ylabel="Average Shared Memory Load/Store Instructions per SM", # Custom y-axis label
57+
annotate_points=True, # Show values on the plot
58+
)
59+
@nsight.analyze.kernel(
60+
configs=sizes,
61+
runs=10,
62+
derive_metric=compute_avg_insts, # Use custom metric
63+
metrics=[
64+
"smsp__sass_inst_executed_op_shared_ld.sum",
65+
"smsp__sass_inst_executed_op_shared_st.sum",
66+
"launch__sm_count",
67+
],
68+
)
69+
def benchmark_avg_insts(n: int) -> None:
70+
"""
71+
Benchmark matmul and display results.
72+
"""
73+
a = torch.randn(n, n, device="cuda")
74+
b = torch.randn(n, n, device="cuda")
75+
76+
with nsight.annotate("matmul"):
77+
_ = a @ b
78+
79+
80+
def main() -> None:
81+
result = benchmark_avg_insts()
82+
print(result.to_dataframe())
83+
print("✓ Avg Insts benchmark complete! Check '09_advanced_metric_custom.png'")
84+
85+
86+
if __name__ == "__main__":
87+
main()
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Copyright 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""
5+
Example 10: Multiple Kernels per Run with Combined Metrics
6+
===========================================================
7+
8+
This example shows how to profile multiple kernels in a single run and combine their metrics.
9+
10+
New concepts:
11+
- Using `combine_kernel_metrics` to aggregate metrics from multiple kernels
12+
- Summing metrics from consecutive kernel executions
13+
"""
14+
15+
import torch
16+
17+
import nsight
18+
19+
# Define configuration sizes
20+
sizes = [(2**i,) for i in range(10, 13)]
21+
22+
23+
@nsight.analyze.plot(
24+
filename="10_combine_kernel_metrics.png",
25+
ylabel="Total Cycles (Sum of 3 Kernels)",
26+
annotate_points=True,
27+
)
28+
@nsight.analyze.kernel(
29+
configs=sizes,
30+
runs=7,
31+
combine_kernel_metrics=lambda x, y: x + y, # Sum metrics from multiple kernels
32+
metrics=[
33+
"sm__cycles_elapsed.avg",
34+
],
35+
)
36+
def benchmark_multiple_kernels(n: int) -> None:
37+
"""
38+
Benchmark three matrix multiplications in a single run.
39+
40+
Executes three matmul operations within one profiled context,
41+
demonstrating metric combination across kernels.
42+
43+
Args:
44+
n: Matrix size (n x n)
45+
"""
46+
a = torch.randn(n, n, device="cuda")
47+
b = torch.randn(n, n, device="cuda")
48+
49+
with nsight.annotate("test"):
50+
# Three consecutive kernel executions
51+
_ = a @ b # Kernel 1
52+
_ = a @ b # Kernel 2
53+
_ = a @ b # Kernel 3
54+
55+
56+
def main() -> None:
57+
result = benchmark_multiple_kernels()
58+
print(result.to_dataframe())
59+
print("\n✓ Total Cycles benchmark complete! Check '10_combine_kernel_metrics.png'")
60+
61+
62+
if __name__ == "__main__":
63+
main()

0 commit comments

Comments
 (0)