Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,10 @@ Thumbs.db
html/

# Test report
report.xml
report.xml

# Logs / NCU Reps / PNGs / CSVs
*.log
*.ncu-rep
*.png
*.csv
17 changes: 14 additions & 3 deletions docs/source/overview/architecture.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,23 @@ Advanced Options
----------------

**Metric Selection**
Nsight Python collects `gpu__time_duration.sum` by default. To collect another NVIDIA Nsight Compute metric:
Nsight Python collects `gpu__time_duration.sum` by default. To collect other NVIDIA Nsight Compute metrics:

.. code-block:: python

@nsight.analyze.kernel(metric="sm__throughput.avg.pct_of_peak_sustained_elapsed")
def benchmark(...):
@nsight.analyze.kernel(metrics=["sm__throughput.avg.pct_of_peak_sustained_elapsed"])
def benchmark1(...):
...

# or
@nsight.analyze.kernel(
metrics=[
"smsp__sass_inst_executed_op_shared_ld.sum",
"smsp__sass_inst_executed_op_shared_st.sum",
"launch__sm_count",
],
)
def benchmark2(...):
...

**Derived Metrics**
Expand Down
4 changes: 2 additions & 2 deletions examples/01_compare_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
New concepts:
- Multiple `nsight.annotate()` blocks to profile different kernels
- Using `@nsight.annotate()` as a function decorator (alternative to context manager)
- Using the `metric` parameter to collect a specific Nsight Compute metric (DRAM throughput instead of execution time)
- Using the `metrics` parameter to collect a specific Nsight Compute metric (DRAM throughput instead of execution time)
- Using `print_data=True` to print the collected dataframe to the terminal
"""

Expand All @@ -33,7 +33,7 @@ def einsum_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
@nsight.analyze.kernel(
runs=10,
# Collect DRAM throughput as percentage of peak instead of time
metric="dram__throughput.avg.pct_of_peak_sustained_elapsed",
metrics=["dram__throughput.avg.pct_of_peak_sustained_elapsed"],
)
def benchmark_matmul_throughput(n: int) -> None:
"""
Expand Down
4 changes: 3 additions & 1 deletion examples/02_parameter_sweep.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def benchmark_matmul_sizes(n: int) -> None:


def main() -> None:
benchmark_matmul_sizes() # notice no n parameter is passed, it is passed in the configs list instead
# notice no n parameter is passed, it is passed in the configs list instead
result = benchmark_matmul_sizes()
print(result.to_dataframe())
print("✓ Benchmark complete! Check '02_parameter_sweep.png'")


Expand Down
3 changes: 2 additions & 1 deletion examples/03_custom_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def benchmark_tflops(n: int) -> None:


def main() -> None:
benchmark_tflops()
result = benchmark_tflops()
print(result.to_dataframe())
print("✓ TFLOPs benchmark complete! Check '03_custom_metrics.png'")


Expand Down
80 changes: 80 additions & 0 deletions examples/08_multiple_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Example 8: Collecting Multiple Metrics
=======================================

This example shows how to collect multiple metrics in a single profiling run.

New concepts:
- Using the `metrics` parameter to collect multiple metrics
- `@nsight.analyze.plot` decorator does NOT support multiple metrics now
"""

import torch

import nsight

sizes = [(2**i,) for i in range(11, 13)]


@nsight.analyze.kernel(
configs=sizes,
runs=5,
# Collect both shared memory load and store SASS instructions
metrics=[
"smsp__sass_inst_executed_op_shared_ld.sum",
"smsp__sass_inst_executed_op_shared_st.sum",
],
)
def analyze_shared_memory_ops(n: int) -> None:
"""Analyze both shared memory load and store SASS instructions
for different kernels.

Note: To evaluate multiple metrics, pass them as a sequence
(list/tuple). All results are merged into one ProfileResults
object, with the 'Metric' column indicating each specific metric.
"""

a = torch.randn(n, n, device="cuda")
b = torch.randn(n, n, device="cuda")
c = torch.randn(2 * n, 2 * n, device="cuda")
d = torch.randn(2 * n, 2 * n, device="cuda")

with nsight.annotate("@-operator"):
_ = a @ b

with nsight.annotate("torch.matmul"):
_ = torch.matmul(c, d)


def main() -> None:
# Run analysis with multiple metrics
results = analyze_shared_memory_ops()

df = results.to_dataframe()
print(df)

unique_metrics = df["Metric"].unique()
print(f"\n✓ Collected {len(unique_metrics)} metrics:")
for metric in unique_metrics:
print(f" - {metric}")

print("\n✓ Sample data:")
print(df[["Annotation", "n", "Metric", "AvgValue"]].to_string(index=False))

print("\n" + "=" * 60)
print("IMPORTANT: @plot decorator limitation")
print("=" * 60)
print("When multiple metrics are collected:")
print(" ✓ All metrics are collected in a single ProfileResults object")
print(" ✓ DataFrame has 'Metric' column to distinguish them")
print(" ✗ @nsight.analyze.plot decorator will RAISE AN ERROR")
print(" Why? @plot can only visualize one metric at a time.")
print(" Tip: Use separate @kernel functions for each metric or use")
print(" 'derive_metric' to compute custom values.")


if __name__ == "__main__":
main()
87 changes: 87 additions & 0 deletions examples/09_advanced_metric_custom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Copyright 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Example 9: Advanced Custom Metrics from Multiple Metrics
=========================================================

This example shows how to compute custom metrics from multiple metrics.

New concepts:
- Using `derive_metric` to compute custom values from multiple metrics
"""

import torch

import nsight

sizes = [(2**i,) for i in range(10, 13)]


def compute_avg_insts(
ld_insts: int, st_insts: int, launch_sm_count: int, n: int
) -> float:
"""
Compute average shared memory load/store instructions per SM.

Custom metric function signature:
- First several arguments: the measured metrics, must match the order
of metrics in @kernel decorator
- Remaining arguments: must match the decorated function's signature

In this example:
- ld_insts: Total shared memory load instructions
(from smsp__inst_executed_pipe_lsu.shared_op_ld.sum metric)
- st_insts: Total shared memory store instructions
(from smsp__inst_executed_pipe_lsu.shared_op_st.sum metric)
- launch_sm_count: Number of SMs that launched blocks
(from launch__block_sm_count metric)
- n: Matches the 'n' parameter from benchmark_avg_insts(n)

Args:
ld_insts: Total shared memory load instructions
st_insts: Total shared memory store instructions
launch_sm_count: Number of SMs that launched blocks
n: Matrix size (n x n) - parameter from the decorated benchmark function

Returns:
Average shared memory load/store instructions per SM
"""
insts_per_sm = (ld_insts + st_insts) / launch_sm_count
return insts_per_sm


@nsight.analyze.plot(
filename="09_advanced_metric_custom.png",
ylabel="Average Shared Memory Load/Store Instructions per SM", # Custom y-axis label
annotate_points=True, # Show values on the plot
)
@nsight.analyze.kernel(
configs=sizes,
runs=10,
derive_metric=compute_avg_insts, # Use custom metric
metrics=[
"smsp__sass_inst_executed_op_shared_ld.sum",
"smsp__sass_inst_executed_op_shared_st.sum",
"launch__sm_count",
],
)
def benchmark_avg_insts(n: int) -> None:
"""
Benchmark matmul and display results.
"""
a = torch.randn(n, n, device="cuda")
b = torch.randn(n, n, device="cuda")

with nsight.annotate("matmul"):
_ = a @ b


def main() -> None:
result = benchmark_avg_insts()
print(result.to_dataframe())
print("✓ Avg Insts benchmark complete! Check '09_advanced_metric_custom.png'")


if __name__ == "__main__":
main()
63 changes: 63 additions & 0 deletions examples/10_combine_kernel_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Example 10: Multiple Kernels per Run with Combined Metrics
===========================================================

This example shows how to profile multiple kernels in a single run and combine their metrics.

New concepts:
- Using `combine_kernel_metrics` to aggregate metrics from multiple kernels
- Summing metrics from consecutive kernel executions
"""

import torch

import nsight

# Define configuration sizes
sizes = [(2**i,) for i in range(10, 13)]


@nsight.analyze.plot(
filename="10_combine_kernel_metrics.png",
ylabel="Total Cycles (Sum of 3 Kernels)",
annotate_points=True,
)
@nsight.analyze.kernel(
configs=sizes,
runs=7,
combine_kernel_metrics=lambda x, y: x + y, # Sum metrics from multiple kernels
metrics=[
"sm__cycles_elapsed.avg",
],
)
def benchmark_multiple_kernels(n: int) -> None:
"""
Benchmark three matrix multiplications in a single run.

Executes three matmul operations within one profiled context,
demonstrating metric combination across kernels.

Args:
n: Matrix size (n x n)
"""
a = torch.randn(n, n, device="cuda")
b = torch.randn(n, n, device="cuda")

with nsight.annotate("test"):
# Three consecutive kernel executions
_ = a @ b # Kernel 1
_ = a @ b # Kernel 2
_ = a @ b # Kernel 3


def main() -> None:
result = benchmark_multiple_kernels()
print(result.to_dataframe())
print("\n✓ Total Cycles benchmark complete! Check '10_combine_kernel_metrics.png'")


if __name__ == "__main__":
main()
Loading