NVIDIA
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 1 deletion b/‎.gitignore‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎docs/source/overview/architecture.rst‎
Lines changed: 14 additions & 3 deletions b/‎docs/source/overview/architecture.rst‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎examples/01_compare_throughput.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/01_compare_throughput.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/02_parameter_sweep.py‎
Lines changed: 3 additions & 1 deletion b/‎examples/02_parameter_sweep.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/03_custom_metrics.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/03_custom_metrics.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/08_multiple_metrics.py‎
Lines changed: 80 additions & 0 deletions b/‎examples/08_multiple_metrics.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎examples/09_advanced_metric_custom.py‎
Lines changed: 87 additions & 0 deletions b/‎examples/09_advanced_metric_custom.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎examples/10_combine_kernel_metrics.py‎
Lines changed: 63 additions & 0 deletions b/‎examples/10_combine_kernel_metrics.py‎
Lines changed: 63 additions & 0 deletions
@@ -64,4 +64,10 @@ Thumbs.db
 html/
 
 # Test report
-report.xml
+report.xml
+
+# Logs / NCU Reps / PNGs / CSVs
+*.log
+*.ncu-rep
+*.png
+*.csv
@@ -21,12 +21,23 @@ Advanced Options
 ----------------
 
 **Metric Selection**  
-Nsight Python collects `gpu__time_duration.sum` by default. To collect another NVIDIA Nsight Compute metric:
+Nsight Python collects `gpu__time_duration.sum` by default. To collect other NVIDIA Nsight Compute metrics:
 
 .. code-block:: python
 
-   @nsight.analyze.kernel(metric="sm__throughput.avg.pct_of_peak_sustained_elapsed")
-   def benchmark(...):
+   @nsight.analyze.kernel(metrics=["sm__throughput.avg.pct_of_peak_sustained_elapsed"])
+   def benchmark1(...):
+       ...
+
+   # or
+   @nsight.analyze.kernel(
+       metrics=[
+           "smsp__sass_inst_executed_op_shared_ld.sum",
+           "smsp__sass_inst_executed_op_shared_st.sum",
+           "launch__sm_count",
+       ],
+   )
+   def benchmark2(...):
        ...
 
 **Derived Metrics**  
 
@@ -10,7 +10,7 @@
 New concepts:
 - Multiple `nsight.annotate()` blocks to profile different kernels
 - Using `@nsight.annotate()` as a function decorator (alternative to context manager)
-- Using the `metric` parameter to collect a specific Nsight Compute metric (DRAM throughput instead of execution time)
+- Using the `metrics` parameter to collect a specific Nsight Compute metric (DRAM throughput instead of execution time)
 - Using `print_data=True` to print the collected dataframe to the terminal
 """
 
@@ -33,7 +33,7 @@ def einsum_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
 @nsight.analyze.kernel(
     runs=10,
     # Collect DRAM throughput as percentage of peak instead of time
-    metric="dram__throughput.avg.pct_of_peak_sustained_elapsed",
+    metrics=["dram__throughput.avg.pct_of_peak_sustained_elapsed"],
 )
 def benchmark_matmul_throughput(n: int) -> None:
     """
 
@@ -36,7 +36,9 @@ def benchmark_matmul_sizes(n: int) -> None:
 
 
 def main() -> None:
-    benchmark_matmul_sizes()  # notice no n parameter is passed, it is passed in the configs list instead
+    # notice no n parameter is passed, it is passed in the configs list instead
+    result = benchmark_matmul_sizes()
+    print(result.to_dataframe())
     print("✓ Benchmark complete! Check '02_parameter_sweep.png'")
 
 
 
@@ -69,7 +69,8 @@ def benchmark_tflops(n: int) -> None:
 
 
 def main() -> None:
-    benchmark_tflops()
+    result = benchmark_tflops()
+    print(result.to_dataframe())
     print("✓ TFLOPs benchmark complete! Check '03_custom_metrics.png'")
 
 
 
@@ -0,0 +1,80 @@
+# Copyright 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Example 8: Collecting Multiple Metrics
+=======================================
+
+This example shows how to collect multiple metrics in a single profiling run.
+
+New concepts:
+- Using the `metrics` parameter to collect multiple metrics
+- `@nsight.analyze.plot` decorator does NOT support multiple metrics now
+"""
+
+import torch
+
+import nsight
+
+sizes = [(2**i,) for i in range(11, 13)]
+
+
+@nsight.analyze.kernel(
+    configs=sizes,
+    runs=5,
+    # Collect both shared memory load and store SASS instructions
+    metrics=[
+        "smsp__sass_inst_executed_op_shared_ld.sum",
+        "smsp__sass_inst_executed_op_shared_st.sum",
+    ],
+)
+def analyze_shared_memory_ops(n: int) -> None:
+    """Analyze both shared memory load and store SASS instructions
+    for different kernels.
+
+    Note: To evaluate multiple metrics, pass them as a sequence
+    (list/tuple). All results are merged into one ProfileResults
+    object, with the 'Metric' column indicating each specific metric.
+    """
+
+    a = torch.randn(n, n, device="cuda")
+    b = torch.randn(n, n, device="cuda")
+    c = torch.randn(2 * n, 2 * n, device="cuda")
+    d = torch.randn(2 * n, 2 * n, device="cuda")
+
+    with nsight.annotate("@-operator"):
+        _ = a @ b
+
+    with nsight.annotate("torch.matmul"):
+        _ = torch.matmul(c, d)
+
+
+def main() -> None:
+    # Run analysis with multiple metrics
+    results = analyze_shared_memory_ops()
+
+    df = results.to_dataframe()
+    print(df)
+
+    unique_metrics = df["Metric"].unique()
+    print(f"\n✓ Collected {len(unique_metrics)} metrics:")
+    for metric in unique_metrics:
+        print(f"  - {metric}")
+
+    print("\n✓ Sample data:")
+    print(df[["Annotation", "n", "Metric", "AvgValue"]].to_string(index=False))
+
+    print("\n" + "=" * 60)
+    print("IMPORTANT: @plot decorator limitation")
+    print("=" * 60)
+    print("When multiple metrics are collected:")
+    print("  ✓ All metrics are collected in a single ProfileResults object")
+    print("  ✓ DataFrame has 'Metric' column to distinguish them")
+    print("  ✗ @nsight.analyze.plot decorator will RAISE AN ERROR")
+    print("    Why? @plot can only visualize one metric at a time.")
+    print("    Tip: Use separate @kernel functions for each metric or use")
+    print("         'derive_metric' to compute custom values.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,87 @@
+# Copyright 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Example 9: Advanced Custom Metrics from Multiple Metrics
+=========================================================
+
+This example shows how to compute custom metrics from multiple metrics.
+
+New concepts:
+- Using `derive_metric` to compute custom values from multiple metrics
+"""
+
+import torch
+
+import nsight
+
+sizes = [(2**i,) for i in range(10, 13)]
+
+
+def compute_avg_insts(
+    ld_insts: int, st_insts: int, launch_sm_count: int, n: int
+) -> float:
+    """
+    Compute average shared memory load/store instructions per SM.
+
+    Custom metric function signature:
+    - First several arguments: the measured metrics, must match the order
+      of metrics in @kernel decorator
+    - Remaining arguments: must match the decorated function's signature
+
+    In this example:
+    - ld_insts: Total shared memory load instructions
+                (from smsp__inst_executed_pipe_lsu.shared_op_ld.sum metric)
+    - st_insts: Total shared memory store instructions
+                (from smsp__inst_executed_pipe_lsu.shared_op_st.sum metric)
+    - launch_sm_count: Number of SMs that launched blocks
+                (from launch__block_sm_count metric)
+    - n: Matches the 'n' parameter from benchmark_avg_insts(n)
+
+    Args:
+        ld_insts: Total shared memory load instructions
+        st_insts: Total shared memory store instructions
+        launch_sm_count: Number of SMs that launched blocks
+        n: Matrix size (n x n) - parameter from the decorated benchmark function
+
+    Returns:
+        Average shared memory load/store instructions per SM
+    """
+    insts_per_sm = (ld_insts + st_insts) / launch_sm_count
+    return insts_per_sm
+
+
+@nsight.analyze.plot(
+    filename="09_advanced_metric_custom.png",
+    ylabel="Average Shared Memory Load/Store Instructions per SM",  # Custom y-axis label
+    annotate_points=True,  # Show values on the plot
+)
+@nsight.analyze.kernel(
+    configs=sizes,
+    runs=10,
+    derive_metric=compute_avg_insts,  # Use custom metric
+    metrics=[
+        "smsp__sass_inst_executed_op_shared_ld.sum",
+        "smsp__sass_inst_executed_op_shared_st.sum",
+        "launch__sm_count",
+    ],
+)
+def benchmark_avg_insts(n: int) -> None:
+    """
+    Benchmark matmul and display results.
+    """
+    a = torch.randn(n, n, device="cuda")
+    b = torch.randn(n, n, device="cuda")
+
+    with nsight.annotate("matmul"):
+        _ = a @ b
+
+
+def main() -> None:
+    result = benchmark_avg_insts()
+    print(result.to_dataframe())
+    print("✓ Avg Insts benchmark complete! Check '09_advanced_metric_custom.png'")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,63 @@
+# Copyright 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Example 10: Multiple Kernels per Run with Combined Metrics
+===========================================================
+
+This example shows how to profile multiple kernels in a single run and combine their metrics.
+
+New concepts:
+- Using `combine_kernel_metrics` to aggregate metrics from multiple kernels
+- Summing metrics from consecutive kernel executions
+"""
+
+import torch
+
+import nsight
+
+# Define configuration sizes
+sizes = [(2**i,) for i in range(10, 13)]
+
+
+@nsight.analyze.plot(
+    filename="10_combine_kernel_metrics.png",
+    ylabel="Total Cycles (Sum of 3 Kernels)",
+    annotate_points=True,
+)
+@nsight.analyze.kernel(
+    configs=sizes,
+    runs=7,
+    combine_kernel_metrics=lambda x, y: x + y,  # Sum metrics from multiple kernels
+    metrics=[
+        "sm__cycles_elapsed.avg",
+    ],
+)
+def benchmark_multiple_kernels(n: int) -> None:
+    """
+    Benchmark three matrix multiplications in a single run.
+
+    Executes three matmul operations within one profiled context,
+    demonstrating metric combination across kernels.
+
+    Args:
+        n: Matrix size (n x n)
+    """
+    a = torch.randn(n, n, device="cuda")
+    b = torch.randn(n, n, device="cuda")
+
+    with nsight.annotate("test"):
+        # Three consecutive kernel executions
+        _ = a @ b  # Kernel 1
+        _ = a @ b  # Kernel 2
+        _ = a @ b  # Kernel 3
+
+
+def main() -> None:
+    result = benchmark_multiple_kernels()
+    print(result.to_dataframe())
+    print("\n✓ Total Cycles benchmark complete! Check '10_combine_kernel_metrics.png'")
+
+
+if __name__ == "__main__":
+    main()