From 4b30cea96b9f5b084c9d06ee6a829bb36ccac1d4 Mon Sep 17 00:00:00 2001
From: Christian Legnitto <christian@legnitto.com>
Date: Sun, 27 Jul 2025 19:51:08 -0400
Subject: [PATCH] Add target_feature support for compute_*

This lets us gate code to virtual architectures at
compile time using `cfg()`.
---
 crates/cuda_builder/src/lib.rs          |  22 +++
 crates/nvvm/Cargo.toml                  |   1 +
 crates/nvvm/src/lib.rs                  | 247 +++++++++++++++++++++++-
 crates/rustc_codegen_nvvm/src/lib.rs    |  54 +++++-
 guide/src/SUMMARY.md                    |   1 +
 guide/src/guide/compute_capabilities.md | 207 ++++++++++++++++++++
 guide/src/guide/getting_started.md      |  12 ++
 7 files changed, 538 insertions(+), 6 deletions(-)
 create mode 100644 guide/src/guide/compute_capabilities.md

diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs
index adab4a52..766413d1 100644
--- a/crates/cuda_builder/src/lib.rs
+++ b/crates/cuda_builder/src/lib.rs
@@ -93,6 +93,23 @@ pub struct CudaBuilder {
     /// the GTX 1030, GTX 1050, GTX 1080, Tesla P40, etc. We default to this because
     /// Maxwell (5.x) will be deprecated in CUDA 12 and we anticipate for that. Moreover,
     /// `6.x` contains support for things like f64 atomic add and half precision float ops.
+    ///
+    /// ## Target Features for Conditional Compilation
+    ///
+    /// The chosen architecture enables a target feature that can be used for
+    /// conditional compilation with `#[cfg(target_feature = "compute_XX")]`.
+    /// This feature means "at least this capability", matching NVIDIA's semantics.
+    ///
+    /// For other patterns (exact ranges, maximum capabilities), use boolean `cfg` logic.
+    /// See the compute capabilities guide for examples.
+    ///
+    /// For example, with `.arch(NvvmArch::Compute61)`:
+    /// ```ignore
+    /// #[cfg(target_feature = "compute_61")]
+    /// {
+    ///     // Code that requires compute capability 6.1+
+    /// }
+    /// ```
     pub arch: NvvmArch,
     /// Flush denormal values to zero when performing single-precision floating point operations.
     /// `false` by default.
@@ -229,6 +246,11 @@ impl CudaBuilder {
     /// NOTE that this does not necessarily mean that code using a certain capability
     /// will not work on older capabilities. It means that if it uses certain
     /// features it may not work.
+    ///
+    /// ## Target Features for Conditional Compilation
+    ///
+    /// The chosen architecture enables target features for conditional compilation.
+    /// See the documentation on the `arch` field for more details.
     pub fn arch(mut self, arch: NvvmArch) -> Self {
         self.arch = arch;
         self
diff --git a/crates/nvvm/Cargo.toml b/crates/nvvm/Cargo.toml
index f45fd131..b8e2d741 100644
--- a/crates/nvvm/Cargo.toml
+++ b/crates/nvvm/Cargo.toml
@@ -10,3 +10,4 @@ readme = "../../README.md"
 
 [dependencies]
 cust_raw = { path = "../cust_raw", default-features = false, features = ["nvvm"] }
+strum = { version = "0.27", features = ["derive"] }
diff --git a/crates/nvvm/src/lib.rs b/crates/nvvm/src/lib.rs
index f8f4c9f8..ac8b8e50 100644
--- a/crates/nvvm/src/lib.rs
+++ b/crates/nvvm/src/lib.rs
@@ -8,6 +8,8 @@ use std::{
     str::FromStr,
 };
 
+use strum::IntoEnumIterator;
+
 use cust_raw::nvvm_sys;
 
 pub use cust_raw::nvvm_sys::LIBDEVICE_BITCODE;
@@ -255,6 +257,10 @@ impl FromStr for NvvmOption {
                     "72" => NvvmArch::Compute72,
                     "75" => NvvmArch::Compute75,
                     "80" => NvvmArch::Compute80,
+                    "86" => NvvmArch::Compute86,
+                    "87" => NvvmArch::Compute87,
+                    "89" => NvvmArch::Compute89,
+                    "90" => NvvmArch::Compute90,
                     _ => return Err("unknown arch"),
                 };
                 Self::Arch(arch)
@@ -265,7 +271,7 @@ impl FromStr for NvvmOption {
 }
 
 /// Nvvm architecture, default is `Compute52`
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, strum::EnumIter)]
 pub enum NvvmArch {
     Compute35,
     Compute37,
@@ -279,6 +285,10 @@ pub enum NvvmArch {
     Compute72,
     Compute75,
     Compute80,
+    Compute86,
+    Compute87,
+    Compute89,
+    Compute90,
 }
 
 impl Display for NvvmArch {
@@ -295,6 +305,53 @@ impl Default for NvvmArch {
     }
 }
 
+impl NvvmArch {
+    /// Get the numeric capability value (e.g., 35 for Compute35)
+    pub fn capability_value(&self) -> u32 {
+        match self {
+            Self::Compute35 => 35,
+            Self::Compute37 => 37,
+            Self::Compute50 => 50,
+            Self::Compute52 => 52,
+            Self::Compute53 => 53,
+            Self::Compute60 => 60,
+            Self::Compute61 => 61,
+            Self::Compute62 => 62,
+            Self::Compute70 => 70,
+            Self::Compute72 => 72,
+            Self::Compute75 => 75,
+            Self::Compute80 => 80,
+            Self::Compute86 => 86,
+            Self::Compute87 => 87,
+            Self::Compute89 => 89,
+            Self::Compute90 => 90,
+        }
+    }
+
+    /// Get the target feature string (e.g., "compute_35" for Compute35)
+    pub fn target_feature(&self) -> String {
+        let cap = self.capability_value();
+        format!("compute_{cap}")
+    }
+
+    /// Get all target features up to and including this architecture.
+    /// This ensures that `cfg(target_feature = "compute_50")` works on compute_60+ devices.
+    pub fn all_target_features(&self) -> Vec<String> {
+        let current = self.capability_value();
+
+        NvvmArch::iter()
+            .filter(|arch| arch.capability_value() <= current)
+            .map(|arch| arch.target_feature())
+            .collect()
+    }
+
+    /// Create an iterator over all architectures from Compute35 up to and including this one
+    pub fn iter_up_to(&self) -> impl Iterator<Item = Self> {
+        let current = self.capability_value();
+        NvvmArch::iter().filter(move |arch| arch.capability_value() <= current)
+    }
+}
+
 pub struct NvvmProgram {
     raw: nvvm_sys::nvvmProgram,
 }
@@ -409,6 +466,194 @@ impl NvvmProgram {
 mod tests {
     use std::str::FromStr;
 
+    #[test]
+    fn nvvm_arch_capability_value() {
+        use crate::NvvmArch;
+
+        assert_eq!(NvvmArch::Compute35.capability_value(), 35);
+        assert_eq!(NvvmArch::Compute37.capability_value(), 37);
+        assert_eq!(NvvmArch::Compute50.capability_value(), 50);
+        assert_eq!(NvvmArch::Compute52.capability_value(), 52);
+        assert_eq!(NvvmArch::Compute53.capability_value(), 53);
+        assert_eq!(NvvmArch::Compute60.capability_value(), 60);
+        assert_eq!(NvvmArch::Compute61.capability_value(), 61);
+        assert_eq!(NvvmArch::Compute62.capability_value(), 62);
+        assert_eq!(NvvmArch::Compute70.capability_value(), 70);
+        assert_eq!(NvvmArch::Compute72.capability_value(), 72);
+        assert_eq!(NvvmArch::Compute75.capability_value(), 75);
+        assert_eq!(NvvmArch::Compute80.capability_value(), 80);
+        assert_eq!(NvvmArch::Compute86.capability_value(), 86);
+        assert_eq!(NvvmArch::Compute87.capability_value(), 87);
+        assert_eq!(NvvmArch::Compute89.capability_value(), 89);
+        assert_eq!(NvvmArch::Compute90.capability_value(), 90);
+    }
+
+    #[test]
+    fn nvvm_arch_target_feature_format() {
+        use crate::NvvmArch;
+
+        assert_eq!(NvvmArch::Compute35.target_feature(), "compute_35");
+        assert_eq!(NvvmArch::Compute61.target_feature(), "compute_61");
+        assert_eq!(NvvmArch::Compute90.target_feature(), "compute_90");
+    }
+
+    #[test]
+    fn nvvm_arch_all_target_features_includes_lower_capabilities() {
+        use crate::NvvmArch;
+
+        // Compute35 only includes itself
+        let compute35_features = NvvmArch::Compute35.all_target_features();
+        assert_eq!(compute35_features, vec!["compute_35"]);
+
+        // Compute50 includes all lower capabilities
+        let compute50_features = NvvmArch::Compute50.all_target_features();
+        assert_eq!(
+            compute50_features,
+            vec!["compute_35", "compute_37", "compute_50"]
+        );
+
+        // Compute61 includes all lower capabilities
+        let compute61_features = NvvmArch::Compute61.all_target_features();
+        assert_eq!(
+            compute61_features,
+            vec![
+                "compute_35",
+                "compute_37",
+                "compute_50",
+                "compute_52",
+                "compute_53",
+                "compute_60",
+                "compute_61"
+            ]
+        );
+
+        // Compute90 includes all capabilities
+        let compute90_features = NvvmArch::Compute90.all_target_features();
+        assert_eq!(
+            compute90_features,
+            vec![
+                "compute_35",
+                "compute_37",
+                "compute_50",
+                "compute_52",
+                "compute_53",
+                "compute_60",
+                "compute_61",
+                "compute_62",
+                "compute_70",
+                "compute_72",
+                "compute_75",
+                "compute_80",
+                "compute_86",
+                "compute_87",
+                "compute_89",
+                "compute_90"
+            ]
+        );
+    }
+
+    #[test]
+    fn target_feature_synthesis_supports_conditional_compilation_patterns() {
+        use crate::NvvmArch;
+
+        // When targeting Compute61, should enable all lower capabilities
+        let features = NvvmArch::Compute61.all_target_features();
+
+        // Should enable compute_60 (for f64 atomics)
+        assert!(features.contains(&"compute_60".to_string()));
+
+        // Should enable compute_50 (for 64-bit integer atomics)
+        assert!(features.contains(&"compute_50".to_string()));
+
+        // Should enable compute_35 (baseline)
+        assert!(features.contains(&"compute_35".to_string()));
+
+        // Should enable the target itself
+        assert!(features.contains(&"compute_61".to_string()));
+
+        // Should NOT enable higher capabilities
+        assert!(!features.contains(&"compute_62".to_string()));
+        assert!(!features.contains(&"compute_70".to_string()));
+    }
+
+    #[test]
+    fn target_feature_synthesis_enables_correct_cfg_patterns() {
+        use crate::NvvmArch;
+
+        // Test that targeting Compute70 enables appropriate cfg patterns
+        let features = NvvmArch::Compute70.all_target_features();
+
+        // These should all be true for compute_70 target
+        let expected_enabled = [
+            "compute_35",
+            "compute_37",
+            "compute_50",
+            "compute_52",
+            "compute_53",
+            "compute_60",
+            "compute_61",
+            "compute_62",
+            "compute_70",
+        ];
+
+        for feature in expected_enabled {
+            assert!(
+                features.contains(&feature.to_string()),
+                "Compute70 should enable {feature} for cfg(target_feature = \"{feature}\")"
+            );
+        }
+
+        // These should NOT be enabled for compute_70 target
+        let expected_disabled = ["compute_72", "compute_75", "compute_80", "compute_90"];
+
+        for feature in expected_disabled {
+            assert!(
+                !features.contains(&feature.to_string()),
+                "Compute70 should NOT enable {feature}"
+            );
+        }
+    }
+
+    #[test]
+    fn nvvm_arch_iter_up_to_includes_only_lower_or_equal() {
+        use crate::NvvmArch;
+
+        // Compute35 only includes itself
+        let archs: Vec<_> = NvvmArch::Compute35.iter_up_to().collect();
+        assert_eq!(archs, vec![NvvmArch::Compute35]);
+
+        // Compute52 includes all up to 52
+        let archs: Vec<_> = NvvmArch::Compute52.iter_up_to().collect();
+        assert_eq!(
+            archs,
+            vec![
+                NvvmArch::Compute35,
+                NvvmArch::Compute37,
+                NvvmArch::Compute50,
+                NvvmArch::Compute52,
+            ]
+        );
+
+        // Compute75 includes all up to 75
+        let archs: Vec<_> = NvvmArch::Compute75.iter_up_to().collect();
+        assert_eq!(
+            archs,
+            vec![
+                NvvmArch::Compute35,
+                NvvmArch::Compute37,
+                NvvmArch::Compute50,
+                NvvmArch::Compute52,
+                NvvmArch::Compute53,
+                NvvmArch::Compute60,
+                NvvmArch::Compute61,
+                NvvmArch::Compute62,
+                NvvmArch::Compute70,
+                NvvmArch::Compute72,
+                NvvmArch::Compute75,
+            ]
+        );
+    }
+
     #[test]
     fn options_parse_correctly() {
         use crate::NvvmArch::*;
diff --git a/crates/rustc_codegen_nvvm/src/lib.rs b/crates/rustc_codegen_nvvm/src/lib.rs
index b0c2da34..a0656a69 100644
--- a/crates/rustc_codegen_nvvm/src/lib.rs
+++ b/crates/rustc_codegen_nvvm/src/lib.rs
@@ -61,7 +61,7 @@ use lto::ThinBuffer;
 use rustc_ast::expand::allocator::AllocatorKind;
 use rustc_ast::expand::autodiff_attrs::AutoDiffItem;
 use rustc_codegen_ssa::{
-    CodegenResults, CompiledModule, ModuleCodegen,
+    CodegenResults, CompiledModule, ModuleCodegen, TargetConfig,
     back::{
         lto::{LtoModuleCodegen, SerializedModule, ThinModule},
         write::{CodegenContext, FatLtoInput, ModuleConfig, OngoingCodegen},
@@ -131,10 +131,28 @@ impl CodegenBackend for NvvmCodegenBackend {
     }
 
     fn provide(&self, providers: &mut Providers) {
-        // FIXME(eddyb) this is currently only passed back to us, specifically
-        // into `target_machine_factory` (which is a noop), but it might make
-        // sense to move some of the target feature parsing into here.
-        providers.global_backend_features = |_tcx, ()| vec![];
+        // Synthesize compute capability target features from the architecture specified in llvm-args.
+        // This enables code to use `#[cfg(target_feature = "compute_60")]` etc. for conditional compilation.
+        // Following NVIDIA semantics, we enable "at least this capability" matching - for example,
+        // when targeting compute_70, we also enable compute_60, compute_50, and all lower capabilities.
+        // This allows libraries to gate features based on minimum required compute capability.
+        providers.global_backend_features = |tcx, ()| {
+            let mut features = vec![];
+
+            // Parse CodegenArgs to get the architecture from llvm-args (e.g., "-arch=compute_70")
+            let args = context::CodegenArgs::from_session(tcx.sess);
+
+            // Find the architecture option and synthesize all implied features
+            for opt in &args.nvvm_options {
+                if let ::nvvm::NvvmOption::Arch(arch) = opt {
+                    // Add all features up to and including the current architecture
+                    features.extend(arch.all_target_features());
+                    break;
+                }
+            }
+
+            features
+        };
 
         providers.fn_abi_of_fn_ptr = |tcx, key| {
             let result = (rustc_interface::DEFAULT_QUERY_PROVIDERS.fn_abi_of_fn_ptr)(tcx, key);
@@ -192,6 +210,32 @@ impl CodegenBackend for NvvmCodegenBackend {
             metadata,
         );
     }
+
+    fn target_config(&self, sess: &Session) -> TargetConfig {
+        // Parse target features from command line
+        let cmdline = sess.opts.cg.target_feature.split(',');
+        let cfg = sess.target.options.features.split(',');
+
+        let target_features: Vec<_> = cfg
+            .chain(cmdline)
+            .filter(|l| l.starts_with('+'))
+            .map(|l| &l[1..])
+            .filter(|l| !l.is_empty())
+            .map(rustc_span::Symbol::intern)
+            .collect();
+
+        // For NVPTX, all target features are stable
+        let unstable_target_features = target_features.clone();
+
+        TargetConfig {
+            target_features,
+            unstable_target_features,
+            has_reliable_f16: false,
+            has_reliable_f16_math: false,
+            has_reliable_f128: false,
+            has_reliable_f128_math: false,
+        }
+    }
 }
 
 impl WriteBackendMethods for NvvmCodegenBackend {
diff --git a/guide/src/SUMMARY.md b/guide/src/SUMMARY.md
index 9d5b24f9..6a6e420c 100644
--- a/guide/src/SUMMARY.md
+++ b/guide/src/SUMMARY.md
@@ -5,6 +5,7 @@
 - [Frequently Asked Questions](faq.md)
 - [Guide](guide/README.md)
   - [Getting Started](guide/getting_started.md)
+  - [Compute Capability Gating](guide/compute_capabilities.md)
   - [Tips](guide/tips.md)
   - [Kernel ABI](guide/kernel_abi.md)
   - [Safety](guide/safety.md)
diff --git a/guide/src/guide/compute_capabilities.md b/guide/src/guide/compute_capabilities.md
new file mode 100644
index 00000000..cf7ea293
--- /dev/null
+++ b/guide/src/guide/compute_capabilities.md
@@ -0,0 +1,207 @@
+# Compute Capability Gating
+
+This section covers how to write code that adapts to different CUDA compute capabilities
+using conditional compilation.
+
+## What are Compute Capabilities?
+
+CUDA GPUs have different "compute capabilities" that determine which features they
+support. Each capability is identified by a version number like `3.5`, `5.0`, `6.1`,
+`7.5`, etc. Higher numbers generally mean more features are available.
+
+For example:
+
+- Compute capability 5.0+ supports 64-bit integer min/max and bitwise atomic operations
+- Compute capability 6.0+ supports double-precision (f64) atomic operations
+- Compute capability 7.0+ supports tensor core operations
+
+For comprehensive details, see [NVIDIA's CUDA documentation on GPU architectures](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-compilation).
+
+## Virtual vs Real Architectures
+
+In CUDA terminology:
+
+- **Virtual architectures** (`compute_XX`) define the PTX instruction set and available
+  features
+- **Real architectures** (`sm_XX`) represent actual GPU hardware
+
+rust-cuda works exclusively with virtual architectures since it only generates PTX. The
+`NvvmArch::ComputeXX` enum values correspond to CUDA's virtual architectures.
+
+## Using Target Features
+
+When you build a CUDA kernel with `cuda_builder`, the architecture you choose (e.g.,
+`NvvmArch::Compute61`) enables target features that you can use for conditional compilation.
+
+These features follow the pattern `compute_XX` where XX is the capability number without
+the decimal point. The enabled feature means "at least this capability", matching
+NVIDIA's semantics.
+
+### Example: Basic Usage
+
+```rust
+use cuda_builder::CudaBuilder;
+
+fn main() {
+    CudaBuilder::new("kernels")
+        .arch(NvvmArch::Compute61)  // Target compute capability 6.1+
+        .build()
+        .unwrap();
+}
+```
+
+This enables only the `compute_61` target feature, meaning the code requires
+at least compute capability 6.1.
+
+For other targeting patterns (exact ranges, maximum capabilities), use boolean
+`cfg` logic as shown in the examples below.
+
+### Manual Compilation (Without CudaBuilder)
+
+If you're invoking `rustc` directly instead of using `cuda_builder`, you only need to specify the architecture through LLVM args:
+
+```bash
+rustc --target nvptx64-nvidia-cuda \
+    -C llvm-args=-arch=compute_61 \
+    -Z codegen-backend=/path/to/librustc_codegen_nvvm.so \
+    ...
+```
+
+Or with cargo:
+
+```bash
+export RUSTFLAGS="-C llvm-args=-arch=compute_61 -Z codegen-backend=/path/to/librustc_codegen_nvvm.so"
+cargo build --target nvptx64-nvidia-cuda
+```
+
+The codegen backend automatically synthesizes all appropriate target features based on the architecture. For example, targeting `compute_61` will enable `compute_35`, `compute_37`, `compute_50`, `compute_52`, `compute_53`, `compute_60`, and `compute_61` features for conditional compilation.
+
+## Conditional Compilation in Kernels
+
+You can use `#[cfg(target_feature = "compute_XX")]` to conditionally compile code based on the available compute capabilities. With boolean logic, you can express any capability range you need.
+
+### Common Patterns
+
+#### At Least a Capability (Default)
+```rust,no_run
+// Code that requires compute 6.0 or higher
+#[cfg(target_feature = "compute_60")]
+{
+    cuda_std::atomic::atomic_add(data, 1.0); // f64 atomics need 6.0+
+}
+```
+
+#### Exactly One Capability  
+```rust,no_run
+// Code that targets exactly compute 5.0 (not 5.2+)
+#[cfg(all(target_feature = "compute_50", not(target_feature = "compute_52")))]
+{
+    // Optimizations specific to compute 5.0
+}
+
+// Code that targets exactly compute 6.1 (not 6.2+)
+#[cfg(all(target_feature = "compute_61", not(target_feature = "compute_62")))]
+{
+    // Features specific to compute 6.1
+}
+```
+
+#### Up To a Maximum Capability
+```rust,no_run
+// Code that works on compute 5.0 and below (not 5.2+)
+#[cfg(all(target_feature = "compute_35", not(target_feature = "compute_52")))]
+{
+    // Fallback implementation for older GPUs
+}
+
+// Code that works up to compute 6.0 (not 6.1+)  
+#[cfg(all(target_feature = "compute_35", not(target_feature = "compute_61")))]
+{
+    // Maximum compatibility implementation
+}
+```
+
+#### Capability Ranges
+```rust,no_run
+// Code that works on compute 5.0 through 7.0 (not 7.2+)
+#[cfg(all(target_feature = "compute_50", not(target_feature = "compute_72")))]
+{
+    // Features available in this range
+}
+```
+
+### Complete Example
+
+```rust,no_run
+use cuda_std::*;
+
+#[kernel]
+pub unsafe fn adaptive_kernel(data: *mut f64) {
+    // This code only compiles when targeting compute 6.0 or higher
+    #[cfg(target_feature = "compute_60")]
+    {
+        // f64 atomics are only available on compute 6.0+
+        cuda_std::atomic::atomic_add(data, 1.0);
+    }
+
+    // Fallback for older GPUs
+    #[cfg(not(target_feature = "compute_60"))]
+    {
+        // Manual implementation or alternative approach
+    }
+}
+```
+
+## Best Practices
+
+### 1. Choose the Lowest Viable Architecture
+
+Select the lowest compute capability that provides the features you need. This maximizes GPU compatibility:
+
+```rust,no_run
+// If you only need basic atomics
+.arch(NvvmArch::Compute35)
+
+// If you need 64-bit integer atomics
+.arch(NvvmArch::Compute50)
+
+// If you need f64 atomics
+.arch(NvvmArch::Compute60)
+```
+
+### 2. Provide Fallbacks When Possible
+
+For maximum compatibility, provide alternative implementations for older GPUs:
+
+```rust,no_run
+#[cfg(target_feature = "compute_50")]
+fn fast_path(data: *mut u64) {
+    // Use hardware atomic
+    atomic_min(data, 100);
+}
+
+#[cfg(not(target_feature = "compute_50"))]
+fn fast_path(data: *mut u64) {
+    // Software fallback
+}
+```
+
+## Debugging Capability Issues
+
+If you encounter errors about missing functions or features:
+
+1. Check the compute capability you're targeting in `cuda_builder`
+2. Verify your GPU supports the features you're using
+3. Use `nvidia-smi` to check your GPU's compute capability
+4. Add appropriate `#[cfg]` guards or increase the target architecture
+
+## Runtime Behavior
+
+Again, rust-cuda **only generates PTX**, not pre-compiled GPU binaries
+("[fatbinaries](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#fatbinaries)").
+This PTX is then JIT-compiled by the CUDA driver at _runtime_.
+
+For more details, see [NVIDIA's documentation on GPU
+compilation](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-compilation)
+and [JIT
+compilation](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#just-in-time-compilation).
diff --git a/guide/src/guide/getting_started.md b/guide/src/guide/getting_started.md
index 241f9b00..e61cff4f 100644
--- a/guide/src/guide/getting_started.md
+++ b/guide/src/guide/getting_started.md
@@ -196,6 +196,18 @@ inside of `target/cuda-builder/nvptx64-nvidia-cuda/release/crate_name.ptx`, but
 what such method does. Finally, `build()` actually runs rustc to compile the crate. This may take a while since it needs to build things like core
 from scratch, but after the first compile, incremental will make it much faster.
 
+You can also specify a different compute capability with `.arch()`:
+
+```rs
+CudaBuilder::new("path/to/gpu/crate")
+    .arch(cuda_builder::NvvmArch::Compute75)  // Target compute 7.5 GPUs
+    .copy_to("kernel.ptx")
+    .build()
+    .unwrap();
+```
+
+The architecture you choose affects which GPU features are available. See the [Compute Capability Gating](./compute_capabilities.md) guide for details on writing code that adapts to different GPU capabilities.
+
 Finally, you can include the PTX as a static string in your program:
 
 ```rs