From 22cf729bfc3b4b511bb4544b26192ff580c4db39 Mon Sep 17 00:00:00 2001
From: Christian Legnitto <christian@legnitto.com>
Date: Sun, 27 Jul 2025 22:59:18 -0400
Subject: [PATCH] Implement CUDA 12.9 architecture suffixes

Add support for CUDA 12.9's family-specific ('f') and
architecture-specific
('a') suffixes to NvvmArch enum. These suffixes provide different PTX
compatibility modes as specified by NVIDIA.
---
 crates/cuda_builder/src/lib.rs          |  79 ++--
 crates/nvvm/src/lib.rs                  | 537 +++++++++++++++++++++++-
 guide/src/guide/compute_capabilities.md | 212 +++++-----
 3 files changed, 686 insertions(+), 142 deletions(-)

diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs
index 766413d1..3977b7a2 100644
--- a/crates/cuda_builder/src/lib.rs
+++ b/crates/cuda_builder/src/lib.rs
@@ -71,45 +71,72 @@ pub struct CudaBuilder {
     /// Whether to run libnvvm optimizations. This defaults to `false`
     /// but will be set to `true` if release is specified.
     pub nvvm_opts: bool,
-    /// The virtual compute architecture to target for PTX generation. This
-    /// dictates how certain things are codegenned and may affect performance
-    /// and/or which gpus the code can run on.
+    /// The virtual compute architecture to target for PTX generation. This dictates how
+    /// certain things are codegenned and may affect performance and/or which gpus the
+    /// code can run on.
     ///
-    /// You should generally try to pick an arch that will work with most
-    /// GPUs you want your program to work with. Make sure to also
-    /// use an appropriate compute arch if you are using recent features
-    /// such as tensor cores (which need at least 7.x).
+    /// You should generally try to pick an arch that will work with most GPUs you want
+    /// your program to work with. Make sure to also use an appropriate compute arch if
+    /// you are using recent features such as tensor cores (which need at least 7.x).
     ///
-    /// If you are unsure, either leave this option to default, or pick something around 5.2 to 7.x.
+    /// If you are unsure, either leave this option to default, or pick something around
+    /// 5.2 to 7.x.
     ///
-    /// You can find a list of features supported on each arch and a list of GPUs for every
-    /// arch [`here`](https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications).
+    /// You can find a list of features supported on each arch and a list of GPUs for
+    /// every arch
+    /// [`here`](https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications).
     ///
     /// NOTE that this does not necessarily mean that code using a certain capability
-    /// will not work on older capabilities. It means that if it uses certain
-    /// features it may not work.
+    /// will not work on older capabilities. It means that if it uses certain features
+    /// it may not work.
+    ///
+    /// This currently defaults to `6.1`. Which corresponds to Pascal, GPUs such as the
+    /// GTX 1030, GTX 1050, GTX 1080, Tesla P40, etc. We default to this because Maxwell
+    /// (5.x) will be deprecated in CUDA 12 and we anticipate for that. Moreover, `6.x`
+    /// contains support for things like f64 atomic add and half precision float ops.
     ///
-    /// This currently defaults to `6.1`. Which corresponds to Pascal, GPUs such as
-    /// the GTX 1030, GTX 1050, GTX 1080, Tesla P40, etc. We default to this because
-    /// Maxwell (5.x) will be deprecated in CUDA 12 and we anticipate for that. Moreover,
-    /// `6.x` contains support for things like f64 atomic add and half precision float ops.
+    /// Starting with CUDA 12.9, architectures can have suffixes:
     ///
-    /// ## Target Features for Conditional Compilation
+    /// - **No suffix** (e.g., `Compute70`): Forward-compatible across all future GPUs.
+    ///   Best for general compatibility.
+    /// - **'f' suffix** (e.g., `Compute100f`): Family-specific features,
+    ///   forward-compatible within same major version (10.0, 10.3, etc.) but NOT across
+    ///   major versions.
+    /// - **'a' suffix** (e.g., `Compute100a`): Architecture-specific features (mainly
+    ///   Tensor Cores). Code ONLY runs on that exact compute capability, no
+    ///   compatibility with any other GPU.
     ///
-    /// The chosen architecture enables a target feature that can be used for
-    /// conditional compilation with `#[cfg(target_feature = "compute_XX")]`.
-    /// This feature means "at least this capability", matching NVIDIA's semantics.
+    /// Most applications should use base architectures (no suffix). Only use 'f' or 'a'
+    /// if you need specific features and understand the compatibility trade-offs.
     ///
-    /// For other patterns (exact ranges, maximum capabilities), use boolean `cfg` logic.
-    /// See the compute capabilities guide for examples.
+    /// The chosen architecture enables target features for conditional compilation:
+    /// - Base arch: `#[cfg(target_feature = "compute_70")]` - enabled on 7.0+
+    /// - Family variant: `#[cfg(target_feature = "compute_100f")]` - enabled on 10.x family
+    ///   with same or higher minor version
+    /// - Arch variant: `#[cfg(target_feature = "compute_100a")]` - enabled when building for
+    ///   exactly 10.0 (includes all base and family features during compilation)
     ///
     /// For example, with `.arch(NvvmArch::Compute61)`:
     /// ```ignore
     /// #[cfg(target_feature = "compute_61")]
     /// {
-    ///     // Code that requires compute capability 6.1+
+    ///     // Code that requires compute capability 6.1+ will be emitted because it matches
+    ///     // the target architecture.
+    /// }
+    /// #[cfg(target_feature = "compute_51")]
+    /// {
+    ///     // Code that requires compute capability 5.1 will be emitted
+    ///     // because 6.1 is a superset of 5.1.
+    /// }
+    /// #[cfg(target_feature = "compute_71")]
+    /// {
+    ///     // Code that requires compute capability 7.1 will NOT be emitted
+    ///     // because the chosen arch (6.1) is not a superset of 7.1.
     /// }
     /// ```
+    ///
+    /// See:
+    /// <https://developer.nvidia.com/blog/nvidia-blackwell-and-nvidia-cuda-12-9-introduce-family-specific-architecture-features/>
     pub arch: NvvmArch,
     /// Flush denormal values to zero when performing single-precision floating point operations.
     /// `false` by default.
@@ -234,9 +261,7 @@ impl CudaBuilder {
     /// and/or which gpus the code can run on.
     ///
     /// You should generally try to pick an arch that will work with most
-    /// GPUs you want your program to work with. Make sure to also
-    /// use an appropriate compute arch if you are using recent features
-    /// such as tensor cores (which need at least 7.x).
+    /// GPUs you want your program to work with.
     ///
     /// If you are unsure, either leave this option to default, or pick something around 5.2 to 7.x.
     ///
@@ -247,8 +272,6 @@ impl CudaBuilder {
     /// will not work on older capabilities. It means that if it uses certain
     /// features it may not work.
     ///
-    /// ## Target Features for Conditional Compilation
-    ///
     /// The chosen architecture enables target features for conditional compilation.
     /// See the documentation on the `arch` field for more details.
     pub fn arch(mut self, arch: NvvmArch) -> Self {
diff --git a/crates/nvvm/src/lib.rs b/crates/nvvm/src/lib.rs
index ac8b8e50..70f81740 100644
--- a/crates/nvvm/src/lib.rs
+++ b/crates/nvvm/src/lib.rs
@@ -261,6 +261,22 @@ impl FromStr for NvvmOption {
                     "87" => NvvmArch::Compute87,
                     "89" => NvvmArch::Compute89,
                     "90" => NvvmArch::Compute90,
+                    "90a" => NvvmArch::Compute90a,
+                    "100" => NvvmArch::Compute100,
+                    "100f" => NvvmArch::Compute100f,
+                    "100a" => NvvmArch::Compute100a,
+                    "101" => NvvmArch::Compute101,
+                    "101f" => NvvmArch::Compute101f,
+                    "101a" => NvvmArch::Compute101a,
+                    "103" => NvvmArch::Compute103,
+                    "103f" => NvvmArch::Compute103f,
+                    "103a" => NvvmArch::Compute103a,
+                    "120" => NvvmArch::Compute120,
+                    "120f" => NvvmArch::Compute120f,
+                    "120a" => NvvmArch::Compute120a,
+                    "121" => NvvmArch::Compute121,
+                    "121f" => NvvmArch::Compute121f,
+                    "121a" => NvvmArch::Compute121a,
                     _ => return Err("unknown arch"),
                 };
                 Self::Arch(arch)
@@ -289,13 +305,35 @@ pub enum NvvmArch {
     Compute87,
     Compute89,
     Compute90,
+    Compute90a,
+    Compute100,
+    Compute100f,
+    Compute100a,
+    Compute101,
+    Compute101f,
+    Compute101a,
+    Compute103,
+    Compute103f,
+    Compute103a,
+    Compute120,
+    Compute120f,
+    Compute120a,
+    Compute121,
+    Compute121f,
+    Compute121a,
 }
 
 impl Display for NvvmArch {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let mut raw = format!("{self:?}").to_ascii_lowercase();
-        raw.insert(7, '_');
-        f.write_str(&raw)
+        let raw = format!("{self:?}").to_ascii_lowercase();
+        // Handle architectures with suffixes (e.g., Compute90a -> compute_90a)
+        if let Some(pos) = raw.find(|c: char| c.is_ascii_digit()) {
+            let (prefix, rest) = raw.split_at(pos);
+            write!(f, "{prefix}_{rest}")
+        } else {
+            // Fallback for unexpected format
+            f.write_str(&raw)
+        }
     }
 }
 
@@ -325,24 +363,140 @@ impl NvvmArch {
             Self::Compute87 => 87,
             Self::Compute89 => 89,
             Self::Compute90 => 90,
+            Self::Compute90a => 90,
+            Self::Compute100 => 100,
+            Self::Compute100f => 100,
+            Self::Compute100a => 100,
+            Self::Compute101 => 101,
+            Self::Compute101f => 101,
+            Self::Compute101a => 101,
+            Self::Compute103 => 103,
+            Self::Compute103f => 103,
+            Self::Compute103a => 103,
+            Self::Compute120 => 120,
+            Self::Compute120f => 120,
+            Self::Compute120a => 120,
+            Self::Compute121 => 121,
+            Self::Compute121f => 121,
+            Self::Compute121a => 121,
         }
     }
 
-    /// Get the target feature string (e.g., "compute_35" for Compute35)
+    /// Get the major version number (e.g., 7 for Compute70)
+    pub fn major_version(&self) -> u32 {
+        self.capability_value() / 10
+    }
+
+    /// Get the minor version number (e.g., 5 for Compute75)
+    pub fn minor_version(&self) -> u32 {
+        self.capability_value() % 10
+    }
+
+    /// Get the target feature string (e.g., "compute_35" for Compute35, "compute_90a" for Compute90a)
     pub fn target_feature(&self) -> String {
-        let cap = self.capability_value();
-        format!("compute_{cap}")
+        match self {
+            Self::Compute35 => "compute_35".to_string(),
+            Self::Compute37 => "compute_37".to_string(),
+            Self::Compute50 => "compute_50".to_string(),
+            Self::Compute52 => "compute_52".to_string(),
+            Self::Compute53 => "compute_53".to_string(),
+            Self::Compute60 => "compute_60".to_string(),
+            Self::Compute61 => "compute_61".to_string(),
+            Self::Compute62 => "compute_62".to_string(),
+            Self::Compute70 => "compute_70".to_string(),
+            Self::Compute72 => "compute_72".to_string(),
+            Self::Compute75 => "compute_75".to_string(),
+            Self::Compute80 => "compute_80".to_string(),
+            Self::Compute86 => "compute_86".to_string(),
+            Self::Compute87 => "compute_87".to_string(),
+            Self::Compute89 => "compute_89".to_string(),
+            Self::Compute90 => "compute_90".to_string(),
+            Self::Compute90a => "compute_90a".to_string(),
+            Self::Compute100 => "compute_100".to_string(),
+            Self::Compute100f => "compute_100f".to_string(),
+            Self::Compute100a => "compute_100a".to_string(),
+            Self::Compute101 => "compute_101".to_string(),
+            Self::Compute101f => "compute_101f".to_string(),
+            Self::Compute101a => "compute_101a".to_string(),
+            Self::Compute103 => "compute_103".to_string(),
+            Self::Compute103f => "compute_103f".to_string(),
+            Self::Compute103a => "compute_103a".to_string(),
+            Self::Compute120 => "compute_120".to_string(),
+            Self::Compute120f => "compute_120f".to_string(),
+            Self::Compute120a => "compute_120a".to_string(),
+            Self::Compute121 => "compute_121".to_string(),
+            Self::Compute121f => "compute_121f".to_string(),
+            Self::Compute121a => "compute_121a".to_string(),
+        }
     }
 
     /// Get all target features up to and including this architecture.
-    /// This ensures that `cfg(target_feature = "compute_50")` works on compute_60+ devices.
+    ///
+    /// # PTX Forward-Compatibility Rules (per NVIDIA documentation):
+    ///
+    /// - **No suffix** (compute_XX): PTX is forward-compatible across all future architectures.
+    ///   Example: compute_70 runs on CC 7.0, 8.x, 9.x, 10.x, 12.x, and all future GPUs.
+    ///
+    /// - **Family-specific 'f' suffix** (compute_XXf): Forward-compatible within the same major
+    ///   version family. Supports devices with same major CC and equal or higher minor CC.
+    ///   Example: compute_100f runs on CC 10.0, 10.3, and future 10.x devices, but NOT on 11.x.
+    ///
+    /// - **Architecture-specific 'a' suffix** (compute_XXa): The code only runs on GPUs of that
+    ///   specific CC and no others. No forward or backward compatibility whatsoever.
+    ///   These features are primarily related to Tensor Core programming.
+    ///   Example: compute_100a ONLY runs on CC 10.0, not on 10.3, 10.1, 9.0, or any other version.
+    ///
+    /// For more details on family and architecture-specific features, see:
+    /// <https://developer.nvidia.com/blog/nvidia-blackwell-and-nvidia-cuda-12-9-introduce-family-specific-architecture-features/>
     pub fn all_target_features(&self) -> Vec<String> {
-        let current = self.capability_value();
+        let mut features: Vec<String> = if self.is_architecture_variant() {
+            // 'a' variants: include all available instructions for the architecture
+            // This means: all base variants up to same version, all 'f' variants with same major and <= minor, plus itself
+            let base_features: Vec<String> = NvvmArch::iter()
+                .filter(|arch| {
+                    arch.is_base_variant() && arch.capability_value() <= self.capability_value()
+                })
+                .map(|arch| arch.target_feature())
+                .collect();
+
+            let family_features: Vec<String> = NvvmArch::iter()
+                .filter(|arch| {
+                    arch.is_family_variant()
+                        && arch.major_version() == self.major_version()
+                        && arch.minor_version() <= self.minor_version()
+                })
+                .map(|arch| arch.target_feature())
+                .collect();
+
+            base_features
+                .into_iter()
+                .chain(family_features)
+                .chain(std::iter::once(self.target_feature()))
+                .collect()
+        } else if self.is_family_variant() {
+            // 'f' variants: same major version with equal or higher minor version
+            NvvmArch::iter()
+                .filter(|arch| {
+                    // Include base variants with same major and >= minor version
+                    arch.is_base_variant()
+                        && arch.major_version() == self.major_version()
+                        && arch.minor_version() >= self.minor_version()
+                })
+                .map(|arch| arch.target_feature())
+                .chain(std::iter::once(self.target_feature())) // Add the 'f' variant itself
+                .collect()
+        } else {
+            // Base variants: all base architectures from lower or equal versions
+            NvvmArch::iter()
+                .filter(|arch| {
+                    arch.is_base_variant() && arch.capability_value() <= self.capability_value()
+                })
+                .map(|arch| arch.target_feature())
+                .collect()
+        };
 
-        NvvmArch::iter()
-            .filter(|arch| arch.capability_value() <= current)
-            .map(|arch| arch.target_feature())
-            .collect()
+        features.sort();
+        features
     }
 
     /// Create an iterator over all architectures from Compute35 up to and including this one
@@ -350,6 +504,88 @@ impl NvvmArch {
         let current = self.capability_value();
         NvvmArch::iter().filter(move |arch| arch.capability_value() <= current)
     }
+
+    /// Check if this architecture is a base variant (no suffix)
+    pub fn is_base_variant(&self) -> bool {
+        let feature = self.target_feature();
+        // A base variant doesn't end with any letter suffix
+        !feature
+            .chars()
+            .last()
+            .is_some_and(|c| c.is_ascii_alphabetic())
+    }
+
+    /// Check if this architecture is a family-specific variant (f suffix)
+    /// Family-specific features are supported across devices within the same major compute capability
+    pub fn is_family_variant(&self) -> bool {
+        self.target_feature().ends_with('f')
+    }
+
+    /// Check if this architecture is an architecture-specific variant (a suffix)
+    /// Architecture-specific features are locked to that exact compute capability only
+    pub fn is_architecture_variant(&self) -> bool {
+        self.target_feature().ends_with('a')
+    }
+
+    /// Get the base architecture for this variant (strips f/a suffix if present)
+    pub fn base_architecture(&self) -> Self {
+        match self {
+            // Already base variants
+            Self::Compute35
+            | Self::Compute37
+            | Self::Compute50
+            | Self::Compute52
+            | Self::Compute53
+            | Self::Compute60
+            | Self::Compute61
+            | Self::Compute62
+            | Self::Compute70
+            | Self::Compute72
+            | Self::Compute75
+            | Self::Compute80
+            | Self::Compute86
+            | Self::Compute87
+            | Self::Compute89
+            | Self::Compute90
+            | Self::Compute100
+            | Self::Compute101
+            | Self::Compute103
+            | Self::Compute120
+            | Self::Compute121 => *self,
+
+            // Family-specific variants
+            Self::Compute100f => Self::Compute100,
+            Self::Compute101f => Self::Compute101,
+            Self::Compute103f => Self::Compute103,
+            Self::Compute120f => Self::Compute120,
+            Self::Compute121f => Self::Compute121,
+
+            // Architecture-specific variants
+            Self::Compute90a => Self::Compute90,
+            Self::Compute100a => Self::Compute100,
+            Self::Compute101a => Self::Compute101,
+            Self::Compute103a => Self::Compute103,
+            Self::Compute120a => Self::Compute120,
+            Self::Compute121a => Self::Compute121,
+        }
+    }
+
+    /// Get all available variants for the same base architecture (including the base)
+    pub fn get_variants(&self) -> Vec<Self> {
+        let base = self.base_architecture();
+        let base_value = base.capability_value();
+
+        NvvmArch::iter()
+            .filter(|arch| arch.capability_value() == base_value)
+            .collect()
+    }
+
+    /// Get all available variants for a given capability value
+    pub fn variants_for_capability(capability: u32) -> Vec<Self> {
+        NvvmArch::iter()
+            .filter(|arch| arch.capability_value() == capability)
+            .collect()
+    }
 }
 
 pub struct NvvmProgram {
@@ -489,12 +725,62 @@ mod tests {
     }
 
     #[test]
-    fn nvvm_arch_target_feature_format() {
+    fn nvvm_arch_major_minor_version() {
+        use crate::NvvmArch;
+
+        // Test major/minor version extraction
+        assert_eq!(NvvmArch::Compute35.major_version(), 3);
+        assert_eq!(NvvmArch::Compute35.minor_version(), 5);
+
+        assert_eq!(NvvmArch::Compute70.major_version(), 7);
+        assert_eq!(NvvmArch::Compute70.minor_version(), 0);
+
+        assert_eq!(NvvmArch::Compute121.major_version(), 12);
+        assert_eq!(NvvmArch::Compute121.minor_version(), 1);
+
+        // Suffixes don't affect version numbers
+        assert_eq!(NvvmArch::Compute100f.major_version(), 10);
+        assert_eq!(NvvmArch::Compute100f.minor_version(), 0);
+
+        assert_eq!(NvvmArch::Compute90a.major_version(), 9);
+        assert_eq!(NvvmArch::Compute90a.minor_version(), 0);
+    }
+
+    #[test]
+    fn nvvm_arch_target_feature_format_base_variants() {
         use crate::NvvmArch;
 
+        // Test base variants format
         assert_eq!(NvvmArch::Compute35.target_feature(), "compute_35");
         assert_eq!(NvvmArch::Compute61.target_feature(), "compute_61");
         assert_eq!(NvvmArch::Compute90.target_feature(), "compute_90");
+        assert_eq!(NvvmArch::Compute100.target_feature(), "compute_100");
+        assert_eq!(NvvmArch::Compute120.target_feature(), "compute_120");
+    }
+
+    #[test]
+    fn nvvm_arch_target_feature_format_family_variants() {
+        use crate::NvvmArch;
+
+        // Test family ('f') variants format
+        assert_eq!(NvvmArch::Compute100f.target_feature(), "compute_100f");
+        assert_eq!(NvvmArch::Compute101f.target_feature(), "compute_101f");
+        assert_eq!(NvvmArch::Compute103f.target_feature(), "compute_103f");
+        assert_eq!(NvvmArch::Compute120f.target_feature(), "compute_120f");
+        assert_eq!(NvvmArch::Compute121f.target_feature(), "compute_121f");
+    }
+
+    #[test]
+    fn nvvm_arch_target_feature_format_architecture_variants() {
+        use crate::NvvmArch;
+
+        // Test architecture ('a') variants format
+        assert_eq!(NvvmArch::Compute90a.target_feature(), "compute_90a");
+        assert_eq!(NvvmArch::Compute100a.target_feature(), "compute_100a");
+        assert_eq!(NvvmArch::Compute101a.target_feature(), "compute_101a");
+        assert_eq!(NvvmArch::Compute103a.target_feature(), "compute_103a");
+        assert_eq!(NvvmArch::Compute120a.target_feature(), "compute_120a");
+        assert_eq!(NvvmArch::Compute121a.target_feature(), "compute_121a");
     }
 
     #[test]
@@ -505,14 +791,14 @@ mod tests {
         let compute35_features = NvvmArch::Compute35.all_target_features();
         assert_eq!(compute35_features, vec!["compute_35"]);
 
-        // Compute50 includes all lower capabilities
+        // Compute50 includes all lower base capabilities
         let compute50_features = NvvmArch::Compute50.all_target_features();
         assert_eq!(
             compute50_features,
             vec!["compute_35", "compute_37", "compute_50"]
         );
 
-        // Compute61 includes all lower capabilities
+        // Compute61 includes all lower base capabilities
         let compute61_features = NvvmArch::Compute61.all_target_features();
         assert_eq!(
             compute61_features,
@@ -527,7 +813,72 @@ mod tests {
             ]
         );
 
-        // Compute90 includes all capabilities
+        // Test 'a' variant - includes all available instructions for the architecture
+        // This means: all base variants up to same version, all 'f' variants with same major and <= minor, plus itself
+        let compute90a_features = NvvmArch::Compute90a.all_target_features();
+        // Should include all base up to 90
+        assert!(compute90a_features.contains(&"compute_35".to_string()));
+        assert!(compute90a_features.contains(&"compute_90".to_string()));
+        // Should include the 'a' variant itself
+        assert!(compute90a_features.contains(&"compute_90a".to_string()));
+        // Should NOT include any 'f' variants (90 has no 'f' variants)
+
+        // Test compute100a - should include base variants, and 100f
+        let compute100a_features = NvvmArch::Compute100a.all_target_features();
+        // Should include all base up to 100
+        assert!(compute100a_features.contains(&"compute_90".to_string()));
+        assert!(compute100a_features.contains(&"compute_100".to_string()));
+        // Should include 100f (same major, <= minor)
+        assert!(compute100a_features.contains(&"compute_100f".to_string()));
+        // Should NOT include 101f or 103f (higher minor)
+        assert!(!compute100a_features.contains(&"compute_101f".to_string()));
+        assert!(!compute100a_features.contains(&"compute_103f".to_string()));
+        // Should include itself
+        assert!(compute100a_features.contains(&"compute_100a".to_string()));
+
+        // Test compute101a
+        let compute101a_features = NvvmArch::Compute101a.all_target_features();
+        // Should include all base up to 101
+        assert!(compute101a_features.contains(&"compute_100".to_string()));
+        assert!(compute101a_features.contains(&"compute_101".to_string()));
+        // Should include 100f and 101f (same major, <= minor)
+        assert!(compute101a_features.contains(&"compute_100f".to_string()));
+        assert!(compute101a_features.contains(&"compute_101f".to_string()));
+        // Should NOT include 103f (higher minor)
+        assert!(!compute101a_features.contains(&"compute_103f".to_string()));
+        // Should include itself
+        assert!(compute101a_features.contains(&"compute_101a".to_string()));
+
+        // Test 'f' variant - includes same major version with >= minor
+        let compute120f_features = NvvmArch::Compute120f.all_target_features();
+        assert!(compute120f_features.contains(&"compute_120".to_string()));
+        assert!(compute120f_features.contains(&"compute_121".to_string())); // Higher minor included
+        assert!(compute120f_features.contains(&"compute_120f".to_string())); // Self included
+        assert!(!compute120f_features.contains(&"compute_120a".to_string())); // No 'a' variants
+        assert!(!compute120f_features.contains(&"compute_121f".to_string())); // No other 'f' variants
+        assert!(!compute120f_features.contains(&"compute_121a".to_string())); // No 'a' variants
+                                                                              // Should NOT include different major versions
+        assert!(!compute120f_features.contains(&"compute_100".to_string()));
+        assert!(!compute120f_features.contains(&"compute_90".to_string()));
+
+        // Test 'f' variant with 100f
+        let compute100f_features = NvvmArch::Compute100f.all_target_features();
+        assert!(compute100f_features.contains(&"compute_100".to_string())); // Same version base
+        assert!(compute100f_features.contains(&"compute_101".to_string())); // Higher minor
+        assert!(compute100f_features.contains(&"compute_103".to_string())); // Higher minor
+        assert!(compute100f_features.contains(&"compute_100f".to_string())); // Self
+        assert!(!compute100f_features.contains(&"compute_101f".to_string())); // No other 'f' variants
+        assert!(!compute100f_features.contains(&"compute_90".to_string())); // Different major
+
+        // Test 'f' variant with 101f
+        let compute101f_features = NvvmArch::Compute101f.all_target_features();
+        assert!(!compute101f_features.contains(&"compute_100".to_string())); // Lower minor NOT included
+        assert!(compute101f_features.contains(&"compute_101".to_string())); // Same version base
+        assert!(compute101f_features.contains(&"compute_103".to_string())); // Higher minor included
+        assert!(compute101f_features.contains(&"compute_101f".to_string())); // Self
+        assert!(!compute101f_features.contains(&"compute_101a".to_string())); // No 'a' variants
+
+        // Compute90 includes lower base capabilities
         let compute90_features = NvvmArch::Compute90.all_target_features();
         assert_eq!(
             compute90_features,
@@ -599,7 +950,9 @@ mod tests {
         for feature in expected_enabled {
             assert!(
                 features.contains(&feature.to_string()),
-                "Compute70 should enable {feature} for cfg(target_feature = \"{feature}\")"
+                "Compute70 should enable {} for cfg(target_feature = \"{}\")",
+                feature,
+                feature
             );
         }
 
@@ -609,7 +962,8 @@ mod tests {
         for feature in expected_disabled {
             assert!(
                 !features.contains(&feature.to_string()),
-                "Compute70 should NOT enable {feature}"
+                "Compute70 should NOT enable {}",
+                feature
             );
         }
     }
@@ -709,4 +1063,151 @@ mod tests {
 
         assert_eq!(found, expected);
     }
+
+    #[test]
+    fn nvvm_arch_variant_checks() {
+        use crate::NvvmArch;
+
+        // Base variants
+        assert!(NvvmArch::Compute90.is_base_variant());
+        assert!(NvvmArch::Compute120.is_base_variant());
+        assert!(!NvvmArch::Compute90.is_family_variant());
+        assert!(!NvvmArch::Compute90.is_architecture_variant());
+
+        // Family-specific variants
+        assert!(NvvmArch::Compute120f.is_family_variant());
+        assert!(!NvvmArch::Compute120f.is_base_variant());
+        assert!(!NvvmArch::Compute120f.is_architecture_variant());
+
+        // Architecture-specific variants
+        assert!(NvvmArch::Compute90a.is_architecture_variant());
+        assert!(NvvmArch::Compute120a.is_architecture_variant());
+        assert!(!NvvmArch::Compute90a.is_base_variant());
+        assert!(!NvvmArch::Compute90a.is_family_variant());
+    }
+
+    #[test]
+    fn nvvm_arch_base_architecture() {
+        use crate::NvvmArch;
+
+        // Base variants return themselves
+        assert_eq!(NvvmArch::Compute90.base_architecture(), NvvmArch::Compute90);
+        assert_eq!(
+            NvvmArch::Compute120.base_architecture(),
+            NvvmArch::Compute120
+        );
+
+        // Floating-point variants return base
+        assert_eq!(
+            NvvmArch::Compute120f.base_architecture(),
+            NvvmArch::Compute120
+        );
+        assert_eq!(
+            NvvmArch::Compute101f.base_architecture(),
+            NvvmArch::Compute101
+        );
+
+        // Architecture variants return base
+        assert_eq!(
+            NvvmArch::Compute90a.base_architecture(),
+            NvvmArch::Compute90
+        );
+        assert_eq!(
+            NvvmArch::Compute120a.base_architecture(),
+            NvvmArch::Compute120
+        );
+    }
+
+    #[test]
+    fn nvvm_arch_get_variants() {
+        use crate::NvvmArch;
+
+        // Architecture with only base variant
+        let compute80_variants = NvvmArch::Compute80.get_variants();
+        assert_eq!(compute80_variants, vec![NvvmArch::Compute80]);
+
+        // Architecture with architecture and base variants
+        let mut compute90_variants = NvvmArch::Compute90.get_variants();
+        compute90_variants.sort_by_key(|v| format!("{:?}", v));
+        assert_eq!(
+            compute90_variants,
+            vec![NvvmArch::Compute90, NvvmArch::Compute90a]
+        );
+
+        // Architecture with all three variants
+        let mut compute120_variants = NvvmArch::Compute120.get_variants();
+        compute120_variants.sort_by_key(|v| format!("{:?}", v));
+        assert_eq!(
+            compute120_variants,
+            vec![
+                NvvmArch::Compute120,
+                NvvmArch::Compute120a,
+                NvvmArch::Compute120f
+            ]
+        );
+
+        // Getting variants from a variant returns all variants
+        let compute120f_variants = NvvmArch::Compute120f.get_variants();
+        assert_eq!(compute120f_variants.len(), 3);
+        assert!(compute120f_variants.contains(&NvvmArch::Compute120));
+        assert!(compute120f_variants.contains(&NvvmArch::Compute120f));
+        assert!(compute120f_variants.contains(&NvvmArch::Compute120a));
+    }
+
+    #[test]
+    fn nvvm_arch_a_suffix_includes_all_available_instructions() {
+        use crate::NvvmArch;
+
+        // Test that 'a' suffix variants include all available instructions for the architecture
+        // While they only RUN on exact CC, they enable all base and family features during compilation
+
+        // Test Compute90a
+        let features = NvvmArch::Compute90a.all_target_features();
+        assert!(features.contains(&"compute_90a".to_string())); // Includes itself
+        assert!(features.contains(&"compute_90".to_string())); // Includes base
+        assert!(features.contains(&"compute_80".to_string())); // Includes lower versions
+        assert!(!features.contains(&"compute_100".to_string())); // Does NOT include higher versions
+
+        // Test Compute100a
+        let features = NvvmArch::Compute100a.all_target_features();
+        assert!(features.contains(&"compute_100a".to_string())); // Includes itself
+        assert!(features.contains(&"compute_100".to_string())); // Includes base
+        assert!(features.contains(&"compute_100f".to_string())); // Includes family variant
+        assert!(features.contains(&"compute_90".to_string())); // Includes lower base versions
+        assert!(!features.contains(&"compute_90a".to_string())); // Does NOT include other 'a' variants
+        assert!(!features.contains(&"compute_101f".to_string())); // Does NOT include higher minor family variants
+
+        // Test Compute120a
+        let features = NvvmArch::Compute120a.all_target_features();
+        assert!(features.contains(&"compute_120a".to_string())); // Includes itself
+        assert!(features.contains(&"compute_120".to_string())); // Includes base
+        assert!(features.contains(&"compute_120f".to_string())); // Includes family variant (same minor)
+        assert!(features.contains(&"compute_100".to_string())); // Includes lower base versions
+        assert!(!features.contains(&"compute_121f".to_string())); // Does NOT include higher minor family variants
+    }
+
+    #[test]
+    fn nvvm_arch_variants_for_capability() {
+        use crate::NvvmArch;
+
+        // Capability with single variant
+        let compute75_variants = NvvmArch::variants_for_capability(75);
+        assert_eq!(compute75_variants, vec![NvvmArch::Compute75]);
+
+        // Capability with multiple variants
+        let mut compute101_variants = NvvmArch::variants_for_capability(101);
+        compute101_variants.sort_by_key(|v| format!("{:?}", v));
+        assert_eq!(
+            compute101_variants,
+            vec![
+                NvvmArch::Compute101,
+                NvvmArch::Compute101a,
+                NvvmArch::Compute101f
+            ]
+        );
+
+        // Non-existent capability
+        let compute999_variants = NvvmArch::variants_for_capability(999);
+        assert!(compute999_variants.is_empty());
+    }
 }
diff --git a/guide/src/guide/compute_capabilities.md b/guide/src/guide/compute_capabilities.md
index cf7ea293..432522c7 100644
--- a/guide/src/guide/compute_capabilities.md
+++ b/guide/src/guide/compute_capabilities.md
@@ -30,31 +30,117 @@ rust-cuda works exclusively with virtual architectures since it only generates P
 
 ## Using Target Features
 
-When you build a CUDA kernel with `cuda_builder`, the architecture you choose (e.g.,
-`NvvmArch::Compute61`) enables target features that you can use for conditional compilation.
+When building your kernel, the `NvvmArch::ComputeXX` variant you choose enables specific
+`target_feature` flags. These can be used with `#[cfg(...)]` to conditionally compile
+code based on the capabilities of the target GPU.
 
-These features follow the pattern `compute_XX` where XX is the capability number without
-the decimal point. The enabled feature means "at least this capability", matching
-NVIDIA's semantics.
+For example, this checks whether the target architecture supports running compute 6.0
+code or newer:
 
-### Example: Basic Usage
+```rust
+#[cfg(target_feature = "compute_60")]
+```
+
+Think of it as asking: “Is the GPU I’m building for at least compute 6.0?” Depending on
+which `NvvmArch::ComputeXX` is used to build the kernel, there is a different answer:
+
+- Building for `Compute60` → ✓ Yes (exact match)
+- Building for `Compute70` → ✓ Yes (7.0 GPUs support 6.0 code)
+- Building for `Compute50` → ✗ No (5.0 GPUs can't run 6.0 code)
+
+These features let you write optimized code paths for specific GPU generations while
+still supporting older ones.
+
+## Specifying Compute Capabilites
+
+Starting with CUDA 12.9, NVIDIA introduced architecture suffixes that affect
+compatibility.
+
+### Base Architecture (No Suffix)
+
+Example: `NvvmArch::Compute70`
+
+This is everything mentioned above, and was the only option in CUDA 12.8 and lower.
+
+**When to use**: Default choice for maximum compatibility.
+
+Example usage:
 
 ```rust
-use cuda_builder::CudaBuilder;
+// In build.rs
+CudaBuilder::new("kernels")
+    .arch(NvvmArch::Compute70)
+    .build()
+    .unwrap();
+
+// In your kernel code:
+#[cfg(target_feature = "compute_60")]  // ✓ Pass (older compute capability)
+#[cfg(target_feature = "compute_70")]  // ✓ Pass (current compute capability)
+#[cfg(target_feature = "compute_80")]  // ✗ Fail (newer compute capability)
+```
 
-fn main() {
-    CudaBuilder::new("kernels")
-        .arch(NvvmArch::Compute61)  // Target compute capability 6.1+
-        .build()
-        .unwrap();
-}
+### Family Suffix ('f')
+
+Example: `NvvmArch::Compute101f`
+
+Specifies code compatible with the same major compute capability version and with an
+equal or higher minor compute capability version.
+
+**When to use**: When you need features from a specific minor version but want forward
+compatibility within the family.
+
+Example usage:
+
+```rust
+// In build.rs
+CudaBuilder::new("kernels")
+    .arch(NvvmArch::Compute101f)
+    .build()
+    .unwrap();
+
+// In your kernel code:
+#[cfg(target_feature = "compute_100")]   // ✗ Fail (10.0 < 10.1)
+#[cfg(target_feature = "compute_101")]   // ✓ Pass (equal major, equal minor)
+#[cfg(target_feature = "compute_103")]   // ✓ Pass (equal major, greater minor)
+#[cfg(target_feature = "compute_101f")]  // ✓ Pass (the 'f' variant itself)
+#[cfg(target_feature = "compute_100f")]  // ✗ Fail (other 'f' variant)
+#[cfg(target_feature = "compute_90")]    // ✗ Fail (different major)
+#[cfg(target_feature = "compute_110")]   // ✗ Fail (different major)
 ```
 
-This enables only the `compute_61` target feature, meaning the code requires
-at least compute capability 6.1.
+### Architecture Suffix ('a')
+
+Example: `NvvmArch::Compute100a`
+
+Specifies code that only runs on GPUs of that specific compute capability and no others.
+However, during compilation, it enables all available instructions for the architecture,
+including all base variants up to the same version and all family variants with the same
+major version and equal or lower minor version.
+
+**When to use**: When you need to use architecture-specific features (like certain
+Tensor Core operations) that are only available on that exact GPU model.
 
-For other targeting patterns (exact ranges, maximum capabilities), use boolean
-`cfg` logic as shown in the examples below.
+Example usage:
+
+```rust
+// In build.rs
+CudaBuilder::new("kernels")
+    .arch(NvvmArch::Compute100a)
+    .build()
+    .unwrap();
+
+// In your kernel code:
+#[cfg(target_feature = "compute_100a")]  // ✓ Pass (the 'a' variant itself)
+#[cfg(target_feature = "compute_100")]   // ✓ Pass (base variant)
+#[cfg(target_feature = "compute_90")]    // ✓ Pass (lower base variant)
+#[cfg(target_feature = "compute_100f")]  // ✓ Pass (family variant with same major/minor)
+#[cfg(target_feature = "compute_101f")]  // ✗ Fail (family variant with higher minor)
+#[cfg(target_feature = "compute_110")]   // ✗ Fail (higher major version)
+```
+
+Note: While the 'a' variant enables all these features during compilation (allowing you to use all available instructions), the generated PTX code will still only run on the exact GPU architecture specified.
+
+For more details on suffixes, see [NVIDIA's blog post on family-specific architecture features](https://developer.nvidia.com/blog/nvidia-blackwell-and-nvidia-cuda-12-9-introduce-family-specific-architecture-features/).
 
 ### Manual Compilation (Without CudaBuilder)
 
@@ -74,15 +160,14 @@ export RUSTFLAGS="-C llvm-args=-arch=compute_61 -Z codegen-backend=/path/to/libr
 cargo build --target nvptx64-nvidia-cuda
 ```
 
-The codegen backend automatically synthesizes all appropriate target features based on the architecture. For example, targeting `compute_61` will enable `compute_35`, `compute_37`, `compute_50`, `compute_52`, `compute_53`, `compute_60`, and `compute_61` features for conditional compilation.
-
-## Conditional Compilation in Kernels
+The codegen backend automatically synthesizes target features based on the architecture type as described above.
 
-You can use `#[cfg(target_feature = "compute_XX")]` to conditionally compile code based on the available compute capabilities. With boolean logic, you can express any capability range you need.
+### Common Patterns for Base Architectures
 
-### Common Patterns
+These patterns work when using base architectures (no suffix), which enable all lower capabilities:
 
 #### At Least a Capability (Default)
+
 ```rust,no_run
 // Code that requires compute 6.0 or higher
 #[cfg(target_feature = "compute_60")]
@@ -91,14 +176,9 @@ You can use `#[cfg(target_feature = "compute_XX")]` to conditionally compile cod
 }
 ```
 
-#### Exactly One Capability  
-```rust,no_run
-// Code that targets exactly compute 5.0 (not 5.2+)
-#[cfg(all(target_feature = "compute_50", not(target_feature = "compute_52")))]
-{
-    // Optimizations specific to compute 5.0
-}
+#### Exactly One Capability
 
+```rust,no_run
 // Code that targets exactly compute 6.1 (not 6.2+)
 #[cfg(all(target_feature = "compute_61", not(target_feature = "compute_62")))]
 {
@@ -107,82 +187,22 @@ You can use `#[cfg(target_feature = "compute_XX")]` to conditionally compile cod
 ```
 
 #### Up To a Maximum Capability
-```rust,no_run
-// Code that works on compute 5.0 and below (not 5.2+)
-#[cfg(all(target_feature = "compute_35", not(target_feature = "compute_52")))]
-{
-    // Fallback implementation for older GPUs
-}
 
-// Code that works up to compute 6.0 (not 6.1+)  
+```rust,no_run
+// Code that works up to compute 6.0 (not 6.1+)
 #[cfg(all(target_feature = "compute_35", not(target_feature = "compute_61")))]
 {
     // Maximum compatibility implementation
 }
 ```
 
-#### Capability Ranges
-```rust,no_run
-// Code that works on compute 5.0 through 7.0 (not 7.2+)
-#[cfg(all(target_feature = "compute_50", not(target_feature = "compute_72")))]
-{
-    // Features available in this range
-}
-```
-
-### Complete Example
-
-```rust,no_run
-use cuda_std::*;
-
-#[kernel]
-pub unsafe fn adaptive_kernel(data: *mut f64) {
-    // This code only compiles when targeting compute 6.0 or higher
-    #[cfg(target_feature = "compute_60")]
-    {
-        // f64 atomics are only available on compute 6.0+
-        cuda_std::atomic::atomic_add(data, 1.0);
-    }
-
-    // Fallback for older GPUs
-    #[cfg(not(target_feature = "compute_60"))]
-    {
-        // Manual implementation or alternative approach
-    }
-}
-```
-
-## Best Practices
-
-### 1. Choose the Lowest Viable Architecture
-
-Select the lowest compute capability that provides the features you need. This maximizes GPU compatibility:
+#### Targeting Specific Architecture Ranges
 
 ```rust,no_run
-// If you only need basic atomics
-.arch(NvvmArch::Compute35)
-
-// If you need 64-bit integer atomics
-.arch(NvvmArch::Compute50)
-
-// If you need f64 atomics
-.arch(NvvmArch::Compute60)
-```
-
-### 2. Provide Fallbacks When Possible
-
-For maximum compatibility, provide alternative implementations for older GPUs:
-
-```rust,no_run
-#[cfg(target_feature = "compute_50")]
-fn fast_path(data: *mut u64) {
-    // Use hardware atomic
-    atomic_min(data, 100);
-}
-
-#[cfg(not(target_feature = "compute_50"))]
-fn fast_path(data: *mut u64) {
-    // Software fallback
+// This block compiles when building for architectures >= 6.0 but < 8.0
+#[cfg(all(target_feature = "compute_60", not(target_feature = "compute_80")))]
+{
+    // Code here can use features from 6.0+ but must not use 8.0+ features
 }
 ```