From 4b30cea96b9f5b084c9d06ee6a829bb36ccac1d4 Mon Sep 17 00:00:00 2001 From: Christian Legnitto Date: Sun, 27 Jul 2025 19:51:08 -0400 Subject: [PATCH] Add target_feature support for compute_* This lets us gate code to virtual architectures at compile time using `cfg()`. --- crates/cuda_builder/src/lib.rs | 22 +++ crates/nvvm/Cargo.toml | 1 + crates/nvvm/src/lib.rs | 247 +++++++++++++++++++++++- crates/rustc_codegen_nvvm/src/lib.rs | 54 +++++- guide/src/SUMMARY.md | 1 + guide/src/guide/compute_capabilities.md | 207 ++++++++++++++++++++ guide/src/guide/getting_started.md | 12 ++ 7 files changed, 538 insertions(+), 6 deletions(-) create mode 100644 guide/src/guide/compute_capabilities.md diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs index adab4a52..766413d1 100644 --- a/crates/cuda_builder/src/lib.rs +++ b/crates/cuda_builder/src/lib.rs @@ -93,6 +93,23 @@ pub struct CudaBuilder { /// the GTX 1030, GTX 1050, GTX 1080, Tesla P40, etc. We default to this because /// Maxwell (5.x) will be deprecated in CUDA 12 and we anticipate for that. Moreover, /// `6.x` contains support for things like f64 atomic add and half precision float ops. + /// + /// ## Target Features for Conditional Compilation + /// + /// The chosen architecture enables a target feature that can be used for + /// conditional compilation with `#[cfg(target_feature = "compute_XX")]`. + /// This feature means "at least this capability", matching NVIDIA's semantics. + /// + /// For other patterns (exact ranges, maximum capabilities), use boolean `cfg` logic. + /// See the compute capabilities guide for examples. + /// + /// For example, with `.arch(NvvmArch::Compute61)`: + /// ```ignore + /// #[cfg(target_feature = "compute_61")] + /// { + /// // Code that requires compute capability 6.1+ + /// } + /// ``` pub arch: NvvmArch, /// Flush denormal values to zero when performing single-precision floating point operations. /// `false` by default. @@ -229,6 +246,11 @@ impl CudaBuilder { /// NOTE that this does not necessarily mean that code using a certain capability /// will not work on older capabilities. It means that if it uses certain /// features it may not work. + /// + /// ## Target Features for Conditional Compilation + /// + /// The chosen architecture enables target features for conditional compilation. + /// See the documentation on the `arch` field for more details. pub fn arch(mut self, arch: NvvmArch) -> Self { self.arch = arch; self diff --git a/crates/nvvm/Cargo.toml b/crates/nvvm/Cargo.toml index f45fd131..b8e2d741 100644 --- a/crates/nvvm/Cargo.toml +++ b/crates/nvvm/Cargo.toml @@ -10,3 +10,4 @@ readme = "../../README.md" [dependencies] cust_raw = { path = "../cust_raw", default-features = false, features = ["nvvm"] } +strum = { version = "0.27", features = ["derive"] } diff --git a/crates/nvvm/src/lib.rs b/crates/nvvm/src/lib.rs index f8f4c9f8..ac8b8e50 100644 --- a/crates/nvvm/src/lib.rs +++ b/crates/nvvm/src/lib.rs @@ -8,6 +8,8 @@ use std::{ str::FromStr, }; +use strum::IntoEnumIterator; + use cust_raw::nvvm_sys; pub use cust_raw::nvvm_sys::LIBDEVICE_BITCODE; @@ -255,6 +257,10 @@ impl FromStr for NvvmOption { "72" => NvvmArch::Compute72, "75" => NvvmArch::Compute75, "80" => NvvmArch::Compute80, + "86" => NvvmArch::Compute86, + "87" => NvvmArch::Compute87, + "89" => NvvmArch::Compute89, + "90" => NvvmArch::Compute90, _ => return Err("unknown arch"), }; Self::Arch(arch) @@ -265,7 +271,7 @@ impl FromStr for NvvmOption { } /// Nvvm architecture, default is `Compute52` -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, strum::EnumIter)] pub enum NvvmArch { Compute35, Compute37, @@ -279,6 +285,10 @@ pub enum NvvmArch { Compute72, Compute75, Compute80, + Compute86, + Compute87, + Compute89, + Compute90, } impl Display for NvvmArch { @@ -295,6 +305,53 @@ impl Default for NvvmArch { } } +impl NvvmArch { + /// Get the numeric capability value (e.g., 35 for Compute35) + pub fn capability_value(&self) -> u32 { + match self { + Self::Compute35 => 35, + Self::Compute37 => 37, + Self::Compute50 => 50, + Self::Compute52 => 52, + Self::Compute53 => 53, + Self::Compute60 => 60, + Self::Compute61 => 61, + Self::Compute62 => 62, + Self::Compute70 => 70, + Self::Compute72 => 72, + Self::Compute75 => 75, + Self::Compute80 => 80, + Self::Compute86 => 86, + Self::Compute87 => 87, + Self::Compute89 => 89, + Self::Compute90 => 90, + } + } + + /// Get the target feature string (e.g., "compute_35" for Compute35) + pub fn target_feature(&self) -> String { + let cap = self.capability_value(); + format!("compute_{cap}") + } + + /// Get all target features up to and including this architecture. + /// This ensures that `cfg(target_feature = "compute_50")` works on compute_60+ devices. + pub fn all_target_features(&self) -> Vec { + let current = self.capability_value(); + + NvvmArch::iter() + .filter(|arch| arch.capability_value() <= current) + .map(|arch| arch.target_feature()) + .collect() + } + + /// Create an iterator over all architectures from Compute35 up to and including this one + pub fn iter_up_to(&self) -> impl Iterator { + let current = self.capability_value(); + NvvmArch::iter().filter(move |arch| arch.capability_value() <= current) + } +} + pub struct NvvmProgram { raw: nvvm_sys::nvvmProgram, } @@ -409,6 +466,194 @@ impl NvvmProgram { mod tests { use std::str::FromStr; + #[test] + fn nvvm_arch_capability_value() { + use crate::NvvmArch; + + assert_eq!(NvvmArch::Compute35.capability_value(), 35); + assert_eq!(NvvmArch::Compute37.capability_value(), 37); + assert_eq!(NvvmArch::Compute50.capability_value(), 50); + assert_eq!(NvvmArch::Compute52.capability_value(), 52); + assert_eq!(NvvmArch::Compute53.capability_value(), 53); + assert_eq!(NvvmArch::Compute60.capability_value(), 60); + assert_eq!(NvvmArch::Compute61.capability_value(), 61); + assert_eq!(NvvmArch::Compute62.capability_value(), 62); + assert_eq!(NvvmArch::Compute70.capability_value(), 70); + assert_eq!(NvvmArch::Compute72.capability_value(), 72); + assert_eq!(NvvmArch::Compute75.capability_value(), 75); + assert_eq!(NvvmArch::Compute80.capability_value(), 80); + assert_eq!(NvvmArch::Compute86.capability_value(), 86); + assert_eq!(NvvmArch::Compute87.capability_value(), 87); + assert_eq!(NvvmArch::Compute89.capability_value(), 89); + assert_eq!(NvvmArch::Compute90.capability_value(), 90); + } + + #[test] + fn nvvm_arch_target_feature_format() { + use crate::NvvmArch; + + assert_eq!(NvvmArch::Compute35.target_feature(), "compute_35"); + assert_eq!(NvvmArch::Compute61.target_feature(), "compute_61"); + assert_eq!(NvvmArch::Compute90.target_feature(), "compute_90"); + } + + #[test] + fn nvvm_arch_all_target_features_includes_lower_capabilities() { + use crate::NvvmArch; + + // Compute35 only includes itself + let compute35_features = NvvmArch::Compute35.all_target_features(); + assert_eq!(compute35_features, vec!["compute_35"]); + + // Compute50 includes all lower capabilities + let compute50_features = NvvmArch::Compute50.all_target_features(); + assert_eq!( + compute50_features, + vec!["compute_35", "compute_37", "compute_50"] + ); + + // Compute61 includes all lower capabilities + let compute61_features = NvvmArch::Compute61.all_target_features(); + assert_eq!( + compute61_features, + vec![ + "compute_35", + "compute_37", + "compute_50", + "compute_52", + "compute_53", + "compute_60", + "compute_61" + ] + ); + + // Compute90 includes all capabilities + let compute90_features = NvvmArch::Compute90.all_target_features(); + assert_eq!( + compute90_features, + vec![ + "compute_35", + "compute_37", + "compute_50", + "compute_52", + "compute_53", + "compute_60", + "compute_61", + "compute_62", + "compute_70", + "compute_72", + "compute_75", + "compute_80", + "compute_86", + "compute_87", + "compute_89", + "compute_90" + ] + ); + } + + #[test] + fn target_feature_synthesis_supports_conditional_compilation_patterns() { + use crate::NvvmArch; + + // When targeting Compute61, should enable all lower capabilities + let features = NvvmArch::Compute61.all_target_features(); + + // Should enable compute_60 (for f64 atomics) + assert!(features.contains(&"compute_60".to_string())); + + // Should enable compute_50 (for 64-bit integer atomics) + assert!(features.contains(&"compute_50".to_string())); + + // Should enable compute_35 (baseline) + assert!(features.contains(&"compute_35".to_string())); + + // Should enable the target itself + assert!(features.contains(&"compute_61".to_string())); + + // Should NOT enable higher capabilities + assert!(!features.contains(&"compute_62".to_string())); + assert!(!features.contains(&"compute_70".to_string())); + } + + #[test] + fn target_feature_synthesis_enables_correct_cfg_patterns() { + use crate::NvvmArch; + + // Test that targeting Compute70 enables appropriate cfg patterns + let features = NvvmArch::Compute70.all_target_features(); + + // These should all be true for compute_70 target + let expected_enabled = [ + "compute_35", + "compute_37", + "compute_50", + "compute_52", + "compute_53", + "compute_60", + "compute_61", + "compute_62", + "compute_70", + ]; + + for feature in expected_enabled { + assert!( + features.contains(&feature.to_string()), + "Compute70 should enable {feature} for cfg(target_feature = \"{feature}\")" + ); + } + + // These should NOT be enabled for compute_70 target + let expected_disabled = ["compute_72", "compute_75", "compute_80", "compute_90"]; + + for feature in expected_disabled { + assert!( + !features.contains(&feature.to_string()), + "Compute70 should NOT enable {feature}" + ); + } + } + + #[test] + fn nvvm_arch_iter_up_to_includes_only_lower_or_equal() { + use crate::NvvmArch; + + // Compute35 only includes itself + let archs: Vec<_> = NvvmArch::Compute35.iter_up_to().collect(); + assert_eq!(archs, vec![NvvmArch::Compute35]); + + // Compute52 includes all up to 52 + let archs: Vec<_> = NvvmArch::Compute52.iter_up_to().collect(); + assert_eq!( + archs, + vec![ + NvvmArch::Compute35, + NvvmArch::Compute37, + NvvmArch::Compute50, + NvvmArch::Compute52, + ] + ); + + // Compute75 includes all up to 75 + let archs: Vec<_> = NvvmArch::Compute75.iter_up_to().collect(); + assert_eq!( + archs, + vec![ + NvvmArch::Compute35, + NvvmArch::Compute37, + NvvmArch::Compute50, + NvvmArch::Compute52, + NvvmArch::Compute53, + NvvmArch::Compute60, + NvvmArch::Compute61, + NvvmArch::Compute62, + NvvmArch::Compute70, + NvvmArch::Compute72, + NvvmArch::Compute75, + ] + ); + } + #[test] fn options_parse_correctly() { use crate::NvvmArch::*; diff --git a/crates/rustc_codegen_nvvm/src/lib.rs b/crates/rustc_codegen_nvvm/src/lib.rs index b0c2da34..a0656a69 100644 --- a/crates/rustc_codegen_nvvm/src/lib.rs +++ b/crates/rustc_codegen_nvvm/src/lib.rs @@ -61,7 +61,7 @@ use lto::ThinBuffer; use rustc_ast::expand::allocator::AllocatorKind; use rustc_ast::expand::autodiff_attrs::AutoDiffItem; use rustc_codegen_ssa::{ - CodegenResults, CompiledModule, ModuleCodegen, + CodegenResults, CompiledModule, ModuleCodegen, TargetConfig, back::{ lto::{LtoModuleCodegen, SerializedModule, ThinModule}, write::{CodegenContext, FatLtoInput, ModuleConfig, OngoingCodegen}, @@ -131,10 +131,28 @@ impl CodegenBackend for NvvmCodegenBackend { } fn provide(&self, providers: &mut Providers) { - // FIXME(eddyb) this is currently only passed back to us, specifically - // into `target_machine_factory` (which is a noop), but it might make - // sense to move some of the target feature parsing into here. - providers.global_backend_features = |_tcx, ()| vec![]; + // Synthesize compute capability target features from the architecture specified in llvm-args. + // This enables code to use `#[cfg(target_feature = "compute_60")]` etc. for conditional compilation. + // Following NVIDIA semantics, we enable "at least this capability" matching - for example, + // when targeting compute_70, we also enable compute_60, compute_50, and all lower capabilities. + // This allows libraries to gate features based on minimum required compute capability. + providers.global_backend_features = |tcx, ()| { + let mut features = vec![]; + + // Parse CodegenArgs to get the architecture from llvm-args (e.g., "-arch=compute_70") + let args = context::CodegenArgs::from_session(tcx.sess); + + // Find the architecture option and synthesize all implied features + for opt in &args.nvvm_options { + if let ::nvvm::NvvmOption::Arch(arch) = opt { + // Add all features up to and including the current architecture + features.extend(arch.all_target_features()); + break; + } + } + + features + }; providers.fn_abi_of_fn_ptr = |tcx, key| { let result = (rustc_interface::DEFAULT_QUERY_PROVIDERS.fn_abi_of_fn_ptr)(tcx, key); @@ -192,6 +210,32 @@ impl CodegenBackend for NvvmCodegenBackend { metadata, ); } + + fn target_config(&self, sess: &Session) -> TargetConfig { + // Parse target features from command line + let cmdline = sess.opts.cg.target_feature.split(','); + let cfg = sess.target.options.features.split(','); + + let target_features: Vec<_> = cfg + .chain(cmdline) + .filter(|l| l.starts_with('+')) + .map(|l| &l[1..]) + .filter(|l| !l.is_empty()) + .map(rustc_span::Symbol::intern) + .collect(); + + // For NVPTX, all target features are stable + let unstable_target_features = target_features.clone(); + + TargetConfig { + target_features, + unstable_target_features, + has_reliable_f16: false, + has_reliable_f16_math: false, + has_reliable_f128: false, + has_reliable_f128_math: false, + } + } } impl WriteBackendMethods for NvvmCodegenBackend { diff --git a/guide/src/SUMMARY.md b/guide/src/SUMMARY.md index 9d5b24f9..6a6e420c 100644 --- a/guide/src/SUMMARY.md +++ b/guide/src/SUMMARY.md @@ -5,6 +5,7 @@ - [Frequently Asked Questions](faq.md) - [Guide](guide/README.md) - [Getting Started](guide/getting_started.md) + - [Compute Capability Gating](guide/compute_capabilities.md) - [Tips](guide/tips.md) - [Kernel ABI](guide/kernel_abi.md) - [Safety](guide/safety.md) diff --git a/guide/src/guide/compute_capabilities.md b/guide/src/guide/compute_capabilities.md new file mode 100644 index 00000000..cf7ea293 --- /dev/null +++ b/guide/src/guide/compute_capabilities.md @@ -0,0 +1,207 @@ +# Compute Capability Gating + +This section covers how to write code that adapts to different CUDA compute capabilities +using conditional compilation. + +## What are Compute Capabilities? + +CUDA GPUs have different "compute capabilities" that determine which features they +support. Each capability is identified by a version number like `3.5`, `5.0`, `6.1`, +`7.5`, etc. Higher numbers generally mean more features are available. + +For example: + +- Compute capability 5.0+ supports 64-bit integer min/max and bitwise atomic operations +- Compute capability 6.0+ supports double-precision (f64) atomic operations +- Compute capability 7.0+ supports tensor core operations + +For comprehensive details, see [NVIDIA's CUDA documentation on GPU architectures](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-compilation). + +## Virtual vs Real Architectures + +In CUDA terminology: + +- **Virtual architectures** (`compute_XX`) define the PTX instruction set and available + features +- **Real architectures** (`sm_XX`) represent actual GPU hardware + +rust-cuda works exclusively with virtual architectures since it only generates PTX. The +`NvvmArch::ComputeXX` enum values correspond to CUDA's virtual architectures. + +## Using Target Features + +When you build a CUDA kernel with `cuda_builder`, the architecture you choose (e.g., +`NvvmArch::Compute61`) enables target features that you can use for conditional compilation. + +These features follow the pattern `compute_XX` where XX is the capability number without +the decimal point. The enabled feature means "at least this capability", matching +NVIDIA's semantics. + +### Example: Basic Usage + +```rust +use cuda_builder::CudaBuilder; + +fn main() { + CudaBuilder::new("kernels") + .arch(NvvmArch::Compute61) // Target compute capability 6.1+ + .build() + .unwrap(); +} +``` + +This enables only the `compute_61` target feature, meaning the code requires +at least compute capability 6.1. + +For other targeting patterns (exact ranges, maximum capabilities), use boolean +`cfg` logic as shown in the examples below. + +### Manual Compilation (Without CudaBuilder) + +If you're invoking `rustc` directly instead of using `cuda_builder`, you only need to specify the architecture through LLVM args: + +```bash +rustc --target nvptx64-nvidia-cuda \ + -C llvm-args=-arch=compute_61 \ + -Z codegen-backend=/path/to/librustc_codegen_nvvm.so \ + ... +``` + +Or with cargo: + +```bash +export RUSTFLAGS="-C llvm-args=-arch=compute_61 -Z codegen-backend=/path/to/librustc_codegen_nvvm.so" +cargo build --target nvptx64-nvidia-cuda +``` + +The codegen backend automatically synthesizes all appropriate target features based on the architecture. For example, targeting `compute_61` will enable `compute_35`, `compute_37`, `compute_50`, `compute_52`, `compute_53`, `compute_60`, and `compute_61` features for conditional compilation. + +## Conditional Compilation in Kernels + +You can use `#[cfg(target_feature = "compute_XX")]` to conditionally compile code based on the available compute capabilities. With boolean logic, you can express any capability range you need. + +### Common Patterns + +#### At Least a Capability (Default) +```rust,no_run +// Code that requires compute 6.0 or higher +#[cfg(target_feature = "compute_60")] +{ + cuda_std::atomic::atomic_add(data, 1.0); // f64 atomics need 6.0+ +} +``` + +#### Exactly One Capability +```rust,no_run +// Code that targets exactly compute 5.0 (not 5.2+) +#[cfg(all(target_feature = "compute_50", not(target_feature = "compute_52")))] +{ + // Optimizations specific to compute 5.0 +} + +// Code that targets exactly compute 6.1 (not 6.2+) +#[cfg(all(target_feature = "compute_61", not(target_feature = "compute_62")))] +{ + // Features specific to compute 6.1 +} +``` + +#### Up To a Maximum Capability +```rust,no_run +// Code that works on compute 5.0 and below (not 5.2+) +#[cfg(all(target_feature = "compute_35", not(target_feature = "compute_52")))] +{ + // Fallback implementation for older GPUs +} + +// Code that works up to compute 6.0 (not 6.1+) +#[cfg(all(target_feature = "compute_35", not(target_feature = "compute_61")))] +{ + // Maximum compatibility implementation +} +``` + +#### Capability Ranges +```rust,no_run +// Code that works on compute 5.0 through 7.0 (not 7.2+) +#[cfg(all(target_feature = "compute_50", not(target_feature = "compute_72")))] +{ + // Features available in this range +} +``` + +### Complete Example + +```rust,no_run +use cuda_std::*; + +#[kernel] +pub unsafe fn adaptive_kernel(data: *mut f64) { + // This code only compiles when targeting compute 6.0 or higher + #[cfg(target_feature = "compute_60")] + { + // f64 atomics are only available on compute 6.0+ + cuda_std::atomic::atomic_add(data, 1.0); + } + + // Fallback for older GPUs + #[cfg(not(target_feature = "compute_60"))] + { + // Manual implementation or alternative approach + } +} +``` + +## Best Practices + +### 1. Choose the Lowest Viable Architecture + +Select the lowest compute capability that provides the features you need. This maximizes GPU compatibility: + +```rust,no_run +// If you only need basic atomics +.arch(NvvmArch::Compute35) + +// If you need 64-bit integer atomics +.arch(NvvmArch::Compute50) + +// If you need f64 atomics +.arch(NvvmArch::Compute60) +``` + +### 2. Provide Fallbacks When Possible + +For maximum compatibility, provide alternative implementations for older GPUs: + +```rust,no_run +#[cfg(target_feature = "compute_50")] +fn fast_path(data: *mut u64) { + // Use hardware atomic + atomic_min(data, 100); +} + +#[cfg(not(target_feature = "compute_50"))] +fn fast_path(data: *mut u64) { + // Software fallback +} +``` + +## Debugging Capability Issues + +If you encounter errors about missing functions or features: + +1. Check the compute capability you're targeting in `cuda_builder` +2. Verify your GPU supports the features you're using +3. Use `nvidia-smi` to check your GPU's compute capability +4. Add appropriate `#[cfg]` guards or increase the target architecture + +## Runtime Behavior + +Again, rust-cuda **only generates PTX**, not pre-compiled GPU binaries +("[fatbinaries](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#fatbinaries)"). +This PTX is then JIT-compiled by the CUDA driver at _runtime_. + +For more details, see [NVIDIA's documentation on GPU +compilation](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-compilation) +and [JIT +compilation](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#just-in-time-compilation). diff --git a/guide/src/guide/getting_started.md b/guide/src/guide/getting_started.md index 241f9b00..e61cff4f 100644 --- a/guide/src/guide/getting_started.md +++ b/guide/src/guide/getting_started.md @@ -196,6 +196,18 @@ inside of `target/cuda-builder/nvptx64-nvidia-cuda/release/crate_name.ptx`, but what such method does. Finally, `build()` actually runs rustc to compile the crate. This may take a while since it needs to build things like core from scratch, but after the first compile, incremental will make it much faster. +You can also specify a different compute capability with `.arch()`: + +```rs +CudaBuilder::new("path/to/gpu/crate") + .arch(cuda_builder::NvvmArch::Compute75) // Target compute 7.5 GPUs + .copy_to("kernel.ptx") + .build() + .unwrap(); +``` + +The architecture you choose affects which GPU features are available. See the [Compute Capability Gating](./compute_capabilities.md) guide for details on writing code that adapts to different GPU capabilities. + Finally, you can include the PTX as a static string in your program: ```rs