Skip to content

Make constant memory opt-in, spill large statics to global memory #217

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions crates/cuda_builder/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,21 @@ pub struct CudaBuilder {
///
/// `true` by default.
pub override_libm: bool,
/// If `true`, the codegen will attempt to place `static` variables in CUDA's
/// constant memory, which is fast but limited in size (~64KB total across all
/// statics). The codegen avoids placing any single item too large, but it does not
/// track cumulative size. Exceeding the limit may cause `IllegalAddress` runtime
/// errors (CUDA error code: `700`).
///
/// The default is `false`, which places all statics in global memory. This avoids
/// such errors but may reduce performance and use more general memory. When set to
/// `false`, you can still annotate `static` variables with
/// `#[cuda_std::address_space(constant)]` to place them in constant memory
/// manually. This option only affects automatic placement.
///
/// Future versions may support smarter placement and user-controlled
/// packing/spilling strategies.
pub use_constant_memory_space: bool,
/// Whether to generate any debug info and what level of info to generate.
pub debug: DebugInfo,
/// Additional arguments passed to cargo during `cargo build`.
Expand All @@ -155,6 +170,7 @@ impl CudaBuilder {
emit: None,
optix: false,
override_libm: true,
use_constant_memory_space: false,
debug: DebugInfo::None,
build_args: vec![],
final_module_path: None,
Expand Down Expand Up @@ -284,6 +300,24 @@ impl CudaBuilder {
self
}

/// If `true`, the codegen will attempt to place `static` variables in CUDA's
/// constant memory, which is fast but limited in size (~64KB total across all
/// statics). The codegen avoids placing any single item too large, but it does not
/// track cumulative size. Exceeding the limit may cause `IllegalAddress` runtime
/// errors (CUDA error code: `700`).
///
/// If `false`, all statics are placed in global memory. This avoids such errors but
/// may reduce performance and use more general memory. You can still annotate
/// `static` variables with `#[cuda_std::address_space(constant)]` to place them in
/// constant memory manually as this option only affects automatic placement.
///
/// Future versions may support smarter placement and user-controlled
/// packing/spilling strategies.
pub fn use_constant_memory_space(mut self, use_constant_memory_space: bool) -> Self {
self.use_constant_memory_space = use_constant_memory_space;
self
}

/// An optional path where to dump LLVM IR of the final output the codegen will feed to libnvvm. Usually
/// used for debugging.
pub fn final_module_path(mut self, path: impl AsRef<Path>) -> Self {
Expand Down
13 changes: 1 addition & 12 deletions crates/rustc_codegen_nvvm/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1154,18 +1154,7 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {

impl<'ll> StaticBuilderMethods for Builder<'_, 'll, '_> {
fn get_static(&mut self, def_id: DefId) -> &'ll Value {
unsafe {
let mut g = self.cx.get_static(def_id);
let llty = self.val_ty(g);
let addrspace = AddressSpace(llvm::LLVMGetPointerAddressSpace(llty));
if addrspace != AddressSpace::DATA {
trace!("Remapping global address space of global {:?}", g);
let llty = llvm::LLVMGetElementType(llty);
let ty = self.type_ptr_to_ext(llty, AddressSpace::DATA);
g = llvm::LLVMBuildAddrSpaceCast(self.llbuilder, g, ty, unnamed());
}
g
}
self.cx.get_static(def_id)
}
}

Expand Down
35 changes: 33 additions & 2 deletions crates/rustc_codegen_nvvm/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use rustc_errors::DiagMessage;
use rustc_hash::FxHashMap;
use rustc_middle::dep_graph::DepContext;
use rustc_middle::ty::layout::{
FnAbiError, FnAbiOf, FnAbiRequest, HasTyCtxt, HasTypingEnv, LayoutError,
FnAbiError, FnAbiOf, FnAbiRequest, HasTyCtxt, HasTypingEnv, LayoutError, LayoutOf,
};
use rustc_middle::ty::layout::{FnAbiOfHelpers, LayoutOfHelpers};
use rustc_middle::ty::{Ty, TypeVisitableExt};
Expand All @@ -40,6 +40,10 @@ use rustc_target::callconv::FnAbi;
use rustc_target::spec::{HasTargetSpec, Target};
use tracing::{debug, trace};

/// "There is a total of 64 KB constant memory on a device."
/// <https://docs.nvidia.com/cuda/archive/12.8.1/pdf/CUDA_C_Best_Practices_Guide.pdf>
const CONSTANT_MEMORY_SIZE_LIMIT_BYTES: u64 = 64 * 1024;

pub(crate) struct CodegenCx<'ll, 'tcx> {
pub tcx: TyCtxt<'tcx>,

Expand Down Expand Up @@ -267,7 +271,31 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
}

if !is_mutable && self.type_is_freeze(ty) {
AddressSpace(4)
if !self.codegen_args.use_constant_memory_space {
// We aren't using constant memory, so put the instance in global memory.
AddressSpace(1)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: could we make consts somewhere that represent this 0 1 2 3 4 stuff better for easier readability?

Copy link
Contributor Author

@LegNeato LegNeato May 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we have one in cuda_std and I didn't want to duplicate it. A followup should add a rustc_codegen_nvvm-types crate that cuda_std and rustc_codegen_nvvm could share (rust-gpu has a "-types" crate for just this reason). Irust-gpu also has rspirv for spirv-specific info encoded in rust types, so perhaps it should be something like rcuda? we could move the cuda error code mapping out of cust into it as well 🤔 )

} else {
// We are using constant memory, see if the instance will fit.
//
// FIXME(@LegNeato) ideally we keep track of what we have put into
// constant memory and when it is filled up spill instead of only
// spilling when a static is big. We'll probably want some packing
// strategy controlled by the user...for example, if you have one large
// static and many small ones, you might want the small ones to all be
// in constant memory or just the big one depending on your workload.
let layout = self.layout_of(ty);
if layout.size.bytes() > CONSTANT_MEMORY_SIZE_LIMIT_BYTES {
self.tcx.sess.dcx().warn(format!(
"static `{}` exceeds the constant memory limit; placing in global memory (performance may be reduced)",
instance
));
// Place instance in global memory if it is too big for constant memory.
AddressSpace(1)
} else {
// Place instance in constant memory if it fits.
AddressSpace(4)
}
}
} else {
AddressSpace::DATA
}
Expand Down Expand Up @@ -519,6 +547,7 @@ impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
pub struct CodegenArgs {
pub nvvm_options: Vec<NvvmOption>,
pub override_libm: bool,
pub use_constant_memory_space: bool,
pub final_module_path: Option<PathBuf>,
}

Expand All @@ -537,6 +566,8 @@ impl CodegenArgs {
cg_args.nvvm_options.push(flag);
} else if arg == "--override-libm" {
cg_args.override_libm = true;
} else if arg == "--use-constant-memory-space" {
cg_args.use_constant_memory_space = true;
} else if arg == "--final-module-path" {
cg_args.final_module_path = Some(PathBuf::from(
args.get(idx + 1).expect("No path for --final-module-path"),
Expand Down
Loading