llvm · AlexMaclean · Jun 19, 2025 · Jun 25, 2025 · Artem-B · Jun 23, 2025
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -26,6 +26,7 @@ set(NVPTXCodeGen_sources
   NVPTXISelLowering.cpp
   NVPTXLowerAggrCopies.cpp
   NVPTXLowerAlloca.cpp
+  NVPTXIncreaseAlignment.cpp
   NVPTXLowerArgs.cpp
   NVPTXLowerUnreachable.cpp
   NVPTXMCExpr.cpp

diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -55,6 +55,7 @@ FunctionPass *createNVPTXTagInvariantLoadsPass();
 MachineFunctionPass *createNVPTXPeephole();
 MachineFunctionPass *createNVPTXProxyRegErasurePass();
 MachineFunctionPass *createNVPTXForwardParamsPass();
+FunctionPass *createNVPTXIncreaseLocalAlignmentPass();
 
 void initializeNVVMReflectLegacyPassPass(PassRegistry &);
 void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
@@ -76,6 +77,7 @@ void initializeNVPTXAAWrapperPassPass(PassRegistry &);
 void initializeNVPTXExternalAAWrapperPass(PassRegistry &);
 void initializeNVPTXPeepholePass(PassRegistry &);
 void initializeNVPTXTagInvariantLoadLegacyPassPass(PassRegistry &);
+void initializeNVPTXIncreaseLocalAlignmentLegacyPassPass(PassRegistry &);
 
 struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
@@ -111,6 +113,11 @@ struct NVPTXTagInvariantLoadsPass : PassInfoMixin<NVPTXTagInvariantLoadsPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+struct NVPTXIncreaseLocalAlignmentPass
+    : PassInfoMixin<NVPTXIncreaseLocalAlignmentPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 namespace NVPTX {
 enum DrvInterface {
   NVCL,

diff --git a/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp b/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp
@@ -0,0 +1,157 @@
+//===-- NVPTXIncreaseAlignment.cpp - Increase alignment for local arrays --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A simple pass that looks at local memory arrays that are statically
+// sized and potentially increases their alignment. This enables vectorization
+// of loads/stores to these arrays if not explicitly specified by the client.
+//
+// TODO: Ideally we should do a bin-packing of local arrays to maximize
+// alignments while minimizing holes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/NVPTXAddrSpace.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    MaxLocalArrayAlignment("nvptx-use-max-local-array-alignment",
+                           cl::init(false), cl::Hidden,
+                           cl::desc("Use maximum alignment for local memory"));
+
+static Align getMaxLocalArrayAlignment(const TargetTransformInfo &TTI) {
+  const unsigned MaxBitWidth =
+      TTI.getLoadStoreVecRegBitWidth(NVPTXAS::ADDRESS_SPACE_LOCAL);
+  return Align(MaxBitWidth / 8);
+}
+
+namespace {
+struct NVPTXIncreaseLocalAlignment {
+  const Align MaxAlign;
+
+  NVPTXIncreaseLocalAlignment(const TargetTransformInfo &TTI)
+      : MaxAlign(getMaxLocalArrayAlignment(TTI)) {}
+
+  bool run(Function &F);
+  bool updateAllocaAlignment(AllocaInst *Alloca, const DataLayout &DL);
+  Align getAggressiveArrayAlignment(unsigned ArraySize);
+  Align getConservativeArrayAlignment(unsigned ArraySize);
+};
+} // namespace
+
+/// Get the maximum useful alignment for an array. This is more likely to
+/// produce holes in the local memory.
+///
+/// Choose an alignment large enough that the entire array could be loaded with
+/// a single vector load (if possible). Cap the alignment at
+/// MaxPTXArrayAlignment.
+Align NVPTXIncreaseLocalAlignment::getAggressiveArrayAlignment(
+    const unsigned ArraySize) {
+  return std::min(MaxAlign, Align(PowerOf2Ceil(ArraySize)));
+}
+
+/// Get the alignment of arrays that reduces the chances of leaving holes when
+/// arrays are allocated within a contiguous memory buffer (like shared memory
+/// and stack). Holes are still possible before and after the array allocation.
+///
+/// Choose the largest alignment such that the array size is a multiple of the
+/// alignment. If all elements of the buffer are allocated in order of
+/// alignment (higher to lower) no holes will be left.
+Align NVPTXIncreaseLocalAlignment::getConservativeArrayAlignment(
+    const unsigned ArraySize) {
+  return commonAlignment(MaxAlign, ArraySize);
+}
+
+/// Find a better alignment for local arrays
+bool NVPTXIncreaseLocalAlignment::updateAllocaAlignment(AllocaInst *Alloca,
+                                                        const DataLayout &DL) {
+  // Looking for statically sized local arrays
+  if (!Alloca->isStaticAlloca())
+    return false;
+
+  const auto ArraySize = Alloca->getAllocationSize(DL);
+  if (!(ArraySize && ArraySize->isFixed()))
+    return false;
+
+  const auto ArraySizeValue = ArraySize->getFixedValue();
+  const Align PreferredAlignment =
+      MaxLocalArrayAlignment ? getAggressiveArrayAlignment(ArraySizeValue)
+                             : getConservativeArrayAlignment(ArraySizeValue);
+
+  if (PreferredAlignment > Alloca->getAlign()) {
+    Alloca->setAlignment(PreferredAlignment);
+    return true;
+  }
+
+  return false;
+}
+
+bool NVPTXIncreaseLocalAlignment::run(Function &F) {
+  bool Changed = false;
+  const auto &DL = F.getParent()->getDataLayout();
+
+  BasicBlock &EntryBB = F.getEntryBlock();
+  for (Instruction &I : EntryBB)
+    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(&I))
+      Changed |= updateAllocaAlignment(Alloca, DL);
+
+  return Changed;
+}
+
+namespace {
+struct NVPTXIncreaseLocalAlignmentLegacyPass : public FunctionPass {
+  static char ID;
+  NVPTXIncreaseLocalAlignmentLegacyPass() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+  StringRef getPassName() const override {
+    return "NVPTX Increase Local Alignment";
+  }
+};
+} // namespace
+
+char NVPTXIncreaseLocalAlignmentLegacyPass::ID = 0;
+INITIALIZE_PASS(NVPTXIncreaseLocalAlignmentLegacyPass,
+                "nvptx-increase-local-alignment",
+                "Increase alignment for statically sized alloca arrays", false,
+                false)
+
+FunctionPass *llvm::createNVPTXIncreaseLocalAlignmentPass() {
+  return new NVPTXIncreaseLocalAlignmentLegacyPass();
+}
+
+bool NVPTXIncreaseLocalAlignmentLegacyPass::runOnFunction(Function &F) {
+  const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  return NVPTXIncreaseLocalAlignment(TTI).run(F);
+}
+
+PreservedAnalyses
+NVPTXIncreaseLocalAlignmentPass::run(Function &F,
+                                     FunctionAnalysisManager &FAM) {
+  const auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
+  bool Changed = NVPTXIncreaseLocalAlignment(TTI).run(F);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
@@ -40,4 +40,5 @@ FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
 FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
 FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this))
 FUNCTION_PASS("nvptx-tag-invariant-loads", NVPTXTagInvariantLoadsPass())
+FUNCTION_PASS("nvptx-increase-local-alignment", NVPTXIncreaseLocalAlignmentPass())
 #undef FUNCTION_PASS
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -392,6 +392,8 @@ void NVPTXPassConfig::addIRPasses() {
   // but EarlyCSE can do neither of them.
   if (getOptLevel() != CodeGenOptLevel::None) {
     addEarlyCSEOrGVNPass();
+    // Increase alignment for local arrays to improve vectorization.
+    addPass(createNVPTXIncreaseLocalAlignmentPass());
     if (!DisableLoadStoreVectorizer)
       addPass(createLoadStoreVectorizerPass());
     addPass(createSROAPass());

diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -20,7 +20,7 @@ define ptx_kernel void @kernel_func(ptr %a) {
 entry:
   %buf = alloca [16 x i8], align 4
 
-; CHECK: .local .align 4 .b8 	__local_depot0[16]
+; CHECK: .local .align 16 .b8 	__local_depot0[16]
 ; CHECK: mov.b64 %SPL
 
 ; CHECK: ld.param.b64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]

diff --git a/llvm/test/CodeGen/NVPTX/increase-local-align.ll b/llvm/test/CodeGen/NVPTX/increase-local-align.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=nvptx-increase-local-alignment < %s | FileCheck %s --check-prefixes=COMMON,DEFAULT
+; RUN: opt -S -passes=nvptx-increase-local-alignment -nvptx-use-max-local-array-alignment < %s | FileCheck %s --check-prefixes=COMMON,MAX
+target triple = "nvptx64-nvidia-cuda"
+
+define void @test1() {
+; COMMON-LABEL: define void @test1() {
+; COMMON-NEXT:    [[A:%.*]] = alloca i8, align 1
+; COMMON-NEXT:    ret void
+;
+  %a = alloca i8, align 1
+  ret void
+}
+
+define void @test2() {
+; DEFAULT-LABEL: define void @test2() {
+; DEFAULT-NEXT:    [[A:%.*]] = alloca [63 x i8], align 1
+; DEFAULT-NEXT:    ret void
+;
+; MAX-LABEL: define void @test2() {
+; MAX-NEXT:    [[A:%.*]] = alloca [63 x i8], align 16
+; MAX-NEXT:    ret void
+;
+  %a = alloca [63 x i8], align 1
+  ret void
+}
+
+define void @test3() {
+; COMMON-LABEL: define void @test3() {
+; COMMON-NEXT:    [[A:%.*]] = alloca [64 x i8], align 16
+; COMMON-NEXT:    ret void
+;
+  %a = alloca [64 x i8], align 1
+  ret void
+}
+
+define void @test4() {
+; DEFAULT-LABEL: define void @test4() {
+; DEFAULT-NEXT:    [[A:%.*]] = alloca i8, i32 63, align 1
+; DEFAULT-NEXT:    ret void
+;
+; MAX-LABEL: define void @test4() {
+; MAX-NEXT:    [[A:%.*]] = alloca i8, i32 63, align 16
+; MAX-NEXT:    ret void
+;
+  %a = alloca i8, i32 63, align 1
+  ret void
+}
+
+define void @test5() {
+; COMMON-LABEL: define void @test5() {
+; COMMON-NEXT:    [[A:%.*]] = alloca i8, i32 64, align 16
+; COMMON-NEXT:    ret void
+;
+  %a = alloca i8, i32 64, align 1
+  ret void
+}
+
+define void @test6() {
+; COMMON-LABEL: define void @test6() {
+; COMMON-NEXT:    [[A:%.*]] = alloca i8, align 32
+; COMMON-NEXT:    ret void
+;
+  %a = alloca i8, align 32
+  ret void
+}
+
+define void @test7() {
+; COMMON-LABEL: define void @test7() {
+; COMMON-NEXT:    [[A:%.*]] = alloca i32, align 4
+; COMMON-NEXT:    ret void
+;
+  %a = alloca i32, align 2
+  ret void
+}
+
+define void @test8() {
+; COMMON-LABEL: define void @test8() {
+; COMMON-NEXT:    [[A:%.*]] = alloca [2 x i32], align 8
+; COMMON-NEXT:    ret void
+;
+  %a = alloca [2 x i32], align 2
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -135,7 +135,7 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out
 ;
 ; PTX-LABEL: escape_ptr(
 ; PTX:       {
-; PTX-NEXT:    .local .align 4 .b8 __local_depot2[8];
+; PTX-NEXT:    .local .align 8 .b8 __local_depot2[8];
 ; PTX-NEXT:    .reg .b64 %SP;
 ; PTX-NEXT:    .reg .b64 %SPL;
 ; PTX-NEXT:    .reg .b32 %r<3>;
@@ -179,7 +179,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone
 ;
 ; PTX-LABEL: escape_ptr_gep(
 ; PTX:       {
-; PTX-NEXT:    .local .align 4 .b8 __local_depot3[8];
+; PTX-NEXT:    .local .align 8 .b8 __local_depot3[8];
 ; PTX-NEXT:    .reg .b64 %SP;
 ; PTX-NEXT:    .reg .b64 %SPL;
 ; PTX-NEXT:    .reg .b32 %r<3>;
@@ -194,7 +194,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone
 ; PTX-NEXT:    st.local.b32 [%rd2+4], %r1;
 ; PTX-NEXT:    ld.param.b32 %r2, [escape_ptr_gep_param_1];
 ; PTX-NEXT:    st.local.b32 [%rd2], %r2;
-; PTX-NEXT:    add.s64 %rd3, %rd1, 4;
+; PTX-NEXT:    or.b64 %rd3, %rd1, 4;
 ; PTX-NEXT:    { // callseq 1, 0
 ; PTX-NEXT:    .param .b64 param0;
 ; PTX-NEXT:    st.param.b64 [param0], %rd3;
@@ -224,7 +224,7 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon
 ;
 ; PTX-LABEL: escape_ptr_store(
 ; PTX:       {
-; PTX-NEXT:    .local .align 4 .b8 __local_depot4[8];
+; PTX-NEXT:    .local .align 8 .b8 __local_depot4[8];
 ; PTX-NEXT:    .reg .b64 %SP;
 ; PTX-NEXT:    .reg .b64 %SPL;
 ; PTX-NEXT:    .reg .b32 %r<3>;
@@ -262,7 +262,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri
 ;
 ; PTX-LABEL: escape_ptr_gep_store(
 ; PTX:       {
-; PTX-NEXT:    .local .align 4 .b8 __local_depot5[8];
+; PTX-NEXT:    .local .align 8 .b8 __local_depot5[8];
 ; PTX-NEXT:    .reg .b64 %SP;
 ; PTX-NEXT:    .reg .b64 %SPL;
 ; PTX-NEXT:    .reg .b32 %r<3>;
@@ -279,7 +279,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri
 ; PTX-NEXT:    st.local.b32 [%rd4+4], %r1;
 ; PTX-NEXT:    ld.param.b32 %r2, [escape_ptr_gep_store_param_1];
 ; PTX-NEXT:    st.local.b32 [%rd4], %r2;
-; PTX-NEXT:    add.s64 %rd5, %rd3, 4;
+; PTX-NEXT:    or.b64 %rd5, %rd3, 4;
 ; PTX-NEXT:    st.global.b64 [%rd2], %rd5;
 ; PTX-NEXT:    ret;
 entry:
@@ -302,7 +302,7 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl
 ;
 ; PTX-LABEL: escape_ptrtoint(
 ; PTX:       {
-; PTX-NEXT:    .local .align 4 .b8 __local_depot6[8];
+; PTX-NEXT:    .local .align 8 .b8 __local_depot6[8];
 ; PTX-NEXT:    .reg .b64 %SP;
 ; PTX-NEXT:    .reg .b64 %SPL;
 ; PTX-NEXT:    .reg .b32 %r<3>;