[NVPTX] Add NVPTXIncreaseAligmentPass to improve vectorization #144958

AlexMaclean · 2025-06-19T21:24:54Z

This change adds simple pass that looks at local memory arrays that are statically sized and sets an appropriate alignment for them. This enables vectorization of loads/stores to these arrays if not explicitly specified by the client.

llvmbot · 2025-06-19T21:25:22Z

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

This change adds simple pass that looks at local memory arrays that are statically sized and sets an appropriate alignment for them. This enables vectorization of loads/stores to these arrays if not explicitly specified by the client.

Full diff: https://github.com/llvm/llvm-project/pull/144958.diff

6 Files Affected:

(modified) llvm/lib/Target/NVPTX/CMakeLists.txt (+1)
(modified) llvm/lib/Target/NVPTX/NVPTX.h (+7)
(added) llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp (+128)
(modified) llvm/lib/Target/NVPTX/NVPTXPassRegistry.def (+1)
(modified) llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp (+2)
(added) llvm/test/CodeGen/NVPTX/increase-local-align.ll (+85)

diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index 693f0d0b35edc..9d91100d35b3a 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -26,6 +26,7 @@ set(NVPTXCodeGen_sources
   NVPTXISelLowering.cpp
   NVPTXLowerAggrCopies.cpp
   NVPTXLowerAlloca.cpp
+  NVPTXIncreaseAlignment.cpp
   NVPTXLowerArgs.cpp
   NVPTXLowerUnreachable.cpp
   NVPTXMCExpr.cpp
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index b7c5a0a5c9983..a0d5cc5fd8e87 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -55,6 +55,7 @@ FunctionPass *createNVPTXTagInvariantLoadsPass();
 MachineFunctionPass *createNVPTXPeephole();
 MachineFunctionPass *createNVPTXProxyRegErasurePass();
 MachineFunctionPass *createNVPTXForwardParamsPass();
+FunctionPass *createNVPTXIncreaseLocalAlignmentPass();
 
 void initializeNVVMReflectLegacyPassPass(PassRegistry &);
 void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
@@ -76,6 +77,7 @@ void initializeNVPTXAAWrapperPassPass(PassRegistry &);
 void initializeNVPTXExternalAAWrapperPass(PassRegistry &);
 void initializeNVPTXPeepholePass(PassRegistry &);
 void initializeNVPTXTagInvariantLoadLegacyPassPass(PassRegistry &);
+void initializeNVPTXIncreaseLocalAlignmentLegacyPassPass(PassRegistry &);
 
 struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
@@ -111,6 +113,11 @@ struct NVPTXTagInvariantLoadsPass : PassInfoMixin<NVPTXTagInvariantLoadsPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+struct NVPTXIncreaseLocalAlignmentPass
+    : PassInfoMixin<NVPTXIncreaseLocalAlignmentPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 namespace NVPTX {
 enum DrvInterface {
   NVCL,
diff --git a/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp b/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp
new file mode 100644
index 0000000000000..cf6d52e0d9fd9
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp
@@ -0,0 +1,128 @@
+//===-- NVPTXIncreaseAlignment.cpp - Increase alignment for local arrays --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A simple pass that looks at local memory arrays that are statically
+// sized and sets an appropriate alignment for them. This enables vectorization
+// of loads/stores to these arrays if not explicitly specified by the client.
+//
+// TODO: Ideally we should do a bin-packing of local arrays to maximize 
+// alignments while minimizing holes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    MaxLocalArrayAlignment("nvptx-use-max-local-array-alignment",
+                           cl::init(false), cl::Hidden,
+                           cl::desc("Use maximum alignment for local memory"));
+
+static constexpr Align MaxPTXArrayAlignment = Align::Constant<16>();
+
+/// Get the maximum useful alignment for an array. This is more likely to
+/// produce holes in the local memory.
+///
+/// Choose an alignment large enough that the entire array could be loaded with
+/// a single vector load (if possible). Cap the alignment at MaxPTXArrayAlignment.
+static Align getAggressiveArrayAlignment(const unsigned ArraySize) {
+  return std::min(MaxPTXArrayAlignment, Align(PowerOf2Ceil(ArraySize)));
+}
+
+/// Get the alignment of arrays that reduces the chances of leaving holes when
+/// arrays are allocated within a contiguous memory buffer (like shared memory
+/// and stack). Holes are still possible before and after the array allocation.
+///
+/// Choose the largest alignment such that the array size is a multiple of the
+/// alignment. If all elements of the buffer are allocated in order of
+/// alignment (higher to lower) no holes will be left.
+static Align getConservativeArrayAlignment(const unsigned ArraySize) {
+  return commonAlignment(MaxPTXArrayAlignment, ArraySize);
+}
+
+/// Find a better alignment for local arrays
+static bool updateAllocaAlignment(const DataLayout &DL,
+                                         AllocaInst *Alloca) {
+  // Looking for statically sized local arrays
+  if (!Alloca->isStaticAlloca())
+    return false;
+
+  // For now, we only support array allocas
+  if (!(Alloca->isArrayAllocation() || Alloca->getAllocatedType()->isArrayTy()))
+    return false;
+
+  const auto ArraySize = Alloca->getAllocationSize(DL);
+  if (!(ArraySize && ArraySize->isFixed()))
+    return false;
+
+  const auto ArraySizeValue = ArraySize->getFixedValue();
+  const Align PreferredAlignment =
+      MaxLocalArrayAlignment ? getAggressiveArrayAlignment(ArraySizeValue)
+                             : getConservativeArrayAlignment(ArraySizeValue);
+
+  if (PreferredAlignment > Alloca->getAlign()) {
+    Alloca->setAlignment(PreferredAlignment);
+    return true;
+  }
+
+  return false;
+}
+
+static bool runSetLocalArrayAlignment(Function &F) {
+  bool Changed = false;
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  BasicBlock &EntryBB = F.getEntryBlock();
+  for (Instruction &I : EntryBB)
+    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(&I))
+      Changed |= updateAllocaAlignment(DL, Alloca);
+
+  return Changed;
+}
+
+
+namespace {
+struct NVPTXIncreaseLocalAlignmentLegacyPass : public FunctionPass {
+  static char ID;
+  NVPTXIncreaseLocalAlignmentLegacyPass() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+char NVPTXIncreaseLocalAlignmentLegacyPass::ID = 0;
+INITIALIZE_PASS(NVPTXIncreaseLocalAlignmentLegacyPass, "nvptx-increase-local-alignment",
+                "Increase alignment for statically sized alloca arrays", false,
+                false)
+
+FunctionPass *llvm::createNVPTXIncreaseLocalAlignmentPass() {
+  return new NVPTXIncreaseLocalAlignmentLegacyPass();
+}
+
+bool NVPTXIncreaseLocalAlignmentLegacyPass::runOnFunction(Function &F) {
+  return runSetLocalArrayAlignment(F);
+}
+
+PreservedAnalyses
+NVPTXIncreaseLocalAlignmentPass::run(Function &F, FunctionAnalysisManager &AM) {
+  bool Changed = runSetLocalArrayAlignment(F);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
index ee37c9826012c..827cb7bba7018 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
+++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
@@ -40,4 +40,5 @@ FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
 FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
 FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this))
 FUNCTION_PASS("nvptx-tag-invariant-loads", NVPTXTagInvariantLoadsPass())
+FUNCTION_PASS("nvptx-increase-local-alignment", NVPTXIncreaseLocalAlignmentPass())
 #undef FUNCTION_PASS
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 85d28a703a4cb..e7ad39449dc2f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -391,6 +391,8 @@ void NVPTXPassConfig::addIRPasses() {
   // but EarlyCSE can do neither of them.
   if (getOptLevel() != CodeGenOptLevel::None) {
     addEarlyCSEOrGVNPass();
+    // Increase alignment for local arrays to improve vectorization.
+    addPass(createNVPTXIncreaseLocalAlignmentPass());
     if (!DisableLoadStoreVectorizer)
       addPass(createLoadStoreVectorizerPass());
     addPass(createSROAPass());
diff --git a/llvm/test/CodeGen/NVPTX/increase-local-align.ll b/llvm/test/CodeGen/NVPTX/increase-local-align.ll
new file mode 100644
index 0000000000000..605c4b5b2b77d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/increase-local-align.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=nvptx-increase-local-alignment < %s | FileCheck %s --check-prefixes=COMMON,DEFAULT
+; RUN: opt -S -passes=nvptx-increase-local-alignment -nvptx-use-max-local-array-alignment < %s | FileCheck %s --check-prefixes=COMMON,MAX
+target triple = "nvptx64-nvidia-cuda"
+
+define void @test1() {
+; COMMON-LABEL: define void @test1() {
+; COMMON-NEXT:    [[A:%.*]] = alloca i8, align 1
+; COMMON-NEXT:    ret void
+;
+  %a = alloca i8, align 1
+  ret void
+}
+
+define void @test2() {
+; DEFAULT-LABEL: define void @test2() {
+; DEFAULT-NEXT:    [[A:%.*]] = alloca [63 x i8], align 1
+; DEFAULT-NEXT:    ret void
+;
+; MAX-LABEL: define void @test2() {
+; MAX-NEXT:    [[A:%.*]] = alloca [63 x i8], align 16
+; MAX-NEXT:    ret void
+;
+  %a = alloca [63 x i8], align 1
+  ret void
+}
+
+define void @test3() {
+; COMMON-LABEL: define void @test3() {
+; COMMON-NEXT:    [[A:%.*]] = alloca [64 x i8], align 16
+; COMMON-NEXT:    ret void
+;
+  %a = alloca [64 x i8], align 1
+  ret void
+}
+
+define void @test4() {
+; DEFAULT-LABEL: define void @test4() {
+; DEFAULT-NEXT:    [[A:%.*]] = alloca i8, i32 63, align 1
+; DEFAULT-NEXT:    ret void
+;
+; MAX-LABEL: define void @test4() {
+; MAX-NEXT:    [[A:%.*]] = alloca i8, i32 63, align 16
+; MAX-NEXT:    ret void
+;
+  %a = alloca i8, i32 63, align 1
+  ret void
+}
+
+define void @test5() {
+; COMMON-LABEL: define void @test5() {
+; COMMON-NEXT:    [[A:%.*]] = alloca i8, i32 64, align 16
+; COMMON-NEXT:    ret void
+;
+  %a = alloca i8, i32 64, align 1
+  ret void
+}
+
+define void @test6() {
+; COMMON-LABEL: define void @test6() {
+; COMMON-NEXT:    [[A:%.*]] = alloca i8, align 32
+; COMMON-NEXT:    ret void
+;
+  %a = alloca i8, align 32
+  ret void
+}
+
+define void @test7() {
+; COMMON-LABEL: define void @test7() {
+; COMMON-NEXT:    [[A:%.*]] = alloca i32, align 2
+; COMMON-NEXT:    ret void
+;
+  %a = alloca i32, align 2
+  ret void
+}
+
+define void @test8() {
+; COMMON-LABEL: define void @test8() {
+; COMMON-NEXT:    [[A:%.*]] = alloca [2 x i32], align 8
+; COMMON-NEXT:    ret void
+;
+  %a = alloca [2 x i32], align 2
+  ret void
+}
+

github-actions · 2025-06-19T21:28:21Z

✅ With the latest revision this PR passed the C/C++ code formatter.