diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt index 693f0d0b35edc..9d91100d35b3a 100644 --- a/llvm/lib/Target/NVPTX/CMakeLists.txt +++ b/llvm/lib/Target/NVPTX/CMakeLists.txt @@ -26,6 +26,7 @@ set(NVPTXCodeGen_sources NVPTXISelLowering.cpp NVPTXLowerAggrCopies.cpp NVPTXLowerAlloca.cpp + NVPTXIncreaseAlignment.cpp NVPTXLowerArgs.cpp NVPTXLowerUnreachable.cpp NVPTXMCExpr.cpp diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index b7c5a0a5c9983..a0d5cc5fd8e87 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -55,6 +55,7 @@ FunctionPass *createNVPTXTagInvariantLoadsPass(); MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); MachineFunctionPass *createNVPTXForwardParamsPass(); +FunctionPass *createNVPTXIncreaseLocalAlignmentPass(); void initializeNVVMReflectLegacyPassPass(PassRegistry &); void initializeGenericToNVVMLegacyPassPass(PassRegistry &); @@ -76,6 +77,7 @@ void initializeNVPTXAAWrapperPassPass(PassRegistry &); void initializeNVPTXExternalAAWrapperPass(PassRegistry &); void initializeNVPTXPeepholePass(PassRegistry &); void initializeNVPTXTagInvariantLoadLegacyPassPass(PassRegistry &); +void initializeNVPTXIncreaseLocalAlignmentLegacyPassPass(PassRegistry &); struct NVVMIntrRangePass : PassInfoMixin { PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); @@ -111,6 +113,11 @@ struct NVPTXTagInvariantLoadsPass : PassInfoMixin { PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; +struct NVPTXIncreaseLocalAlignmentPass + : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + namespace NVPTX { enum DrvInterface { NVCL, diff --git a/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp b/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp new file mode 100644 index 0000000000000..1fb1e578994e9 --- /dev/null +++ b/llvm/lib/Target/NVPTX/NVPTXIncreaseAlignment.cpp @@ -0,0 +1,157 @@ +//===-- NVPTXIncreaseAlignment.cpp - Increase alignment for local arrays --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// A simple pass that looks at local memory arrays that are statically +// sized and potentially increases their alignment. This enables vectorization +// of loads/stores to these arrays if not explicitly specified by the client. +// +// TODO: Ideally we should do a bin-packing of local arrays to maximize +// alignments while minimizing holes. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/NVPTXAddrSpace.h" + +using namespace llvm; + +static cl::opt + MaxLocalArrayAlignment("nvptx-use-max-local-array-alignment", + cl::init(false), cl::Hidden, + cl::desc("Use maximum alignment for local memory")); + +static Align getMaxLocalArrayAlignment(const TargetTransformInfo &TTI) { + const unsigned MaxBitWidth = + TTI.getLoadStoreVecRegBitWidth(NVPTXAS::ADDRESS_SPACE_LOCAL); + return Align(MaxBitWidth / 8); +} + +namespace { +struct NVPTXIncreaseLocalAlignment { + const Align MaxAlign; + + NVPTXIncreaseLocalAlignment(const TargetTransformInfo &TTI) + : MaxAlign(getMaxLocalArrayAlignment(TTI)) {} + + bool run(Function &F); + bool updateAllocaAlignment(AllocaInst *Alloca, const DataLayout &DL); + Align getAggressiveArrayAlignment(unsigned ArraySize); + Align getConservativeArrayAlignment(unsigned ArraySize); +}; +} // namespace + +/// Get the maximum useful alignment for an array. This is more likely to +/// produce holes in the local memory. +/// +/// Choose an alignment large enough that the entire array could be loaded with +/// a single vector load (if possible). Cap the alignment at +/// MaxPTXArrayAlignment. +Align NVPTXIncreaseLocalAlignment::getAggressiveArrayAlignment( + const unsigned ArraySize) { + return std::min(MaxAlign, Align(PowerOf2Ceil(ArraySize))); +} + +/// Get the alignment of arrays that reduces the chances of leaving holes when +/// arrays are allocated within a contiguous memory buffer (like shared memory +/// and stack). Holes are still possible before and after the array allocation. +/// +/// Choose the largest alignment such that the array size is a multiple of the +/// alignment. If all elements of the buffer are allocated in order of +/// alignment (higher to lower) no holes will be left. +Align NVPTXIncreaseLocalAlignment::getConservativeArrayAlignment( + const unsigned ArraySize) { + return commonAlignment(MaxAlign, ArraySize); +} + +/// Find a better alignment for local arrays +bool NVPTXIncreaseLocalAlignment::updateAllocaAlignment(AllocaInst *Alloca, + const DataLayout &DL) { + // Looking for statically sized local arrays + if (!Alloca->isStaticAlloca()) + return false; + + const auto ArraySize = Alloca->getAllocationSize(DL); + if (!(ArraySize && ArraySize->isFixed())) + return false; + + const auto ArraySizeValue = ArraySize->getFixedValue(); + const Align PreferredAlignment = + MaxLocalArrayAlignment ? getAggressiveArrayAlignment(ArraySizeValue) + : getConservativeArrayAlignment(ArraySizeValue); + + if (PreferredAlignment > Alloca->getAlign()) { + Alloca->setAlignment(PreferredAlignment); + return true; + } + + return false; +} + +bool NVPTXIncreaseLocalAlignment::run(Function &F) { + bool Changed = false; + const auto &DL = F.getParent()->getDataLayout(); + + BasicBlock &EntryBB = F.getEntryBlock(); + for (Instruction &I : EntryBB) + if (AllocaInst *Alloca = dyn_cast(&I)) + Changed |= updateAllocaAlignment(Alloca, DL); + + return Changed; +} + +namespace { +struct NVPTXIncreaseLocalAlignmentLegacyPass : public FunctionPass { + static char ID; + NVPTXIncreaseLocalAlignmentLegacyPass() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + StringRef getPassName() const override { + return "NVPTX Increase Local Alignment"; + } +}; +} // namespace + +char NVPTXIncreaseLocalAlignmentLegacyPass::ID = 0; +INITIALIZE_PASS(NVPTXIncreaseLocalAlignmentLegacyPass, + "nvptx-increase-local-alignment", + "Increase alignment for statically sized alloca arrays", false, + false) + +FunctionPass *llvm::createNVPTXIncreaseLocalAlignmentPass() { + return new NVPTXIncreaseLocalAlignmentLegacyPass(); +} + +bool NVPTXIncreaseLocalAlignmentLegacyPass::runOnFunction(Function &F) { + const auto &TTI = getAnalysis().getTTI(F); + return NVPTXIncreaseLocalAlignment(TTI).run(F); +} + +PreservedAnalyses +NVPTXIncreaseLocalAlignmentPass::run(Function &F, + FunctionAnalysisManager &FAM) { + const auto &TTI = FAM.getResult(F); + bool Changed = NVPTXIncreaseLocalAlignment(TTI).run(F); + + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def index ee37c9826012c..827cb7bba7018 100644 --- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def +++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def @@ -40,4 +40,5 @@ FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass()) FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass()) FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this)) FUNCTION_PASS("nvptx-tag-invariant-loads", NVPTXTagInvariantLoadsPass()) +FUNCTION_PASS("nvptx-increase-local-alignment", NVPTXIncreaseLocalAlignmentPass()) #undef FUNCTION_PASS diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index ef310e5828f22..c4b629514087e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -392,6 +392,8 @@ void NVPTXPassConfig::addIRPasses() { // but EarlyCSE can do neither of them. if (getOptLevel() != CodeGenOptLevel::None) { addEarlyCSEOrGVNPass(); + // Increase alignment for local arrays to improve vectorization. + addPass(createNVPTXIncreaseLocalAlignmentPass()); if (!DisableLoadStoreVectorizer) addPass(createLoadStoreVectorizerPass()); addPass(createSROAPass()); diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll index a2175dd009f5f..799016390424a 100644 --- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -20,7 +20,7 @@ define ptx_kernel void @kernel_func(ptr %a) { entry: %buf = alloca [16 x i8], align 4 -; CHECK: .local .align 4 .b8 __local_depot0[16] +; CHECK: .local .align 16 .b8 __local_depot0[16] ; CHECK: mov.b64 %SPL ; CHECK: ld.param.b64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0] diff --git a/llvm/test/CodeGen/NVPTX/increase-local-align.ll b/llvm/test/CodeGen/NVPTX/increase-local-align.ll new file mode 100644 index 0000000000000..3dddcf384b81c --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/increase-local-align.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=nvptx-increase-local-alignment < %s | FileCheck %s --check-prefixes=COMMON,DEFAULT +; RUN: opt -S -passes=nvptx-increase-local-alignment -nvptx-use-max-local-array-alignment < %s | FileCheck %s --check-prefixes=COMMON,MAX +target triple = "nvptx64-nvidia-cuda" + +define void @test1() { +; COMMON-LABEL: define void @test1() { +; COMMON-NEXT: [[A:%.*]] = alloca i8, align 1 +; COMMON-NEXT: ret void +; + %a = alloca i8, align 1 + ret void +} + +define void @test2() { +; DEFAULT-LABEL: define void @test2() { +; DEFAULT-NEXT: [[A:%.*]] = alloca [63 x i8], align 1 +; DEFAULT-NEXT: ret void +; +; MAX-LABEL: define void @test2() { +; MAX-NEXT: [[A:%.*]] = alloca [63 x i8], align 16 +; MAX-NEXT: ret void +; + %a = alloca [63 x i8], align 1 + ret void +} + +define void @test3() { +; COMMON-LABEL: define void @test3() { +; COMMON-NEXT: [[A:%.*]] = alloca [64 x i8], align 16 +; COMMON-NEXT: ret void +; + %a = alloca [64 x i8], align 1 + ret void +} + +define void @test4() { +; DEFAULT-LABEL: define void @test4() { +; DEFAULT-NEXT: [[A:%.*]] = alloca i8, i32 63, align 1 +; DEFAULT-NEXT: ret void +; +; MAX-LABEL: define void @test4() { +; MAX-NEXT: [[A:%.*]] = alloca i8, i32 63, align 16 +; MAX-NEXT: ret void +; + %a = alloca i8, i32 63, align 1 + ret void +} + +define void @test5() { +; COMMON-LABEL: define void @test5() { +; COMMON-NEXT: [[A:%.*]] = alloca i8, i32 64, align 16 +; COMMON-NEXT: ret void +; + %a = alloca i8, i32 64, align 1 + ret void +} + +define void @test6() { +; COMMON-LABEL: define void @test6() { +; COMMON-NEXT: [[A:%.*]] = alloca i8, align 32 +; COMMON-NEXT: ret void +; + %a = alloca i8, align 32 + ret void +} + +define void @test7() { +; COMMON-LABEL: define void @test7() { +; COMMON-NEXT: [[A:%.*]] = alloca i32, align 4 +; COMMON-NEXT: ret void +; + %a = alloca i32, align 2 + ret void +} + +define void @test8() { +; COMMON-LABEL: define void @test8() { +; COMMON-NEXT: [[A:%.*]] = alloca [2 x i32], align 8 +; COMMON-NEXT: ret void +; + %a = alloca [2 x i32], align 2 + ret void +} + diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 54495cf0d61f3..5e5888aac3ceb 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -135,7 +135,7 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out ; ; PTX-LABEL: escape_ptr( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot2[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot2[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; @@ -179,7 +179,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone ; ; PTX-LABEL: escape_ptr_gep( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot3[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot3[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; @@ -194,7 +194,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone ; PTX-NEXT: st.local.b32 [%rd2+4], %r1; ; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_gep_param_1]; ; PTX-NEXT: st.local.b32 [%rd2], %r2; -; PTX-NEXT: add.s64 %rd3, %rd1, 4; +; PTX-NEXT: or.b64 %rd3, %rd1, 4; ; PTX-NEXT: { // callseq 1, 0 ; PTX-NEXT: .param .b64 param0; ; PTX-NEXT: st.param.b64 [param0], %rd3; @@ -224,7 +224,7 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon ; ; PTX-LABEL: escape_ptr_store( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot4[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot4[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; @@ -262,7 +262,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri ; ; PTX-LABEL: escape_ptr_gep_store( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot5[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot5[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; @@ -279,7 +279,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri ; PTX-NEXT: st.local.b32 [%rd4+4], %r1; ; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_gep_store_param_1]; ; PTX-NEXT: st.local.b32 [%rd4], %r2; -; PTX-NEXT: add.s64 %rd5, %rd3, 4; +; PTX-NEXT: or.b64 %rd5, %rd3, 4; ; PTX-NEXT: st.global.b64 [%rd2], %rd5; ; PTX-NEXT: ret; entry: @@ -302,7 +302,7 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl ; ; PTX-LABEL: escape_ptrtoint( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot6[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot6[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index ddaa9fd831af7..d83b303d1f0db 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -208,7 +208,7 @@ declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias define dso_local i32 @bar() { ; CHECK-PTX-LABEL: bar( ; CHECK-PTX: { -; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot3[24]; +; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot3[32]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; ; CHECK-PTX-NEXT: .reg .b16 %rs<5>; @@ -226,12 +226,12 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5]; ; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3; ; CHECK-PTX-NEXT: mov.b32 %r1, 1; -; CHECK-PTX-NEXT: st.b32 [%SP+8], %r1; +; CHECK-PTX-NEXT: st.b32 [%SP+16], %r1; ; CHECK-PTX-NEXT: mov.b16 %rs4, 1; -; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs4; +; CHECK-PTX-NEXT: st.b8 [%SP+20], %rs4; ; CHECK-PTX-NEXT: mov.b64 %rd3, 1; -; CHECK-PTX-NEXT: st.b64 [%SP+16], %rd3; -; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 8; +; CHECK-PTX-NEXT: st.b64 [%SP+24], %rd3; +; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 16; ; CHECK-PTX-NEXT: { // callseq 1, 0 ; CHECK-PTX-NEXT: .param .b32 param0; ; CHECK-PTX-NEXT: st.param.b32 [param0], 1; @@ -371,7 +371,7 @@ entry: define dso_local void @qux() { ; CHECK-PTX-LABEL: qux( ; CHECK-PTX: { -; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot7[24]; +; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot7[32]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; ; CHECK-PTX-NEXT: .reg .b32 %r<3>;