Skip to content

Commit 8dd7b01

Browse files
committed
[AMDGPU][Attributor] Infer inreg attribute in AMDGPUAttributor
This patch introduces `AAAMDGPUUniformArgument` that can infer `inreg` function argument attribute. The idea is, for a function argument, if the corresponding call site arguments are always uniform, we can mark it as `inreg` thus pass it via SGPR. In addition, this AA is also able to propagate the inreg attribute if feasible.
1 parent 2dc44b3 commit 8dd7b01

File tree

2 files changed

+189
-1
lines changed

2 files changed

+189
-1
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 115 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "GCNSubtarget.h"
1515
#include "Utils/AMDGPUBaseInfo.h"
1616
#include "llvm/Analysis/CycleAnalysis.h"
17+
#include "llvm/Analysis/TargetTransformInfo.h"
1718
#include "llvm/CodeGen/TargetPassConfig.h"
1819
#include "llvm/IR/IntrinsicsAMDGPU.h"
1920
#include "llvm/IR/IntrinsicsR600.h"
@@ -1295,6 +1296,114 @@ struct AAAMDGPUNoAGPR
12951296

12961297
const char AAAMDGPUNoAGPR::ID = 0;
12971298

1299+
struct AAAMDGPUUniform : public StateWrapper<BooleanState, AbstractAttribute> {
1300+
using Base = StateWrapper<BooleanState, AbstractAttribute>;
1301+
AAAMDGPUUniform(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1302+
1303+
/// Create an abstract attribute view for the position \p IRP.
1304+
static AAAMDGPUUniform &createForPosition(const IRPosition &IRP,
1305+
Attributor &A);
1306+
1307+
/// See AbstractAttribute::getName()
1308+
StringRef getName() const override { return "AAAMDGPUUniform"; }
1309+
1310+
const std::string getAsStr(Attributor *A) const override {
1311+
return getAssumed() ? "uniform" : "divergent";
1312+
}
1313+
1314+
void trackStatistics() const override {}
1315+
1316+
/// See AbstractAttribute::getIdAddr()
1317+
const char *getIdAddr() const override { return &ID; }
1318+
1319+
/// This function should return true if the type of the \p AA is
1320+
/// AAAMDGPUUniform
1321+
static bool classof(const AbstractAttribute *AA) {
1322+
return (AA->getIdAddr() == &ID);
1323+
}
1324+
1325+
/// Unique ID (due to the unique address)
1326+
static const char ID;
1327+
};
1328+
1329+
const char AAAMDGPUUniform::ID = 0;
1330+
1331+
/// This AA is to infer the inreg attribute for a function argument.
1332+
struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
1333+
AAAMDGPUUniformArgument(const IRPosition &IRP, Attributor &A)
1334+
: AAAMDGPUUniform(IRP, A) {}
1335+
1336+
void initialize(Attributor &A) override {
1337+
Argument *Arg = getAssociatedArgument();
1338+
CallingConv::ID CC = Arg->getParent()->getCallingConv();
1339+
if (Arg->hasAttribute(Attribute::InReg)) {
1340+
indicateOptimisticFixpoint();
1341+
return;
1342+
}
1343+
1344+
if (AMDGPU::isEntryFunctionCC(CC)) {
1345+
// We only use isArgPassedInSGPR on kernel entry function argument, so
1346+
// even if we will use SPGR for non-uniform i1 argument passing, it will
1347+
// not affect this.
1348+
if (AMDGPU::isArgPassedInSGPR(Arg))
1349+
indicateOptimisticFixpoint();
1350+
else
1351+
indicatePessimisticFixpoint();
1352+
}
1353+
}
1354+
1355+
ChangeStatus updateImpl(Attributor &A) override {
1356+
unsigned ArgNo = getAssociatedArgument()->getArgNo();
1357+
TargetMachine &TM =
1358+
static_cast<AMDGPUInformationCache &>(A.getInfoCache()).TM;
1359+
1360+
auto isUniform = [&](AbstractCallSite ACS) -> bool {
1361+
CallBase *CB = ACS.getInstruction();
1362+
Value *V = CB->getArgOperand(ArgNo);
1363+
if (auto *Arg = dyn_cast<Argument>(V)) {
1364+
auto *AA = A.getOrCreateAAFor<AAAMDGPUUniform>(
1365+
IRPosition::argument(*Arg), this, DepClassTy::REQUIRED);
1366+
return AA && AA->isValidState();
1367+
}
1368+
TargetTransformInfo TTI = TM.getTargetTransformInfo(*CB->getFunction());
1369+
return TTI.isAlwaysUniform(V);
1370+
};
1371+
1372+
bool UsedAssumedInformation = true;
1373+
if (!A.checkForAllCallSites(isUniform, *this, /*RequireAllCallSites=*/true,
1374+
UsedAssumedInformation))
1375+
return indicatePessimisticFixpoint();
1376+
1377+
if (!UsedAssumedInformation)
1378+
return indicateOptimisticFixpoint();
1379+
1380+
return ChangeStatus::UNCHANGED;
1381+
}
1382+
1383+
ChangeStatus manifest(Attributor &A) override {
1384+
Argument *Arg = getAssociatedArgument();
1385+
// If the argument already has inreg attribute, we will not do anything
1386+
// about it.
1387+
if (Arg->hasAttribute(Attribute::InReg))
1388+
return ChangeStatus::UNCHANGED;
1389+
if (AMDGPU::isEntryFunctionCC(Arg->getParent()->getCallingConv()))
1390+
return ChangeStatus::UNCHANGED;
1391+
LLVMContext &Ctx = Arg->getContext();
1392+
return A.manifestAttrs(getIRPosition(),
1393+
{Attribute::get(Ctx, Attribute::InReg)});
1394+
}
1395+
};
1396+
1397+
AAAMDGPUUniform &AAAMDGPUUniform::createForPosition(const IRPosition &IRP,
1398+
Attributor &A) {
1399+
switch (IRP.getPositionKind()) {
1400+
case IRPosition::IRP_ARGUMENT:
1401+
return *new (A.Allocator) AAAMDGPUUniformArgument(IRP, A);
1402+
default:
1403+
llvm_unreachable("not a valid position for AAAMDGPUUniform");
1404+
}
1405+
}
1406+
12981407
/// Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
12991408
/// based on the finalized 'amdgpu-flat-work-group-size' attribute.
13001409
/// Both attributes start with narrow ranges that expand during iteration.
@@ -1381,7 +1490,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13811490
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
13821491
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
13831492
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1384-
&AAInstanceInfo::ID});
1493+
&AAInstanceInfo::ID, &AAAMDGPUUniform::ID});
13851494

13861495
AttributorConfig AC(CGUpdater);
13871496
AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1433,6 +1542,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14331542
A.getOrCreateAAFor<AAAddressSpace>(
14341543
IRPosition::value(*CmpX->getPointerOperand()));
14351544
}
1545+
1546+
if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
1547+
for (auto &Arg : F->args())
1548+
A.getOrCreateAAFor<AAAMDGPUUniform>(IRPosition::argument(Arg));
1549+
}
14361550
}
14371551
}
14381552

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s -o - | FileCheck %s
3+
4+
@g1 = protected addrspace(1) externally_initialized global i32 0, align 4
5+
@g2 = protected addrspace(1) externally_initialized global i32 0, align 4
6+
@g3 = protected addrspace(1) externally_initialized global i32 0, align 4
7+
@g4 = protected addrspace(1) externally_initialized global i32 0, align 4
8+
9+
define internal void @callee_with_always_uniform_argument(ptr addrspace(1) %x, i32 %y) {
10+
; CHECK-LABEL: define internal void @callee_with_always_uniform_argument(
11+
; CHECK-SAME: ptr addrspace(1) inreg [[X:%.*]], i32 inreg [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
12+
; CHECK-NEXT: [[ENTRY:.*:]]
13+
; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
14+
; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
15+
; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
16+
; CHECK-NEXT: ret void
17+
;
18+
entry:
19+
%x.val = load i32, ptr addrspace(1) %x, align 4
20+
store i32 %x.val, ptr addrspace(1) @g3, align 4
21+
store i32 %y, ptr addrspace(1) @g4, align 4
22+
ret void
23+
}
24+
25+
define amdgpu_kernel void @kernel_with_readfirstlane(ptr addrspace(1) %p, i32 %x) {
26+
; CHECK-LABEL: define amdgpu_kernel void @kernel_with_readfirstlane(
27+
; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
28+
; CHECK-NEXT: [[ENTRY:.*:]]
29+
; CHECK-NEXT: [[P0:%.*]] = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) [[P]])
30+
; CHECK-NEXT: call void @callee_with_always_uniform_argument(ptr addrspace(1) [[P0]], i32 [[X]])
31+
; CHECK-NEXT: ret void
32+
;
33+
entry:
34+
%p0 = call ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1) %p)
35+
call void @callee_with_always_uniform_argument(ptr addrspace(1) %p0, i32 %x)
36+
ret void
37+
}
38+
39+
define internal void @callee_without_always_uniform_argument(ptr addrspace(1) %x, i32 %y) {
40+
; CHECK-LABEL: define internal void @callee_without_always_uniform_argument(
41+
; CHECK-SAME: ptr addrspace(1) [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
42+
; CHECK-NEXT: [[ENTRY:.*:]]
43+
; CHECK-NEXT: [[X_VAL:%.*]] = load i32, ptr addrspace(1) [[X]], align 4
44+
; CHECK-NEXT: store i32 [[X_VAL]], ptr addrspace(1) @g3, align 4
45+
; CHECK-NEXT: store i32 [[Y]], ptr addrspace(1) @g4, align 4
46+
; CHECK-NEXT: ret void
47+
;
48+
entry:
49+
%x.val = load i32, ptr addrspace(1) %x, align 4
50+
store i32 %x.val, ptr addrspace(1) @g3, align 4
51+
store i32 %y, ptr addrspace(1) @g4, align 4
52+
ret void
53+
}
54+
55+
define amdgpu_kernel void @kernel_without_divergent_callsite_argument(ptr addrspace(1) %p, i32 %x) {
56+
; CHECK-LABEL: define amdgpu_kernel void @kernel_without_divergent_callsite_argument(
57+
; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
58+
; CHECK-NEXT: [[ENTRY:.*:]]
59+
; CHECK-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
60+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[P]], i32 [[ID_X]]
61+
; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4
62+
; CHECK-NEXT: call void @callee_without_always_uniform_argument(ptr addrspace(1) [[GEP]], i32 [[D]])
63+
; CHECK-NEXT: ret void
64+
;
65+
entry:
66+
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
67+
%gep = getelementptr i32, ptr addrspace(1) %p, i32 %id.x
68+
%d = load i32, ptr addrspace(1) %gep
69+
call void @callee_without_always_uniform_argument(ptr addrspace(1) %gep, i32 %d)
70+
ret void
71+
}
72+
73+
declare ptr addrspace(1) @llvm.amdgcn.readfirstlane.p1(ptr addrspace(1))
74+
declare noundef i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)