Skip to content

Commit dbe1c1d

Browse files
committed
[AMDGPU][UnifyDivergentExitNodes][StructurizeCFG] Add support for callbr instruction with basic inline-asm
Finishes adding basic inline-asm callbr support for AMDGPU, started by #149308.
1 parent 691bb41 commit dbe1c1d

File tree

9 files changed

+744
-79
lines changed

9 files changed

+744
-79
lines changed

llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp

Lines changed: 54 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
181181
return NewRetBlock;
182182
}
183183

184+
static BasicBlock *
185+
createDummyReturnBlock(Function &F,
186+
SmallVector<BasicBlock *, 4> &ReturningBlocks) {
187+
BasicBlock *DummyReturnBB =
188+
BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
189+
Type *RetTy = F.getReturnType();
190+
Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
191+
ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
192+
ReturningBlocks.push_back(DummyReturnBB);
193+
return DummyReturnBB;
194+
}
195+
196+
/// Handle conditional branch instructions (-> 2 targets) and callbr
197+
/// instructions with N targets.
198+
static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
199+
BasicBlock *DummyReturnBB,
200+
std::vector<DominatorTree::UpdateType> &Updates) {
201+
SmallVector<BasicBlock *, 2> Successors(successors(BB));
202+
203+
// Create a new transition block to hold the conditional branch.
204+
BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
205+
206+
Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
207+
208+
// 'Successors' become successors of TransitionBB instead of BB,
209+
// and TransitionBB becomes a single successor of BB.
210+
Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
211+
for (BasicBlock *Successor : Successors) {
212+
Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
213+
Updates.emplace_back(DominatorTree::Delete, BB, Successor);
214+
}
215+
216+
// Create a branch that will always branch to the transition block and
217+
// references DummyReturnBB.
218+
BB->getTerminator()->eraseFromParent();
219+
BranchInst::Create(TransitionBB, DummyReturnBB,
220+
ConstantInt::getTrue(F.getContext()), BB);
221+
Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
222+
}
223+
184224
bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
185225
const PostDominatorTree &PDT,
186226
const UniformityInfo &UA) {
187-
assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
188-
189227
if (PDT.root_size() == 0 ||
190228
(PDT.root_size() == 1 &&
191-
!isa<BranchInst>(PDT.getRoot()->getTerminator())))
229+
!isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
192230
return false;
193231

194232
// Loop over all of the blocks in a function, tracking all of the blocks that
@@ -222,46 +260,27 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
222260
if (HasDivergentExitBlock)
223261
UnreachableBlocks.push_back(BB);
224262
} else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
225-
226-
ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
227-
if (DummyReturnBB == nullptr) {
228-
DummyReturnBB = BasicBlock::Create(F.getContext(),
229-
"DummyReturnBlock", &F);
230-
Type *RetTy = F.getReturnType();
231-
Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
232-
ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
233-
ReturningBlocks.push_back(DummyReturnBB);
234-
}
263+
if (DummyReturnBB == nullptr)
264+
DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
235265

236266
if (BI->isUnconditional()) {
237267
BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
238268
BI->eraseFromParent(); // Delete the unconditional branch.
239269
// Add a new conditional branch with a dummy edge to the return block.
240-
BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
241-
Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
242-
} else { // Conditional branch.
243-
SmallVector<BasicBlock *, 2> Successors(successors(BB));
244-
245-
// Create a new transition block to hold the conditional branch.
246-
BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
247-
248-
Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
249-
250-
// 'Successors' become successors of TransitionBB instead of BB,
251-
// and TransitionBB becomes a single successor of BB.
252-
Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
253-
for (BasicBlock *Successor : Successors) {
254-
Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
255-
Updates.emplace_back(DominatorTree::Delete, BB, Successor);
256-
}
257-
258-
// Create a branch that will always branch to the transition block and
259-
// references DummyReturnBB.
260-
BB->getTerminator()->eraseFromParent();
261-
BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
270+
BranchInst::Create(LoopHeaderBB, DummyReturnBB,
271+
ConstantInt::getTrue(F.getContext()), BB);
262272
Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
273+
} else {
274+
handleNBranch(F, BB, BI, DummyReturnBB, Updates);
263275
}
264276
Changed = true;
277+
} else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
278+
if (DummyReturnBB == nullptr)
279+
DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
280+
281+
handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
282+
} else {
283+
llvm_unreachable("unsupported block terminator");
265284
}
266285
}
267286

llvm/lib/Transforms/Scalar/StructurizeCFG.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -554,11 +554,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
554554
} else {
555555
// Test for successors as back edge
556556
BasicBlock *BB = N->getNodeAs<BasicBlock>();
557-
BranchInst *Term = cast<BranchInst>(BB->getTerminator());
558-
559-
for (BasicBlock *Succ : Term->successors())
560-
if (Visited.count(Succ))
561-
Loops[Succ] = BB;
557+
if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()))
558+
for (BasicBlock *Succ : Term->successors())
559+
if (Visited.count(Succ))
560+
Loops[Succ] = BB;
562561
}
563562
}
564563

@@ -590,7 +589,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
590589

591590
for (BasicBlock *P : predecessors(BB)) {
592591
// Ignore it if it's a branch from outside into our region entry
593-
if (!ParentRegion->contains(P))
592+
if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))
594593
continue;
595594

596595
Region *R = RI->getRegionFor(P);
@@ -1397,13 +1396,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
13971396
/// Run the transformation for each region found
13981397
bool StructurizeCFG::run(Region *R, DominatorTree *DT,
13991398
const TargetTransformInfo *TTI) {
1400-
if (R->isTopLevelRegion())
1399+
// CallBr and its corresponding blocks must not be modified by this pass.
1400+
if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator()))
14011401
return false;
14021402

14031403
this->DT = DT;
14041404
this->TTI = TTI;
14051405
Func = R->getEntry()->getParent();
1406-
assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
14071406

14081407
ParentRegion = R;
14091408

llvm/test/CodeGen/AMDGPU/callbr.ll

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
3+
4+
define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
5+
; CHECK-LABEL: callbr_inline_asm:
6+
; CHECK: ; %bb.0:
7+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8+
; CHECK-NEXT: flat_load_dword v0, v[0:1]
9+
; CHECK-NEXT: ;;#ASMSTART
10+
; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
11+
; CHECK-NEXT: ;;#ASMEND
12+
; CHECK-NEXT: ; %bb.1: ; %fallthrough
13+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14+
; CHECK-NEXT: flat_store_dword v[2:3], v0
15+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16+
; CHECK-NEXT: s_setpc_b64 s[30:31]
17+
; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target
18+
; CHECK-NEXT: ; %indirect
19+
; CHECK-NEXT: ; Label of block must be emitted
20+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
21+
; CHECK-NEXT: flat_store_dword v[4:5], v0
22+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
23+
; CHECK-NEXT: s_setpc_b64 s[30:31]
24+
%a = load i32, ptr %src, align 4
25+
callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect]
26+
fallthrough:
27+
store i32 %a, ptr %dst1, align 4
28+
br label %ret
29+
indirect:
30+
store i32 %a, ptr %dst2, align 4
31+
br label %ret
32+
ret:
33+
ret void
34+
}
35+
36+
define void @callbr_self_loop(i1 %c) {
37+
; CHECK-LABEL: callbr_self_loop:
38+
; CHECK: ; %bb.0:
39+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40+
; CHECK-NEXT: .LBB1_1: ; %callbr
41+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
42+
; CHECK-NEXT: ;;#ASMSTART
43+
; CHECK-NEXT: ;;#ASMEND
44+
; CHECK-NEXT: s_branch .LBB1_1
45+
; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target
46+
; CHECK-NEXT: ; %callbr.target.ret
47+
; CHECK-NEXT: ; Label of block must be emitted
48+
; CHECK-NEXT: s_setpc_b64 s[30:31]
49+
br label %callbr
50+
callbr:
51+
callbr void asm "", "!i"() to label %callbr [label %ret]
52+
ret:
53+
ret void
54+
}

llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
declare void @foo(ptr)
55
declare i1 @bar(ptr)
6+
declare i32 @bar32(ptr)
67

78
define void @musttail_call_without_return_value(ptr %p) {
89
; CHECK-LABEL: define void @musttail_call_without_return_value(
@@ -28,6 +29,31 @@ bb.1:
2829
ret void
2930
}
3031

32+
define void @musttail_call_without_return_value_callbr(ptr %p) {
33+
; CHECK-LABEL: define void @musttail_call_without_return_value_callbr(
34+
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
35+
; CHECK-NEXT: [[ENTRY:.*:]]
36+
; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
37+
; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
38+
; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
39+
; CHECK: [[BB_0]]:
40+
; CHECK-NEXT: musttail call void @foo(ptr [[P]])
41+
; CHECK-NEXT: ret void
42+
; CHECK: [[BB_1:.*:]]
43+
; CHECK-NEXT: ret void
44+
;
45+
entry:
46+
%load = load i32, ptr %p, align 1
47+
callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
48+
49+
bb.0:
50+
musttail call void @foo(ptr %p)
51+
ret void
52+
53+
bb.1:
54+
ret void
55+
}
56+
3157
define i1 @musttail_call_with_return_value(ptr %p) {
3258
; CHECK-LABEL: define i1 @musttail_call_with_return_value(
3359
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
@@ -51,3 +77,28 @@ bb.0:
5177
bb.1:
5278
ret i1 %load
5379
}
80+
81+
define i32 @musttail_call_with_return_value_callbr(ptr %p) {
82+
; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr(
83+
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
84+
; CHECK-NEXT: [[ENTRY:.*:]]
85+
; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
86+
; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
87+
; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
88+
; CHECK: [[BB_0]]:
89+
; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]])
90+
; CHECK-NEXT: ret i32 [[RET]]
91+
; CHECK: [[BB_1:.*:]]
92+
; CHECK-NEXT: ret i32 [[LOAD]]
93+
;
94+
entry:
95+
%load = load i32, ptr %p, align 1
96+
callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
97+
98+
bb.0:
99+
%ret = musttail call i32 @bar32(ptr %p)
100+
ret i32 %ret
101+
102+
bb.1:
103+
ret i32 %load
104+
}

0 commit comments

Comments
 (0)