Skip to content

Commit f11f49a

Browse files
rampitecDavid Salinas
authored andcommitted
[AMDGPU] Avoid unneeded waitcounts before spill stores (llvm#108303)
Implicit defs and uses on spill stores were accounted as real defs and uses, while only exist for liveness accounting. As a result unneded waits were generated. Fixes: SWDEV-484177 Change-Id: I2103b4d93e57cfc7be7236980e55da82a39eb09f
1 parent 9e34c05 commit f11f49a

18 files changed

+3531
-2889
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -903,7 +903,16 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
903903
}
904904
} else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
905905
// Match the score to the destination registers.
906-
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
906+
//
907+
// Check only explicit operands. Stores, especially spill stores, include
908+
// implicit uses and defs of their super registers which would create an
909+
// artificial dependency, while these are there only for register liveness
910+
// accounting purposes.
911+
//
912+
// Special cases where implicit register defs and uses exists, such as
913+
// M0, FLAT_SCR or VCC, but the wait will be generated earlier in the
914+
// generateWaitcntInstBefore() if that was loaded from memory.
915+
for (unsigned I = 0, E = Inst.getNumExplicitOperands(); I != E; ++I) {
907916
auto &Op = Inst.getOperand(I);
908917
if (!Op.isReg() || !Op.isDef())
909918
continue;

llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
268268
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
269269
; GFX906-NEXT: s_waitcnt vmcnt(0)
270270
; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill
271-
; GFX906-NEXT: s_waitcnt vmcnt(0)
271+
; GFX906-NEXT: s_nop 0
272272
; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
273273
; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
274274
; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
@@ -294,7 +294,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
294294
; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
295295
; GFX906-NEXT: s_waitcnt vmcnt(0)
296296
; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
297-
; GFX906-NEXT: s_waitcnt vmcnt(0)
297+
; GFX906-NEXT: s_nop 0
298298
; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
299299
; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
300300
; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
@@ -317,7 +317,7 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
317317
; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
318318
; GFX906-NEXT: s_waitcnt vmcnt(0)
319319
; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
320-
; GFX906-NEXT: s_waitcnt vmcnt(0)
320+
; GFX906-NEXT: s_nop 0
321321
; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
322322
; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
323323
; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
185185
; GFX90A-NEXT: s_nop 7
186186
; GFX90A-NEXT: s_nop 2
187187
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
188-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
188+
; GFX90A-NEXT: s_nop 0
189189
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
190190
; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
191191
; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
@@ -215,7 +215,6 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
215215
; GFX90A-NEXT: buffer_load_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
216216
; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
217217
; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
218-
; GFX90A-NEXT: s_waitcnt vmcnt(9)
219218
; GFX90A-NEXT: v_accvgpr_write_b32 a10, v39 ; Reload Reuse
220219
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v38 ; Reload Reuse
221220
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v37 ; Reload Reuse
@@ -1094,7 +1093,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
10941093
; GFX90A-NEXT: s_nop 7
10951094
; GFX90A-NEXT: s_nop 2
10961095
; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
1097-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
1096+
; GFX90A-NEXT: s_nop 0
10981097
; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
10991098
; GFX90A-NEXT: buffer_store_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
11001099
; GFX90A-NEXT: buffer_store_dword a3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
@@ -1125,7 +1124,6 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
11251124
; GFX90A-NEXT: buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
11261125
; GFX90A-NEXT: buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
11271126
; GFX90A-NEXT: buffer_load_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
1128-
; GFX90A-NEXT: s_waitcnt vmcnt(10)
11291127
; GFX90A-NEXT: v_accvgpr_write_b32 a11, v39 ; Reload Reuse
11301128
; GFX90A-NEXT: v_accvgpr_write_b32 a12, v38 ; Reload Reuse
11311129
; GFX90A-NEXT: v_accvgpr_write_b32 a13, v37 ; Reload Reuse

llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,6 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
684684
; GCN-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
685685
; GCN-O0-NEXT: v_mov_b32_e32 v5, v1
686686
; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
687-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
688687
; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
689688
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
690689
; GCN-O0-NEXT: s_mov_b32 s2, 0
@@ -1122,7 +1121,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
11221121
; GCN-O0-NEXT: v_mov_b32_e32 v2, s10
11231122
; GCN-O0-NEXT: v_mov_b32_e32 v3, s11
11241123
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
1125-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
11261124
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
11271125
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
11281126
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -1156,7 +1154,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
11561154
; GCN-O0-NEXT: v_mov_b32_e32 v2, s10
11571155
; GCN-O0-NEXT: v_mov_b32_e32 v3, s11
11581156
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
1159-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
11601157
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
11611158
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
11621159
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -1188,7 +1185,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
11881185
; GCN-O0-NEXT: v_mov_b32_e32 v2, s6
11891186
; GCN-O0-NEXT: v_mov_b32_e32 v3, s7
11901187
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
1191-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
11921188
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
11931189
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
11941190
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -1209,7 +1205,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
12091205
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
12101206
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12111207
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
1212-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12131208
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
12141209
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
12151210
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -1230,7 +1225,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
12301225
; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
12311226
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12321227
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
1233-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
12341228
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
12351229
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
12361230
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
@@ -1303,7 +1297,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
13031297
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
13041298
; GCN-O0-NEXT: s_waitcnt vmcnt(1)
13051299
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
1306-
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
13071300
; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
13081301
; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
13091302
; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill

0 commit comments

Comments
 (0)