Skip to content

Commit 6eec809

Browse files
admitricpszymich
authored andcommitted
Fix and enable vector shuffle rescheduling functionality
Fix incorrect IR after rollback application in vector shuffle rescheduling in CodeLoopSinking pass and weaken the condition of creating a candidate to re-enable vector shuffle scheduling. (cherry picked from commit a486baa)
1 parent c890d2a commit 6eec809

File tree

4 files changed

+153
-4
lines changed

4 files changed

+153
-4
lines changed

IGC/Compiler/CISACodeGen/CodeSinking.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1647,20 +1647,25 @@ namespace IGC {
16471647
{
16481648
PrintDump(VerbosityLevel::Low, ">> Reverting the changes.\n");
16491649

1650+
CandidatePtrSet RevertedCandidates;
1651+
16501652
for (auto CI = SinkedCandidates.rbegin(), CE = SinkedCandidates.rend(); CI != CE; CI++)
16511653
{
16521654
Candidate *C = CI->get();
16531655
Instruction *UndoPos = C->UndoPos;
16541656
IGC_ASSERT(UndoPos);
16551657
while (InstToCandidate.count(UndoPos))
16561658
{
1659+
if (RevertedCandidates.count(InstToCandidate[UndoPos]))
1660+
break;
16571661
UndoPos = InstToCandidate[UndoPos]->UndoPos;
16581662
}
16591663
for (Instruction *I : *C)
16601664
{
16611665
I->moveBefore(UndoPos);
16621666
UndoPos = I;
16631667
}
1668+
RevertedCandidates.insert(C);
16641669
}
16651670

16661671
rerunLiveness();
@@ -1991,7 +1996,7 @@ namespace IGC {
19911996
if (CurrentCandidateInsts.size() > 0 &&
19921997
Id == GenISAIntrinsic::GenISA_LSC2DBlockReadAddrPayload)
19931998
{
1994-
if (!SinkFromPH && !allUsesAreDominatedByUndoPoint(CurrentCandidateInsts, CurrentCandidateInsts[0]->getNextNode()))
1999+
if (!SinkFromPH && !allUsesAreDominatedByUndoPoint(CurrentCandidateInsts, CurrentCandidateInsts[0]))
19952000
{
19962001
PrintDump(VerbosityLevel::High, "Not all the uses are dominated by the UndoPoint, skipping.\n");
19972002
return false;

IGC/Compiler/CISACodeGen/CodeSinking.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ namespace IGC {
200200

201201
typedef llvm::SmallVector<std::unique_ptr<Candidate>, 64> CandidateVec;
202202
typedef llvm::SmallVector<Candidate*, 64> CandidatePtrVec;
203+
typedef llvm::DenseSet<Candidate*> CandidatePtrSet;
203204
typedef llvm::DenseMap<Instruction*, Candidate*> InstToCandidateMap;
204205

205206
/// sinking
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; REQUIRES: regkeys, llvm-14-plus
10+
; RUN: igc_opt --opaque-pointers -platformpvc \
11+
; RUN: --regkey LoopSinkEnableVectorShuffle=1,ForceLoopSink=1,LoopSinkForceRollback=1 \
12+
; RUN: --regkey LoopSinkAvoidSplittingDPAS=0,LoopSinkEnable2dBlockReads=1,LoopSinkEnableLoadsRescheduling=1 \
13+
; RUN: --regkey CodeSinkingLoadSchedulingInstr=1,LoopSinkCoarserLoadsRescheduling=0,CodeLoopSinkingMinSize=10 \
14+
; RUN: %enable-basic-aa% --igc-code-loop-sinking --verify -S %s 2>&1 | FileCheck %s
15+
16+
define spir_kernel void @foo(<8 x float> %0, <8 x float> %1, <8 x float> %2, <8 x float> %3, <8 x float> %4, <8 x float> %5) {
17+
18+
; Check nothing is sinked after rollback: first come load and lowered vector shuffle, then DPASes
19+
20+
; CHECK-LABEL: @foo(
21+
22+
; CHECK: [[BLOCK2D_ADDRPAYLOAD1062:%.*]] = call ptr @llvm.genx.GenISA.LSC2DBlockCreateAddrPayload.p0i32(i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
23+
; CHECK: br label [[DOT_CRIT_EDGE:%.*]]
24+
25+
; CHECK: ._crit_edge:
26+
; CHECK: [[BLOCK2D_READADDRPAYLOAD1065:%.*]] = call <16 x i32> @llvm.genx.GenISA.LSC2DBlockReadAddrPayload.v16i32.p0i32(ptr [[BLOCK2D_ADDRPAYLOAD1062]], i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false, i32 0)
27+
28+
; CHECK: [[TMP12:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 0
29+
; CHECK: [[TMP13:%.*]] = insertelement <8 x i32> undef, i32 [[TMP12]], i32 0
30+
; CHECK: [[TMP14:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 2
31+
; CHECK: [[TMP15:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[TMP14]], i32 1
32+
; CHECK: [[TMP16:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 4
33+
; CHECK: [[TMP17:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[TMP16]], i32 2
34+
; CHECK: [[TMP18:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 6
35+
; CHECK: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP17]], i32 [[TMP18]], i32 3
36+
; CHECK: [[TMP20:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 8
37+
; CHECK: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP20]], i32 4
38+
; CHECK: [[TMP22:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 10
39+
; CHECK: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP22]], i32 5
40+
; CHECK: [[TMP24:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 12
41+
; CHECK: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP24]], i32 6
42+
; CHECK: [[TMP26:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 14
43+
; CHECK: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP25]], i32 [[TMP26]], i32 7
44+
; CHECK: [[TMP28:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 1
45+
; CHECK: [[TMP29:%.*]] = insertelement <8 x i32> undef, i32 [[TMP28]], i32 0
46+
; CHECK: [[TMP30:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 3
47+
; CHECK: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP30]], i32 1
48+
; CHECK: [[TMP32:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 5
49+
; CHECK: [[TMP33:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP32]], i32 2
50+
; CHECK: [[TMP34:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 7
51+
; CHECK: [[TMP35:%.*]] = insertelement <8 x i32> [[TMP33]], i32 [[TMP34]], i32 3
52+
; CHECK: [[TMP36:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 9
53+
; CHECK: [[TMP37:%.*]] = insertelement <8 x i32> [[TMP35]], i32 [[TMP36]], i32 4
54+
; CHECK: [[TMP38:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 11
55+
; CHECK: [[TMP39:%.*]] = insertelement <8 x i32> [[TMP37]], i32 [[TMP38]], i32 5
56+
; CHECK: [[TMP40:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 13
57+
; CHECK: [[TMP41:%.*]] = insertelement <8 x i32> [[TMP39]], i32 [[TMP40]], i32 6
58+
; CHECK: [[TMP42:%.*]] = extractelement <16 x i32> [[BLOCK2D_READADDRPAYLOAD1065]], i32 15
59+
; CHECK: [[TMP43:%.*]] = insertelement <8 x i32> [[TMP41]], i32 [[TMP42]], i32 7
60+
61+
; CHECK: [[DPAS:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[TMP0:%.*]], <8 x i16> [[TMP8:%.*]], <8 x i32> [[TMP10:%.*]], i32 11, i32 11, i32 8, i32 8, i1 false)
62+
; CHECK: [[DPAS36:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[TMP0]], <8 x i16> [[TMP8]], <8 x i32> [[TMP11:%.*]], i32 11, i32 11, i32 8, i32 8, i1 false)
63+
; CHECK: [[DPAS37:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[TMP0]], <8 x i16> [[TMP8]], <8 x i32> [[TMP44:%.*]], i32 11, i32 11, i32 8, i32 8, i1 false)
64+
; CHECK: [[DPAS38:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[TMP0]], <8 x i16> [[TMP8]], <8 x i32> [[TMP45:%.*]], i32 11, i32 11, i32 8, i32 8, i1 false)
65+
; CHECK: [[DPAS39:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[TMP0]], <8 x i16> [[TMP9:%.*]], <8 x i32> [[TMP10]], i32 11, i32 11, i32 8, i32 8, i1 false)
66+
; CHECK: [[DPAS40:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[TMP0]], <8 x i16> [[TMP9]], <8 x i32> [[TMP11]], i32 11, i32 11, i32 8, i32 8, i1 false)
67+
; CHECK: [[DPAS41:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[TMP0]], <8 x i16> [[TMP9]], <8 x i32> [[TMP44]], i32 11, i32 11, i32 8, i32 8, i1 false)
68+
; CHECK: [[DPAS42:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> [[TMP0]], <8 x i16> [[TMP9]], <8 x i32> [[TMP45]], i32 11, i32 11, i32 8, i32 8, i1 false)
69+
; CHECK: @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32
70+
; CHECK: @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32
71+
72+
; CHECK: br
73+
;
74+
precompiled_s32divrem.exit1167:
75+
%Block2D_AddrPayload1062 = call i32* @llvm.genx.GenISA.LSC2DBlockCreateAddrPayload.p0i32(i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
76+
br label %._crit_edge
77+
78+
._crit_edge: ; preds = %._crit_edge.._crit_edge_crit_edge, %precompiled_s32divrem.exit1167
79+
%6 = phi <8 x float> [ zeroinitializer, %precompiled_s32divrem.exit1167 ], [ %dpas52, %._crit_edge.._crit_edge_crit_edge ]
80+
%7 = phi <8 x float> [ zeroinitializer, %precompiled_s32divrem.exit1167 ], [ %dpas51, %._crit_edge.._crit_edge_crit_edge ]
81+
%8 = insertelement <8 x i16> zeroinitializer, i16 0, i32 0
82+
%9 = insertelement <8 x i16> zeroinitializer, i16 0, i32 0
83+
%10 = insertelement <8 x i32> zeroinitializer, i32 0, i32 0
84+
%11 = insertelement <8 x i32> zeroinitializer, i32 0, i32 0
85+
%Block2D_ReadAddrPayload1065 = call <16 x i32> @llvm.genx.GenISA.LSC2DBlockReadAddrPayload.v16i32.p0i32(ptr %Block2D_AddrPayload1062, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false, i32 0)
86+
%12 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 0
87+
%13 = insertelement <8 x i32> undef, i32 %12, i32 0
88+
%14 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 2
89+
%15 = insertelement <8 x i32> %13, i32 %14, i32 1
90+
%16 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 4
91+
%17 = insertelement <8 x i32> %15, i32 %16, i32 2
92+
%18 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 6
93+
%19 = insertelement <8 x i32> %17, i32 %18, i32 3
94+
%20 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 8
95+
%21 = insertelement <8 x i32> %19, i32 %20, i32 4
96+
%22 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 10
97+
%23 = insertelement <8 x i32> %21, i32 %22, i32 5
98+
%24 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 12
99+
%25 = insertelement <8 x i32> %23, i32 %24, i32 6
100+
%26 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 14
101+
%27 = insertelement <8 x i32> %25, i32 %26, i32 7
102+
%28 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 1
103+
%29 = insertelement <8 x i32> undef, i32 %28, i32 0
104+
%30 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 3
105+
%31 = insertelement <8 x i32> %29, i32 %30, i32 1
106+
%32 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 5
107+
%33 = insertelement <8 x i32> %31, i32 %32, i32 2
108+
%34 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 7
109+
%35 = insertelement <8 x i32> %33, i32 %34, i32 3
110+
%36 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 9
111+
%37 = insertelement <8 x i32> %35, i32 %36, i32 4
112+
%38 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 11
113+
%39 = insertelement <8 x i32> %37, i32 %38, i32 5
114+
%40 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 13
115+
%41 = insertelement <8 x i32> %39, i32 %40, i32 6
116+
%42 = extractelement <16 x i32> %Block2D_ReadAddrPayload1065, i32 15
117+
%43 = insertelement <8 x i32> %41, i32 %42, i32 7
118+
%44 = insertelement <8 x i32> zeroinitializer, i32 0, i32 0
119+
%45 = insertelement <8 x i32> zeroinitializer, i32 0, i32 0
120+
%dpas = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %0, <8 x i16> %8, <8 x i32> %10, i32 11, i32 11, i32 8, i32 8, i1 false)
121+
%dpas36 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %0, <8 x i16> %8, <8 x i32> %11, i32 11, i32 11, i32 8, i32 8, i1 false)
122+
%dpas37 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %0, <8 x i16> %8, <8 x i32> %44, i32 11, i32 11, i32 8, i32 8, i1 false)
123+
%dpas38 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %0, <8 x i16> %8, <8 x i32> %45, i32 11, i32 11, i32 8, i32 8, i1 false)
124+
%dpas39 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %0, <8 x i16> %9, <8 x i32> %10, i32 11, i32 11, i32 8, i32 8, i1 false)
125+
%dpas40 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %0, <8 x i16> %9, <8 x i32> %11, i32 11, i32 11, i32 8, i32 8, i1 false)
126+
%dpas41 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %0, <8 x i16> %9, <8 x i32> %44, i32 11, i32 11, i32 8, i32 8, i1 false)
127+
%dpas42 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %0, <8 x i16> %9, <8 x i32> %45, i32 11, i32 11, i32 8, i32 8, i1 false)
128+
%dpas51 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %7, <8 x i16> zeroinitializer, <8 x i32> %27, i32 0, i32 0, i32 0, i32 0, i1 false)
129+
%dpas52 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> %6, <8 x i16> zeroinitializer, <8 x i32> %43, i32 0, i32 0, i32 0, i32 0, i1 false)
130+
br label %._crit_edge.._crit_edge_crit_edge
131+
132+
._crit_edge.._crit_edge_crit_edge: ; preds = %._crit_edge
133+
br label %._crit_edge
134+
}
135+
136+
declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1)
137+
138+
declare ptr @llvm.genx.GenISA.LSC2DBlockCreateAddrPayload.p0i32(i64, i32, i32, i32, i32, i32, i32, i32, i32)
139+
140+
declare void @llvm.genx.GenISA.LSC2DBlockSetAddrPayloadField.p0i32.i32(ptr, i32, i32, i1)
141+
142+
declare <16 x i32> @llvm.genx.GenISA.LSC2DBlockReadAddrPayload.v16i32.p0i32(ptr, i32, i32, i32, i32, i32, i32, i1, i1, i32)
143+
144+
attributes #0 = { nofree nosync nounwind readnone speculatable willreturn }
145+
146+
!igc.functions = !{}

IGC/Compiler/tests/CodeSinking/LoopSinking/2d-blockload-vectorschuffle-sch.ll

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,6 @@
99
; REQUIRES: regkeys, llvm-14-plus
1010
; RUN: igc_opt --opaque-pointers -platformpvc --regkey LoopSinkMinSave=1 --regkey LoopSinkAvoidSplittingDPAS=0 --regkey LoopSinkEnable2dBlockReads=1 --regkey LoopSinkEnableLoadsRescheduling=1 --regkey ForceLoopSink=1 --regkey CodeLoopSinkingMinSize=10 --regkey CodeSinkingLoadSchedulingInstr=1 --regkey LoopSinkCoarserLoadsRescheduling=0 --regkey LoopSinkEnableVectorShuffle=1 %enable-basic-aa% --igc-code-loop-sinking -S %s 2>&1 | FileCheck %s
1111

12-
; The functionality is not functional yet, so we expect the test to fail
13-
; XFAIL: *
14-
1512
define spir_kernel void @foo(ptr addrspace(1) %_arg_A, ptr addrspace(1) %_arg_B, i16 %localIdY) {
1613
; Check that the order of the first loads and SetField calls is not changed after rollback
1714
; CHECK-LABEL: @foo(

0 commit comments

Comments
 (0)