WaveShuffleIndex Sinking Optimization

bowenxue-intel · igcbot · commit 137f19ce902c · 2024-11-21T01:57:14.000+01:00
Hoists and combines some identical BinaryOperator instructions that come
after WaveShuffleIndex instructions with a constant lane/channel and
sink WaveShuffleIndex instrucitons to the point of divergence
Use distributive property of insturctions to allow some BinaryOperators
to be hoisted above other unhoistable BinaryOperators
diff --git a/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp b/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp
@@ -101,6 +101,7 @@ SPDX-License-Identifier: MIT
 #include "Compiler/Optimizer/SynchronizationObjectCoalescing.hpp"
 #include "Compiler/Optimizer/BarrierControlFlowOptimization.hpp"
 #include "Compiler/Optimizer/RuntimeValueVectorExtractPass.h"
+#include "Compiler/Optimizer/WaveShuffleIndexSinking.hpp"
 #include "Compiler/MetaDataApi/PurgeMetaDataUtils.hpp"
 #include "Compiler/HandleLoadStoreInstructions.hpp"
 #include "Compiler/CustomSafeOptPass.hpp"
@@ -1424,6 +1425,10 @@ void OptimizeIR(CodeGenContext* const pContext)
         }
 
         mpm.add(createIGCInstructionCombiningPass());
+        if( IGC_IS_FLAG_ENABLED( EnableWaveShuffleIndexSinking ) )
+        {
+            mpm.add( createWaveShuffleIndexSinking() );
+        }
         mpm.add(new FCmpPaternMatch());
         mpm.add(llvm::createDeadCodeEliminationPass()); // this should be done both before/after constant propagation
 
diff --git a/IGC/Compiler/InitializePasses.h b/IGC/Compiler/InitializePasses.h
@@ -273,3 +273,4 @@ void initializeCollectLoopCountPass(llvm::PassRegistry&);
 void initializeRemoveLoopDependencyPass(llvm::PassRegistry&);
 void initializeResourceLoopUnrollPass(llvm::PassRegistry&);
 void initializeInjectPrintfPass(llvm::PassRegistry&);
+void initializeWaveShuffleIndexSinkingPass(llvm::PassRegistry&);
diff --git a/IGC/Compiler/Optimizer/CMakeLists.txt b/IGC/Compiler/Optimizer/CMakeLists.txt
@@ -33,6 +33,7 @@ set(IGC_BUILD__SRC__Optimizer
     "${CMAKE_CURRENT_SOURCE_DIR}/ValueTracker.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/RuntimeValueVectorExtractPass.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/BarrierControlFlowOptimization.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/WaveShuffleIndexSinking.cpp"
   )
 
 set(IGC_BUILD__SRC__Compiler_Optimizer
@@ -59,6 +60,7 @@ set(IGC_BUILD__HDR__Optimizer
     "${CMAKE_CURRENT_SOURCE_DIR}/ValueTracker.h"
     "${CMAKE_CURRENT_SOURCE_DIR}/RuntimeValueVectorExtractPass.h"
     "${CMAKE_CURRENT_SOURCE_DIR}/BarrierControlFlowOptimization.hpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/WaveShuffleIndexSinking.cpp"
   )
 
 set(IGC_BUILD__HDR__Optimizer
diff --git a/IGC/Compiler/Optimizer/WaveShuffleIndexSinking.cpp b/IGC/Compiler/Optimizer/WaveShuffleIndexSinking.cpp
diff --git a/IGC/Compiler/Optimizer/WaveShuffleIndexSinking.hpp b/IGC/Compiler/Optimizer/WaveShuffleIndexSinking.hpp
@@ -0,0 +1,18 @@
+/*========================== begin_copyright_notice ============================
+
+Copyright (C) 2024 Intel Corporation
+
+SPDX-License-Identifier: MIT
+
+============================= end_copyright_notice ===========================*/
+
+#pragma once
+
+#include "common/LLVMWarningsPush.hpp"
+#include <llvm/Pass.h>
+#include "common/LLVMWarningsPop.hpp"
+
+namespace IGC
+{
+    llvm::FunctionPass* createWaveShuffleIndexSinking();
+} // namespace IGC
diff --git a/IGC/Compiler/tests/WaveShuffleIndexSinking/basic.ll b/IGC/Compiler/tests/WaveShuffleIndexSinking/basic.ll
@@ -0,0 +1,58 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+; RUN: igc_opt -igc-wave-shuffle-index-sinking -S < %s | FileCheck %s
+; ------------------------------------------------
+; WaveShuffleIndexSinking
+;
+; Verifies that four WaveShuffleIndex instructions with the same source and a constant channel get subsequent instructions checked and hoisted
+; Each WaveShuffleIndex instruction is in turn fed into an add, and then a shl
+; The second operand of the add is not a constant, so the add is considered an anchor instruction
+; The second operand of the shl is a constant, so the shl is considered a hoistable instruction
+; Due to distributive properties, the shl is allowed to be hoisted above the add, and afterwards, above all the WaveShuffleIndex instructions
+; Since there are 4 WaveShuffleIndex instructions in the ShuffleGroup, we can trade a shl on the source of the WaveShuffleIndex and a shl on the second operand of the add for removing all 4 shl instructions operating on the result of each add
+; This changes the number of instructions from 4 * WSI + 4 * add + 4 * shl to shl(for %a) + shl(for %b) + 4 * WS + 4 * add, reducing the total number of instructions by 2, while preserving functionality
+; ------------------------------------------------
+
+define void @test_wave_shuffle_index_sinking(i32* %dst0, i32* %dst1, i32* %dst2, i32* %dst3, i32 %a, i32 %b) {
+; CHECK: [[HOISTED:%.*]] = shl i32 %a, 2
+; CHECK: [[WS0:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[HOISTED]], i32 0, i32 0)
+  %ws0 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 0, i32 0)
+; CHECK: [[ANCHOR_HOISTED:%.*]] = shl i32 %b, 2
+; CHECK-NEXT: [[ANCHOR0:%.*]] = add i32 [[WS0]], [[ANCHOR_HOISTED]]
+  %add0 = add i32 %ws0, %b
+  %shl0 = shl i32 %add0, 2
+; CHECK: store i32 [[ANCHOR0]], i32* %dst0
+  store i32 %shl0, i32* %dst0
+; CHECK: [[WS1:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[HOISTED]], i32 1, i32 0)
+  %ws1 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 1, i32 0)
+; CHECK: [[ANCHOR1:%.*]] = add i32 [[WS1]], [[ANCHOR_HOISTED]]
+  %add1 = add i32 %ws1, %b
+  %shl1 = shl i32 %add1, 2
+; CHECK: store i32 [[ANCHOR1]], i32* %dst1
+  store i32 %shl1, i32* %dst1
+; CHECK: [[WS2:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[HOISTED]], i32 2, i32 0)
+  %ws2 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 2, i32 0)
+; CHECK: [[ANCHOR2:%.*]] = add i32 [[WS2]], [[ANCHOR_HOISTED]]
+  %add2 = add i32 %ws2, %b
+  %shl2 = shl i32 %add2, 2
+; CHECK: store i32 [[ANCHOR2]], i32* %dst2
+  store i32 %shl2, i32* %dst2
+; CHECK: [[WS3:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[HOISTED]], i32 3, i32 0)
+  %ws3 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 3, i32 0)
+; CHECK: [[ANCHOR3:%.*]] = add i32 [[WS3]], [[ANCHOR_HOISTED]]
+  %add3 = add i32 %ws3, %b
+  %shl3 = shl i32 %add3, 2
+; CHECK: store i32 [[ANCHOR3]], i32* %dst3
+  store i32 %shl3, i32* %dst3
+  ret void
+}
+
+; Function Attrs: convergent nounwind readnone
+declare i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32, i32, i32) #0
+
+attributes #0 = { convergent nounwind readnone }
diff --git a/IGC/Compiler/tests/WaveShuffleIndexSinking/split-sink-merge.ll b/IGC/Compiler/tests/WaveShuffleIndexSinking/split-sink-merge.ll
@@ -0,0 +1,72 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+; RUN: igc_opt -igc-wave-shuffle-index-sinking -S < %s | FileCheck %s
+; ------------------------------------------------
+; WaveShuffleIndexSinking
+;
+; This test does not primarily demonstrate the benefits/profitability of the optimization
+; This test focuses on the auxiliary functionality of splitting and merging various WaveShuffleIndex instructions
+; Only the 4 combinations of N/N, N/Y, Y/N, and Y/Y for Split/Merge need to be considered, Sink is for informational purposes only
+;
+; Test Scenarios
+; %ws0: Split: N, Sink: Y, Merge: N
+; %ws1: Split: Y, Sink: Y/Y, Merge: N (profitable to sink both paths)
+; %ws2: Split: Y, Sink: Y/N, Merge: N (profitable to sink one path)
+; %ws3: Split: Y, Sink: N/N, Merge: Y (nothing to group and sink with)
+; %ws4: Split: N, Sink: N, Merge: Y
+; %ws5: Split: N, Sink: N, Merge: Y
+; ------------------------------------------------
+
+define void @test_split_sink_merge(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK: [[USE1_WS0_WS1C1_HOISTED:%.*]] = shl i32 %a, 2
+; CHECK: [[WS0:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[USE1_WS0_WS1C1_HOISTED]], i32 0, i32 0)
+  %ws0 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 0, i32 0)
+  %use1_ws0 = shl i32 %ws0, 2
+; CHECK: add i32 [[WS0]], %c
+  %anchor1_ws0 = add i32 %use1_ws0, %c
+; CHECK: [[WS1C1:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[USE1_WS0_WS1C1_HOISTED]], i32 1, i32 0)
+  %ws1 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 1, i32 0)
+  %use1_ws1 = shl i32 %ws1, 2
+; CHECK: add i32 [[WS1C1]], %c
+  %anchor1_ws1 = add i32 %use1_ws1, %c
+; CHECK: [[USE2_WS1C2_WS2C1_HOISTED:%.*]] = shl i32 %a, 3
+; CHECK: [[WS1C2:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[USE2_WS1C2_WS2C1_HOISTED]], i32 1, i32 0)
+  %use2_ws1 = shl i32 %ws1, 3
+; CHECK: add i32 [[WS1C2]], %d
+  %anchor2_ws1 = add i32 %use2_ws1, %d
+; CHECK: [[WS2C1:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[USE2_WS1C2_WS2C1_HOISTED]], i32 2, i32 0)
+  %ws2 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 2, i32 0)
+  %use1_ws2 = shl i32 %ws2, 3
+; CHECK: add i32 [[WS2C1]], %d
+  %anchor1_ws2 = add i32 %use1_ws2, %d
+; CHECK: [[WS2C2:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 2, i32 0)
+; CHECK: [[USE2_WS2C2_NOT_HOISTED:%.*]] = shl i32 [[WS2C2]], 4
+  %use2_ws2 = shl i32 %ws2, 4
+; CHECK: add i32 [[USE2_WS2C2_NOT_HOISTED]], %d
+  %anchor2_ws2 = add i32 %use2_ws2, %d
+; CHECK: [[WS3:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %b, i32 0, i32 0)
+  %ws3 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %b, i32 0, i32 0)
+; CHECK-NEXT: add i32 [[WS3]], %c
+  %use1_ws3 = add i32 %ws3, %c
+; CHECK-NEXT: add i32 [[WS3]], %d
+  %use2_ws3 = add i32 %ws3, %d
+; CHECK: [[WS4:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %b, i32 1, i32 0)
+  %ws4 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %b, i32 1, i32 0)
+; CHECK-NEXT: {{%.*}} = add i32 [[WS4]], %c
+  %use1_ws4 = add i32 %ws4, %c
+; CHECK-NOT: call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %b, i32 1, i32 0)
+  %ws5 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %b, i32 1, i32 0)
+; CHECK-NEXT: {{%.*}} = add i32 [[WS4]], %d
+  %use1_ws5 = add i32 %ws5, %d
+  ret void
+}
+
+; Function Attrs: convergent nounwind readnone
+declare i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32, i32, i32) #0
+
+attributes #0 = { convergent nounwind readnone }
diff --git a/IGC/Compiler/tests/WaveShuffleIndexSinking/two-iterations-delay.ll b/IGC/Compiler/tests/WaveShuffleIndexSinking/two-iterations-delay.ll
@@ -0,0 +1,71 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+; RUN: igc_opt -igc-wave-shuffle-index-sinking -S < %s | FileCheck %s
+; ------------------------------------------------
+; WaveShuffleIndexSinking
+;
+; A new WaveShuffleInstruction may match with more hoistable instructions when compared to some individual WaveShuffleIndex insts in an existing ShuffleGroup
+; However, the group's maximal list of hoistable instructions is already established, and can only be reduced as more WaveShuffleIndex insts attempt to join
+; The partial group with the additional hoistable instructions will be hoisted in the next iteration
+; The pass is configured to run iteratively up to a maximum of the value specified by the WaveShuffleIndexSinkingMaxIterations regkey (default: 3)
+; Rerunning ensures that any potential hoistable instructions that were not added to a ShuffleGroup that are still profitable to merge will get merged eventually
+; ShuffleGroup consisting of %ws0 and %ws1 have established maximal InstChain, even though %ws1 and %ws2 could have formed a ShuffleGroup with a longer InstChain
+; ashr gets hoisted in next iteration when ShuffleGroup containing %ws1 and %ws2 gets constructed since %ws0 has no more suitable instructions in order to join the ShuffleGroup
+; Note: Hoisted instructions are to demonstrate functionality, InstCombine would reduce the shl by 2 and ashr by 1 to a single shl by 1
+; ------------------------------------------------
+
+define void @test_wave_shuffle_index_sinking(i32* %dst0, i32* %dst1, i32* %dst2, i32* %dst3, i32 %a, i32 %b, i32 %c) {
+; CHECK: [[HOISTED_I1:%.*]] = shl i32 %a, 2
+; CHECK: [[WS0:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[HOISTED_I1]], i32 0, i32 0)
+  %ws0 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 0, i32 0)
+; CHECK: [[ANCHOR1_HOISTED_I1:%.*]] = shl i32 %b, 2
+; CHECK-NEXT: [[ANCHOR1_WS0:%.*]] = add i32 [[WS0]], [[ANCHOR1_HOISTED_I1]]
+  %add0 = add i32 %ws0, %b
+  %shl0 = shl i32 %add0, 2
+; CHECK: store i32 [[ANCHOR1_WS0]], i32* %dst0
+  store i32 %shl0, i32* %dst0
+; CHECK: [[HOISTED_I2:%.*]] = ashr i32 [[HOISTED_I1]], 1
+; CHECK: [[WS1:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[HOISTED_I2]], i32 1, i32 0)
+  %ws1 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 1, i32 0)
+; CHECK: [[ANCHOR1_HOISTED_I2:%.*]] = ashr i32 [[ANCHOR1_HOISTED_I1]], 1
+; CHECK: [[ANCHOR1_WS1:%.*]] = add i32 [[WS1]], [[ANCHOR1_HOISTED_I2]]
+  %add1 = add i32 %ws1, %b
+  %shl1 = shl i32 %add1, 2
+; CHECK: [[ANCHOR2_HOISTED_I2:%.*]] = ashr i32 %c, 1
+; CHECK-NEXT: [[ANCHOR2_WS1:%.*]] = mul i32 [[ANCHOR1_WS1]], [[ANCHOR2_HOISTED_I2]]
+  %mul1 = mul i32 %shl1, %c
+  %ashr1 = ashr i32 %mul1, 1
+; CHECK: store i32 [[ANCHOR2_WS1]], i32* %dst1
+  store i32 %ashr1, i32* %dst1
+; CHECK: [[WS2:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[HOISTED_I2]], i32 2, i32 0)
+  %ws2 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 2, i32 0)
+; CHECK: [[ANCHOR1_WS2:%.*]] = add i32 [[WS2]], [[ANCHOR1_HOISTED_I2]]
+  %add2 = add i32 %ws2, %b
+  %shl2 = shl i32 %add2, 2
+; CHECK: [[ANCHOR2_WS2:%.*]] = mul i32 [[ANCHOR1_WS2]], [[ANCHOR2_HOISTED_I2]]
+  %mul2 = mul i32 %shl2, %c
+  %ashr2 = ashr i32 %mul2, 1
+; CHECK: store i32 [[ANCHOR2_WS2]], i32* %dst2
+  store i32 %ashr2, i32* %dst2
+; CHECK: [[WS3:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[HOISTED_I2]], i32 3, i32 0)
+  %ws3 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 3, i32 0)
+; CHECK: [[ANCHOR1_WS3:%.*]] = add i32 [[WS3]], [[ANCHOR1_HOISTED_I2]]
+  %add3 = add i32 %ws3, %b
+  %shl3 = shl i32 %add3, 2
+; CHECK: [[ANCHOR2_WS3:%.*]] = mul i32 [[ANCHOR1_WS3]], [[ANCHOR2_HOISTED_I2]]
+  %mul3 = mul i32 %shl3, %c
+  %ashr3 = ashr i32 %mul3, 1
+; CHECK: store i32 [[ANCHOR2_WS3]], i32* %dst3
+  store i32 %ashr3, i32* %dst3
+  ret void
+}
+
+; Function Attrs: convergent nounwind readnone
+declare i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32, i32, i32) #0
+
+attributes #0 = { convergent nounwind readnone }
diff --git a/IGC/Compiler/tests/WaveShuffleIndexSinking/two-iterations-trim.ll b/IGC/Compiler/tests/WaveShuffleIndexSinking/two-iterations-trim.ll
@@ -0,0 +1,69 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+; RUN: igc_opt -igc-wave-shuffle-index-sinking -S < %s | FileCheck %s
+; ------------------------------------------------
+; WaveShuffleIndexSinking
+;
+; An initial ShuffleGroup's InstChain may get reduced to encompass a wider number of WaveShuffleIndex that have fewer similar instructions
+; The pass is configured to run iteratively up to a maximum of the value specified by the WaveShuffleIndexSinkingMaxIterations regkey (default: 3)
+; Rerunning ensures that any potential hoistable instructions that were kicked out of a ShuffleGroup that are still profitable to merge will get merged eventually
+; ShuffleGroup consisting of %ws0, %ws1, and %ws2 gets trimmed in first iteration in order to accommodate %ws3
+; ashr gets hoisted in next iteration when ShuffleGroup (%ws0, %ws1, %ws2) gets reconstructed and %ws3 has no more suitable instructions in order to join the ShuffleGroup
+; Note: Hoisted instructions are to demonstrate functionality, InstCombine would reduce the shl by 2 and ashr by 1 to a single shl by 1
+; ------------------------------------------------
+
+define void @test_wave_shuffle_index_sinking(i32* %dst0, i32* %dst1, i32* %dst2, i32* %dst3, i32 %a, i32 %b, i32 %c) {
+; CHECK: [[HOISTED_I1:%.*]] = shl i32 %a, 2
+; CHECK: [[HOISTED_I2:%.*]] = ashr i32 [[HOISTED_I1]], 1
+; CHECK: [[WS0:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[HOISTED_I2]], i32 0, i32 0)
+  %ws0 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 0, i32 0)
+; CHECK: [[ANCHOR1_HOISTED_I1:%.*]] = shl i32 %b, 2
+; CHECK: [[ANCHOR1_HOISTED_I2:%.*]] = ashr i32 [[ANCHOR1_HOISTED_I1]], 1
+; CHECK-NEXT: [[ANCHOR1_WS0:%.*]] = add i32 [[WS0]], [[ANCHOR1_HOISTED_I2]]
+  %add0 = add i32 %ws0, %b
+  %shl0 = shl i32 %add0, 2
+; CHECK: [[ANCHOR2_HOISTED_I2:%.*]] = ashr i32 %c, 1
+; CHECK-NEXT: [[ANCHOR2_WS0:%.*]] = mul i32 [[ANCHOR1_WS0]], [[ANCHOR2_HOISTED_I2]]
+  %mul0 = mul i32 %shl0, %c
+  %ashr0 = ashr i32 %mul0, 1
+; CHECK: store i32 [[ANCHOR2_WS0]], i32* %dst0
+  store i32 %ashr0, i32* %dst0
+; CHECK: [[WS1:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[HOISTED_I2]], i32 1, i32 0)
+  %ws1 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 1, i32 0)
+; CHECK: [[ANCHOR1_WS1:%.*]] = add i32 [[WS1]], [[ANCHOR1_HOISTED_I2]]
+  %add1 = add i32 %ws1, %b
+  %shl1 = shl i32 %add1, 2
+; CHECK: [[ANCHOR2_WS1:%.*]] = mul i32 [[ANCHOR1_WS1]], [[ANCHOR2_HOISTED_I2]]
+  %mul1 = mul i32 %shl1, %c
+  %ashr1 = ashr i32 %mul1, 1
+; CHECK: store i32 [[ANCHOR2_WS1]], i32* %dst1
+  store i32 %ashr1, i32* %dst1
+; CHECK: [[WS2:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[HOISTED_I2]], i32 2, i32 0)
+  %ws2 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 2, i32 0)
+; CHECK: [[ANCHOR1_WS2:%.*]] = add i32 [[WS2]], [[ANCHOR1_HOISTED_I2]]
+  %add2 = add i32 %ws2, %b
+  %shl2 = shl i32 %add2, 2
+; CHECK: [[ANCHOR2_WS2:%.*]] = mul i32 [[ANCHOR1_WS2]], [[ANCHOR2_HOISTED_I2]]
+  %mul2 = mul i32 %shl2, %c
+  %ashr2 = ashr i32 %mul2, 1
+; CHECK: store i32 [[ANCHOR2_WS2]], i32* %dst2
+  store i32 %ashr2, i32* %dst2
+; CHECK: [[WS3:%.*]] = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 [[HOISTED_I1]], i32 3, i32 0)
+  %ws3 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %a, i32 3, i32 0)
+; CHECK: [[ANCHOR1_WS3:%.*]] = add i32 [[WS3]], [[ANCHOR1_HOISTED_I1]]
+  %add3 = add i32 %ws3, %b
+  %shl3 = shl i32 %add3, 2
+; CHECK: store i32 [[ANCHOR1_WS3]], i32* %dst3
+  store i32 %shl3, i32* %dst3
+  ret void
+}
+
+; Function Attrs: convergent nounwind readnone
+declare i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32, i32, i32) #0
+
+attributes #0 = { convergent nounwind readnone }
diff --git a/IGC/GenISAIntrinsics/GenIntrinsicInst.h b/IGC/GenISAIntrinsics/GenIntrinsicInst.h
@@ -1205,6 +1205,25 @@ class WavePrefixIntrinsic : public GenIntrinsicInst
     }
 };
 
+class WaveShuffleIndexIntrinsic : public GenIntrinsicInst
+{
+public:
+    Value* getSrc() const { return getOperand( 0 ); }
+    Value* getChannel() const { return getOperand( 1 ); }
+
+    void setSrc( Value* src ) { setOperand( 0, src ); }
+
+    // Methods for support type inquiry through isa, cast, and dyn_cast:
+    static inline bool classof( const GenIntrinsicInst* I )
+    {
+        return I->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveShuffleIndex;
+    }
+    static inline bool classof( const Value* V )
+    {
+        return isa<GenIntrinsicInst>( V ) && classof( cast<GenIntrinsicInst>( V ) );
+    }
+};
+
 class QuadPrefixIntrinsic : public GenIntrinsicInst
 {
 public:
diff --git a/IGC/common/igc_flags.h b/IGC/common/igc_flags.h
@@ -321,6 +321,8 @@ DECLARE_IGC_REGKEY(DWORD, FPRoundingModeCoalescingMaxDistance, 20, "Max distance
 DECLARE_IGC_REGKEY(bool, DisableDotAddToDp4aMerge, false, "Disable Dot and Add ops to Dp4a merge optimization.", false)
 DECLARE_IGC_REGKEY(bool, DisableLoopSplitWidePHIs, false, "Disable splitting of loop PHI values to eliminate subvector extract operations", false)
 DECLARE_IGC_REGKEY(bool, EnableBarrierControlFlowOptimizationPass, false, "Enable barrier control flow optimization pass", false)
+DECLARE_IGC_REGKEY(bool, EnableWaveShuffleIndexSinking, false, "Hoist identical instructions operating on WaveShuffleIndex instructions with the same source and a constant lane/channel", false)
+DECLARE_IGC_REGKEY(DWORD, WaveShuffleIndexSinkingMaxIterations, 3, "Max number of iterations to run iterative WaveShuffleIndexSinking", false)
 
 DECLARE_IGC_GROUP("Shader debugging")
 DECLARE_IGC_REGKEY(bool, CopyA0ToDBG0,                  false, " Copy a0 used for extended msg descriptor to dbg0 to help debug", false)

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@ set(IGC_BUILD__SRC__Optimizer`
`33`	`33`	`"${CMAKE_CURRENT_SOURCE_DIR}/ValueTracker.cpp"`
`34`	`34`	`"${CMAKE_CURRENT_SOURCE_DIR}/RuntimeValueVectorExtractPass.cpp"`
`35`	`35`	`"${CMAKE_CURRENT_SOURCE_DIR}/BarrierControlFlowOptimization.cpp"`
	`36`	`+ "${CMAKE_CURRENT_SOURCE_DIR}/WaveShuffleIndexSinking.cpp"`
`36`	`37`	`)`
`37`	`38`
`38`	`39`	`set(IGC_BUILD__SRC__Compiler_Optimizer`
`@@ -59,6 +60,7 @@ set(IGC_BUILD__HDR__Optimizer`
`59`	`60`	`"${CMAKE_CURRENT_SOURCE_DIR}/ValueTracker.h"`
`60`	`61`	`"${CMAKE_CURRENT_SOURCE_DIR}/RuntimeValueVectorExtractPass.h"`
`61`	`62`	`"${CMAKE_CURRENT_SOURCE_DIR}/BarrierControlFlowOptimization.hpp"`
	`63`	`+ "${CMAKE_CURRENT_SOURCE_DIR}/WaveShuffleIndexSinking.cpp"`
`62`	`64`	`)`
`63`	`65`
`64`	`66`	`set(IGC_BUILD__HDR__Optimizer`