From fb473a4aa0f13a53259258539a20e56885f032fe Mon Sep 17 00:00:00 2001 From: Qi Zhao Date: Thu, 3 Jul 2025 11:49:55 +0800 Subject: [PATCH] [LoongArch] Optimize inserting element to high part of 256bits vector --- .../LoongArch/LoongArchISelLowering.cpp | 5 +- .../CodeGen/LoongArch/lasx/build-vector.ll | 154 ++++++++---------- .../insert-extract-pair-elements.ll | 22 ++- .../lasx/ir-instruction/insertelement.ll | 6 +- 4 files changed, 79 insertions(+), 108 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 7dae4d30d31be..9ee58fb7f1771 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -5934,10 +5934,9 @@ emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB, Register ScratchReg1 = XSrc; if (Idx >= HalfSize) { ScratchReg1 = MRI.createVirtualRegister(RC); - BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), ScratchReg1) + BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_D), ScratchReg1) .addReg(XSrc) - .addReg(XSrc) - .addImm(1); + .addImm(14); } Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC); diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll index b06f6523e977c..f25e988b52dc9 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll @@ -250,84 +250,68 @@ define void @buildvector_v32i8(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ; CHECK-NEXT: vinsgr2vr.b $vr0, $a2, 14 ; CHECK-NEXT: ld.b $a1, $sp, 72 ; CHECK-NEXT: vinsgr2vr.b $vr0, $a3, 15 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 +; CHECK-NEXT: ld.b $a2, $sp, 80 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 ; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 0 -; CHECK-NEXT: ld.b $a1, $sp, 80 -; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 1 ; CHECK-NEXT: ld.b $a1, $sp, 88 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 2 -; CHECK-NEXT: ld.b $a1, $sp, 96 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 1 +; CHECK-NEXT: ld.b $a2, $sp, 96 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 3 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 2 ; CHECK-NEXT: ld.b $a1, $sp, 104 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 4 -; CHECK-NEXT: ld.b $a1, $sp, 112 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 3 +; CHECK-NEXT: ld.b $a2, $sp, 112 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 5 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 4 ; CHECK-NEXT: ld.b $a1, $sp, 120 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 6 -; CHECK-NEXT: ld.b $a1, $sp, 128 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 5 +; CHECK-NEXT: ld.b $a2, $sp, 128 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 7 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 6 ; CHECK-NEXT: ld.b $a1, $sp, 136 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 8 -; CHECK-NEXT: ld.b $a1, $sp, 144 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 7 +; CHECK-NEXT: ld.b $a2, $sp, 144 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 9 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 8 ; CHECK-NEXT: ld.b $a1, $sp, 152 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 10 -; CHECK-NEXT: ld.b $a1, $sp, 160 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 9 +; CHECK-NEXT: ld.b $a2, $sp, 160 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 11 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 10 ; CHECK-NEXT: ld.b $a1, $sp, 168 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 12 -; CHECK-NEXT: ld.b $a1, $sp, 176 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 11 +; CHECK-NEXT: ld.b $a2, $sp, 176 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 13 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 12 ; CHECK-NEXT: ld.b $a1, $sp, 184 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 13 +; CHECK-NEXT: ld.b $a2, $sp, 192 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 ; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 14 -; CHECK-NEXT: ld.b $a1, $sp, 192 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 15 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 15 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret @@ -371,8 +355,15 @@ entry: define void @buildvector_v16i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind { ; CHECK-LABEL: buildvector_v16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: ld.h $t0, $sp, 8 -; CHECK-NEXT: ld.h $t1, $sp, 0 +; CHECK-NEXT: ld.h $t0, $sp, 64 +; CHECK-NEXT: ld.h $t1, $sp, 56 +; CHECK-NEXT: ld.h $t2, $sp, 48 +; CHECK-NEXT: ld.h $t3, $sp, 40 +; CHECK-NEXT: ld.h $t4, $sp, 32 +; CHECK-NEXT: ld.h $t5, $sp, 24 +; CHECK-NEXT: ld.h $t6, $sp, 16 +; CHECK-NEXT: ld.h $t7, $sp, 8 +; CHECK-NEXT: ld.h $t8, $sp, 0 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 0 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a2, 1 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a3, 2 @@ -380,45 +371,30 @@ define void @buildvector_v16i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i1 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a5, 4 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a6, 5 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a7, 6 -; CHECK-NEXT: vinsgr2vr.h $vr0, $t1, 7 -; CHECK-NEXT: ld.h $a1, $sp, 16 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $t0, 0 +; CHECK-NEXT: vinsgr2vr.h $vr0, $t8, 7 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t7, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.h $a2, $sp, 24 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t6, 1 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.h $a1, $sp, 32 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a2, 2 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t5, 2 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.h $a2, $sp, 40 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t4, 3 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.h $a1, $sp, 48 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a2, 4 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t3, 4 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.h $a2, $sp, 56 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t2, 5 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.h $a1, $sp, 64 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a2, 6 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t1, 6 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t0, 7 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll index 88c3e4367ffa7..a94708e96e896 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll @@ -9,13 +9,12 @@ define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind { ; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill ; CHECK-NEXT: addi.d $fp, $sp, 64 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 -; CHECK-NEXT: vpickve2gr.b $a0, $vr0, 15 ; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: ld.b $a1, $sp, 31 -; CHECK-NEXT: vinsgr2vr.b $vr0, $a0, 1 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 1 +; CHECK-NEXT: ld.b $a0, $sp, 31 +; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 15 +; CHECK-NEXT: vinsgr2vr.b $vr0, $a1, 1 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a0, 1 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 ; CHECK-NEXT: addi.d $sp, $fp, -64 ; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload @@ -38,13 +37,12 @@ define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind { ; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill ; CHECK-NEXT: addi.d $fp, $sp, 64 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 -; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 7 ; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: ld.h $a1, $sp, 30 -; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 1 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1 +; CHECK-NEXT: ld.h $a0, $sp, 30 +; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 7 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 1 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 1 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 ; CHECK-NEXT: addi.d $sp, $fp, -64 ; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll index 25106b456d2f7..3a4f6efd2c893 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll @@ -18,8 +18,7 @@ define void @insert_32xi8_upper(ptr %src, ptr %dst, i8 %in) nounwind { ; CHECK-LABEL: insert_32xi8_upper: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 ; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 ; CHECK-NEXT: xvst $xr0, $a1, 0 @@ -47,8 +46,7 @@ define void @insert_16xi16_upper(ptr %src, ptr %dst, i16 %in) nounwind { ; CHECK-LABEL: insert_16xi16_upper: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 ; CHECK-NEXT: vinsgr2vr.h $vr1, $a2, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 ; CHECK-NEXT: xvst $xr0, $a1, 0