Skip to content

[LoongArch] Optimize inserting extracted elements #146018

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from

Conversation

zhaoqi5
Copy link
Contributor

@zhaoqi5 zhaoqi5 commented Jun 27, 2025

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Jun 27, 2025

@llvm/pr-subscribers-backend-loongarch

Author: ZhaoQi (zhaoqi5)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/146018.diff

5 Files Affected:

  • (modified) llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td (+8-5)
  • (modified) llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td (+4-1)
  • (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll (+2-10)
  • (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll (-4)
  • (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll (+2-4)
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index ff7b0f2ae3f25..915dc803bdbd7 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1593,11 +1593,14 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
           (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
 def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
           (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
-
-def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
-          (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
-def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
-          (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2),
+          (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>;
+def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2),
+          (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>;
+def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
+          (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm),
+          (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
 
 // scalar_to_vector
 def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index d73d78083ddcd..34c6ffc6727f1 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1791,7 +1791,10 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm),
           (VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>;
 def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm),
           (VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>;
-
+def : Pat<(vector_insert v4f32:$vd, (f32 (vector_extract v4f32:$vj, uimm2:$imm1)), uimm2:$imm2),
+          (VINSGR2VR_W $vd, (VPICKVE2GR_W v4f32:$vj, uimm2:$imm1), uimm2:$imm2)>;
+def : Pat<(vector_insert v2f64:$vd, (f64 (vector_extract v2f64:$vj, uimm1:$imm1)), uimm1:$imm2),
+          (VINSGR2VR_D $vd, (VPICKVE2GR_D v2f64:$vj, uimm1:$imm1), uimm1:$imm2)>;
 def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm),
           (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>;
 def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm),
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
index f3bec11810e9b..f154dd3b8eb3c 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -7,20 +7,12 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: shufflevector_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT:    movgr2fr.d $fa2, $a0
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 2
-; CHECK-NEXT:    movgr2fr.d $fa3, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa2
 ; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa3
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 2
 ; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 1
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa0, $a0
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 3
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
 ; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 2
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 3
 ; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 3
 ; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
index a5d3a0d395b3c..ddbc159ca94ba 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
@@ -5,8 +5,6 @@ define <8 x float> @insert_extract_v8f32(<8 x float> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
-; CHECK-NEXT:    movgr2fr.w $fa1, $a0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
@@ -19,8 +17,6 @@ define <4 x double> @insert_extract_v4f64(<4 x double> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
index dcf23f0240712..4c34e0f49b8c8 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
@@ -4,8 +4,7 @@
 define <4 x float> @insert_extract_v4f32(<4 x float> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 3
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
+; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 3
 ; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -17,8 +16,7 @@ entry:
 define <2 x double> @insert_extract_v2f64(<2 x double> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
 ; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:

@tangaac
Copy link
Contributor

tangaac commented Jun 27, 2025

We could use VEXTRINS instructions instead.

@zhaoqi5
Copy link
Contributor Author

zhaoqi5 commented Jun 27, 2025

We could use VEXTRINS instructions instead.

Great, using one vextrins instruction is enough. I will modify it later. Thanks.

@zhaoqi5 zhaoqi5 force-pushed the users/zhaoqi5/opt-insert-extract-element branch from 85cb5e9 to aab3fee Compare July 1, 2025 03:12
@zhaoqi5 zhaoqi5 changed the title [LoongArch] Optimize inserting extracted fp elements [LoongArch] Optimize inserting extracted elements Jul 1, 2025
@tangaac
Copy link
Contributor

tangaac commented Jul 2, 2025

  foreach imm1 = 0...1 in {
    foreach imm2 = 0...1 in {
      defvar Imm = !or(!shl(imm2, 4), imm1);
      def : Pat<(vector_insert(vector_insert v4i64:$xd,
                     (GRLenVT(vector_extract v4i64:$xj, imm1)), imm2),
                    (GRLenVT(vector_extract v4i64:$xj, !add(imm1, 2))),
                    !add(imm2, 2)),
                (XVEXTRINS_D $xd, $xj, Imm)>;
    }
  }

We could also support XVEXTRINS.{W/D} instrunctions.

@tangaac tangaac closed this Jul 2, 2025
@tangaac tangaac reopened this Jul 2, 2025
@zhaoqi5
Copy link
Contributor Author

zhaoqi5 commented Jul 2, 2025

  foreach imm1 = 0...1 in {
    foreach imm2 = 0...1 in {
      defvar Imm = !or(!shl(imm2, 4), imm1);
      def : Pat<(vector_insert(vector_insert v4i64:$xd,
                     (GRLenVT(vector_extract v4i64:$xj, imm1)), imm2),
                    (GRLenVT(vector_extract v4i64:$xj, !add(imm1, 2))),
                    !add(imm2, 2)),
                (XVEXTRINS_D $xd, $xj, Imm)>;
    }
  }

We could also support XVEXTRINS.{W/D} instrunctions.

XVEXTRINS operates on two elements on the front and back 128 bits. So two pairs of vector_extract + vector_insert are needed. The current tests cannot be optimized. I will add new tests and support it.

@tangaac
Copy link
Contributor

tangaac commented Jul 2, 2025

Use this patch to support extract i8/i16 type element from hi128 part of 256bit vector.
Please update tests too.
support-extract-i8-i16-type-element-from-hi128-part.txt

@zhaoqi5 zhaoqi5 force-pushed the users/zhaoqi5/opt-insert-extract-element branch from aab3fee to 00a0512 Compare July 2, 2025 10:26
@zhaoqi5
Copy link
Contributor Author

zhaoqi5 commented Jul 2, 2025

Use this patch to support extract i8/i16 type element from hi128 part of 256bit vector. Please update tests too. support-extract-i8-i16-type-element-from-hi128-part.txt

Done. Thanks for your efforts on this.

@tangaac
Copy link
Contributor

tangaac commented Jul 3, 2025

This change improves vector_insert for lasx

// XVINSGR2VR_{W/D}
def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
          (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
          (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;

def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
          (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
          (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;

-->

// XVINSGR2VR_{W/D}
def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
          (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
          (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;

// XVINSVE0_{W/D}
def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
          (XVINSVE0_W $vd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
          (XVINSVE0_D $vd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>;

@zhaoqi5
Copy link
Contributor Author

zhaoqi5 commented Jul 3, 2025

This change improves vector_insert for lasx

// XVINSGR2VR_{W/D}
def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
          (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
          (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;

def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
          (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
          (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;

-->

// XVINSGR2VR_{W/D}
def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
          (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
          (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;

// XVINSVE0_{W/D}
def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
          (XVINSVE0_W $vd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
          (XVINSVE0_D $vd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>;

The purpose of this pr is mainly to optimize inserting extracted elements from 128 or 256 bits vector. I think we can do this optimization for vector_insert in a later patch.

@tangaac
Copy link
Contributor

tangaac commented Jul 3, 2025

This change improves vector_insert for lasx

// XVINSGR2VR_{W/D}
def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
          (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
          (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;

def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
          (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
          (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;

-->

// XVINSGR2VR_{W/D}
def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
          (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
          (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;

// XVINSVE0_{W/D}
def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
          (XVINSVE0_W $vd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
          (XVINSVE0_D $vd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>;

The purpose of this pr is mainly to optimize inserting extracted elements from 128 or 256 bits vector. I think we can do this optimization for vector_insert in a later patch.

OK

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants