[MLIR][AArch64] Change some tests to ensure SVE vector length is the same throughout the function (#147506)

momchil-velikov · web-flow · commit 962c4217bc68 · 2025-07-09T09:32:25.000+01:00
This change only applies to functions the can be reasonably expected to use SVE registers. Modifying vector length in the middle of a function might cause incorrect stack deallocation if there are callee-saved SVE registers or incorrect access to SVE stack slots. Addresses (non-issue) #143670
diff --git a/mlir/lib/ExecutionEngine/ArmRunnerUtils.cpp b/mlir/lib/ExecutionEngine/ArmRunnerUtils.cpp
@@ -38,6 +38,17 @@ extern "C" {
 // PR_SVE_VL_LEN_MASK.
 #define PR_VL_LEN_MASK 0xffff
 
+/// Sets the vector length (streaming or not, as indicated by `option`) to
+/// `bits`.
+///
+/// Caveat emptor: If a function has allocated stack slots for SVE registers
+/// (e.g. slots for callee-saved SVE registers or spill slots) changing
+/// the vector length is tricky and error prone - it may cause incorrect stack
+/// deallocation or incorrect access to stack slots.
+///
+/// The recommended strategy is to call `setArmVectorLength` only from functions
+/// that do not access SVE registers, either by themselves or by inlining other
+/// functions.
 static void setArmVectorLength(std::string_view helper_name, int option,
                                uint32_t bits) {
 #if defined(__linux__) && defined(__aarch64__)
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-scalable-inner-tile.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-scalable-inner-tile.mlir
@@ -39,20 +39,21 @@ func.func @main() {
     [ 7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 91, 98, 105, 112]
   ]> : tensor<7x16xi32>
 
+
+  // Set vscale to 2 (vector width = 256). This will have identical effect to:
+  //  * qemu-aarch64 -cpu max,sve-max-vq=2 (...)
+  %c256 = arith.constant 256 : i32
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+
   func.call @pack(%A) : (tensor<7x16xi32>) -> ()
 
   return
 }
 
-func.func private @pack(%A: tensor<7x16xi32>) {
+func.func private @pack(%A: tensor<7x16xi32>) attributes {no_inline} {
   %c1 = arith.constant 1 : index
   %pad_val = arith.constant 123 : i32
 
-  // Set vscale to 2 (vector width = 256). This will have identical effect to:
-  //  * qemu-aarch64 -cpu max,sve-max-vq=2 (...)
-  %c256 = arith.constant 256 : i32
-  func.call @setArmVLBits(%c256) : (i32) -> ()
-
   // Scalable tile size
   %vs = vector.vscale
   %c8 = arith.constant 8 : index
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-unpack-scalable-inner-tile.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-unpack-scalable-inner-tile.mlir
@@ -10,7 +10,7 @@
 // RUN: rm -f %t && %{compile} && %{run} | FileCheck %s
 
 /// End-to-end test for linalg.pack + linalg.unpack where one of the inner tile sizes is
-/// scalable. 
+/// scalable.
 /// NOTE: Vectorization has not been enabled yet!
 
 
@@ -21,7 +21,12 @@ func.func @main() {
   // (If your platform supports it, you can play with other values as well)
   %c256 = arith.constant 256 : i32
   func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_pack_unpack_scalable_inner_tile() : () -> ()
 
+  return
+}
+
+func.func @test_pack_unpack_scalable_inner_tile() attributes {no_inline} {
   // Dynamic/scalable tile size (vscale x 4)
   %c4 = arith.constant 4 : index
   %vs = vector.vscale
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-deinterleave.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-deinterleave.mlir
@@ -17,7 +17,7 @@ func.func @entry() {
   return
 }
 
-func.func @test_deinterleave() {
+func.func @test_deinterleave() attributes {no_inline} {
   %step_vector = llvm.intr.stepvector : vector<[4]xi8>
   vector.print %step_vector : vector<[4]xi8>
   // CHECK: ( 0, 1, 2, 3, 4, 5, 6, 7 )
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-setArmVLBits.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-setArmVLBits.mlir
@@ -8,7 +8,7 @@
 
 // RUN: %{compile} | %{run} | FileCheck %s
 
-func.func @checkVScale() {
+func.func @checkVScale() attributes {no_inline} {
   %vscale = vector.vscale
   vector.print str "vscale = "
   vector.print %vscale : index
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/transfer-read-scalable-non-trailing.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/transfer-read-scalable-non-trailing.mlir
@@ -15,9 +15,7 @@
 // Test the transfer_read with vector type with a non-trailing scalable
 // dimension as transformed by the pattern LegalizeTransferRead.
 
-func.func @transfer_read_scalable_non_trailing(%vs : i32, %M : memref<?x8xi8>) {
-  func.call @setArmVLBits(%vs) : (i32) -> ()
-
+func.func @transfer_read_scalable_non_trailing(%M : memref<?x8xi8>) attributes {no_inline} {
   // Read an LLVM-illegal vector
   %c0 = arith.constant 0 : index
   %c0_i8 = arith.constant 0 : i8
@@ -56,14 +54,16 @@ func.func @main() {
 // CHECK:( 31, 32, 33, 34, 35, 36, 37, 38, 41, 42, 43, 44, 45, 46, 47, 48 )
   vector.print str "Result(VL128):\n"
   %c128 = arith.constant 128 : i32
-  func.call @transfer_read_scalable_non_trailing(%c128, %MM) : (i32, memref<?x8xi8>) -> ()
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @transfer_read_scalable_non_trailing(%MM) : (memref<?x8xi8>) -> ()
 
 // CHECK-LABEL: Result(VL256):
 // CHECK: ( 11, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 41, 42, 43, 44, 45, 46, 47, 48 )
 // CHECK: ( 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 71, 72, 73, 74, 75, 76, 77, 78, 81, 82, 83, 84, 85, 86, 87, 88 )
   vector.print str "Result(VL256):\n"
   %c256 = arith.constant 256 : i32
-  func.call @transfer_read_scalable_non_trailing(%c256, %MM) : (i32, memref<?x8xi8>) -> ()
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @transfer_read_scalable_non_trailing(%MM) : (memref<?x8xi8>) -> ()
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir
@@ -116,7 +116,7 @@ func.func private @prepareRHSTestData(%in: vector<4x8xi8>) -> memref<?xi8> {
 
 // CHECK-IR-LABEL: llvm.func @test_smmla
 // CHECK-IR-COUNT-4: arm_sve.intr.smmla
-func.func @test_smmla() {
+func.func @test_smmla() attributes {no_inline} {
 
   %c0 = arith.constant 0 : index
   %c0_i32 = arith.constant 0 : i32
@@ -131,10 +131,6 @@ func.func @test_smmla() {
   %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> memref<4x?xi32>
   %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x?xi32>, vector<4x[4]xi32>
 
-  // FIXME: Workaround for a crash, see https://github.com/llvm/llvm-project/issues/143670
-  %acc_cast = memref.cast %acc_mem : memref<4x?xi32> to memref<*xi32>
-  call @printMemrefI32(%acc_cast) : (memref<*xi32>) -> ()
-
   // LHS test data
   %lhs_cst = arith.constant dense<[[-35, -27, -36, -31,  23, -34,  -8, -33],
                                    [-20,  17, -32, -47,  37,  22,  -7, -21],
@@ -186,7 +182,7 @@ func.func @test_smmla() {
 
 // CHECK-IR-LABEL: llvm.func @test_ummla
 // CHECK-IR-COUNT-4: arm_sve.intr.ummla
-func.func @test_ummla() {
+func.func @test_ummla() attributes {no_inline} {
 
   %c0 = arith.constant 0 : index
   %c0_i32 = arith.constant 0 : i32
@@ -253,7 +249,7 @@ func.func @test_ummla() {
 
 // CHECK-IR-LABEL: llvm.func @test_usmmla
 // CHECK-IR-COUNT-4: arm_sve.intr.usmmla
-func.func @test_usmmla() {
+func.func @test_usmmla() attributes {no_inline} {
 
   %c0 = arith.constant 0 : index
   %c0_i32 = arith.constant 0 : i32
@@ -321,7 +317,7 @@ func.func @test_usmmla() {
 
 // CHECK-IR-LABEL: llvm.func @test_summla
 // CHECK-IR-COUNT-4: arm_sve.intr.usmmla
-func.func @test_summla() {
+func.func @test_summla() attributes {no_inline} {
 
   %c0 = arith.constant 0 : index
   %c0_i32 = arith.constant 0 : i32

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@ func.func @entry() {`
`17`	`17`	`return`
`18`	`18`	`}`
`19`	`19`
`20`		`-func.func @test_deinterleave() {`
	`20`	`+func.func @test_deinterleave() attributes {no_inline} {`
`21`	`21`	`%step_vector = llvm.intr.stepvector : vector<[4]xi8>`
`22`	`22`	`vector.print %step_vector : vector<[4]xi8>`
`23`	`23`	`// CHECK: ( 0, 1, 2, 3, 4, 5, 6, 7 )`