Support Call::dynamic_shuffle for LUT in SVE2

stevesuzuki-arm · stevesuzuki-arm · commit 23f498867b12 · 2026-03-31T08:37:58.000Z
OptimizeShuffles pass is enabled in CodeGen_ARM for SVE2.
- Detects gather load where index range is bounded within certain value
  e.g. Look Up Table
- Transforms it into contiguous load + Call::dynamic_shuffle,
  which is lowered to TBL instruction by codegen.

This is especially useful to vectorize with long vector
in SME2 streaming mode where general form of gather load is unsupported.

OptimizeShuffles is modified so that we can use it commonly
between targets (for now, Hexagon and ARM SVE2).
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
@@ -14,6 +14,7 @@
 #include "IROperator.h"
 #include "IRPrinter.h"
 #include "LLVM_Headers.h"
+#include "OptimizeShuffles.h"
 #include "Simplify.h"
 #include "Substitute.h"
 #include "Util.h"
@@ -227,6 +228,7 @@ class CodeGen_ARM : public CodeGen_Posix {
     Value *interleave_vectors(const std::vector<Value *> &) override;
     Value *shuffle_vectors(Value *a, Value *b, const std::vector<int> &indices) override;
     Value *shuffle_scalable_vectors_general(Value *a, Value *b, const std::vector<int> &indices);
+    Value *shuffle_scalable_vectors_general_llvm(Value *a, Value *b, Value *indices, int min_index, int max_index);
     Value *codegen_shuffle_indices(int bits, const std::vector<int> &indices);
     Value *codegen_whilelt(int total_lanes, int start, int end);
     void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
@@ -1223,6 +1225,22 @@ void CodeGen_ARM::compile_func(const LoweredFunc &f,
     // and a - (b << c) into umlsl/smlsl.
     func.body = distribute_shifts(func.body, /* multiply_adds */ true);
 
+    if (target_vscale() > 0) {
+        debug(1) << "ARM: Optimizing shuffles...\n";
+        const int lut_alignment = 16;
+
+        auto max_span_query = [&](const Type &lut_type) -> std::vector<int> {
+            int vl = natural_vector_size(lut_type);
+            // SVE2 has TBL and TBL2 (TBL with two src vectors) LLVM intrinsic.
+            // We prioritize TBL with single src vector in favor of performance.
+            return {vl, vl * 2};
+        };
+
+        func.body = optimize_shuffles(func.body, lut_alignment, native_vector_bits(), max_span_query, true);
+        debug(2) << "ARM: Lowering after optimizing shuffles:\n"
+                 << func.body << "\n\n";
+    }
+
     CodeGen_Posix::compile_func(func, simple_name, extern_name);
 }
 
@@ -2250,7 +2268,7 @@ Value *CodeGen_ARM::shuffle_vectors(Value *a, Value *b, const std::vector<int> &
     }
 
     // Perform vector shuffle by decomposing the operation to multiple native shuffle steps
-    // which calls shuffle_scalable_vectors_general() which emits TBL/TBL2 instruction
+    // which calls shuffle_scalable_vectors_general() which emits TBL/TBL2 LLVM intrinsic.
     DecomposeVectorShuffle shuffler(*this, a, b, get_vector_num_elements(a->getType()), natural_lanes);
     return shuffler.run(indices);
 }
@@ -2259,41 +2277,50 @@ Value *CodeGen_ARM::shuffle_scalable_vectors_general(Value *a, Value *b, const s
     internal_assert(a) << "Must provide a valid vector operand";
     internal_assert(!indices.empty()) << "Cannot shuffle with empty indices";
 
+    llvm::Type *elt = get_vector_element_type(a->getType());
+    Value *val_indices = codegen_shuffle_indices(elt->getScalarSizeInBits(), indices);
+    auto [min_itr, max_itr] = std::minmax_element(indices.begin(), indices.end());
+    int highest_lane = *max_itr;
+    internal_assert(highest_lane >= 0)
+        << "highest_lane was "
+        << (highest_lane == SliceIndexNone            ? "SliceIndexNone" :
+            highest_lane == SliceIndexCarryPrevResult ? "SliceIndexCarryPrevResult" :
+                                                        "")
+        << " (" << highest_lane << ")";
+
+    return shuffle_scalable_vectors_general_llvm(a, b, val_indices, *min_itr, *max_itr);
+}
+
+Value *CodeGen_ARM::shuffle_scalable_vectors_general_llvm(Value *a, Value *b, Value *indices, int min_index, int max_index) {
+    internal_assert(a) << "Must provide a valid vector operand";
+    internal_assert(indices) << "Must provide a valid indices";
+
     llvm::Type *elt = get_vector_element_type(a->getType());
     const int bits = elt->getScalarSizeInBits();
     const int natural_lanes = natural_vector_size(Int(bits));
     const int src_lanes = get_vector_num_elements(a->getType());
-    const int dst_lanes = indices.size();
+    const int dst_lanes = get_vector_num_elements(indices->getType());
     llvm::Type *dst_type = get_vector_type(elt, dst_lanes);
 
     internal_assert(target_vscale() > 0 && is_scalable_vector(a)) << "Only deal with scalable vectors\n";
     internal_assert(src_lanes == natural_lanes && dst_lanes == natural_lanes)
         << "Only deal with vector with natural_lanes\n";
 
     // We select TBL or TBL2 intrinsic depending on indices range
-    int highest_lane = *std::max_element(indices.begin(), indices.end());
-    internal_assert(highest_lane >= 0)
-        << "highest_lane was "
-        << (highest_lane == SliceIndexNone            ? "SliceIndexNone" :
-            highest_lane == SliceIndexCarryPrevResult ? "SliceIndexCarryPrevResult" :
-                                                        "")
-        << " (" << highest_lane << ")";
-
-    bool use_tbl = highest_lane < src_lanes;
+    const bool use_tbl = max_index < src_lanes;
     internal_assert(use_tbl || b) << "'b' must be valid in case of tbl2\n";
 
     auto instr = concat_strings("llvm.aarch64.sve.", use_tbl ? "tbl" : "tbl2", mangle_llvm_type(dst_type));
 
-    Value *val_indices = codegen_shuffle_indices(bits, indices);
     llvm::Type *vt_natural = get_vector_type(elt, natural_lanes);
     std::vector<llvm::Type *> llvm_arg_types;
     std::vector<llvm::Value *> llvm_arg_vals;
     if (use_tbl) {
-        llvm_arg_types = {vt_natural, val_indices->getType()};
-        llvm_arg_vals = {a, val_indices};
+        llvm_arg_types = {vt_natural, indices->getType()};
+        llvm_arg_vals = {a, indices};
     } else {
-        llvm_arg_types = {vt_natural, vt_natural, val_indices->getType()};
-        llvm_arg_vals = {a, b, val_indices};
+        llvm_arg_types = {vt_natural, vt_natural, indices->getType()};
+        llvm_arg_vals = {a, b, indices};
     }
     llvm::FunctionType *fn_type = FunctionType::get(vt_natural, llvm_arg_types, false);
     FunctionCallee fn = module->getOrInsertFunction(instr, fn_type);
@@ -2383,6 +2410,41 @@ void CodeGen_ARM::visit(const Call *op) {
             value = codegen(lower_round_to_nearest_ties_to_even(op->args[0]));
             return;
         }
+    } else if (op->is_intrinsic(Call::dynamic_shuffle)) {
+        internal_assert(target_vscale() > 0);
+        internal_assert(op->args.size() == 4);
+        const auto min_index = as_const_int(op->args[2]);
+        const auto max_index = as_const_int(op->args[3]);
+        internal_assert(min_index.has_value() && max_index.has_value());
+
+        Type lut_type = op->args[0].type();
+        const int src_lanes = lut_type.lanes();
+        const int dst_lanes = op->args[1].type().lanes();
+        const int natural_lanes = natural_vector_size(lut_type);
+
+        debug(3) << "dynamic_shuffle: [" << *min_index << ", " << *max_index << "]"
+                 << ", natural_lanes:" << natural_lanes << ", src_lanes:" << src_lanes << "\n";
+
+        Value *src = codegen(op->args[0]);
+        internal_assert(src_lanes <= natural_lanes * 2) << "src is too long to dynamic_shuffle\n";
+        Value *src_a = slice_vector(src, 0, natural_lanes);
+        Value *src_b = (src_lanes > natural_lanes) ? slice_vector(src, natural_lanes, natural_lanes) : nullptr;
+
+        // Cast index to interger with the same bits as LUT data
+        Type index_type = UInt(lut_type.bits()).with_lanes(dst_lanes);
+        Expr indices = cast(index_type, op->args[1]);
+        Value *val_indices = codegen(indices);
+
+        std::vector<Value *> slices;
+        const int num_slices = align_up(dst_lanes, natural_lanes) / natural_lanes;
+        slices.reserve(num_slices);
+        for (int i = 0; i < num_slices; i++) {
+            Value *indices_slice = slice_vector(val_indices, i * natural_lanes, natural_lanes);
+            Value *dst_slice = shuffle_scalable_vectors_general_llvm(src_a, src_b, indices_slice, *min_index, *max_index);
+            slices.push_back(dst_slice);
+        }
+        value = slice_vector(concat_vectors(slices), 0, dst_lanes);
+        return;
     }
 
     if (op->type.is_vector()) {
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
@@ -1935,7 +1935,9 @@ void CodeGen_Hexagon::visit(const Call *op) {
             auto max_index = as_const_int(op->args[3]);
             internal_assert(min_index && max_index);
             Value *lut = codegen(op->args[0]);
-            Value *idx = codegen(op->args[1]);
+            // Cast the index to 8 bit
+            Expr index = cast(UInt(8).with_lanes(op->type.lanes()), op->args[1]);
+            Value *idx = codegen(index);
             value = vlut(lut, idx, *min_index, *max_index);
             return;
         } else if (op->is_intrinsic(Call::abs)) {
diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
@@ -2285,7 +2285,8 @@ class SyncronizationBarriers : public IRMutator {
 Stmt optimize_hexagon_shuffles(const Stmt &s, int lut_alignment) {
     // Replace indirect and other complicated loads with
     // dynamic_shuffle (vlut) calls.
-    return optimize_shuffles(s, lut_alignment);
+    auto max_span_query = [](const Type &t) -> std::vector<int> { return {256}; };
+    return optimize_shuffles(s, lut_alignment, 1024, max_span_query, false);
 }
 
 Stmt scatter_gather_generator(Stmt s) {
diff --git a/src/OptimizeShuffles.cpp b/src/OptimizeShuffles.cpp
@@ -21,8 +21,13 @@ namespace Internal {
 
 namespace {
 
+using SpanQueryType = std::function<std::vector<int>(const Type &)>;
+
 class OptimizeShuffles : public IRMutator {
     int lut_alignment;
+    int native_vector_bits;
+    SpanQueryType get_max_span_sizes;
+    bool align_loads_with_native_vector;
     Scope<Interval> bounds;
     std::vector<std::pair<std::string, Expr>> lets;
 
@@ -67,7 +72,7 @@ class OptimizeShuffles : public IRMutator {
         if (allocations_to_pad.count(op->name)) {
             op = s.as<Allocate>();
             internal_assert(op);
-            int padding = 128 / op->type.bytes();  // One native vector
+            int padding = native_vector_bits / op->type.bits();  // One native vector
             return Allocate::make(op->name, op->type, op->memory_type,
                                   op->extents, op->condition,
                                   op->body, op->new_expr, op->free_function,
@@ -99,34 +104,40 @@ class OptimizeShuffles : public IRMutator {
                 ((unaligned_index_bounds.max + align) / align) * align - 1};
             ModulusRemainder alignment(align, 0);
 
-            for (const Interval &index_bounds : {aligned_index_bounds, unaligned_index_bounds}) {
-                Expr index_span = span_of_bounds(index_bounds);
-                index_span = common_subexpression_elimination(index_span);
-                index_span = simplify(index_span);
-
-                if (can_prove(index_span < 256)) {
-                    // This is a lookup within an up to 256 element array. We
-                    // can use dynamic_shuffle for this.
-                    int const_extent = as_const_int(index_span) ? *as_const_int(index_span) + 1 : 256;
-                    Expr base = simplify(index_bounds.min);
-
-                    // Load all of the possible indices loaded from the
-                    // LUT. Note that for clamped ramps, this loads up to 1
-                    // vector past the max, so we will add padding to the
-                    // allocation accordingly (if we're the one that made it).
-                    allocations_to_pad.insert(op->name);
-                    Expr lut = Load::make(op->type.with_lanes(const_extent), op->name,
-                                          Ramp::make(base, 1, const_extent),
-                                          op->image, op->param, const_true(const_extent), alignment);
-
-                    // We know the size of the LUT is not more than 256, so we
-                    // can safely cast the index to 8 bit, which
-                    // dynamic_shuffle requires.
-                    index = simplify(cast(UInt(8).with_lanes(op->type.lanes()), index - base));
-                    return Call::make(op->type, "dynamic_shuffle", {lut, index, 0, const_extent - 1}, Call::PureIntrinsic);
+            const int native_vector_size = native_vector_bits / op->type.bits();
+
+            for (const auto &max_span_size : get_max_span_sizes(op->type)) {
+
+                for (const Interval &index_bounds : {aligned_index_bounds, unaligned_index_bounds}) {
+                    Expr index_span = span_of_bounds(index_bounds);
+                    index_span = common_subexpression_elimination(index_span);
+                    index_span = simplify(index_span);
+
+                    if (can_prove(index_span < max_span_size)) {
+                        // This is a lookup within an up to max_span_size element array. We
+                        // can use dynamic_shuffle for this.
+                        int const_extent = as_const_int(index_span) ? *as_const_int(index_span) + 1 : max_span_size;
+                        if (align_loads_with_native_vector) {
+                            const_extent = align_up(const_extent, native_vector_size);
+                        }
+                        Expr base = simplify(index_bounds.min);
+
+                        // Load all of the possible indices loaded from the
+                        // LUT. Note that for clamped ramps, this loads up to 1
+                        // vector past the max, so we will add padding to the
+                        // allocation accordingly (if we're the one that made it).
+                        allocations_to_pad.insert(op->name);
+                        Expr lut = Load::make(op->type.with_lanes(const_extent), op->name,
+                                              Ramp::make(base, 1, const_extent),
+                                              op->image, op->param, const_true(const_extent), alignment);
+
+                        // Target dependent codegen needs to cast the type of index to what it accepts
+                        index = simplify(index - base);
+                        return Call::make(op->type, "dynamic_shuffle", {lut, index, 0, const_extent - 1}, Call::PureIntrinsic);
+                    }
+                    // Only the first iteration of this loop is aligned.
+                    alignment = ModulusRemainder();
                 }
-                // Only the first iteration of this loop is aligned.
-                alignment = ModulusRemainder();
             }
         }
         if (!index.same_as(op->index)) {
@@ -137,14 +148,17 @@ class OptimizeShuffles : public IRMutator {
     }
 
 public:
-    OptimizeShuffles(int lut_alignment)
-        : lut_alignment(lut_alignment) {
+    OptimizeShuffles(int lut_alignment, int native_vector_bits, SpanQueryType get_max_span_sizes, bool align_loads_with_native_vector)
+        : lut_alignment(lut_alignment),
+          native_vector_bits(native_vector_bits),
+          get_max_span_sizes(std::move(get_max_span_sizes)),
+          align_loads_with_native_vector(align_loads_with_native_vector) {
     }
 };
 }  // namespace
 
-Stmt optimize_shuffles(Stmt s, int lut_alignment) {
-    s = OptimizeShuffles(lut_alignment)(s);
+Stmt optimize_shuffles(Stmt s, int lut_alignment, int native_vector_bits, SpanQueryType get_max_span_sizes, bool align_loads_with_native_vector) {
+    s = OptimizeShuffles(lut_alignment, native_vector_bits, std::move(get_max_span_sizes), align_loads_with_native_vector)(s);
     return s;
 }
 
diff --git a/src/OptimizeShuffles.h b/src/OptimizeShuffles.h
@@ -7,13 +7,19 @@
  */
 
 #include "Expr.h"
+#include <functional>
+#include <vector>
 
 namespace Halide {
 namespace Internal {
 
 /* Replace indirect loads with dynamic_shuffle intrinsics where
 possible. */
-Stmt optimize_shuffles(Stmt s, int lut_alignment);
+Stmt optimize_shuffles(Stmt s,
+                       int lut_alignment,
+                       int native_vector_bits,
+                       std::function<std::vector<int>(const Type &)> get_max_span_sizes,
+                       bool align_loads_with_native_vector);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp
@@ -855,7 +855,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                     if (instr_lanes < 2 || (total_lanes / vscale < 2)) continue;  // bail out scalar and <vscale x 1 x ty>
 
                     AddTestFunctor add(*this, bits, total_lanes);
-                    Expr index = clamp(cast<int>(in_im(x)), 0, W - 1);
+                    Expr index = clamp(in_i32(x), 0, W - 1);
                     Func tmp;
                     tmp(x, y) = cast(elt, y);
                     tmp(x, index) = cast(elt, 1);
@@ -876,6 +876,38 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                     }
                 }
             }
+
+            // Gather load where index range is bounded within certain value. e.g. LUT
+            // In this case, Halide tries to tranform it into contiguous load + Call::dynamic_shuffle
+            // which is lowered to TBL instruction. (see OptimizeShuffles.cpp)
+            if (has_sve()) {
+                const int width = base_vec_bits;
+                const int total_lanes = width / bits;
+                const int instr_lanes = Instruction::get_instr_lanes(bits, total_lanes, target);
+                if (instr_lanes < 2 || (total_lanes / vscale < 2)) continue;  // bail out scalar and <vscale x 1 x ty>
+
+                AddTestFunctor add(*this, bits, total_lanes);
+                const std::vector<std::pair<int, int>> index_min_max{
+                    {0, total_lanes - 1},
+                    {1, total_lanes},
+                    {0, total_lanes * 2 - 1},
+                };
+                for (auto &[index_min, index_max] : index_min_max) {
+                    Expr index = cast(Int(32), in_im(x));
+                    index = clamp(index, index_min, index_max);
+                    Expr look_up = in_im(index);
+
+                    add("tbl", look_up);
+                }
+
+                // Without clamped but bounded by the range of the data type of the input image (8bit)
+                Expr index = cast(Int(32), in_u8(x));  // 8 bit fixed
+                int factor = (1 << 8) / (total_lanes * 2);
+                index = index / factor;  // index should be within native_vector*2 range
+                Expr look_up = in_im(index);
+
+                add("tbl", look_up);
+            }
         }
     }
 

Original file line number	Diff line number	Diff line change
`@@ -2285,7 +2285,8 @@ class SyncronizationBarriers : public IRMutator {`
`2285`	`2285`	`Stmt optimize_hexagon_shuffles(const Stmt &s, int lut_alignment) {`
`2286`	`2286`	`// Replace indirect and other complicated loads with`
`2287`	`2287`	`// dynamic_shuffle (vlut) calls.`
`2288`		`- return optimize_shuffles(s, lut_alignment);`
	`2288`	`+ auto max_span_query = [](const Type &t) -> std::vector<int> { return {256}; };`
	`2289`	`+ return optimize_shuffles(s, lut_alignment, 1024, max_span_query, false);`
`2289`	`2290`	`}`
`2290`	`2291`
`2291`	`2292`	`Stmt scatter_gather_generator(Stmt s) {`