slothy-optimizer · mkannwischer · Dec 1, 2025 · Nov 23, 2025
diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py
@@ -1374,6 +1374,52 @@ class vsub(AArch64Instruction):
 ############################
 
 
+class Ldr_D(AArch64Instruction):
+    pass
+
+
+class d_ldr(Ldr_D):
+    pattern = "ldr <Da>, [<Xc>]"
+    inputs = ["Xc"]
+    outputs = ["Da"]
+
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = None
+        obj.pre_index = None
+        obj.addr = obj.args_in[0]
+        return obj
+
+
+class d_ldr_with_postinc(Ldr_D):
+    pattern = "ldr <Da>, [<Xc>], <imm>"
+    in_outs = ["Xc"]
+    outputs = ["Da"]
+
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = obj.immediate
+        obj.pre_index = None
+        obj.addr = obj.args_in_out[0]
+        return obj
+
+
+class d_ldr_with_inc_writeback(Ldr_D):
+    pattern = "ldr <Da>, [<Xc>, <imm>]!"
+    in_outs = ["Xc"]
+    outputs = ["Da"]
+
+    @classmethod
+    def make(cls, src):
+        obj = AArch64Instruction.build(cls, src)
+        obj.increment = obj.immediate
+        obj.pre_index = None
+        obj.addr = obj.args_in_out[0]
+        return obj
+
+
 class Ldr_Q(AArch64Instruction):
     pass
 

diff --git a/slothy/targets/aarch64/cortex_a55.py b/slothy/targets/aarch64/cortex_a55.py
@@ -52,6 +52,7 @@
     Str_X,
     Stp_X,
     Ldr_Q,
+    Ldr_D,
     Str_Q,
     vmov,
     vadd,
@@ -311,6 +312,7 @@ def get_min_max_objective(slothy):
     ],
     # non-q-form vector instructions
     (
+        Ldr_D,
         umov_d,
         mov_d01,
         mov_b00,
@@ -454,6 +456,7 @@ def get_min_max_objective(slothy):
     ): 1,
     (vshl, vshl_d, vsshr, vushr, vuxtl): 1,
     (trn2, trn1, ASimdCompare): 1,
+    (Ldr_D): 1,
     (Ldr_Q): 2,
     (AArch64NeonCount): 1,
     (Str_Q): 1,
@@ -530,6 +533,7 @@ def get_min_max_objective(slothy):
         vmls,
         vmls_lane,
     ): 4,
+    (Ldr_D): 3,
     (Ldr_Q, Str_Q): 4,
     (sub_imm, cmp): 2,
     AArch64NeonCount: 2,

diff --git a/slothy/targets/aarch64/cortex_a72_frontend.py b/slothy/targets/aarch64/cortex_a72_frontend.py
@@ -63,6 +63,7 @@
     Ldr_X,
     Str_X,
     Ldr_Q,
+    Ldr_D,
     Str_Q,
     vadd,
     vmul,
@@ -210,7 +211,7 @@ def get_min_max_objective(slothy):
     vuaddlv_sform: [[ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1]],
     Vins: [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1],
     umov_d: ExecutionUnit.LOAD(),  # ???
-    (Ldr_Q, Ldr_X): ExecutionUnit.LOAD(),
+    (Ldr_D, Ldr_Q, Ldr_X): ExecutionUnit.LOAD(),
     (Str_Q, Str_X): ExecutionUnit.STORE(),
     AArch64Move: ExecutionUnit.SCALAR(),
     (add, add_imm, add_shifted): ExecutionUnit.SCALAR(),
@@ -252,7 +253,7 @@ def get_min_max_objective(slothy):
     Vins: 1,
     umov_d: 1,
     (add, add_imm, add_shifted): 1,
-    (Ldr_Q, Str_Q, Ldr_X, Str_X): 1,
+    (Ldr_D, Ldr_Q, Str_Q, Ldr_X, Str_X): 1,
     (VShiftImmediateRounding, VShiftImmediateBasic): 1,
     # TODO: this seems in accurate; revisiting may improve performance
     St2: 4,
@@ -298,7 +299,7 @@ def get_min_max_objective(slothy):
     AArch64NeonShiftInsert: 3,
     AArch64ConditionalCompare: 1,
     AArch64Logical: 1,
-    (Ldr_Q, Ldr_X, Str_Q, Str_X): 4,  # approx
+    (Ldr_D, Ldr_Q, Ldr_X, Str_Q, Str_X): 4,  # approx
     Vins: 6,  # approx
     umov_d: 4,  # approx
     (add, add_imm, add_shifted): 2,

diff --git a/slothy/targets/aarch64/neoverse_n1_experimental.py b/slothy/targets/aarch64/neoverse_n1_experimental.py
@@ -40,6 +40,7 @@
     Ldr_X,
     Str_X,
     Stp_X,
+    Ldr_D,
     Ldr_Q,
     Str_Q,
     Stp_Q,
@@ -167,7 +168,17 @@ def get_min_max_objective(slothy):
 
 
 execution_units = {
-    (Ldp_X, Ldr_X, Str_X, Stp_X, Ldr_Q, Str_Q, Stp_Q, Ldp_Q): ExecutionUnit.LSU(),
+    (
+        Ldp_X,
+        Ldr_X,
+        Str_X,
+        Stp_X,
+        Ldr_D,
+        Ldr_Q,
+        Str_Q,
+        Stp_Q,
+        Ldp_Q,
+    ): ExecutionUnit.LSU(),
     # TODO: The following would be more accurate, but does not
     #       necessarily lead to better results, while making the
     #       optimization slower. Investigate...
@@ -226,7 +237,7 @@ def get_min_max_objective(slothy):
 }
 
 inverse_throughput = {
-    (Ldr_X, Str_X, Ldr_Q, Str_Q, Ldp_Q): 1,
+    (Ldr_X, Str_X, Ldr_D, Ldr_Q, Str_Q, Ldp_Q): 1,
     (Ldp_X, Stp_X): 2,
     AArch64NeonCount: 1,
     Stp_Q: 2,
@@ -277,7 +288,7 @@ def get_min_max_objective(slothy):
 default_latencies = {
     # For OOO uArch we use relaxed latency modeling for load instructions
     # since the uArch will heavily front-load them anyway
-    (Ldp_X, Ldr_X, Ldr_Q, Stp_Q, Ldp_Q): 4,
+    (Ldp_X, Ldr_X, Ldr_D, Ldr_Q, Stp_Q, Ldp_Q): 4,
     (Stp_X, Str_X, Str_Q): 2,
     St3: 6,  # Multiple structures, Q form, storing bytes
     St4: 4,

diff --git a/tests/naive/aarch64/instructions.s b/tests/naive/aarch64/instructions.s
@@ -52,6 +52,7 @@ movz x13, #0x1234
 movz x14, #0x5678, lsl #16
 ldr q24, [x3, x12, lsl #4]
 ldr x6, [x3, x12]
+ldr d1, [x2], #32
 clz v0.16b, v0.16b
 cnt v0.16b, v0.16b
 tbl v16.16b, {v16.16b}, v24.16b