diff --git a/Cargo.lock b/Cargo.lock
index eb206607583..de03d3c1246 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2114,6 +2114,7 @@ dependencies = [
  "icu_testdata",
  "itertools",
  "libm",
+ "once_cell",
  "serde",
  "serde_json",
  "utf8_iter",
diff --git a/components/segmenter/Cargo.toml b/components/segmenter/Cargo.toml
index 70873821953..d0d11bf7c8e 100644
--- a/components/segmenter/Cargo.toml
+++ b/components/segmenter/Cargo.toml
@@ -41,6 +41,7 @@ databake = { version = "0.1.3", path = "../../utils/databake", optional = true,
 serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true }
 
 libm = { version = "0.2", default-features = false, optional = true }
+once_cell = { version = "1.17.1"}
 
 [dev-dependencies]
 criterion = "0.4"
diff --git a/components/segmenter/src/complex/lstm/matrix.rs b/components/segmenter/src/complex/lstm/matrix.rs
index 6b8314f73f6..6592adc52b6 100644
--- a/components/segmenter/src/complex/lstm/matrix.rs
+++ b/components/segmenter/src/complex/lstm/matrix.rs
@@ -2,10 +2,10 @@
 // called LICENSE at the top level of the ICU4X source tree
 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
 
+use super::ops::{dot_1, dot_2};
 use alloc::vec;
 use alloc::vec::Vec;
 use core::ops::Range;
-use zerovec::ule::AsULE;
 use zerovec::ZeroSlice;
 
 // This will be used in #[no_std] as f32::exp/f32::tanh are not in core.
@@ -276,14 +276,6 @@ impl<'a, const D: usize> MatrixBorrowedMut<'a, D> {
     }
 }
 
-impl<'a> MatrixBorrowed<'a, 1> {
-    #[allow(dead_code)] // could be useful
-    pub(super) fn dot_1d(&self, other: MatrixZero<1>) -> f32 {
-        debug_assert_eq!(self.dims, other.dims);
-        unrolled_dot_1(self.data, other.data)
-    }
-}
-
 impl<'a> MatrixBorrowedMut<'a, 1> {
     /// Calculate the dot product of a and b, adding the result to self.
     ///
@@ -311,7 +303,7 @@ impl<'a> MatrixBorrowedMut<'a, 1> {
         for i in 0..n {
             if let (Some(dest), Some(b_sub)) = (self.as_mut_slice().get_mut(i), b.submatrix::<1>(i))
             {
-                *dest += unrolled_dot_1(a.data, b_sub.data);
+                *dest += dot_1(a.data, b_sub.data);
             } else {
                 debug_assert!(false, "unreachable: dims checked above");
             }
@@ -353,7 +345,7 @@ impl<'a> MatrixBorrowedMut<'a, 2> {
                 self.as_mut_slice().get_mut(i),
                 b.as_slice().get_subslice(i * m..(i + 1) * m),
             ) {
-                *dest += unrolled_dot_1(lhs, rhs);
+                *dest += dot_1(lhs, rhs);
             } else {
                 debug_assert!(false, "unreachable: dims checked above");
             }
@@ -393,7 +385,7 @@ impl<'a> MatrixBorrowedMut<'a, 2> {
                 self.as_mut_slice().get_mut(i),
                 b.as_slice().get_subslice(i * m..(i + 1) * m),
             ) {
-                *dest += unrolled_dot_2(lhs, rhs);
+                *dest += dot_2(lhs, rhs);
             } else {
                 debug_assert!(false, "unreachable: dims checked above");
             }
@@ -476,83 +468,3 @@ impl<'a, const D: usize> MatrixZero<'a, D> {
         (n * index..n * (index + 1), sub_dims)
     }
 }
-
-macro_rules! f32c {
-    ($ule:expr) => {
-        f32::from_unaligned($ule)
-    };
-}
-
-/// Compute the dot product of an aligned and an unaligned f32 slice.
-///
-/// `xs` and `ys` must be the same length
-///
-/// (Based on ndarray 0.15.6)
-fn unrolled_dot_1(xs: &[f32], ys: &ZeroSlice<f32>) -> f32 {
-    debug_assert_eq!(xs.len(), ys.len());
-    // eightfold unrolled so that floating point can be vectorized
-    // (even with strict floating point accuracy semantics)
-    let mut p = (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
-    let xit = xs.chunks_exact(8);
-    let yit = ys.as_ule_slice().chunks_exact(8);
-    let sum = xit
-        .remainder()
-        .iter()
-        .zip(yit.remainder().iter())
-        .map(|(x, y)| x * f32c!(*y))
-        .sum::<f32>();
-    for (xx, yy) in xit.zip(yit) {
-        // TODO: Use array_chunks once stable to avoid the unwrap.
-        // <https://github.com/rust-lang/rust/issues/74985>
-        #[allow(clippy::unwrap_used)]
-        let [x0, x1, x2, x3, x4, x5, x6, x7] = *<&[f32; 8]>::try_from(xx).unwrap();
-        #[allow(clippy::unwrap_used)]
-        let [y0, y1, y2, y3, y4, y5, y6, y7] = *<&[<f32 as AsULE>::ULE; 8]>::try_from(yy).unwrap();
-        p.0 += x0 * f32c!(y0);
-        p.1 += x1 * f32c!(y1);
-        p.2 += x2 * f32c!(y2);
-        p.3 += x3 * f32c!(y3);
-        p.4 += x4 * f32c!(y4);
-        p.5 += x5 * f32c!(y5);
-        p.6 += x6 * f32c!(y6);
-        p.7 += x7 * f32c!(y7);
-    }
-    sum + (p.0 + p.4) + (p.1 + p.5) + (p.2 + p.6) + (p.3 + p.7)
-}
-
-/// Compute the dot product of two unaligned f32 slices.
-///
-/// `xs` and `ys` must be the same length
-///
-/// (Based on ndarray 0.15.6)
-fn unrolled_dot_2(xs: &ZeroSlice<f32>, ys: &ZeroSlice<f32>) -> f32 {
-    debug_assert_eq!(xs.len(), ys.len());
-    // eightfold unrolled so that floating point can be vectorized
-    // (even with strict floating point accuracy semantics)
-    let mut p = (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
-    let xit = xs.as_ule_slice().chunks_exact(8);
-    let yit = ys.as_ule_slice().chunks_exact(8);
-    let sum = xit
-        .remainder()
-        .iter()
-        .zip(yit.remainder().iter())
-        .map(|(x, y)| f32c!(*x) * f32c!(*y))
-        .sum::<f32>();
-    for (xx, yy) in xit.zip(yit) {
-        // TODO: Use array_chunks once stable to avoid the unwrap.
-        // <https://github.com/rust-lang/rust/issues/74985>
-        #[allow(clippy::unwrap_used)]
-        let [x0, x1, x2, x3, x4, x5, x6, x7] = *<&[<f32 as AsULE>::ULE; 8]>::try_from(xx).unwrap();
-        #[allow(clippy::unwrap_used)]
-        let [y0, y1, y2, y3, y4, y5, y6, y7] = *<&[<f32 as AsULE>::ULE; 8]>::try_from(yy).unwrap();
-        p.0 += f32c!(x0) * f32c!(y0);
-        p.1 += f32c!(x1) * f32c!(y1);
-        p.2 += f32c!(x2) * f32c!(y2);
-        p.3 += f32c!(x3) * f32c!(y3);
-        p.4 += f32c!(x4) * f32c!(y4);
-        p.5 += f32c!(x5) * f32c!(y5);
-        p.6 += f32c!(x6) * f32c!(y6);
-        p.7 += f32c!(x7) * f32c!(y7);
-    }
-    sum + (p.0 + p.4) + (p.1 + p.5) + (p.2 + p.6) + (p.3 + p.7)
-}
diff --git a/components/segmenter/src/complex/lstm/mod.rs b/components/segmenter/src/complex/lstm/mod.rs
index c9f72cbd740..af76c29c821 100644
--- a/components/segmenter/src/complex/lstm/mod.rs
+++ b/components/segmenter/src/complex/lstm/mod.rs
@@ -10,6 +10,7 @@ use zerovec::{maps::ZeroMapBorrowed, ule::UnvalidatedStr};
 
 mod matrix;
 use matrix::*;
+mod ops;
 
 // A word break iterator using LSTM model. Input string have to be same language.
 
diff --git a/components/segmenter/src/complex/lstm/ops.rs b/components/segmenter/src/complex/lstm/ops.rs
new file mode 100644
index 00000000000..53e8e1ad63a
--- /dev/null
+++ b/components/segmenter/src/complex/lstm/ops.rs
@@ -0,0 +1,449 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/anySE ).
+
+#![allow(dead_code)]
+use zerovec::ZeroSlice;
+
+macro_rules! f32c {
+    ($ule:expr) => {
+        f32::from_unaligned($ule)
+    };
+}
+
+pub(crate) fn dot_1(xs: &[f32], ys: &ZeroSlice<f32>) -> f32 {
+    #[cfg(all(target_feature = "avx", target_feature = "fma"))]
+    {
+        unsafe { avx::dot_1_avx_fma(xs, ys) }
+    }
+    #[cfg(all(
+        target_arch = "aarch64",
+        target_endian = "little",
+        target_feature = "neon"
+    ))]
+    {
+        unsafe { neon::dot_1_neon(xs, ys) }
+    }
+    #[cfg(all(
+        not(all(target_feature = "avx", target_feature = "fma")),
+        not(all(
+            target_arch = "aarch64",
+            target_endian = "little",
+            target_feature = "neon"
+        )),
+        not(feature = "std")
+    ))]
+    {
+        unrolled::unrolled_dot_1(xs, ys)
+    }
+    #[cfg(all(
+        not(all(target_feature = "avx", target_feature = "fma")),
+        not(all(
+            target_arch = "aarch64",
+            target_endian = "little",
+            target_feature = "neon"
+        )),
+        feature = "std"
+    ))]
+    {
+        // runtime dispatch
+        unsafe { runtime::DOT_1_PTR(xs, ys) }
+    }
+}
+
+pub(crate) fn dot_2(xs: &ZeroSlice<f32>, ys: &ZeroSlice<f32>) -> f32 {
+    #[cfg(all(target_feature = "avx", target_feature = "fma"))]
+    {
+        unsafe { avx::dot_2_avx_fma(xs, ys) }
+    }
+    #[cfg(all(
+        target_arch = "aarch64",
+        target_endian = "little",
+        target_feature = "neon"
+    ))]
+    {
+        neon::dot_2_neon(xs, ys)
+    }
+    #[cfg(all(
+        not(all(target_feature = "avx", target_feature = "fma")),
+        not(all(
+            target_arch = "aarch64",
+            target_endian = "little",
+            target_feature = "neon"
+        )),
+        not(feature = "std")
+    ))]
+    {
+        unrolled::unrolled_dot_2(xs, ys)
+    }
+    #[cfg(all(
+        not(all(target_feature = "avx", target_feature = "fma")),
+        not(all(
+            target_arch = "aarch64",
+            target_endian = "little",
+            target_feature = "neon"
+        )),
+        feature = "std"
+    ))]
+    {
+        // runtime dispatch
+        unsafe { runtime::DOT_2_PTR(xs, ys) }
+    }
+}
+
+#[cfg(all(
+    not(all(target_feature = "avx", target_feature = "fma")),
+    not(all(
+        target_arch = "aarch64",
+        target_endian = "little",
+        target_feature = "neon"
+    )),
+    feature = "std"
+))]
+mod runtime {
+    use once_cell::sync::Lazy;
+    use zerovec::ZeroSlice;
+
+    type Dot1Fn = unsafe fn(&[f32], &ZeroSlice<f32>) -> f32;
+    type Dot2Fn = unsafe fn(&ZeroSlice<f32>, &ZeroSlice<f32>) -> f32;
+
+    pub(crate) static DOT_1_PTR: Lazy<Dot1Fn> = Lazy::new(initialize_dot1);
+    pub(crate) static DOT_2_PTR: Lazy<Dot2Fn> = Lazy::new(initialize_dot2);
+
+    fn initialize_dot1() -> Dot1Fn {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        {
+            if std::arch::is_x86_feature_detected!("avx")
+                && std::arch::is_x86_feature_detected!("fma")
+            {
+                return super::avx::dot_1_avx_fma;
+            }
+        }
+        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
+        {
+            if std::arch::is_aarch64_feature_detected!("neon") {
+                return super::neon::dot_1_neon;
+            }
+        }
+        super::unrolled::unrolled_dot_1
+    }
+
+    fn initialize_dot2() -> Dot2Fn {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        {
+            if std::arch::is_x86_feature_detected!("avx")
+                && std::arch::is_x86_feature_detected!("fma")
+            {
+                return super::avx::dot_2_avx_fma;
+            }
+        }
+        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
+        {
+            if std::arch::is_aarch64_feature_detected!("neon") {
+                return super::neon::dot_2_neon;
+            }
+        }
+        super::unrolled::unrolled_dot_2
+    }
+}
+
+#[cfg(all(
+    not(all(target_feature = "avx", target_feature = "fma")),
+    not(all(
+        target_arch = "aarch64",
+        target_endian = "little",
+        target_feature = "neon"
+    )),
+))]
+mod unrolled {
+    use zerovec::ule::AsULE;
+    use zerovec::ZeroSlice;
+
+    /// Compute the dot product of an aligned and an unaligned f32 slice.
+    ///
+    /// `xs` and `ys` must be the same length
+    ///
+    /// (Based on ndarray 0.15.6)
+    pub(crate) fn unrolled_dot_1(xs: &[f32], ys: &ZeroSlice<f32>) -> f32 {
+        debug_assert_eq!(xs.len(), ys.len());
+        // eightfold unrolled so that floating point can be vectorized
+        // (even with strict floating point accuracy semantics)
+        let mut p = (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+        let xit = xs.chunks_exact(8);
+        let yit = ys.as_ule_slice().chunks_exact(8);
+        let sum = xit
+            .remainder()
+            .iter()
+            .zip(yit.remainder().iter())
+            .map(|(x, y)| x * f32c!(*y))
+            .sum::<f32>();
+        for (xx, yy) in xit.zip(yit) {
+            // TODO: Use array_chunks once stable to avoid the unwrap.
+            // <https://github.com/rust-lang/rust/issues/74985>
+            #[allow(clippy::unwrap_used)]
+            let [x0, x1, x2, x3, x4, x5, x6, x7] = *<&[f32; 8]>::try_from(xx).unwrap();
+            #[allow(clippy::unwrap_used)]
+            let [y0, y1, y2, y3, y4, y5, y6, y7] =
+                *<&[<f32 as AsULE>::ULE; 8]>::try_from(yy).unwrap();
+            p.0 += x0 * f32c!(y0);
+            p.1 += x1 * f32c!(y1);
+            p.2 += x2 * f32c!(y2);
+            p.3 += x3 * f32c!(y3);
+            p.4 += x4 * f32c!(y4);
+            p.5 += x5 * f32c!(y5);
+            p.6 += x6 * f32c!(y6);
+            p.7 += x7 * f32c!(y7);
+        }
+        sum + (p.0 + p.4) + (p.1 + p.5) + (p.2 + p.6) + (p.3 + p.7)
+    }
+
+    /// Compute the dot product of two unaligned f32 slices.
+    ///
+    /// `xs` and `ys` must be the same length
+    ///
+    /// (Based on ndarray 0.15.6)
+    pub(crate) fn unrolled_dot_2(xs: &ZeroSlice<f32>, ys: &ZeroSlice<f32>) -> f32 {
+        debug_assert_eq!(xs.len(), ys.len());
+        // eightfold unrolled so that floating point can be vectorized
+        // (even with strict floating point accuracy semantics)
+        let mut p = (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+        let xit = xs.as_ule_slice().chunks_exact(8);
+        let yit = ys.as_ule_slice().chunks_exact(8);
+        let sum = xit
+            .remainder()
+            .iter()
+            .zip(yit.remainder().iter())
+            .map(|(x, y)| f32c!(*x) * f32c!(*y))
+            .sum::<f32>();
+        for (xx, yy) in xit.zip(yit) {
+            // TODO: Use array_chunks once stable to avoid the unwrap.
+            // <https://github.com/rust-lang/rust/issues/74985>
+            #[allow(clippy::unwrap_used)]
+            let [x0, x1, x2, x3, x4, x5, x6, x7] =
+                *<&[<f32 as AsULE>::ULE; 8]>::try_from(xx).unwrap();
+            #[allow(clippy::unwrap_used)]
+            let [y0, y1, y2, y3, y4, y5, y6, y7] =
+                *<&[<f32 as AsULE>::ULE; 8]>::try_from(yy).unwrap();
+            p.0 += f32c!(x0) * f32c!(y0);
+            p.1 += f32c!(x1) * f32c!(y1);
+            p.2 += f32c!(x2) * f32c!(y2);
+            p.3 += f32c!(x3) * f32c!(y3);
+            p.4 += f32c!(x4) * f32c!(y4);
+            p.5 += f32c!(x5) * f32c!(y5);
+            p.6 += f32c!(x6) * f32c!(y6);
+            p.7 += f32c!(x7) * f32c!(y7);
+        }
+        sum + (p.0 + p.4) + (p.1 + p.5) + (p.2 + p.6) + (p.3 + p.7)
+    }
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod avx {
+    use zerovec::ule::AsULE;
+    use zerovec::ZeroSlice;
+
+    #[target_feature(enable = "avx,fma")]
+    pub(crate) unsafe fn dot_1_avx_fma(xs: &[f32], ys: &ZeroSlice<f32>) -> f32 {
+        #[cfg(target_arch = "x86")]
+        use core::arch::x86::*;
+        #[cfg(target_arch = "x86_64")]
+        debug_assert_eq!(xs.len(), ys.len());
+
+        let xc = xs.chunks_exact(8);
+        let yc = ys.as_ule_slice().chunks_exact(8);
+
+        let remainder = xc
+            .remainder()
+            .iter()
+            .zip(yc.remainder().iter())
+            .map(|(x, y)| x * f32c!(*y))
+            .sum::<f32>();
+
+        // TODO: Use array_chunks once stable to avoid the unwrap.
+        // <https://github.com/rust-lang/rust/issues/74985>
+        #[allow(clippy::unwrap_used)]
+        let xc = xc.map(|xx| *<&[_; 8]>::try_from(xx).unwrap());
+        #[allow(clippy::unwrap_used)]
+        let yc = yc.map(|yy| *<&[_; 8]>::try_from(yy).unwrap());
+        use core::arch::x86_64::*;
+
+        // SAFETY: No safety requirement
+        let mut sum = unsafe { _mm256_setzero_ps() };
+
+        for (x, y) in xc.zip(yc) {
+            // We should be able to use _mm256_load_ps here, as x is f32-aligned, and f32-aligment
+            // is the safety requirement of that function. However, it segfaults.
+            let xv = unsafe { _mm256_loadu_ps(x.as_ptr()) };
+            // SAFETY: _mm256_loadu_ps does not require its argument to be aligned
+            let yv = unsafe { _mm256_loadu_ps(y.as_ptr() as *const f32) };
+            // SAFETY: No safety requirement
+            sum = unsafe { _mm256_fmadd_ps(xv, yv, sum) };
+        }
+
+        // Using hacks in
+        // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction
+        // SAFETY: No safety requirement
+        let mut lo = unsafe { _mm256_castps256_ps128(sum) };
+        // SAFETY: No safety requirement
+        let hi = unsafe { _mm256_extractf128_ps(sum, 1) };
+        // SAFETY: No safety requirement
+        lo = unsafe { _mm_add_ps(lo, hi) };
+
+        // SAFETY: No safety requirement
+        let mut shuf = unsafe { _mm_movehdup_ps(lo) };
+        // SAFETY: No safety requirement
+        let mut sums = unsafe { _mm_add_ps(lo, shuf) };
+        // SAFETY: No safety requirement
+        shuf = unsafe { _mm_movehl_ps(shuf, sums) };
+        // SAFETY: No safety requirement
+        sums = unsafe { _mm_add_ss(sums, shuf) };
+        // SAFETY: No safety requirement
+        unsafe { _mm_cvtss_f32(sums) + remainder }
+    }
+
+    #[target_feature(enable = "avx,fma")]
+    pub(crate) unsafe fn dot_2_avx_fma(xs: &ZeroSlice<f32>, ys: &ZeroSlice<f32>) -> f32 {
+        debug_assert_eq!(xs.len(), ys.len());
+        #[cfg(target_arch = "x86")]
+        use core::arch::x86::*;
+        #[cfg(target_arch = "x86_64")]
+        use core::arch::x86_64::*;
+
+        let xc = xs.as_ule_slice().chunks_exact(8);
+        let yc = ys.as_ule_slice().chunks_exact(8);
+
+        let remainder = xc
+            .remainder()
+            .iter()
+            .zip(yc.remainder().iter())
+            .map(|(x, y)| f32c!(*x) * f32c!(*y))
+            .sum::<f32>();
+
+        // TODO: Use array_chunks once stable to avoid the unwrap.
+        // <https://github.com/rust-lang/rust/issues/74985>
+        #[allow(clippy::unwrap_used)]
+        let xc = xc.map(|xx| *<&[_; 8]>::try_from(xx).unwrap());
+        #[allow(clippy::unwrap_used)]
+        let yc = yc.map(|yy| *<&[_; 8]>::try_from(yy).unwrap());
+
+        // SAFETY: No safety requirement
+        let mut sum = unsafe { _mm256_setzero_ps() };
+
+        for (x, y) in xc.zip(yc) {
+            // SAFETY: _mm256_loadu_ps does not require its argument to be aligned
+            let xv = unsafe { _mm256_loadu_ps(x.as_ptr() as *const f32) };
+            // SAFETY: _mm256_loadu_ps does not require its argument to be aligned
+            let yv = unsafe { _mm256_loadu_ps(y.as_ptr() as *const f32) };
+            // SAFETY: No safety requirement
+            sum = unsafe { _mm256_fmadd_ps(xv, yv, sum) };
+        }
+
+        // Using hacks in
+        // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction
+        // SAFETY: No safety requirement
+        let mut lo = unsafe { _mm256_castps256_ps128(sum) };
+        // SAFETY: No safety requirement
+        let hi = unsafe { _mm256_extractf128_ps(sum, 1) };
+        // SAFETY: No safety requirement
+        lo = unsafe { _mm_add_ps(lo, hi) };
+
+        // SAFETY: No safety requirement
+        let mut shuf = unsafe { _mm_movehdup_ps(lo) };
+        // SAFETY: No safety requirement
+        let mut sums = unsafe { _mm_add_ps(lo, shuf) };
+        // SAFETY: No safety requirement
+        shuf = unsafe { _mm_movehl_ps(shuf, sums) };
+        // SAFETY: No safety requirement
+        sums = unsafe { _mm_add_ss(sums, shuf) };
+        // SAFETY: No safety requirement
+        unsafe { _mm_cvtss_f32(sums) + remainder }
+    }
+}
+
+#[cfg(target_arch = "aarch64")]
+#[cfg(target_endian = "little")]
+mod neon {
+    use zerovec::ule::AsULE;
+
+    #[target_feature(enable = "neon")]
+    pub(crate) unsafe fn dot_1_neon(xs: &[f32], ys: &ZeroSlice<f32>) -> f32 {
+        use core::arch::aarch64::*;
+
+        debug_assert_eq!(xs.len(), ys.len());
+
+        let xc = xs.chunks_exact(8);
+        let yc = ys.as_ule_slice().chunks_exact(8);
+
+        let remainder = xc
+            .remainder()
+            .iter()
+            .zip(yc.remainder().iter())
+            .map(|(x, y)| x * f32c!(*y))
+            .sum::<f32>();
+
+        // TODO: Use array_chunks once stable to avoid the unwrap.
+        // <https://github.com/rust-lang/rust/issues/74985>
+        #[allow(clippy::unwrap_used)]
+        let xc = xc.map(|xx| *<&[_; 8]>::try_from(xx).unwrap());
+        #[allow(clippy::unwrap_used)]
+        let yc = yc.map(|yy| *<&[_; 8]>::try_from(yy).unwrap());
+
+        // https://developer.arm.com/documentation/102197/0100/Calculating-dot-products-using-Neon-Intrinsics
+        let mut sum0 = unsafe { vdupq_n_f32(0.0) };
+        let mut sum1 = unsafe { vdupq_n_f32(0.0) };
+
+        for (x, y) in xc.zip(yc) {
+            let xv0 = unsafe { vld1q_f32(x.as_ptr()) };
+            let yv0 = unsafe { vld1q_f32(y.as_ptr() as *const f32) };
+
+            sum0 = unsafe { vfmaq_f32(sum0, xv0, yv0) };
+
+            let xv1 = unsafe { vld1q_f32(x[4..].as_ptr()) };
+            let yv1 = unsafe { vld1q_f32(y[4..].as_ptr() as *const f32) };
+
+            sum1 = unsafe { vfmaq_f32(sum1, xv1, yv1) };
+        }
+        unsafe { vaddvq_f32(sum0) + vaddvq_f32(sum1) + remainder }
+    }
+
+    #[target_feature(enable = "neon")]
+    pub(crate) unsafe fn dot_2_neon(xs: &ZeroSlice<f32>, ys: &ZeroSlice<f32>) -> f32 {
+        use core::arch::aarch64::*;
+
+        debug_assert_eq!(xs.len(), ys.len());
+
+        let xc = xs.as_ule_slice().chunks_exact(8);
+        let yc = ys.as_ule_slice().chunks_exact(8);
+
+        let remainder = xc
+            .remainder()
+            .iter()
+            .zip(yc.remainder().iter())
+            .map(|(x, y)| f32c!(*x) * f32c!(*y))
+            .sum::<f32>();
+
+        // TODO: Use array_chunks once stable to avoid the unwrap.
+        // <https://github.com/rust-lang/rust/issues/74985>
+        #[allow(clippy::unwrap_used)]
+        let xc = xc.map(|xx| *<&[_; 8]>::try_from(xx).unwrap());
+        #[allow(clippy::unwrap_used)]
+        let yc = yc.map(|yy| *<&[_; 8]>::try_from(yy).unwrap());
+
+        // https://developer.arm.com/documentation/102197/0100/Calculating-dot-products-using-Neon-Intrinsics
+        let mut sum0 = unsafe { vdupq_n_f32(0.0) };
+        let mut sum1 = unsafe { vdupq_n_f32(0.0) };
+
+        for (x, y) in xc.zip(yc) {
+            let xv0 = unsafe { vld1q_f32(x.as_ptr() as *const f32) };
+            let yv0 = unsafe { vld1q_f32(y.as_ptr() as *const f32) };
+
+            sum0 = unsafe { vfmaq_f32(sum0, xv0, yv0) };
+
+            let xv1 = unsafe { vld1q_f32(x[4..].as_ptr() as *const f32) };
+            let yv1 = unsafe { vld1q_f32(y[4..].as_ptr() as *const f32) };
+
+            sum1 = unsafe { vfmaq_f32(sum1, xv1, yv1) };
+        }
+        unsafe { vaddvq_f32(sum0) + vaddvq_f32(sum1) + remainder }
+    }
+}