Improve __clzsi2 performance

AaronKutch · AaronKutch · commit 74f7bcc0c43a · 2020-07-13T17:33:51.000-05:00
diff --git a/src/int/mod.rs b/src/int/mod.rs
@@ -309,11 +309,24 @@ intrinsics! {
         target_pointer_width = "64"
     ))]
     pub extern "C" fn __clzsi2(x: usize) -> usize {
-        // TODO: const this? Would require const-if
-        // Note(Lokathor): the `intrinsics!` macro can't process mut inputs
+        // Note: This routine produces the correct value for `x == 0`. Zero is probably common
+        // enough that it could warrant adding a zero check at the beginning, but this function has
+        // a precondition that `x != 0`. Compilers will insert the check for zero in cases where it
+        // is needed.
+
+        // The basic idea is to test if the higher bits of `x` are zero and bisect the number of
+        // leading zeros (done here by subtraction instead of addition because it simplifies the
+        // final bisection step).
+        // It is possible for all branches of the bisection to use the same code path via
+        // conditionally shifting the higher parts down to let the next bisection step work on the
+        // higher or lower parts of `x`.
+        // This method using `(x >= power-of-two) as usize` to test if the higher bits are zero is
+        // branchless on most architectures (performed on many architectures with a single
+        // set-if-more-than-or-equal instruction, or done with some kind of conditional move).
+
         let mut x = x;
-        let mut y: usize;
-        let mut n: usize = {
+        // The number of potential leading zeros
+        let mut z = {
             #[cfg(target_pointer_width = "64")]
             {
                 64
@@ -327,42 +340,60 @@ intrinsics! {
                 16
             }
         };
+
+        // a temporary
+        let mut t: usize;
+
         #[cfg(target_pointer_width = "64")]
         {
-            y = x >> 32;
-            if y != 0 {
-                n -= 32;
-                x = y;
-            }
+            // If the upper 32 bits of `x` are not all 0, `t` is set to `1 << 5`, otherwise `t` is
+            // set to 0.
+            t = ((x >= (1 << 32)) as usize) << 5;
+            // If `t` was set to `1 << 5`, then the upper 32 bits are shifted down for the next step
+            // to process.
+            x >>= t;
+            // If `t` was set to `1 << 5`, then we subtract 32 from the number of potential leading
+            // zeros
+            z -= t;
         }
+
         #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
         {
-            y = x >> 16;
-            if y != 0 {
-                n -= 16;
-                x = y;
-            }
-        }
-        y = x >> 8;
-        if y != 0 {
-            n -= 8;
-            x = y;
-        }
-        y = x >> 4;
-        if y != 0 {
-            n -= 4;
-            x = y;
-        }
-        y = x >> 2;
-        if y != 0 {
-            n -= 2;
-            x = y;
-        }
-        y = x >> 1;
-        if y != 0 {
-            n - 2
-        } else {
-            n - x
+            t = ((x >= (1 << 16)) as usize) << 4;
+            x >>= t;
+            z -= t;
         }
+
+        t = ((x >= (1 << 8)) as usize) << 3;
+        x >>= t;
+        z -= t;
+
+        t = ((x >= (1 << 4)) as usize) << 2;
+        x >>= t;
+        z -= t;
+
+        t = ((x >= (1 << 2)) as usize) << 1;
+        x >>= t;
+        z -= t;
+
+        t = (x >= (1 << 1)) as usize;
+        x >>= t;
+        z -= t;
+
+        // All bits except LSB are guaranteed to be zero for this final bisection step. If `x != 0`
+        // then `x == 1` and subtracts a potential zero from `z`.
+        z - x
+
+        // We could potentially save a few cycles by using the LUT trick from
+        // "https://embeddedgurus.com/state-space/2014/09/
+        // fast-deterministic-and-portable-counting-leading-zeros/". However, 256 bytes for a LUT is
+        // too large for embedded use cases. We could perform bisection down to
+        // `((x >= (1 << 4)) as usize) << 2` and use this 16 byte LUT for the rest of the work:
+        //const LUT: [u8; 16] = [0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4];
+        //z -= LUT[x] as usize;
+        //z
+        // However, it ends up generating about the same number of instructions. When benchmarked on
+        // x86_64, it is slightly faster to use the LUT, but this is probably because of OOO
+        // execution effects. Changing to using a LUT and branching is risky for smaller cores.
     }
 }