Skip to content

Commit 74f7bcc

Browse files
committed
Improve __clzsi2 performance
1 parent f3846bc commit 74f7bcc

File tree

1 file changed

+66
-35
lines changed

1 file changed

+66
-35
lines changed

src/int/mod.rs

+66-35
Original file line numberDiff line numberDiff line change
@@ -309,11 +309,24 @@ intrinsics! {
309309
target_pointer_width = "64"
310310
))]
311311
pub extern "C" fn __clzsi2(x: usize) -> usize {
312-
// TODO: const this? Would require const-if
313-
// Note(Lokathor): the `intrinsics!` macro can't process mut inputs
312+
// Note: This routine produces the correct value for `x == 0`. Zero is probably common
313+
// enough that it could warrant adding a zero check at the beginning, but this function has
314+
// a precondition that `x != 0`. Compilers will insert the check for zero in cases where it
315+
// is needed.
316+
317+
// The basic idea is to test if the higher bits of `x` are zero and bisect the number of
318+
// leading zeros (done here by subtraction instead of addition because it simplifies the
319+
// final bisection step).
320+
// It is possible for all branches of the bisection to use the same code path via
321+
// conditionally shifting the higher parts down to let the next bisection step work on the
322+
// higher or lower parts of `x`.
323+
// This method using `(x >= power-of-two) as usize` to test if the higher bits are zero is
324+
// branchless on most architectures (performed on many architectures with a single
325+
// set-if-more-than-or-equal instruction, or done with some kind of conditional move).
326+
314327
let mut x = x;
315-
let mut y: usize;
316-
let mut n: usize = {
328+
// The number of potential leading zeros
329+
let mut z = {
317330
#[cfg(target_pointer_width = "64")]
318331
{
319332
64
@@ -327,42 +340,60 @@ intrinsics! {
327340
16
328341
}
329342
};
343+
344+
// a temporary
345+
let mut t: usize;
346+
330347
#[cfg(target_pointer_width = "64")]
331348
{
332-
y = x >> 32;
333-
if y != 0 {
334-
n -= 32;
335-
x = y;
336-
}
349+
// If the upper 32 bits of `x` are not all 0, `t` is set to `1 << 5`, otherwise `t` is
350+
// set to 0.
351+
t = ((x >= (1 << 32)) as usize) << 5;
352+
// If `t` was set to `1 << 5`, then the upper 32 bits are shifted down for the next step
353+
// to process.
354+
x >>= t;
355+
// If `t` was set to `1 << 5`, then we subtract 32 from the number of potential leading
356+
// zeros
357+
z -= t;
337358
}
359+
338360
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
339361
{
340-
y = x >> 16;
341-
if y != 0 {
342-
n -= 16;
343-
x = y;
344-
}
345-
}
346-
y = x >> 8;
347-
if y != 0 {
348-
n -= 8;
349-
x = y;
350-
}
351-
y = x >> 4;
352-
if y != 0 {
353-
n -= 4;
354-
x = y;
355-
}
356-
y = x >> 2;
357-
if y != 0 {
358-
n -= 2;
359-
x = y;
360-
}
361-
y = x >> 1;
362-
if y != 0 {
363-
n - 2
364-
} else {
365-
n - x
362+
t = ((x >= (1 << 16)) as usize) << 4;
363+
x >>= t;
364+
z -= t;
366365
}
366+
367+
t = ((x >= (1 << 8)) as usize) << 3;
368+
x >>= t;
369+
z -= t;
370+
371+
t = ((x >= (1 << 4)) as usize) << 2;
372+
x >>= t;
373+
z -= t;
374+
375+
t = ((x >= (1 << 2)) as usize) << 1;
376+
x >>= t;
377+
z -= t;
378+
379+
t = (x >= (1 << 1)) as usize;
380+
x >>= t;
381+
z -= t;
382+
383+
// All bits except LSB are guaranteed to be zero for this final bisection step. If `x != 0`
384+
// then `x == 1` and subtracts a potential zero from `z`.
385+
z - x
386+
387+
// We could potentially save a few cycles by using the LUT trick from
388+
// "https://embeddedgurus.com/state-space/2014/09/
389+
// fast-deterministic-and-portable-counting-leading-zeros/". However, 256 bytes for a LUT is
390+
// too large for embedded use cases. We could perform bisection down to
391+
// `((x >= (1 << 4)) as usize) << 2` and use this 16 byte LUT for the rest of the work:
392+
//const LUT: [u8; 16] = [0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4];
393+
//z -= LUT[x] as usize;
394+
//z
395+
// However, it ends up generating about the same number of instructions. When benchmarked on
396+
// x86_64, it is slightly faster to use the LUT, but this is probably because of OOO
397+
// execution effects. Changing to using a LUT and branching is risky for smaller cores.
367398
}
368399
}

0 commit comments

Comments
 (0)