@@ -309,11 +309,24 @@ intrinsics! {
309
309
target_pointer_width = "64"
310
310
) ) ]
311
311
pub extern "C" fn __clzsi2( x: usize ) -> usize {
312
- // TODO: const this? Would require const-if
313
- // Note(Lokathor): the `intrinsics!` macro can't process mut inputs
312
+ // Note: This routine produces the correct value for `x == 0`. Zero is probably common
313
+ // enough that it could warrant adding a zero check at the beginning, but this function has
314
+ // a precondition that `x != 0`. Compilers will insert the check for zero in cases where it
315
+ // is needed.
316
+
317
+ // The basic idea is to test if the higher bits of `x` are zero and bisect the number of
318
+ // leading zeros (done here by subtraction instead of addition because it simplifies the
319
+ // final bisection step).
320
+ // It is possible for all branches of the bisection to use the same code path via
321
+ // conditionally shifting the higher parts down to let the next bisection step work on the
322
+ // higher or lower parts of `x`.
323
+ // This method using `(x >= power-of-two) as usize` to test if the higher bits are zero is
324
+ // branchless on most architectures (performed on many architectures with a single
325
+ // set-if-more-than-or-equal instruction, or done with some kind of conditional move).
326
+
314
327
let mut x = x;
315
- let mut y : usize ;
316
- let mut n : usize = {
328
+ // The number of potential leading zeros
329
+ let mut z = {
317
330
#[ cfg( target_pointer_width = "64" ) ]
318
331
{
319
332
64
@@ -327,42 +340,60 @@ intrinsics! {
327
340
16
328
341
}
329
342
} ;
343
+
344
+ // a temporary
345
+ let mut t: usize ;
346
+
330
347
#[ cfg( target_pointer_width = "64" ) ]
331
348
{
332
- y = x >> 32 ;
333
- if y != 0 {
334
- n -= 32 ;
335
- x = y;
336
- }
349
+ // If the upper 32 bits of `x` are not all 0, `t` is set to `1 << 5`, otherwise `t` is
350
+ // set to 0.
351
+ t = ( ( x >= ( 1 << 32 ) ) as usize ) << 5 ;
352
+ // If `t` was set to `1 << 5`, then the upper 32 bits are shifted down for the next step
353
+ // to process.
354
+ x >>= t;
355
+ // If `t` was set to `1 << 5`, then we subtract 32 from the number of potential leading
356
+ // zeros
357
+ z -= t;
337
358
}
359
+
338
360
#[ cfg( any( target_pointer_width = "32" , target_pointer_width = "64" ) ) ]
339
361
{
340
- y = x >> 16 ;
341
- if y != 0 {
342
- n -= 16 ;
343
- x = y;
344
- }
345
- }
346
- y = x >> 8 ;
347
- if y != 0 {
348
- n -= 8 ;
349
- x = y;
350
- }
351
- y = x >> 4 ;
352
- if y != 0 {
353
- n -= 4 ;
354
- x = y;
355
- }
356
- y = x >> 2 ;
357
- if y != 0 {
358
- n -= 2 ;
359
- x = y;
360
- }
361
- y = x >> 1 ;
362
- if y != 0 {
363
- n - 2
364
- } else {
365
- n - x
362
+ t = ( ( x >= ( 1 << 16 ) ) as usize ) << 4 ;
363
+ x >>= t;
364
+ z -= t;
366
365
}
366
+
367
+ t = ( ( x >= ( 1 << 8 ) ) as usize ) << 3 ;
368
+ x >>= t;
369
+ z -= t;
370
+
371
+ t = ( ( x >= ( 1 << 4 ) ) as usize ) << 2 ;
372
+ x >>= t;
373
+ z -= t;
374
+
375
+ t = ( ( x >= ( 1 << 2 ) ) as usize ) << 1 ;
376
+ x >>= t;
377
+ z -= t;
378
+
379
+ t = ( x >= ( 1 << 1 ) ) as usize ;
380
+ x >>= t;
381
+ z -= t;
382
+
383
+ // All bits except LSB are guaranteed to be zero for this final bisection step. If `x != 0`
384
+ // then `x == 1` and subtracts a potential zero from `z`.
385
+ z - x
386
+
387
+ // We could potentially save a few cycles by using the LUT trick from
388
+ // "https://embeddedgurus.com/state-space/2014/09/
389
+ // fast-deterministic-and-portable-counting-leading-zeros/". However, 256 bytes for a LUT is
390
+ // too large for embedded use cases. We could perform bisection down to
391
+ // `((x >= (1 << 4)) as usize) << 2` and use this 16 byte LUT for the rest of the work:
392
+ //const LUT: [u8; 16] = [0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4];
393
+ //z -= LUT[x] as usize;
394
+ //z
395
+ // However, it ends up generating about the same number of instructions. When benchmarked on
396
+ // x86_64, it is slightly faster to use the LUT, but this is probably because of OOO
397
+ // execution effects. Changing to using a LUT and branching is risky for smaller cores.
367
398
}
368
399
}
0 commit comments