Skip to content

Commit de7dbad

Browse files
committed
const-specialize predict by sample counts
The idea is to allow codegen to exploit specific known constants. It seems that LLVM by itself will not generate split loop blocks by an input argument that is a loop constant. Whereas we know, from PNG, that some constants yield much better code. In this case coef[i] = coef[i] + coef[i - n] This has a destructive dependency chain for `n = 1`, should never get an argument of 0 and is almost embarrassingly parallel in SIMD if n >= 8 where we can increase the amount of data loaded at once for each independent loop iteration. By splitting the loop we make the compiler apply independent optimization passes to each of case, then have a fallback for things we did not cover with vectorized possibilities.
1 parent 50be15e commit de7dbad

File tree

1 file changed

+60
-5
lines changed

1 file changed

+60
-5
lines changed

src/decoder/mod.rs

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -368,27 +368,82 @@ pub struct IfdDecoder<'lt> {
368368
}
369369

370370
fn rev_hpredict_nsamp(buf: &mut [u8], bit_depth: u8, samples: usize) {
371-
match bit_depth {
372-
0..=8 => {
371+
fn one_byte_predict<const N: usize>(buf: &mut [u8]) {
372+
for i in N..buf.len() {
373+
buf[i] = buf[i].wrapping_add(buf[i - N]);
374+
}
375+
}
376+
377+
fn two_bytes_predict<const N: usize>(buf: &mut [u8]) {
378+
for i in (2 * N..buf.len()).step_by(2) {
379+
let v = u16::from_ne_bytes(buf[i..][..2].try_into().unwrap());
380+
let p = u16::from_ne_bytes(buf[i - 2 * N..][..2].try_into().unwrap());
381+
buf[i..][..2].copy_from_slice(&(v.wrapping_add(p)).to_ne_bytes());
382+
}
383+
}
384+
385+
fn four_bytes_predict<const N: usize>(buf: &mut [u8]) {
386+
for i in (N * 4..buf.len()).step_by(4) {
387+
let v = u32::from_ne_bytes(buf[i..][..4].try_into().unwrap());
388+
let p = u32::from_ne_bytes(buf[i - 4 * N..][..4].try_into().unwrap());
389+
buf[i..][..4].copy_from_slice(&(v.wrapping_add(p)).to_ne_bytes());
390+
}
391+
}
392+
393+
match (bit_depth, samples) {
394+
// Note we can't use `windows` or so due to the overlap between each iteration. We split
395+
// the cases by the samples / lookback constant so that each is optimized individually.
396+
// This is more code generated but each loop can then have a different vectorization
397+
// strategy.
398+
(0..=8, 1) => one_byte_predict::<1>(buf),
399+
(0..=8, 2) => one_byte_predict::<2>(buf),
400+
(0..=8, 3) => one_byte_predict::<3>(buf),
401+
(0..=8, 4) => one_byte_predict::<4>(buf),
402+
// The generic, sub-optimal case for the above.
403+
(0..=8, _) => {
373404
for i in samples..buf.len() {
374405
buf[i] = buf[i].wrapping_add(buf[i - samples]);
375406
}
376407
}
377-
9..=16 => {
408+
(9..=16, 1) => {
409+
two_bytes_predict::<1>(buf);
410+
}
411+
(9..=16, 2) => {
412+
two_bytes_predict::<2>(buf);
413+
}
414+
(9..=16, 3) => {
415+
two_bytes_predict::<3>(buf);
416+
}
417+
(9..=16, 4) => {
418+
two_bytes_predict::<4>(buf);
419+
}
420+
(9..=16, _) => {
378421
for i in (samples * 2..buf.len()).step_by(2) {
379422
let v = u16::from_ne_bytes(buf[i..][..2].try_into().unwrap());
380423
let p = u16::from_ne_bytes(buf[i - 2 * samples..][..2].try_into().unwrap());
381424
buf[i..][..2].copy_from_slice(&(v.wrapping_add(p)).to_ne_bytes());
382425
}
383426
}
384-
17..=32 => {
427+
(17..=32, 1) => {
428+
four_bytes_predict::<1>(buf);
429+
}
430+
(17..=32, 2) => {
431+
four_bytes_predict::<2>(buf);
432+
}
433+
(17..=32, 3) => {
434+
four_bytes_predict::<3>(buf);
435+
}
436+
(17..=32, 4) => {
437+
four_bytes_predict::<4>(buf);
438+
}
439+
(17..=32, _) => {
385440
for i in (samples * 4..buf.len()).step_by(4) {
386441
let v = u32::from_ne_bytes(buf[i..][..4].try_into().unwrap());
387442
let p = u32::from_ne_bytes(buf[i - 4 * samples..][..4].try_into().unwrap());
388443
buf[i..][..4].copy_from_slice(&(v.wrapping_add(p)).to_ne_bytes());
389444
}
390445
}
391-
33..=64 => {
446+
(33..=64, _) => {
392447
for i in (samples * 8..buf.len()).step_by(8) {
393448
let v = u64::from_ne_bytes(buf[i..][..8].try_into().unwrap());
394449
let p = u64::from_ne_bytes(buf[i - 8 * samples..][..8].try_into().unwrap());

0 commit comments

Comments
 (0)