Skip to content

Commit bec8e49

Browse files
authored
Merge pull request #308 from image-rs/vectorized-predict
const-specialize predict by sample counts
2 parents 2ccbe13 + de7dbad commit bec8e49

File tree

1 file changed

+60
-5
lines changed

1 file changed

+60
-5
lines changed

src/decoder/mod.rs

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -368,27 +368,82 @@ pub struct IfdDecoder<'lt> {
368368
}
369369

370370
fn rev_hpredict_nsamp(buf: &mut [u8], bit_depth: u8, samples: usize) {
371-
match bit_depth {
372-
0..=8 => {
371+
fn one_byte_predict<const N: usize>(buf: &mut [u8]) {
372+
for i in N..buf.len() {
373+
buf[i] = buf[i].wrapping_add(buf[i - N]);
374+
}
375+
}
376+
377+
fn two_bytes_predict<const N: usize>(buf: &mut [u8]) {
378+
for i in (2 * N..buf.len()).step_by(2) {
379+
let v = u16::from_ne_bytes(buf[i..][..2].try_into().unwrap());
380+
let p = u16::from_ne_bytes(buf[i - 2 * N..][..2].try_into().unwrap());
381+
buf[i..][..2].copy_from_slice(&(v.wrapping_add(p)).to_ne_bytes());
382+
}
383+
}
384+
385+
fn four_bytes_predict<const N: usize>(buf: &mut [u8]) {
386+
for i in (N * 4..buf.len()).step_by(4) {
387+
let v = u32::from_ne_bytes(buf[i..][..4].try_into().unwrap());
388+
let p = u32::from_ne_bytes(buf[i - 4 * N..][..4].try_into().unwrap());
389+
buf[i..][..4].copy_from_slice(&(v.wrapping_add(p)).to_ne_bytes());
390+
}
391+
}
392+
393+
match (bit_depth, samples) {
394+
// Note we can't use `windows` or so due to the overlap between each iteration. We split
395+
// the cases by the samples / lookback constant so that each is optimized individually.
396+
// This is more code generated but each loop can then have a different vectorization
397+
// strategy.
398+
(0..=8, 1) => one_byte_predict::<1>(buf),
399+
(0..=8, 2) => one_byte_predict::<2>(buf),
400+
(0..=8, 3) => one_byte_predict::<3>(buf),
401+
(0..=8, 4) => one_byte_predict::<4>(buf),
402+
// The generic, sub-optimal case for the above.
403+
(0..=8, _) => {
373404
for i in samples..buf.len() {
374405
buf[i] = buf[i].wrapping_add(buf[i - samples]);
375406
}
376407
}
377-
9..=16 => {
408+
(9..=16, 1) => {
409+
two_bytes_predict::<1>(buf);
410+
}
411+
(9..=16, 2) => {
412+
two_bytes_predict::<2>(buf);
413+
}
414+
(9..=16, 3) => {
415+
two_bytes_predict::<3>(buf);
416+
}
417+
(9..=16, 4) => {
418+
two_bytes_predict::<4>(buf);
419+
}
420+
(9..=16, _) => {
378421
for i in (samples * 2..buf.len()).step_by(2) {
379422
let v = u16::from_ne_bytes(buf[i..][..2].try_into().unwrap());
380423
let p = u16::from_ne_bytes(buf[i - 2 * samples..][..2].try_into().unwrap());
381424
buf[i..][..2].copy_from_slice(&(v.wrapping_add(p)).to_ne_bytes());
382425
}
383426
}
384-
17..=32 => {
427+
(17..=32, 1) => {
428+
four_bytes_predict::<1>(buf);
429+
}
430+
(17..=32, 2) => {
431+
four_bytes_predict::<2>(buf);
432+
}
433+
(17..=32, 3) => {
434+
four_bytes_predict::<3>(buf);
435+
}
436+
(17..=32, 4) => {
437+
four_bytes_predict::<4>(buf);
438+
}
439+
(17..=32, _) => {
385440
for i in (samples * 4..buf.len()).step_by(4) {
386441
let v = u32::from_ne_bytes(buf[i..][..4].try_into().unwrap());
387442
let p = u32::from_ne_bytes(buf[i - 4 * samples..][..4].try_into().unwrap());
388443
buf[i..][..4].copy_from_slice(&(v.wrapping_add(p)).to_ne_bytes());
389444
}
390445
}
391-
33..=64 => {
446+
(33..=64, _) => {
392447
for i in (samples * 8..buf.len()).step_by(8) {
393448
let v = u64::from_ne_bytes(buf[i..][..8].try_into().unwrap());
394449
let p = u64::from_ne_bytes(buf[i - 8 * samples..][..8].try_into().unwrap());

0 commit comments

Comments
 (0)