@@ -368,27 +368,82 @@ pub struct IfdDecoder<'lt> {
368368}
369369
370370fn rev_hpredict_nsamp ( buf : & mut [ u8 ] , bit_depth : u8 , samples : usize ) {
371- match bit_depth {
372- 0 ..=8 => {
371+ fn one_byte_predict < const N : usize > ( buf : & mut [ u8 ] ) {
372+ for i in N ..buf. len ( ) {
373+ buf[ i] = buf[ i] . wrapping_add ( buf[ i - N ] ) ;
374+ }
375+ }
376+
377+ fn two_bytes_predict < const N : usize > ( buf : & mut [ u8 ] ) {
378+ for i in ( 2 * N ..buf. len ( ) ) . step_by ( 2 ) {
379+ let v = u16:: from_ne_bytes ( buf[ i..] [ ..2 ] . try_into ( ) . unwrap ( ) ) ;
380+ let p = u16:: from_ne_bytes ( buf[ i - 2 * N ..] [ ..2 ] . try_into ( ) . unwrap ( ) ) ;
381+ buf[ i..] [ ..2 ] . copy_from_slice ( & ( v. wrapping_add ( p) ) . to_ne_bytes ( ) ) ;
382+ }
383+ }
384+
385+ fn four_bytes_predict < const N : usize > ( buf : & mut [ u8 ] ) {
386+ for i in ( N * 4 ..buf. len ( ) ) . step_by ( 4 ) {
387+ let v = u32:: from_ne_bytes ( buf[ i..] [ ..4 ] . try_into ( ) . unwrap ( ) ) ;
388+ let p = u32:: from_ne_bytes ( buf[ i - 4 * N ..] [ ..4 ] . try_into ( ) . unwrap ( ) ) ;
389+ buf[ i..] [ ..4 ] . copy_from_slice ( & ( v. wrapping_add ( p) ) . to_ne_bytes ( ) ) ;
390+ }
391+ }
392+
393+ match ( bit_depth, samples) {
394+ // Note we can't use `windows` or so due to the overlap between each iteration. We split
395+ // the cases by the samples / lookback constant so that each is optimized individually.
396+ // This is more code generated but each loop can then have a different vectorization
397+ // strategy.
398+ ( 0 ..=8 , 1 ) => one_byte_predict :: < 1 > ( buf) ,
399+ ( 0 ..=8 , 2 ) => one_byte_predict :: < 2 > ( buf) ,
400+ ( 0 ..=8 , 3 ) => one_byte_predict :: < 3 > ( buf) ,
401+ ( 0 ..=8 , 4 ) => one_byte_predict :: < 4 > ( buf) ,
402+ // The generic, sub-optimal case for the above.
403+ ( 0 ..=8 , _) => {
373404 for i in samples..buf. len ( ) {
374405 buf[ i] = buf[ i] . wrapping_add ( buf[ i - samples] ) ;
375406 }
376407 }
377- 9 ..=16 => {
408+ ( 9 ..=16 , 1 ) => {
409+ two_bytes_predict :: < 1 > ( buf) ;
410+ }
411+ ( 9 ..=16 , 2 ) => {
412+ two_bytes_predict :: < 2 > ( buf) ;
413+ }
414+ ( 9 ..=16 , 3 ) => {
415+ two_bytes_predict :: < 3 > ( buf) ;
416+ }
417+ ( 9 ..=16 , 4 ) => {
418+ two_bytes_predict :: < 4 > ( buf) ;
419+ }
420+ ( 9 ..=16 , _) => {
378421 for i in ( samples * 2 ..buf. len ( ) ) . step_by ( 2 ) {
379422 let v = u16:: from_ne_bytes ( buf[ i..] [ ..2 ] . try_into ( ) . unwrap ( ) ) ;
380423 let p = u16:: from_ne_bytes ( buf[ i - 2 * samples..] [ ..2 ] . try_into ( ) . unwrap ( ) ) ;
381424 buf[ i..] [ ..2 ] . copy_from_slice ( & ( v. wrapping_add ( p) ) . to_ne_bytes ( ) ) ;
382425 }
383426 }
384- 17 ..=32 => {
427+ ( 17 ..=32 , 1 ) => {
428+ four_bytes_predict :: < 1 > ( buf) ;
429+ }
430+ ( 17 ..=32 , 2 ) => {
431+ four_bytes_predict :: < 2 > ( buf) ;
432+ }
433+ ( 17 ..=32 , 3 ) => {
434+ four_bytes_predict :: < 3 > ( buf) ;
435+ }
436+ ( 17 ..=32 , 4 ) => {
437+ four_bytes_predict :: < 4 > ( buf) ;
438+ }
439+ ( 17 ..=32 , _) => {
385440 for i in ( samples * 4 ..buf. len ( ) ) . step_by ( 4 ) {
386441 let v = u32:: from_ne_bytes ( buf[ i..] [ ..4 ] . try_into ( ) . unwrap ( ) ) ;
387442 let p = u32:: from_ne_bytes ( buf[ i - 4 * samples..] [ ..4 ] . try_into ( ) . unwrap ( ) ) ;
388443 buf[ i..] [ ..4 ] . copy_from_slice ( & ( v. wrapping_add ( p) ) . to_ne_bytes ( ) ) ;
389444 }
390445 }
391- 33 ..=64 => {
446+ ( 33 ..=64 , _ ) => {
392447 for i in ( samples * 8 ..buf. len ( ) ) . step_by ( 8 ) {
393448 let v = u64:: from_ne_bytes ( buf[ i..] [ ..8 ] . try_into ( ) . unwrap ( ) ) ;
394449 let p = u64:: from_ne_bytes ( buf[ i - 8 * samples..] [ ..8 ] . try_into ( ) . unwrap ( ) ) ;
0 commit comments