|
39 | 39 | # error INFLATE_CHUNK_SIMD_* requires INFLATE_CHUNK_READ_64LE
|
40 | 40 | #endif
|
41 | 41 |
|
42 |
| -#ifdef __aarch64__ |
43 |
| -#include <arm_neon.h> |
44 |
| -static uint8x16_t distance_table[] = { |
45 |
| - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
46 |
| - {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, |
47 |
| - {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}, |
48 |
| - {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0}, |
49 |
| - {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}, |
50 |
| - {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0}, |
51 |
| - {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3}, |
52 |
| - {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1}, |
53 |
| - {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}, |
54 |
| - {0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6}, |
55 |
| - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5}, |
56 |
| - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4}, |
57 |
| - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3}, |
58 |
| - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2}, |
59 |
| - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1}, |
60 |
| - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0}, |
61 |
| - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
62 |
| -}; |
63 |
| - |
64 |
| -static uint64_t distance_offsets[] = {16, 16, 16, 15, 16, 15, 12, 14, 16, 9, 10, 11, 12, 13, 14, 15, 16}; |
65 |
| -#endif |
66 |
| - |
67 | 42 | /*
|
68 | 43 | Decode literal, length, and distance codes and write out the resulting
|
69 | 44 | literal and match bytes until either not enough input or output is
|
@@ -347,39 +322,12 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
|
347 | 322 | else {
|
348 | 323 | /* Whole reference is in range of current output. No
|
349 | 324 | range checks are necessary because we start with room
|
350 |
| - for at least 296 bytes of output, so unroll and roundoff |
| 325 | + for at least 258 bytes of output, so unroll and roundoff |
351 | 326 | operations can write beyond `out+len` so long as they
|
352 |
| - stay within 296 bytes of `out`. |
| 327 | + stay within 258 bytes of `out`. |
353 | 328 | */
|
354 |
| -#ifdef __aarch64__ |
355 |
| - uint8_t *p = out - dist; |
356 |
| - if (dist <= 16) { |
357 |
| - uint8x16_t rep = vqtbl1q_u8(vld1q_u8(p), distance_table[dist]); |
358 |
| - uint64_t size = distance_offsets[dist]; |
359 |
| - uint8_t *o = out; |
360 |
| - int64_t n = len; |
361 |
| - do { |
362 |
| - vst1q_u8(o, rep); |
363 |
| - vst1q_u8(o+size, rep); |
364 |
| - vst1q_u8(o+size*2, rep); |
365 |
| - o += size*3; |
366 |
| - n -= size*3; |
367 |
| - } while (n > 0); |
368 |
| - out += len; |
369 |
| - } else { |
370 |
| - int64_t i = 0; |
371 |
| - do { |
372 |
| - vst1q_u8(out + i, vld1q_u8(p + i)); |
373 |
| - vst1q_u8(out + i+16, vld1q_u8(p + i+16)); |
374 |
| - vst1q_u8(out + i+32, vld1q_u8(p + i+32)); |
375 |
| - i += 48; |
376 |
| - } while (i < len); |
377 |
| - out += len; |
378 |
| - } |
379 |
| -#else |
380 | 329 | out = chunkcopy_lapped_relaxed(out, dist, len);
|
381 | 330 |
|
382 |
| -#endif |
383 | 331 | }
|
384 | 332 |
|
385 | 333 | chunk_continue:
|
|
0 commit comments