Skip to content

Commit 35ab598

Browse files
committed
Revert "Inflate: Simpler, slightly faster NEON chunk-copy"
This reverts commit a2f4bd2, which appears to have caused bug #3
1 parent a2f4bd2 commit 35ab598

File tree

2 files changed

+3
-55
lines changed

2 files changed

+3
-55
lines changed

inffast.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,6 @@
2121
length/distance code pair can output up to 258 bytes, which is the maximum
2222
length that can be coded.
2323
*/
24-
#define INFLATE_FAST_MIN_OUTPUT 298
24+
#define INFLATE_FAST_MIN_OUTPUT 258
2525

2626
void ZLIB_INTERNAL inflate_fast OF((z_streamp strm, unsigned start));

inffast_chunk.c

Lines changed: 2 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -39,31 +39,6 @@
3939
# error INFLATE_CHUNK_SIMD_* requires INFLATE_CHUNK_READ_64LE
4040
#endif
4141

42-
#ifdef __aarch64__
43-
#include <arm_neon.h>
44-
static uint8x16_t distance_table[] = {
45-
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
46-
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
47-
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
48-
{0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
49-
{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
50-
{0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
51-
{0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
52-
{0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
53-
{0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7},
54-
{0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6},
55-
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5},
56-
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4},
57-
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3},
58-
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2},
59-
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1},
60-
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0},
61-
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
62-
};
63-
64-
static uint64_t distance_offsets[] = {16, 16, 16, 15, 16, 15, 12, 14, 16, 9, 10, 11, 12, 13, 14, 15, 16};
65-
#endif
66-
6742
/*
6843
Decode literal, length, and distance codes and write out the resulting
6944
literal and match bytes until either not enough input or output is
@@ -347,39 +322,12 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
347322
else {
348323
/* Whole reference is in range of current output. No
349324
range checks are necessary because we start with room
350-
for at least 296 bytes of output, so unroll and roundoff
325+
for at least 258 bytes of output, so unroll and roundoff
351326
operations can write beyond `out+len` so long as they
352-
stay within 296 bytes of `out`.
327+
stay within 258 bytes of `out`.
353328
*/
354-
#ifdef __aarch64__
355-
uint8_t *p = out - dist;
356-
if (dist <= 16) {
357-
uint8x16_t rep = vqtbl1q_u8(vld1q_u8(p), distance_table[dist]);
358-
uint64_t size = distance_offsets[dist];
359-
uint8_t *o = out;
360-
int64_t n = len;
361-
do {
362-
vst1q_u8(o, rep);
363-
vst1q_u8(o+size, rep);
364-
vst1q_u8(o+size*2, rep);
365-
o += size*3;
366-
n -= size*3;
367-
} while (n > 0);
368-
out += len;
369-
} else {
370-
int64_t i = 0;
371-
do {
372-
vst1q_u8(out + i, vld1q_u8(p + i));
373-
vst1q_u8(out + i+16, vld1q_u8(p + i+16));
374-
vst1q_u8(out + i+32, vld1q_u8(p + i+32));
375-
i += 48;
376-
} while (i < len);
377-
out += len;
378-
}
379-
#else
380329
out = chunkcopy_lapped_relaxed(out, dist, len);
381330

382-
#endif
383331
}
384332

385333
chunk_continue:

0 commit comments

Comments
 (0)