|
25 | 25 | #include <cstdlib> |
26 | 26 | #include <string_view> |
27 | 27 |
|
| 28 | +// Quick test |
| 29 | +#include <immintrin.h> |
| 30 | + |
28 | 31 | namespace uWS { |
29 | 32 |
|
30 | 33 | /* We should not overcomplicate these */ |
@@ -114,24 +117,59 @@ T cond_byte_swap(T value) { |
114 | 117 | // Optimized for predominantly 7-bit content by Alex Hultman, 2016 |
115 | 118 | // Licensed as Zlib, like the rest of this project |
116 | 119 | // This runs about 40% faster than simdutf with g++ -mavx |
117 | | -static bool isValidUtf8(unsigned char *s, size_t length) |
118 | | -{ |
119 | | - for (unsigned char *e = s + length; s != e; ) { |
120 | | - if (s + 16 <= e) { |
121 | | - uint64_t tmp[2]; |
122 | | - memcpy(tmp, s, 16); |
123 | | - if (((tmp[0] & 0x8080808080808080) | (tmp[1] & 0x8080808080808080)) == 0) { |
124 | | - s += 16; |
125 | | - continue; |
| 120 | +static bool isValidUtf8(unsigned char *s, size_t length) { |
| 121 | + auto firstUtf8EscapeByte = [](unsigned char *s, unsigned char *e) { |
| 122 | + // Align |
| 123 | + if (s + 32 <= e) { |
| 124 | + int mask = _mm256_movemask_epi8(_mm256_loadu_si256((const __m256i*)s)); |
| 125 | + if (mask) { |
| 126 | + return s + __builtin_ctz(mask); |
| 127 | + } |
| 128 | + s += 32 - ((uintptr_t)s % 32); |
| 129 | + } else { |
| 130 | + // Worst path |
| 131 | + while (s < e) { |
| 132 | + if (*s & 0x80) { |
| 133 | + return s; |
| 134 | + } |
| 135 | + s++; |
126 | 136 | } |
| 137 | + return e; |
127 | 138 | } |
128 | 139 |
|
129 | | - while (!(*s & 0x80)) { |
130 | | - if (++s == e) { |
131 | | - return true; |
| 140 | + while (s + 128 <= e) { |
| 141 | + // Aligned |
| 142 | + int mask = _mm256_movemask_epi8(_mm256_load_si256((const __m256i*)s)); |
| 143 | + if (mask) { |
| 144 | + return s + __builtin_ctz(mask); |
| 145 | + } |
| 146 | + s += 32; |
| 147 | + mask = _mm256_movemask_epi8(_mm256_load_si256((const __m256i*)s)); |
| 148 | + if (mask) { |
| 149 | + return s + __builtin_ctz(mask); |
132 | 150 | } |
| 151 | + s += 32; |
| 152 | + mask = _mm256_movemask_epi8(_mm256_load_si256((const __m256i*)s)); |
| 153 | + if (mask) { |
| 154 | + return s + __builtin_ctz(mask); |
| 155 | + } |
| 156 | + s += 32; |
| 157 | + mask = _mm256_movemask_epi8(_mm256_load_si256((const __m256i*)s)); |
| 158 | + if (mask) { |
| 159 | + return s + __builtin_ctz(mask); |
| 160 | + } |
| 161 | + s += 32; |
133 | 162 | } |
134 | 163 |
|
| 164 | + // Exit |
| 165 | + while ((*s & 0x80) == 0 && s < e) { |
| 166 | + s++; |
| 167 | + } |
| 168 | + return s; |
| 169 | + }; |
| 170 | + |
| 171 | + for (unsigned char *e = s + length; (s = (unsigned char *) firstUtf8EscapeByte(s, e)) != e; ) { |
| 172 | + |
135 | 173 | if ((s[0] & 0x60) == 0x40) { |
136 | 174 | if (s + 1 >= e || (s[1] & 0xc0) != 0x80 || (s[0] & 0xfe) == 0xc0) { |
137 | 175 | return false; |
|
0 commit comments