Skip to content

Commit 7133db6

Browse files
authored
Merge: Lifetimes, UBs, and Documentation!
2 parents a4582ce + 5219a4d commit 7133db6

File tree

9 files changed

+331
-190
lines changed

9 files changed

+331
-190
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ build_release/
55
build_relwithdebinfo/
66
build_go/
77
build_golang/
8+
build_test/
89
build_artifacts*
910

1011
# Yes, everyone loves keeping this file in the history.

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,10 +1089,10 @@ Similar to GCC, StringZilla is 32 bytes in size, and similar to Clang it can fit
10891089
Our layout might be preferential, if you want to avoid branches.
10901090
If you use a different compiler, you may want to check its SSO buffer size with a [simple Gist](https://gist.github.com/ashvardanian/c197f15732d9855c4e070797adf17b21).
10911091

1092-
| | `libstdc++` in GCC 13 | `libc++` in Clang 17 | StringZilla |
1093-
| :-------------------- | ---------------------: | -------------------: | ----------: |
1094-
| `sizeof(std::string)` | 32 | 24 | 32 |
1095-
| Small String Capacity | 15 | __22__ | __22__ |
1092+
| | `libstdc++` in GCC 13 | `libc++` in Clang 17 | StringZilla |
1093+
| :-------------- | ---------------------: | -------------------: | ----------: |
1094+
| String `sizeof` | 32 | 24 | 32 |
1095+
| Inner Capacity | 15 | __22__ | __22__ |
10961096

10971097
This design has been since ported to many high-level programming languages.
10981098
Swift, for example, [can store 15 bytes](https://developer.apple.com/documentation/swift/substring/withutf8(_:)#discussion) in the `String` instance itself.

include/stringzilla/find.h

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -905,7 +905,7 @@ SZ_PUBLIC sz_cptr_t sz_find_westmere(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
905905
sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
906906

907907
// Broadcast those characters into XMM registers.
908-
int matches;
908+
sz_u32_vec_t matches_vec;
909909
sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
910910
n_first_vec.xmm = _mm_set1_epi8(n[offset_first]);
911911
n_mid_vec.xmm = _mm_set1_epi8(n[offset_mid]);
@@ -916,14 +916,14 @@ SZ_PUBLIC sz_cptr_t sz_find_westmere(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
916916
h_first_vec.xmm = _mm_lddqu_si128((__m128i const *)(h + offset_first));
917917
h_mid_vec.xmm = _mm_lddqu_si128((__m128i const *)(h + offset_mid));
918918
h_last_vec.xmm = _mm_lddqu_si128((__m128i const *)(h + offset_last));
919-
matches = //
919+
matches_vec.i32 = //
920920
_mm_movemask_epi8(_mm_cmpeq_epi8(h_first_vec.xmm, n_first_vec.xmm)) &
921921
_mm_movemask_epi8(_mm_cmpeq_epi8(h_mid_vec.xmm, n_mid_vec.xmm)) &
922922
_mm_movemask_epi8(_mm_cmpeq_epi8(h_last_vec.xmm, n_last_vec.xmm));
923-
while (matches) {
924-
int potential_offset = sz_u32_ctz(matches);
923+
while (matches_vec.u32) {
924+
int potential_offset = sz_u32_ctz(matches_vec.u32);
925925
if (sz_equal_westmere(h + potential_offset, n, n_length)) return h + potential_offset;
926-
matches &= matches - 1;
926+
matches_vec.u32 &= matches_vec.u32 - 1;
927927
}
928928
}
929929

@@ -940,7 +940,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_westmere(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
940940
sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
941941

942942
// Broadcast those characters into XMM registers.
943-
int matches;
943+
sz_u32_vec_t matches_vec;
944944
sz_u128_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
945945
n_first_vec.xmm = _mm_set1_epi8(n[offset_first]);
946946
n_mid_vec.xmm = _mm_set1_epi8(n[offset_mid]);
@@ -953,15 +953,15 @@ SZ_PUBLIC sz_cptr_t sz_rfind_westmere(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
953953
h_first_vec.xmm = _mm_lddqu_si128((__m128i const *)(h_reversed + offset_first));
954954
h_mid_vec.xmm = _mm_lddqu_si128((__m128i const *)(h_reversed + offset_mid));
955955
h_last_vec.xmm = _mm_lddqu_si128((__m128i const *)(h_reversed + offset_last));
956-
matches = //
956+
matches_vec.i32 = //
957957
_mm_movemask_epi8(_mm_cmpeq_epi8(h_first_vec.xmm, n_first_vec.xmm)) &
958958
_mm_movemask_epi8(_mm_cmpeq_epi8(h_mid_vec.xmm, n_mid_vec.xmm)) &
959959
_mm_movemask_epi8(_mm_cmpeq_epi8(h_last_vec.xmm, n_last_vec.xmm));
960-
while (matches) {
961-
int potential_offset = sz_u32_clz(matches) - 16;
960+
while (matches_vec.u32) {
961+
int potential_offset = sz_u32_clz(matches_vec.u32) - 16;
962962
if (sz_equal_westmere(h + h_length - n_length - potential_offset, n, n_length))
963963
return h + h_length - n_length - potential_offset;
964-
matches &= ~(1 << (15 - potential_offset));
964+
matches_vec.u32 &= ~(1u << (15 - potential_offset));
965965
}
966966
}
967967

@@ -1029,7 +1029,7 @@ SZ_PUBLIC sz_cptr_t sz_find_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
10291029
sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
10301030

10311031
// Broadcast those characters into YMM registers.
1032-
int matches;
1032+
sz_u32_vec_t matches_vec;
10331033
sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
10341034
n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
10351035
n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
@@ -1040,14 +1040,14 @@ SZ_PUBLIC sz_cptr_t sz_find_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n
10401040
h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_first));
10411041
h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_mid));
10421042
h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h + offset_last));
1043-
matches = //
1043+
matches_vec.i32 = //
10441044
_mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
10451045
_mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
10461046
_mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
1047-
while (matches) {
1048-
int potential_offset = sz_u32_ctz(matches);
1047+
while (matches_vec.u32) {
1048+
int potential_offset = sz_u32_ctz(matches_vec.u32);
10491049
if (sz_equal_haswell(h + potential_offset, n, n_length)) return h + potential_offset;
1050-
matches &= matches - 1;
1050+
matches_vec.u32 &= matches_vec.u32 - 1;
10511051
}
10521052
}
10531053

@@ -1065,7 +1065,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
10651065
sz_locate_needle_anomalies_(n, n_length, &offset_first, &offset_mid, &offset_last);
10661066

10671067
// Broadcast those characters into YMM registers.
1068-
int matches;
1068+
sz_u32_vec_t matches_vec;
10691069
sz_u256_vec_t h_first_vec, h_mid_vec, h_last_vec, n_first_vec, n_mid_vec, n_last_vec;
10701070
n_first_vec.ymm = _mm256_set1_epi8(n[offset_first]);
10711071
n_mid_vec.ymm = _mm256_set1_epi8(n[offset_mid]);
@@ -1078,15 +1078,15 @@ SZ_PUBLIC sz_cptr_t sz_rfind_haswell(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
10781078
h_first_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_first));
10791079
h_mid_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_mid));
10801080
h_last_vec.ymm = _mm256_lddqu_si256((__m256i const *)(h_reversed + offset_last));
1081-
matches = //
1081+
matches_vec.i32 = //
10821082
_mm256_movemask_epi8(_mm256_cmpeq_epi8(h_first_vec.ymm, n_first_vec.ymm)) &
10831083
_mm256_movemask_epi8(_mm256_cmpeq_epi8(h_mid_vec.ymm, n_mid_vec.ymm)) &
10841084
_mm256_movemask_epi8(_mm256_cmpeq_epi8(h_last_vec.ymm, n_last_vec.ymm));
1085-
while (matches) {
1086-
int potential_offset = sz_u32_clz(matches);
1085+
while (matches_vec.u32) {
1086+
int potential_offset = sz_u32_clz(matches_vec.u32);
10871087
if (sz_equal_haswell(h + h_length - n_length - potential_offset, n, n_length))
10881088
return h + h_length - n_length - potential_offset;
1089-
matches &= ~(1 << (31 - potential_offset));
1089+
matches_vec.u32 &= ~(1u << (31 - potential_offset));
10901090
}
10911091
}
10921092

@@ -1388,9 +1388,8 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
13881388
int potential_offset = sz_u64_clz(matches);
13891389
if (n_length <= 3 || sz_equal_skylake(h + h_length - n_length - potential_offset, n, n_length))
13901390
return h + h_length - n_length - potential_offset;
1391-
sz_assert_((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
1392-
"The bit must be set before we squash it");
1393-
matches &= ~((sz_u64_t)1 << (63 - potential_offset));
1391+
sz_assert_((matches & (1ull << (63 - potential_offset))) != 0 && "The bit must be set before we squash it");
1392+
matches &= ~(1ull << (63 - potential_offset));
13941393
}
13951394
}
13961395

@@ -1409,9 +1408,8 @@ SZ_PUBLIC sz_cptr_t sz_rfind_skylake(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
14091408
int potential_offset = sz_u64_clz(matches);
14101409
if (n_length <= 3 || sz_equal_skylake(h + 64 - potential_offset - 1, n, n_length))
14111410
return h + 64 - potential_offset - 1;
1412-
sz_assert_((matches & ((sz_u64_t)1 << (63 - potential_offset))) != 0 &&
1413-
"The bit must be set before we squash it");
1414-
matches &= ~((sz_u64_t)1 << (63 - potential_offset));
1411+
sz_assert_((matches & (1ull << (63 - potential_offset))) != 0 && "The bit must be set before we squash it");
1412+
matches &= ~(1ull << (63 - potential_offset));
14151413
}
14161414
}
14171415

include/stringzilla/intersect.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_serial(
288288
// Allocate memory for the hash table and initialize it with 0xFF.
289289
// The higher is the `hash_table_slots` multiple - the more memory we will use,
290290
// but the less likely the collisions will be.
291-
sz_size_t const hash_table_slots = sz_size_bit_ceil(small_sequence->count) * (1 << SZ_SEQUENCE_INTERSECT_BUDGET);
291+
sz_size_t const hash_table_slots = sz_size_bit_ceil(small_sequence->count) * (1u << SZ_SEQUENCE_INTERSECT_BUDGET);
292292
sz_size_t const bytes_per_entry = sizeof(sz_size_t) + sizeof(sz_u64_t);
293293
sz_size_t *const table_positions = (sz_size_t *)alloc->allocate(hash_table_slots * bytes_per_entry, alloc);
294294
if (!table_positions) return sz_bad_alloc_k;
@@ -414,7 +414,7 @@ SZ_PUBLIC sz_status_t sz_sequence_intersect_ice(
414414
// Allocate memory for the hash table and initialize it with 0xFF.
415415
// The higher is the `hash_table_slots` multiple - the more memory we will use,
416416
// but the less likely the collisions will be.
417-
sz_size_t const hash_table_slots = sz_size_bit_ceil(small_sequence->count) * (1 << SZ_SEQUENCE_INTERSECT_BUDGET);
417+
sz_size_t const hash_table_slots = sz_size_bit_ceil(small_sequence->count) * (1u << SZ_SEQUENCE_INTERSECT_BUDGET);
418418
sz_size_t const bytes_per_entry = sizeof(sz_size_t) + sizeof(sz_u64_t);
419419
sz_size_t *table_positions = (sz_size_t *)alloc->allocate(hash_table_slots * bytes_per_entry, alloc);
420420
if (!table_positions) return sz_bad_alloc_k;

include/stringzilla/memory.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,7 @@ SZ_PUBLIC void sz_lookup_neon(sz_ptr_t target, sz_size_t length, sz_cptr_t sourc
11941194

11951195
sz_size_t head_length = (16 - ((sz_size_t)target % 16)) % 16; // 15 or less.
11961196
sz_size_t tail_length = (sz_size_t)(target + length) % 16; // 15 or less.
1197+
sz_size_t body_length = length - head_length - tail_length;
11971198

11981199
// We need to pull the lookup table into 16x NEON registers. We have a total of 32 such registers.
11991200
// According to the Neoverse V2 manual, the 4-table lookup has a latency of 6 cycles, and 4x throughput.
@@ -1216,9 +1217,7 @@ SZ_PUBLIC void sz_lookup_neon(sz_ptr_t target, sz_size_t length, sz_cptr_t sourc
12161217
// to perform a 4-table lookup in a single instruction. The XORs are used to adjust the lookup position
12171218
// within each 64-byte range of the table.
12181219
// Details on the 4-table lookup: https://lemire.me/blog/2019/07/23/arbitrary-byte-to-byte-maps-using-arm-neon/
1219-
length -= head_length;
1220-
length -= tail_length;
1221-
for (; length >= 16; source += 16, target += 16, length -= 16) {
1220+
for (; body_length >= 16; source += 16, target += 16, body_length -= 16) {
12221221
source_vec.u8x16 = vld1q_u8((sz_u8_t const *)source);
12231222
lookup_0_to_63_vec.u8x16 = vqtbl4q_u8(lut_0_to_63_vec, source_vec.u8x16);
12241223
lookup_64_to_127_vec.u8x16 = vqtbl4q_u8(lut_64_to_127_vec, veorq_u8(source_vec.u8x16, vdupq_n_u8(0x40)));

0 commit comments

Comments
 (0)