Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 62 additions & 9 deletions zmij.cc
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,14 @@ struct fixed_layout_table {
unsigned char point_pos;
// Start position for shifting digits right by one to insert the point.
unsigned char shift_pos;
#if ZMIJ_USE_SSE4_1
// Buffer-relative position of the last_digit byte, indexed by extra_digit.
// This is only for bcd_size == 16, i.e. doubles.
unsigned char last_digit_pos[2];
// Offset added when loading the bswap constant to correctly shift the digits
// to account for the decimal point.
unsigned char shift_extra;
#endif
// Offset past the end of fixed-notation output, indexed by sig length - 1.
unsigned char end_pos[traits::max_digits10];
};
Expand All @@ -569,6 +577,16 @@ struct fixed_layout_table {
e.point_pos = dec_exp >= 0 ? 1 + dec_exp : 1;
e.shift_pos = e.point_pos + (dec_exp >= 0);

#if ZMIJ_USE_SSE4_1
e.shift_extra = dec_exp >= 0 ? e.point_pos : 0;
for (int extra = 0; extra < 2; ++extra) {
constexpr int bcd_size = 16;
int pre = bcd_size + extra - 1;
e.last_digit_pos[extra] =
pre + (pre >= e.point_pos ? e.shift_pos - e.point_pos : 0);
}
#endif

for (int n = 1; n <= traits::max_digits10; ++n) {
int end_pos = n;
if (dec_exp >= 0) end_pos = n > dec_exp + 1 ? n + 1 : dec_exp + 1;
Expand Down Expand Up @@ -635,7 +653,7 @@ inline auto write_if(char* buffer, uint32_t digit, bool condition) noexcept
return buffer + condition;
}

struct data {
struct alignas(64) data {
static constexpr auto splat64(uint64_t x) -> uint128 { return {x, x}; }
static constexpr auto splat32(uint32_t x) -> uint128 {
return splat64(uint64_t(x) << 32 | x);
Expand Down Expand Up @@ -673,8 +691,12 @@ struct data {
# if ZMIJ_USE_SSE4_1
uint128 neg100 = splat32(::neg100);
uint128 neg10 = splat16((1 << 8) - 10);
uint128 bswap = uint128{pack8(15, 14, 13, 12, 11, 10, 9, 8),
pack8(7, 6, 5, 4, 3, 2, 1, 0)};
// We will read from bswap at offsets for fixed format output. Accomodate
// the necessary offset calculations by using char*.
// This is aligned to 64bits, so we won't cross a cache line when reading,
// even with offsets.
char bswap[16] = {15, 14, 13, 12, 11, 10, 9, 8,
7, 6, 5, 4, 3, 2, 1, 0};
# else
uint128 hundred = splat32(100);
uint128 moddiv10 = splat16(10 * (1 << 8) - 1);
Expand All @@ -689,12 +711,14 @@ struct data {
alignas(64) pow10_significand_table pow10_significands;
fixed_layout_table fixed_layouts;

// Shuffle indices for SIMD digit shift. Offset 0 = identity, offset 1 =
#if ZMIJ_USE_NEON
// Shuffle indices for NEON's write_digits. Offset 0 = identity, offset 1 =
// shift left by 1 (drops the leading '0' of a 16-digit significand).
unsigned char shift_shuffle[17] = {0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 0};
#endif
};
alignas(64) constexpr data static_data;
constexpr data static_data;

#if ZMIJ_USE_NEON // An optimized version for NEON by Dougall Johnson.

Expand Down Expand Up @@ -854,6 +878,8 @@ template <> struct dec_digits<64> {
// Converts a significand to decimal digits, removing trailing zeros. value has
// up to 17 decimal digits (16-17 for normals) for double (num_bits == 64) and
// up to 9 digits (8-9 for normals) for float.
// SSE4.1 returns the bytes in reverse order to save one shuffle on the
// fixed format path.
template <int num_bits>
ZMIJ_INLINE auto to_digits(uint64_t value, const data& d) noexcept
-> dec_digits<num_bits> {
Expand Down Expand Up @@ -896,9 +922,6 @@ ZMIJ_INLINE auto to_digits(uint64_t value, const data& d) noexcept
uint64_t mask = _mm_movemask_epi8(_mm_cmpgt_epi8(bcd, _mm_setzero_si128()));
// Trailing zeros are in the low bits for SSE4.1, the high bits for SSE2.
int len = ZMIJ_USE_SSE4_1 ? 16 - ctz(mask) : 64 - clz(mask);
# if ZMIJ_USE_SSE4_1
bcd = _mm_shuffle_epi8(bcd, _mm_load_si128(m128ptr(&d.bswap))); // SSSE3
# endif
return {_mm_or_si128(bcd, zeros), len};
#endif // ZMIJ_USE_SSE
}
Expand Down Expand Up @@ -926,7 +949,7 @@ ZMIJ_INLINE void write_digits(char* buffer, dec_digits<64>::digits_type digits,
vst1q_u8(reinterpret_cast<uint8_t*>(buffer), shifted);
#elif ZMIJ_USE_SSE4_1
__m128i shuffle = _mm_loadu_si128(
reinterpret_cast<const __m128i*>(d.shift_shuffle + drop_leading_zero));
reinterpret_cast<const __m128i*>(d.bswap + drop_leading_zero));
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer),
_mm_shuffle_epi8(digits, shuffle));
#endif
Expand Down Expand Up @@ -1142,6 +1165,29 @@ auto write(Float value, char* buffer) noexcept -> char* {

const auto& layout = fixed_layouts->get(dec_exp);
buffer += layout.start_pos;
#if ZMIJ_USE_SSE4_1
if (bcd_size == 16) {
// dig.digits is uint64_t for float, alias as __m128i to avoid a compiler error.
auto& digits = reinterpret_cast<const __m128i&>(dig.digits);
// Two pshufbs over dig.digits, each loading its shuffle table from
// d->bswap + some offset. For dec_exp >= 0 the first shuffle and store moves
// the integral part into place, and the second takes care of the decimal part.
// For dec_exp <= 0 the digits are contiguous, and only one shuffle is needed,
// but it is cheaper to not branch and instead do the same thing twice.
const char* bswap_base = d->bswap + !extra_digit;
__m128i digits_shifted = _mm_shuffle_epi8(digits, _mm_loadu_si128(m128ptr(bswap_base)));
memcpy(buffer, &digits_shifted, bcd_size);
// For the case dec_exp < 0, shift_extra == 0, so this repeats the previous shuffle
// and store. This is faster than branching.
unsigned shift_extra = layout.shift_extra;
__m128i digits_shifted_2 = _mm_shuffle_epi8(digits,
_mm_loadu_si128(m128ptr(bswap_base + shift_extra)));
memcpy(buffer + shift_extra + (shift_extra != 0), &digits_shifted_2, bcd_size);
buffer[layout.last_digit_pos[extra_digit]] = last_digit;
start[layout.point_pos] = '.';
return buffer + layout.end_pos[num_digits + extra_digit - 1];
}
#endif
write_digits(buffer, dig.digits, !extra_digit, *d);
buffer[bcd_size + extra_digit - 1] = last_digit;
unsigned point_pos = layout.point_pos;
Expand All @@ -1150,6 +1196,13 @@ auto write(Float value, char* buffer) noexcept -> char* {
return buffer + layout.end_pos[num_digits + extra_digit - 1];
}
buffer += extra_digit;
#if ZMIJ_USE_SSE4_1
if (bcd_size == 16) {
// dig.digits is uint64_t for float, alias as __m128i to avoid a compiler error.
auto& digits = reinterpret_cast<__m128i&>(dig.digits);
digits = _mm_shuffle_epi8(digits, _mm_load_si128(m128ptr(d->bswap)));
}
#endif
memcpy(buffer, &dig.digits, bcd_size);
buffer[bcd_size] = '0' + dec.last_digit;
buffer += select(has_last_digit, bcd_size + 1, dig.num_digits);
Expand Down