Skip to content

Commit df82cf0

Browse files
committed
Only use LoadN fallback when compiling with GCC
Clang does not need the `LoadN` fallback and can happily use the scalar code. Coincidentally, this also avoids triggering a Clang crash when `ReorderWidenMulAccumulate` is used alongside capped vectors.
1 parent 0478c13 commit df82cf0

File tree

1 file changed

+36
-2
lines changed

1 file changed

+36
-2
lines changed

hwy/contrib/dot/dot-inl.h

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,12 +174,30 @@ struct Dot {
174174
(kAssumptions & kMultipleOfVector) != 0;
175175
constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
176176

177-
// Won't be able to do a full vector load without padding => partial load.
177+
// Won't be able to do a full vector load without padding. Use a scalar
178+
// loop under Clang. GCC has very suboptimal codegen for scalar BF16->float
179+
// conversions, so use vector ops with LoadN instead.
180+
// TODO: https://github.com/google/highway/pull/2703
178181
if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
179182
HWY_UNLIKELY(num_elements < NF)) {
183+
#if HWY_COMPILER_GCC_ACTUAL
180184
const VF a = LoadN(df, pa, num_elements);
181185
const VF b = PromoteTo(df, LoadN(dbfh, pb, num_elements));
182186
return ReduceSum(df, Mul(a, b));
187+
#else
188+
// Only 2x unroll to avoid excessive code size.
189+
float sum0 = 0.0f;
190+
float sum1 = 0.0f;
191+
size_t i = 0;
192+
for (; i + 2 <= num_elements; i += 2) {
193+
sum0 += pa[i + 0] * ConvertScalarTo<float>(pb[i + 0]);
194+
sum1 += pa[i + 1] * ConvertScalarTo<float>(pb[i + 1]);
195+
}
196+
for (; i < num_elements; ++i) {
197+
sum1 += pa[i] * ConvertScalarTo<float>(pb[i]);
198+
}
199+
return sum0 + sum1;
200+
#endif
183201
}
184202

185203
// Compiler doesn't make independent sum* accumulators, so unroll manually.
@@ -270,14 +288,30 @@ struct Dot {
270288
(kAssumptions & kMultipleOfVector) != 0;
271289
constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
272290

273-
// Won't be able to do a full vector load without padding => partial load.
291+
// Won't be able to do a full vector load without padding. Use a scalar
292+
// loop under Clang. GCC has very suboptimal codegen for scalar BF16->float
293+
// conversions, so use vector ops with LoadN instead.
294+
// TODO: https://github.com/google/highway/pull/2703
274295
if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
275296
HWY_UNLIKELY(num_elements < N)) {
297+
#if HWY_COMPILER_GCC_ACTUAL
276298
const auto a = LoadN(d, pa, num_elements);
277299
const auto b = LoadN(d, pb, num_elements);
278300
V sum1 = Zero(df32);
279301
V sum0 = ReorderWidenMulAccumulate(df32, a, b, Zero(df32), sum1);
280302
return ReduceSum(df32, Add(sum0, sum1));
303+
#else
304+
float sum0 = 0.0f; // Only 2x unroll to avoid excessive code size for..
305+
float sum1 = 0.0f; // this unlikely(?) case.
306+
for (; i + 2 <= num_elements; i += 2) {
307+
sum0 += F32FromBF16(pa[i + 0]) * F32FromBF16(pb[i + 0]);
308+
sum1 += F32FromBF16(pa[i + 1]) * F32FromBF16(pb[i + 1]);
309+
}
310+
if (i < num_elements) {
311+
sum1 += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
312+
}
313+
return sum0 + sum1;
314+
#endif
281315
}
282316

283317
// See comment in the other Compute() overload. Unroll 2x, but we need

0 commit comments

Comments
 (0)