@@ -174,12 +174,30 @@ struct Dot {
174174 (kAssumptions & kMultipleOfVector ) != 0 ;
175175 constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector ) != 0 ;
176176
177- // Won't be able to do a full vector load without padding => partial load.
177+ // Won't be able to do a full vector load without padding. Use a scalar
178+ // loop under Clang. GCC has very suboptimal codegen for scalar BF16->float
179+ // conversions, so use vector ops with LoadN instead.
180+ // TODO: https://github.com/google/highway/pull/2703
178181 if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
179182 HWY_UNLIKELY (num_elements < NF)) {
183+ #if HWY_COMPILER_GCC_ACTUAL
180184 const VF a = LoadN (df, pa, num_elements);
181185 const VF b = PromoteTo (df, LoadN (dbfh, pb, num_elements));
182186 return ReduceSum (df, Mul (a, b));
187+ #else
188+ // Only 2x unroll to avoid excessive code size.
189+ float sum0 = 0 .0f ;
190+ float sum1 = 0 .0f ;
191+ size_t i = 0 ;
192+ for (; i + 2 <= num_elements; i += 2 ) {
193+ sum0 += pa[i + 0 ] * ConvertScalarTo<float >(pb[i + 0 ]);
194+ sum1 += pa[i + 1 ] * ConvertScalarTo<float >(pb[i + 1 ]);
195+ }
196+ for (; i < num_elements; ++i) {
197+ sum1 += pa[i] * ConvertScalarTo<float >(pb[i]);
198+ }
199+ return sum0 + sum1;
200+ #endif
183201 }
184202
185203 // Compiler doesn't make independent sum* accumulators, so unroll manually.
@@ -270,14 +288,30 @@ struct Dot {
270288 (kAssumptions & kMultipleOfVector ) != 0 ;
271289 constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector ) != 0 ;
272290
273- // Won't be able to do a full vector load without padding => partial load.
291+ // Won't be able to do a full vector load without padding. Use a scalar
292+ // loop under Clang. GCC has very suboptimal codegen for scalar BF16->float
293+ // conversions, so use vector ops with LoadN instead.
294+ // TODO: https://github.com/google/highway/pull/2703
274295 if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
275296 HWY_UNLIKELY (num_elements < N)) {
297+ #if HWY_COMPILER_GCC_ACTUAL
276298 const auto a = LoadN (d, pa, num_elements);
277299 const auto b = LoadN (d, pb, num_elements);
278300 V sum1 = Zero (df32);
279301 V sum0 = ReorderWidenMulAccumulate (df32, a, b, Zero (df32), sum1);
280302 return ReduceSum (df32, Add (sum0, sum1));
303+ #else
304+ float sum0 = 0 .0f ; // Only 2x unroll to avoid excessive code size for..
305+ float sum1 = 0 .0f ; // this unlikely(?) case.
306+ for (; i + 2 <= num_elements; i += 2 ) {
307+ sum0 += F32FromBF16 (pa[i + 0 ]) * F32FromBF16 (pb[i + 0 ]);
308+ sum1 += F32FromBF16 (pa[i + 1 ]) * F32FromBF16 (pb[i + 1 ]);
309+ }
310+ if (i < num_elements) {
311+ sum1 += F32FromBF16 (pa[i]) * F32FromBF16 (pb[i]);
312+ }
313+ return sum0 + sum1;
314+ #endif
281315 }
282316
283317 // See comment in the other Compute() overload. Unroll 2x, but we need
0 commit comments