Only use LoadN fallback when compiling with GCC

vekterli · vekterli · commit df82cf07248a · 2025-09-26T11:44:34.000Z
Clang does not need the `LoadN` fallback and can happily use
the scalar code. Coincidentally, this also avoids triggering a
Clang crash when `ReorderWidenMulAccumulate` is used alongside
capped vectors.
diff --git a/hwy/contrib/dot/dot-inl.h b/hwy/contrib/dot/dot-inl.h
@@ -174,12 +174,30 @@ struct Dot {
         (kAssumptions & kMultipleOfVector) != 0;
     constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
 
-    // Won't be able to do a full vector load without padding => partial load.
+    // Won't be able to do a full vector load without padding. Use a scalar
+    // loop under Clang. GCC has very suboptimal codegen for scalar BF16->float
+    // conversions, so use vector ops with LoadN instead.
+    // TODO: https://github.com/google/highway/pull/2703
     if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
         HWY_UNLIKELY(num_elements < NF)) {
+#if HWY_COMPILER_GCC_ACTUAL
       const VF a = LoadN(df, pa, num_elements);
       const VF b = PromoteTo(df, LoadN(dbfh, pb, num_elements));
       return ReduceSum(df, Mul(a, b));
+#else
+      // Only 2x unroll to avoid excessive code size.
+      float sum0 = 0.0f;
+      float sum1 = 0.0f;
+      size_t i = 0;
+      for (; i + 2 <= num_elements; i += 2) {
+        sum0 += pa[i + 0] * ConvertScalarTo<float>(pb[i + 0]);
+        sum1 += pa[i + 1] * ConvertScalarTo<float>(pb[i + 1]);
+      }
+      for (; i < num_elements; ++i) {
+        sum1 += pa[i] * ConvertScalarTo<float>(pb[i]);
+      }
+      return sum0 + sum1;
+#endif
     }
 
     // Compiler doesn't make independent sum* accumulators, so unroll manually.
@@ -270,14 +288,30 @@ struct Dot {
         (kAssumptions & kMultipleOfVector) != 0;
     constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
 
-    // Won't be able to do a full vector load without padding => partial load.
+    // Won't be able to do a full vector load without padding. Use a scalar
+    // loop under Clang. GCC has very suboptimal codegen for scalar BF16->float
+    // conversions, so use vector ops with LoadN instead.
+    // TODO: https://github.com/google/highway/pull/2703
     if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
         HWY_UNLIKELY(num_elements < N)) {
+#if HWY_COMPILER_GCC_ACTUAL
       const auto a = LoadN(d, pa, num_elements);
       const auto b = LoadN(d, pb, num_elements);
       V sum1 = Zero(df32);
       V sum0 = ReorderWidenMulAccumulate(df32, a, b, Zero(df32), sum1);
       return ReduceSum(df32, Add(sum0, sum1));
+#else
+      float sum0 = 0.0f;  // Only 2x unroll to avoid excessive code size for..
+      float sum1 = 0.0f;  // this unlikely(?) case.
+      for (; i + 2 <= num_elements; i += 2) {
+        sum0 += F32FromBF16(pa[i + 0]) * F32FromBF16(pb[i + 0]);
+        sum1 += F32FromBF16(pa[i + 1]) * F32FromBF16(pb[i + 1]);
+      }
+      if (i < num_elements) {
+        sum1 += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
+      }
+      return sum0 + sum1;
+#endif
     }
 
     // See comment in the other Compute() overload. Unroll 2x, but we need