Current implementation:
for (size_t i = 0; i < m; ++i) {
y[i * incy] = Sleef_fmaq1_u05(col[i], x_j, y[i * incy]);
}
can be parallelised as following given the vector stride is 1 otherwise we can fallback to the original
if (incy == 1 && m >= VECTOR_SIZE) {
// Vectorized path for unit stride
for (size_t i = 0; i < m - (m % VECTOR_SIZE); i += VECTOR_SIZE) {
QuadVector y_vec = QuadVector::load(&y[i]);
QuadVector col_vec = QuadVector::load(&col[i]);
QuadVector x_j_vec(x_j);
y_vec = col_vec.fma(x_j_vec, y_vec);
y_vec.store(&y[i]);
}
// Handle remainder
for (size_t i = m - (m % VECTOR_SIZE); i < m; ++i) {
y[i] = Sleef_fmaq1_u05(col[i], x_j, y[i]);
}
} else {
// Current scalar path for strided access
for (size_t i = 0; i < m; ++i) {
y[i * incy] = Sleef_fmaq1_u05(col[i], x_j, y[i * incy]);
}
}