intel · anmyachev · May 11, 2025 · Jun 1, 2025
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
@@ -200,11 +200,13 @@ void init_triton_intel(py::module &&m) {
     tuningOptions.LoopUnrolling = true;
     tuningOptions.LoopInterleaving = true;
     tuningOptions.LoopVectorization = true;
-    // SLPVectorizer causes test_core.py::test_dot_mulbroadcasted to fail.
-    // It vectorizes @llvm.fmuladd.f32 with @llvm.fmuladd.v32f32. We can
-    // consider to reenable SLP vectorization when the failure is
-    // investigated.
-    tuningOptions.SLPVectorization = false;
+    // TODO: currently we run SLP vectorizer with an empty target machine.
+    // This cause the vectorizer to create larger vector which could be bad.
+    // Disabling it would currently cause regressions as this pass also
+    // applies some scheduling that helps performance in some cases. We
+    // should work on using NVPTX target instead and address the performance
+    // regressions with some scheduling solution.
+    tuningOptions.SLPVectorization = true;
 
     PassBuilder pb(nullptr /*targetMachine*/, tuningOptions, std::nullopt,
                    instrCbPtr);