From f1c0f3e38e1b657876692b6f070baf7c0a10f1aa Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoly.myachev@intel.com>
Date: Sun, 11 May 2025 17:49:35 +0200
Subject: [PATCH] Enable SLPVectorization

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 third_party/intel/triton_xpu.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
index 1e3cd76991..14bad9dd8f 100644
--- a/third_party/intel/triton_xpu.cc
+++ b/third_party/intel/triton_xpu.cc
@@ -194,11 +194,13 @@ void init_triton_intel(py::module &&m) {
     tuningOptions.LoopUnrolling = true;
     tuningOptions.LoopInterleaving = true;
     tuningOptions.LoopVectorization = true;
-    // SLPVectorizer causes test_core.py::test_dot_mulbroadcasted to fail.
-    // It vectorizes @llvm.fmuladd.f32 with @llvm.fmuladd.v32f32. We can
-    // consider to reenable SLP vectorization when the failure is
-    // investigated.
-    tuningOptions.SLPVectorization = false;
+    // TODO: currently we run SLP vectorizer with an empty target machine.
+    // This cause the vectorizer to create larger vector which could be bad.
+    // Disabling it would currently cause regressions as this pass also
+    // applies some scheduling that helps performance in some cases. We
+    // should work on using NVPTX target instead and address the performance
+    // regressions with some scheduling solution.
+    tuningOptions.SLPVectorization = true;
 
     PassBuilder pb(nullptr /*targetMachine*/, tuningOptions, std::nullopt,
                    instrCbPtr);