From f1c0f3e38e1b657876692b6f070baf7c0a10f1aa Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 11 May 2025 17:49:35 +0200 Subject: [PATCH] Enable SLPVectorization Signed-off-by: Anatoly Myachev --- third_party/intel/triton_xpu.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index 1e3cd76991..14bad9dd8f 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -194,11 +194,13 @@ void init_triton_intel(py::module &&m) { tuningOptions.LoopUnrolling = true; tuningOptions.LoopInterleaving = true; tuningOptions.LoopVectorization = true; - // SLPVectorizer causes test_core.py::test_dot_mulbroadcasted to fail. - // It vectorizes @llvm.fmuladd.f32 with @llvm.fmuladd.v32f32. We can - // consider to reenable SLP vectorization when the failure is - // investigated. - tuningOptions.SLPVectorization = false; + // TODO: currently we run SLP vectorizer with an empty target machine. + // This cause the vectorizer to create larger vector which could be bad. + // Disabling it would currently cause regressions as this pass also + // applies some scheduling that helps performance in some cases. We + // should work on using NVPTX target instead and address the performance + // regressions with some scheduling solution. + tuningOptions.SLPVectorization = true; PassBuilder pb(nullptr /*targetMachine*/, tuningOptions, std::nullopt, instrCbPtr);