Open
Description
https://godbolt.org/z/9a6b9oe5c
Given a loop that has a store to an invariant address:
define void @f(ptr %p, ptr %q, i32 %n) {
entry:
br label %loop
loop:
%iv = phi i32 [0, %entry], [%iv.next, %loop]
%gep = getelementptr i32, ptr %p, i32 %iv
%x = load i32, ptr %gep
%y = add i32 %x, 1
store i32 %y, ptr %gep
store i32 %y, ptr %q ; address invariant
%iv.next = add i32 %iv, 1
%done = icmp eq i32 %iv.next, %n
br i1 %done, label %exit, label %loop
exit:
ret void
}
Typically we extract the corresponding element and emit a scalar store:
%wide.load = load <vscale x 4 x i32>, ptr %14, align 4
%15 = add <vscale x 4 x i32> %wide.load, splat (i32 1)
%19 = extractelement <vscale x 4 x i32> %15, i32 %18
store i32 %19, ptr %q, align 4
However with EVL tail folding we fail to do this and instead emit a scatter:
%vp.op.load = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 %13, <vscale x 4 x i1> splat (i1 true), i32 %11)
%14 = add <vscale x 4 x i32> %vp.op.load, splat (i32 1)
call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %14, ptr align 4 %13, <vscale x 4 x i1> splat (i1 true), i32 %11)
call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %14, <vscale x 4 x ptr> align 4 %broadcast.splat, <vscale x 4 x i1> splat (i1 true), i32 %11)