From 11bbd06a2ece97c7aed211289646b60089bfe12c Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 27 Jun 2025 15:29:10 -0700 Subject: [PATCH 1/5] [AMDGPU] Allocate AVRegClass last Change-Id: Iace3462f27ea276b22716793ebfa13b5026b0e58 --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 13 +- .../buffer-fat-pointer-atomicrmw-fadd.ll | 74 +-- .../buffer-fat-pointer-atomicrmw-fmax.ll | 72 +-- .../buffer-fat-pointer-atomicrmw-fmin.ll | 72 +-- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 7 +- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 218 ++++---- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 501 +++++++++--------- .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 501 +++++++++--------- .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 218 ++++---- llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 133 +++-- llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll | 8 +- llvm/test/CodeGen/AMDGPU/freeze.ll | 11 +- llvm/test/CodeGen/AMDGPU/frem.ll | 396 +++++++------- .../AMDGPU/gfx-callable-argument-types.ll | 14 +- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 218 ++++---- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 501 +++++++++--------- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 501 +++++++++--------- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 218 ++++---- llvm/test/CodeGen/AMDGPU/half.ll | 20 +- .../AMDGPU/infer-addrspace-flat-atomic.ll | 24 +- ...-reg-class-snippet-copy-use-after-free.mir | 32 +- .../AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll | 61 ++- .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll | 64 +-- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 192 ++++--- ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 228 ++++---- .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 80 +-- llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 8 +- ...al-regcopy-and-spill-missed-at-regalloc.ll | 19 +- .../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll | 36 +- .../AMDGPU/sext-in-reg-vector-shuffle.ll | 34 +- llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll | 10 +- llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll | 10 +- .../AMDGPU/undef-handling-crash-in-ra.ll | 47 +- llvm/test/CodeGen/AMDGPU/v_pack.ll | 8 +- .../CodeGen/AMDGPU/vector_shuffle.packed.ll | 30 +- .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 10 +- llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 20 +- 37 files changed, 2286 insertions(+), 2323 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index d595163f820cb..b4e968f5f455a 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -109,6 +109,10 @@ class SIRegisterClass rTypes, int Align, dag rList> let TSFlags{2} = HasVGPR; let TSFlags{3} = HasAGPR; let TSFlags{4} = HasSGPR; + + // RegisterClass (e.g. AGPR / VGPR) priority for allocation + field int RegClassPriority = 1; + } multiclass SIRegLoHi16 regIdx, bit ArtificialHigh = 1, @@ -936,14 +940,15 @@ class VRegClassBase regTypes, dag regList> : // Requires n v_mov_b32 to copy let CopyCost = numRegs; - let AllocationPriority = !sub(numRegs, 1); + defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15)); + let AllocationPriority = !add(SizePrioriity, !mul(RegClassPriority, 16)); let Weight = numRegs; } // Define a register tuple class, along with one requiring an even // aligned base register. multiclass VRegClass regTypes, dag regList> { - let HasVGPR = 1 in { + let HasVGPR = 1, RegClassPriority = 1 in { // Define the regular class. def "" : VRegClassBase { let BaseClassOrder = !mul(numRegs, 32); @@ -977,7 +982,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>; } multiclass ARegClass regTypes, dag regList> { - let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in { + let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, RegClassPriority = 1 in { // Define the regular class. def "" : VRegClassBase { let BaseClassOrder = !mul(numRegs, 32); @@ -1070,7 +1075,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3 // aligned base register. multiclass AVRegClass regTypes, dag vregList, dag aregList> { - let HasVGPR = 1, HasAGPR = 1 in { + let HasVGPR = 1, HasAGPR = 1, RegClassPriority = 0 in { // Define the regular class. def "" : VRegClassBase; diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 36370361b677d..dc84a85d6c207 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -4295,15 +4295,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v7 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v6 ; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -4317,29 +4317,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v7.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4354,13 +4355,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4368,7 +4369,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4526,16 +4527,16 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v6 ; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -4547,29 +4548,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 -; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v7.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4583,13 +4585,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -4598,7 +4600,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 3ad1e5c0b81e0..e0a81187aeed1 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -3417,11 +3417,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v6 ; GFX12-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3435,32 +3435,32 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v7.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v7.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3475,13 +3475,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3489,7 +3489,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3657,11 +3657,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v6 ; GFX11-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3673,32 +3673,32 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v7.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3712,13 +3712,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3727,7 +3727,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 1f51c93d08db1..d4dd9327f77b6 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -3417,11 +3417,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v6 ; GFX12-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3435,32 +3435,32 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v7.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, v4.l, v7.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3475,13 +3475,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3489,7 +3489,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3657,11 +3657,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v6 ; GFX11-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3673,32 +3673,32 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v7.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, v4.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3712,13 +3712,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3727,7 +3727,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 9f48c8b5fe49c..0e93e4d591f6a 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -463,12 +463,13 @@ define <2 x half> @chain_hi_to_lo_flat(ptr inreg %ptr) { ; GFX11-TRUE16: ; %bb.0: ; %bb ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:2 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2] +; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v2, v[0:1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 8581e4d030261..8281a13fb87cb 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -9902,33 +9902,34 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -10020,34 +10021,35 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -10273,33 +10275,34 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -10392,34 +10395,35 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -10645,21 +10649,23 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v3.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -10729,22 +10735,24 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v3.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -11595,34 +11603,35 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -11715,34 +11724,35 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index e3bd4e7383598..de3e6838e15ab 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -6416,35 +6416,34 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6455,7 +6454,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6548,35 +6547,33 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6587,7 +6584,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6825,35 +6822,34 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6864,7 +6860,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6958,35 +6954,33 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6997,7 +6991,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7616,38 +7610,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7745,39 +7737,37 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8014,38 +8004,36 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8144,39 +8132,37 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8717,24 +8703,24 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8809,25 +8795,25 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9013,36 +8999,35 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9053,7 +9038,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9147,35 +9132,33 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -9186,7 +9169,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9426,39 +9409,37 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -9557,39 +9538,37 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 25d59a26189c9..53596ea45c3df 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -6416,35 +6416,34 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6455,7 +6454,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6548,35 +6547,33 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6587,7 +6584,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6825,35 +6822,34 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6864,7 +6860,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6958,35 +6954,33 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6997,7 +6991,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7616,38 +7610,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7745,39 +7737,37 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8014,38 +8004,36 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8144,39 +8132,37 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8717,24 +8703,24 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8809,25 +8795,25 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9013,36 +8999,35 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9053,7 +9038,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9147,35 +9132,33 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -9186,7 +9169,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9426,39 +9409,37 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -9557,39 +9538,37 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index c987effec3be3..df4ff33da372e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -7347,33 +7347,34 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7465,34 +7466,35 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7718,33 +7720,34 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7837,34 +7840,35 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8376,21 +8380,23 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v3.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8460,22 +8466,24 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v3.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9040,34 +9048,35 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -9160,34 +9169,35 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index 3304dbf3eaa3d..97ece6090cb5d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -7949,13 +7949,12 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 -; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: flat_load_d16_u8 v2, v[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-TRUE16-NEXT: flat_store_b8 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_i8_offset: @@ -8025,13 +8024,12 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 -; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc +; GFX11-TRUE16-NEXT: flat_load_d16_u8 v2, v[0:1] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-TRUE16-NEXT: flat_store_b8 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_i8: @@ -8114,17 +8112,16 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 ; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s4 ; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: flat_load_d16_u8 v2, v[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-TRUE16-NEXT: flat_store_b8 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_i8_addr64_offset: @@ -8196,9 +8193,9 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b8 v[0:1], v2 offset:16 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_i8_offset: @@ -8257,9 +8254,9 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b8 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_i8: @@ -8329,9 +8326,10 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s2 ; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 offset:16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s4 +; GFX11-TRUE16-NEXT: flat_store_b8 v[0:1], v2 offset:16 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_i8_addr64_offset: @@ -8406,13 +8404,12 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_i16_offset: @@ -8482,13 +8479,12 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] glc +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_i16: @@ -8575,17 +8571,16 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s4 ; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_i16_addr64_offset: @@ -8658,9 +8653,9 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 offset:16 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_i16_offset: @@ -8719,9 +8714,9 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_i16: @@ -8793,12 +8788,12 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s4 ; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s2 ; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 offset:16 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_i16_addr64_offset: @@ -8866,9 +8861,9 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 offset:16 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_f16_offset: @@ -8927,9 +8922,9 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_f16: @@ -8987,9 +8982,9 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_bf16_offset: @@ -9047,9 +9042,9 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_bf16: @@ -10598,13 +10593,12 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_f16_offset: @@ -10673,13 +10667,12 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] glc +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_f16: @@ -10751,13 +10744,12 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_bf16_offset: @@ -10826,13 +10818,12 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] glc +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index 0a900f904bec5..7113d5942208e 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -554,13 +554,13 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 -; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-SDAG-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index ac438062ae208..cf041ba1c2fbc 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -13984,13 +13984,14 @@ define void @freeze_v2i1_vcc(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX11-SDAG-TRUE16-LABEL: freeze_v2i1_vcc: ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-SDAG-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.l, 1, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.l, 1, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l ; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3 ; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[2:3], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index d3432daedadf8..8ed8a4df3e761 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -3430,103 +3430,103 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3] -; GFX11-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32 +; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v5, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v5, s[4:5] offset:32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v9, v8.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v9, v9 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v6 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v4, v1 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v4, v1 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v4.l, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v6.l -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v4.l, v4.l, v3.l, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v7, v7, v9 -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v4.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v10, -v4, v7, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fma_f16 v4.l, -v4.l, v3.l, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v7, v10, v9 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v2, -v4, v7, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v9 -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v8.l, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h -; GFX11-TRUE16-NEXT: v_fma_f16 v0.h, -v0.h, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v0.h -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v1, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v8, -v6, v0, v7 op_sel_hi:[1,0,1] -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v8, v4 +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v1.l, v1.l, v8.l, v6.l +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v6, -v6, v0, v7 op_sel_hi:[1,0,1] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v6, v4 +; GFX11-TRUE16-NEXT: v_fma_f16 v1.l, -v1.l, v8.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v4.l, v1.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v8, -v6, v3, v7 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v8, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v6, -v6, v3, v7 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v6, v4 ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v8, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v8, v8 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v6, v6, v8 -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v9, -v3, v6, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v3.l, v3.l, v2.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v1.l ; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v6, v9, v8 +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_fma_f16 v3.l, -v3.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v1, -v3, v6, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v1, v8 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v6 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v7.l, v4.l +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v4.l +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h -; GFX11-TRUE16-NEXT: v_fma_f16 v0.h, -v0.h, v7.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v0.h -; GFX11-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1] +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v7.l, v4.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v3.l, v0.l +; GFX11-TRUE16-NEXT: global_store_b64 v5, v[0:1], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: frem_v4f16: @@ -3642,107 +3642,106 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, 0 ; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-TRUE16-NEXT: s_clause 0x1 -; GFX1150-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3] -; GFX1150-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32 +; GFX1150-TRUE16-NEXT: global_load_b64 v[0:1], v5, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_b64 v[2:3], v5, s[4:5] offset:32 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.h +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.h ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.h -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v2.h +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v8.l, v2.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v6 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v6 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v4.l, v4 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v4.l, v7.l, v6.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v4.l, v0.h +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v0.l, v7.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v3.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v4.l, v7.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v2.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l ; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v7, v7 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v7 -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v7 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v4, v9 op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v10, v7 -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v4, v10, v7 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v4, v9 op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v7, v8, v7 ; GFX1150-TRUE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v7, v0 -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v4, v7, v4 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v2.l, v0.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v4.l, v0.h ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v3.l, v1.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v4.h -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v6.l -; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 +; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v4.l, v2.l, v0.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.h +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.h +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v6.l +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v2, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v3 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v2, v6, v4 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v2, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v6, v4 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v3, v0 -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v2.l, v2.l, v6.l, v4.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v2.l, v2.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v6.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v2.l, v6.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v6 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v2, v1 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v2, v7, v6 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v2, v1 op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v2, v6, v2 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v2 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v1.h, v1.h, v3.l, v1.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v2.l, v1.h +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v3.l -; GFX1150-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1] +; GFX1150-TRUE16-NEXT: v_fma_f16 v1.l, v2.l, v3.l, v1.l +; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v4.l +; GFX1150-TRUE16-NEXT: global_store_b64 v5, v[0:1], s[0:1] ; GFX1150-TRUE16-NEXT: s_endpgm ; ; GFX1150-FAKE16-LABEL: frem_v4f16: @@ -3863,107 +3862,106 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, 0 ; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1200-TRUE16-NEXT: s_clause 0x1 -; GFX1200-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3] -; GFX1200-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32 +; GFX1200-TRUE16-NEXT: global_load_b64 v[0:1], v5, s[2:3] +; GFX1200-TRUE16-NEXT: global_load_b64 v[2:3], v5, s[4:5] offset:32 ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.h +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.h ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.h -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v2.h +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v8.l, v2.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v6 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v6 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v4.l, v4 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v4.l, v7.l, v6.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v4.l, v0.h +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v0.l, v7.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v3.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v4.l, v7.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v2.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l ; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v7, v7 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v7 -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v7 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v4, v9 op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v10, v7 -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v4, v10, v7 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v4, v9 op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v7, v8, v7 ; GFX1200-TRUE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v7, v0 -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v4, v7, v4 +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v2.l, v0.l +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v4.l, v0.h ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v3.l, v1.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v4.h -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v6.l -; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 +; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v4.l, v2.l, v0.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.h +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.h +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v6.l +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v2, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v3 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v2, v6, v4 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v2, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v6, v4 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v3, v0 -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v2.l, v2.l, v6.l, v4.l +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v2.l, v2.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v6.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v2.l, v6.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v6 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v2, v1 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v2, v7, v6 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v2, v1 op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v2, v6, v2 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v2 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v1.h, v1.h, v3.l, v1.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v2.l, v1.h +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v3.l -; GFX1200-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1] +; GFX1200-TRUE16-NEXT: v_fma_f16 v1.l, v2.l, v3.l, v1.l +; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v4.l +; GFX1200-TRUE16-NEXT: global_store_b64 v5, v[0:1], s[0:1] ; GFX1200-TRUE16-NEXT: s_endpgm ; ; GFX1200-FAKE16-LABEL: frem_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index e40917d4307fb..505e5400a990a 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -4800,15 +4800,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v2.h ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b8 v[3:4], v2, off -; GFX11-TRUE16-NEXT: global_store_b16 v[40:41], v0, off +; GFX11-TRUE16-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[40:41], v2, off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index afa57b8692aa5..501455a551db3 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -9971,33 +9971,34 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -10089,34 +10090,35 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -10392,33 +10394,34 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -10511,34 +10514,35 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -11139,21 +11143,23 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v3.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -11223,22 +11229,24 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v3.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -11892,34 +11900,35 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -12012,34 +12021,35 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index b9774808f1ad1..c2ba80f75c630 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -4895,35 +4895,34 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4934,7 +4933,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5027,35 +5026,33 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5066,7 +5063,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5356,35 +5353,34 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5395,7 +5391,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5489,35 +5485,33 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5528,7 +5522,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6248,38 +6242,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6377,39 +6369,37 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6696,38 +6686,36 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6826,39 +6814,37 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7488,24 +7474,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7580,25 +7566,25 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7821,36 +7807,35 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7861,7 +7846,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7955,35 +7940,33 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -7994,7 +7977,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8286,39 +8269,37 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8417,39 +8398,37 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index c30543642d314..b4772c723a2eb 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -4895,35 +4895,34 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4934,7 +4933,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5027,35 +5026,33 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5066,7 +5063,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5356,35 +5353,34 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5395,7 +5391,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5489,35 +5485,33 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5528,7 +5522,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6248,38 +6242,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6377,39 +6369,37 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6696,38 +6686,36 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6826,39 +6814,37 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7488,24 +7474,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7580,25 +7566,25 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7821,36 +7807,35 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7861,7 +7846,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7955,35 +7940,33 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -7994,7 +7977,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8286,39 +8269,37 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8417,39 +8398,37 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 5e4a5c649bb24..985e1f20b1d33 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -6917,33 +6917,34 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7035,34 +7036,35 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7338,33 +7340,34 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7457,34 +7460,35 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8085,21 +8089,23 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v3.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8169,22 +8175,24 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v3.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8838,34 +8846,35 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8958,34 +8967,35 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 117cf40de72d2..90cf66b3f7b74 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -2595,15 +2595,15 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; GFX11-TRUE16-LABEL: global_truncstore_v2f32_to_v2f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v3, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l -; GFX11-TRUE16-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v1.l +; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: global_truncstore_v2f32_to_v2f16: @@ -2772,12 +2772,12 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v3 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.h, v1 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v2.l, v3.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v1.l -; GFX11-TRUE16-NEXT: global_store_b64 v4, v[1:2], s[0:1] +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v2.l, v3.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v2.h +; GFX11-TRUE16-NEXT: global_store_b64 v4, v[0:1], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: global_truncstore_v4f32_to_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll index 258aa9e299c3d..0a493e5188ad5 100644 --- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll @@ -8,15 +8,15 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_ashr_i32 s7, s6, 31 -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s2, s0 ; CHECK-NEXT: s_addc_u32 s1, s3, s1 -; CHECK-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc -; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: s_endpgm @@ -35,15 +35,15 @@ define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, doub ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_ashr_i32 s7, s6, 31 -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s2 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 -; CHECK-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc -; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir index c1e0d0716acae..a6dfa451c0486 100644 --- a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir +++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir @@ -32,32 +32,14 @@ # CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY undef $vgpr2_vgpr3 { # CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY undef $vgpr0 # CHECK-NEXT: } -# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 { -# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0 -# CHECK-NEXT: } -# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 { -# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0 -# CHECK-NEXT: } -# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5) -# CHECK-NEXT: [[RESTORE1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) -# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE1]].sub0_sub1 -# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5) -# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE2]].sub2_sub3 { -# CHECK-NEXT: internal [[SPLIT3]].sub0:av_512_align2 = COPY [[RESTORE2]].sub0 -# CHECK-NEXT: } -# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT3]].sub2_sub3 { -# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[SPLIT3]].sub0 -# CHECK-NEXT: } -# CHECK-NEXT: [[SPLIT5:%[0-9]+]].sub2:av_512_align2 = COPY [[SPLIT4]].sub3 -# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT5]].sub0_sub1_sub2 -# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2 -# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0 { -# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT4]].sub2 +# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 { +# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0 # CHECK-NEXT: } -# CHECK-NEXT: [[SPLIT9:%[0-9]+]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2 -# CHECK-NEXT: undef [[SPLIT10:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3 -# CHECK-NEXT: undef [[SPLIT13:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT10]].sub0_sub1_sub2_sub3 -# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0 +# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) +# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub0_sub1:vreg_512_align2 = COPY [[RESTORE2]].sub0_sub1 +# CHECK-NEXT: [[MFMA_USE1]].sub2:vreg_512_align2 = COPY [[SPLIT2]].sub3 +# CHECK-NEXT: [[MFMA_USE1]].sub3:vreg_512_align2 = COPY [[SPLIT2]].sub2 +# CHECK-NEXT: [[MFMA_USE1]].sub4:vreg_512_align2 = COPY [[SPLIT2]].sub0 # CHECK-NEXT: [[MFMA_USE1]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec # CHECK-NEXT: [[MFMA_USE1]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec # CHECK-NEXT: [[MFMA_USE1]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll index 4a2c1fe2cf91c..9bc551adc7d88 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll @@ -18,19 +18,22 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GCN-LABEL: load_1d_lwe: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v6, v0 ; GCN-NEXT: v_mov_b32_e32 v9, v8 ; GCN-NEXT: v_mov_b32_e32 v10, v8 ; GCN-NEXT: v_mov_b32_e32 v11, v8 ; GCN-NEXT: v_mov_b32_e32 v12, v8 -; GCN-NEXT: v_mov_b32_e32 v0, v8 -; GCN-NEXT: v_mov_b32_e32 v1, v9 -; GCN-NEXT: v_mov_b32_e32 v2, v10 -; GCN-NEXT: v_mov_b32_e32 v3, v11 -; GCN-NEXT: v_mov_b32_e32 v4, v12 -; GCN-NEXT: image_load v[0:4], v6, s[0:7] dmask:0xf unorm lwe +; GCN-NEXT: v_mov_b32_e32 v2, v8 +; GCN-NEXT: v_mov_b32_e32 v3, v9 +; GCN-NEXT: v_mov_b32_e32 v4, v10 +; GCN-NEXT: v_mov_b32_e32 v5, v11 +; GCN-NEXT: v_mov_b32_e32 v6, v12 +; GCN-NEXT: image_load v[2:6], v0, s[0:7] dmask:0xf unorm lwe ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dword v8, v4, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: v_mov_b32_e32 v1, v3 +; GCN-NEXT: v_mov_b32_e32 v2, v4 +; GCN-NEXT: v_mov_b32_e32 v3, v5 +; GCN-NEXT: global_store_dword v8, v6, s[8:9] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -75,6 +78,27 @@ main_body: } define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) { +; GCN-LABEL: load_cube_lwe: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v11, v10 +; GCN-NEXT: v_mov_b32_e32 v12, v10 +; GCN-NEXT: v_mov_b32_e32 v13, v10 +; GCN-NEXT: v_mov_b32_e32 v14, v10 +; GCN-NEXT: v_mov_b32_e32 v4, v10 +; GCN-NEXT: v_mov_b32_e32 v5, v11 +; GCN-NEXT: v_mov_b32_e32 v6, v12 +; GCN-NEXT: v_mov_b32_e32 v7, v13 +; GCN-NEXT: v_mov_b32_e32 v8, v14 +; GCN-NEXT: image_load v[4:8], v[0:2], s[0:7] dmask:0xf unorm lwe da +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v4 +; GCN-NEXT: v_mov_b32_e32 v1, v5 +; GCN-NEXT: v_mov_b32_e32 v2, v6 +; GCN-NEXT: v_mov_b32_e32 v3, v7 +; GCN-NEXT: global_store_dword v10, v8, s[8:9] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 @@ -106,6 +130,27 @@ main_body: } define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) { +; GCN-LABEL: load_2darray_lwe: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v11, v10 +; GCN-NEXT: v_mov_b32_e32 v12, v10 +; GCN-NEXT: v_mov_b32_e32 v13, v10 +; GCN-NEXT: v_mov_b32_e32 v14, v10 +; GCN-NEXT: v_mov_b32_e32 v4, v10 +; GCN-NEXT: v_mov_b32_e32 v5, v11 +; GCN-NEXT: v_mov_b32_e32 v6, v12 +; GCN-NEXT: v_mov_b32_e32 v7, v13 +; GCN-NEXT: v_mov_b32_e32 v8, v14 +; GCN-NEXT: image_load v[4:8], v[0:2], s[0:7] dmask:0xf unorm lwe da +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v4 +; GCN-NEXT: v_mov_b32_e32 v1, v5 +; GCN-NEXT: v_mov_b32_e32 v2, v6 +; GCN-NEXT: v_mov_b32_e32 v3, v7 +; GCN-NEXT: global_store_dword v10, v8, s[8:9] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 452033f332659..520884534ea77 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -15,9 +15,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[12:13], 48 -; GCN-NEXT: v_mov_b64_e32 v[14:15], 32 -; GCN-NEXT: v_mov_b64_e32 v[16:17], 16 +; GCN-NEXT: v_mov_b64_e32 v[8:9], 48 +; GCN-NEXT: v_mov_b64_e32 v[10:11], 32 +; GCN-NEXT: v_mov_b64_e32 v[12:13], 16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] @@ -39,42 +39,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: v_accvgpr_write_b32 a13, s21 ; GCN-NEXT: v_accvgpr_write_b32 a14, s22 ; GCN-NEXT: v_accvgpr_write_b32 a15, s23 -; GCN-NEXT: v_mov_b64_e32 v[18:19], 0 -; GCN-NEXT: v_mov_b32_e32 v8, s16 +; GCN-NEXT: v_mov_b64_e32 v[14:15], 0 +; GCN-NEXT: v_mov_b32_e32 v16, s16 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NEXT: v_mov_b32_e32 v1, s21 ; GCN-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NEXT: v_mov_b32_e32 v3, s23 -; GCN-NEXT: v_mov_b32_e32 v9, s17 -; GCN-NEXT: v_mov_b32_e32 v10, s18 -; GCN-NEXT: v_mov_b32_e32 v11, s19 +; GCN-NEXT: v_mov_b32_e32 v17, s17 +; GCN-NEXT: v_mov_b32_e32 v18, s18 +; GCN-NEXT: v_mov_b32_e32 v19, s19 ; GCN-NEXT: s_nop 4 -; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) @@ -88,9 +88,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[12:13], 48 -; GCN-NEXT: v_mov_b64_e32 v[14:15], 32 -; GCN-NEXT: v_mov_b64_e32 v[16:17], 16 +; GCN-NEXT: v_mov_b64_e32 v[8:9], 48 +; GCN-NEXT: v_mov_b64_e32 v[10:11], 32 +; GCN-NEXT: v_mov_b64_e32 v[12:13], 16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] @@ -112,42 +112,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN-NEXT: v_accvgpr_write_b32 a13, s21 ; GCN-NEXT: v_accvgpr_write_b32 a14, s22 ; GCN-NEXT: v_accvgpr_write_b32 a15, s23 -; GCN-NEXT: v_mov_b64_e32 v[18:19], 0 -; GCN-NEXT: v_mov_b32_e32 v8, s16 +; GCN-NEXT: v_mov_b64_e32 v[14:15], 0 +; GCN-NEXT: v_mov_b32_e32 v16, s16 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 ; GCN-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NEXT: v_mov_b32_e32 v1, s21 ; GCN-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NEXT: v_mov_b32_e32 v3, s23 -; GCN-NEXT: v_mov_b32_e32 v9, s17 -; GCN-NEXT: v_mov_b32_e32 v10, s18 -; GCN-NEXT: v_mov_b32_e32 v11, s19 +; GCN-NEXT: v_mov_b32_e32 v17, s17 +; GCN-NEXT: v_mov_b32_e32 v18, s18 +; GCN-NEXT: v_mov_b32_e32 v19, s19 ; GCN-NEXT: s_nop 4 -; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 4628a9c15391b..368b43017da97 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -152,9 +152,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32 -; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 +; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 +; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] @@ -176,42 +176,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 ; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 -; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] ; SDAG-NEXT: v_mov_b32_e32 v0, s20 ; SDAG-NEXT: v_mov_b32_e32 v1, s21 ; SDAG-NEXT: v_mov_b32_e32 v2, s22 ; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 ; SDAG-NEXT: s_nop 4 -; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -282,9 +282,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32 -; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 +; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 +; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] @@ -306,42 +306,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 ; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 -; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 ; SDAG-NEXT: v_mov_b32_e32 v0, s20 ; SDAG-NEXT: v_mov_b32_e32 v1, s21 ; SDAG-NEXT: v_mov_b32_e32 v2, s22 ; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 ; SDAG-NEXT: s_nop 4 -; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1062,19 +1062,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 -; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s24 -; SDAG-NEXT: v_mov_b32_e32 v1, s25 -; SDAG-NEXT: v_mov_b32_e32 v2, s26 -; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v8, s24 +; SDAG-NEXT: v_mov_b32_e32 v9, s25 +; SDAG-NEXT: v_mov_b32_e32 v10, s26 +; SDAG-NEXT: v_mov_b32_e32 v11, s27 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b32_e32 v4, s28 -; SDAG-NEXT: v_mov_b32_e32 v5, s29 -; SDAG-NEXT: v_mov_b32_e32 v6, s30 -; SDAG-NEXT: v_mov_b32_e32 v7, s31 +; SDAG-NEXT: v_mov_b32_e32 v12, s28 +; SDAG-NEXT: v_mov_b32_e32 v13, s29 +; SDAG-NEXT: v_mov_b32_e32 v14, s30 +; SDAG-NEXT: v_mov_b32_e32 v15, s31 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -1090,44 +1090,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 ; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1198,19 +1196,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48 -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32 -; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s24 -; SDAG-NEXT: v_mov_b32_e32 v1, s25 -; SDAG-NEXT: v_mov_b32_e32 v2, s26 -; SDAG-NEXT: v_mov_b32_e32 v3, s27 +; SDAG-NEXT: v_mov_b32_e32 v8, s24 +; SDAG-NEXT: v_mov_b32_e32 v9, s25 +; SDAG-NEXT: v_mov_b32_e32 v10, s26 +; SDAG-NEXT: v_mov_b32_e32 v11, s27 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b32_e32 v4, s28 -; SDAG-NEXT: v_mov_b32_e32 v5, s29 -; SDAG-NEXT: v_mov_b32_e32 v6, s30 -; SDAG-NEXT: v_mov_b32_e32 v7, s31 +; SDAG-NEXT: v_mov_b32_e32 v12, s28 +; SDAG-NEXT: v_mov_b32_e32 v13, s29 +; SDAG-NEXT: v_mov_b32_e32 v14, s30 +; SDAG-NEXT: v_mov_b32_e32 v15, s31 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -1226,44 +1224,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 ; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s20 +; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s22 +; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index 91197f915b659..6c0faadf9eae5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -4773,42 +4773,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; SDAG-NEXT: v_mov_b32_e32 v16, s1 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -4922,42 +4921,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5069,42 +5067,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5216,42 +5213,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 48 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 +; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 32 -; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v6, s18 +; SDAG-NEXT: v_mov_b32_e32 v7, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s16 +; SDAG-NEXT: v_mov_b32_e32 v5, s17 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 +; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 +; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v10, s10 +; SDAG-NEXT: v_mov_b32_e32 v11, s11 +; SDAG-NEXT: v_mov_b32_e32 v8, s8 +; SDAG-NEXT: v_mov_b32_e32 v9, s9 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 +; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[10:11], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[8:9], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 77d4aad5f3174..ba8c9c5837a4e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -19,11 +19,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[12:15], v0, s[6:7] ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] @@ -32,9 +32,9 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[8:11], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr: @@ -44,11 +44,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; GISEL-NEXT: global_load_dwordx4 v[12:15], v0, s[6:7] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] @@ -57,10 +57,10 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: v_mov_b32_e32 v16, s16 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[8:11], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, v[12:15], s[6:7] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -549,11 +549,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[12:15], v0, s[6:7] ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; GCN-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] @@ -562,9 +562,9 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: v_mov_b32_e32 v17, s16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[8:11], v[0:7], v17 cbsz:1 abid:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] ; GCN-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -887,13 +887,13 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] @@ -901,10 +901,10 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; GISEL-NEXT: v_mov_b32_e32 v16, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[8:11], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1429,13 +1429,13 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] @@ -1443,10 +1443,10 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: v_mov_b32_e32 v16, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[8:11], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1598,13 +1598,13 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] @@ -1612,10 +1612,10 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: v_mov_b32_e32 v16, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[8:11], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1767,13 +1767,13 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] @@ -1781,10 +1781,10 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: v_mov_b32_e32 v16, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[8:11], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1936,13 +1936,13 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] @@ -1950,10 +1950,10 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: v_mov_b32_e32 v16, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[8:11], v[0:7], v16 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index bef38c1a65ef8..c470a2f9e7ee8 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -2043,12 +2043,12 @@ define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace( ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; PACKED-GISEL-NEXT: ds_read_b64 v[0:1], v2 -; PACKED-GISEL-NEXT: ds_read_b64 v[2:3], v2 offset:8 +; PACKED-GISEL-NEXT: ds_read_b64 v[4:5], v2 offset:8 ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; PACKED-GISEL-NEXT: v_mov_b32_e32 v4, v3 -; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, v5 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v3, v4 +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] ; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 24677b60be6c2..5fb1314c6c11d 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -6,6 +6,7 @@ ; Partial reg copy and spill missed during regalloc handled later at frame lowering. define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { + ; REGALLOC-GFX908-LABEL: name: partial_copy ; REGALLOC-GFX908: bb.0 (%ir-block.0): ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 @@ -13,7 +14,6 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32 ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %7 ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %8 - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 @@ -21,8 +21,8 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) - ; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX908-NEXT: S_ENDPGM 0 ; ; PEI-GFX908-LABEL: name: partial_copy @@ -71,26 +71,19 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; ; PEI-GFX90A-LABEL: name: partial_copy ; PEI-GFX90A: bb.0 (%ir-block.0): - ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9 + ; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; PEI-GFX90A-NEXT: {{ $}} - ; PEI-GFX90A-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; PEI-GFX90A-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 - ; PEI-GFX90A-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0 ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec - ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1 - ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) - ; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 + ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec ; PEI-GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) - ; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1 - ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX90A-NEXT: S_ENDPGM 0 call void asm sideeffect "; use $0", "a" (i32 poison) diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll index 29448ab2d822e..d8c7013a2543f 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll @@ -59,19 +59,19 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX90A-LABEL: scalar_to_vector_v8i16: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, s3 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX90A-NEXT: v_mov_b32_e32 v3, s1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s0 +; GFX90A-NEXT: v_mov_b32_e32 v5, s0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX90A-NEXT: s_endpgm entry: %val.1.i32 = extractelement <2 x i32> %in, i64 0 @@ -146,19 +146,19 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX90A-LABEL: scalar_to_vector_v8f16: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, s3 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s1 -; GFX90A-NEXT: v_mov_b32_e32 v3, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX90A-NEXT: v_mov_b32_e32 v3, s1 +; GFX90A-NEXT: v_mov_b32_e32 v5, s0 +; GFX90A-NEXT: v_mov_b32_e32 v4, s0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX90A-NEXT: s_endpgm entry: %val.1.float = extractelement <2 x float> %in, i64 0 diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll index 49dec15f9f7d7..16e5c92f8e6d8 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll @@ -9,31 +9,33 @@ define amdgpu_kernel void @v_sext_in_reg_i8_i16_shuffle_vector(ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v0, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h -; GFX11-TRUE16-NEXT: v_bfe_i32 v7, v2, 0, 8 -; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v5, 24, v2 -; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v6, 24, v1 -; GFX11-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v4, 24, v0 +; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.h, 8, v0.l +; GFX11-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8 +; GFX11-TRUE16-NEXT: v_bfe_i32 v8, v2, 0, 8 +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v5, 24, v1 +; GFX11-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX11-TRUE16-NEXT: v_bfe_i32 v7, v0, 0, 8 ; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.h, 8, v2.l -; GFX11-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.l +; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v2.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l -; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v0.h -; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v2.h, v6.l +; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v4.h, v5.l -; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v1.l, v1.l ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v1.h, v2.l +; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v1.l, v1.l ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v2.l, v3.l ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v4.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v3, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v3, v0.h, v1.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v1.h ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v4.h, v4.l ; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll index ebe6b232bfcbc..f7caca2d143c8 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -1110,13 +1110,13 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v3 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.h, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v1.h, v1.l -; GFX11-TRUE16-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v2.h, v2.l +; GFX11-TRUE16-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_sint_to_fp_v4i64_to_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll index 5b1a5206c3403..1f58623cfd13c 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -894,13 +894,13 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v3 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.h, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v1.h, v1.l -; GFX11-TRUE16-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v2.h, v2.l +; GFX11-TRUE16-NEXT: global_store_b64 v3, v[0:1], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_uint_to_fp_v4i64_to_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll index d0d1ba82dc000..b3166fa3f4548 100644 --- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll +++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll @@ -8,9 +8,8 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; CHECK-NEXT: v_mov_b32_e32 v40, v0 -; CHECK-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; CHECK-NEXT: flat_load_dword v42, v[0:1] +; CHECK-NEXT: v_pk_mov_b32 v[44:45], 0, 0 +; CHECK-NEXT: flat_load_dword v42, v[44:45] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x8 @@ -19,48 +18,44 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v46, s6 -; CHECK-NEXT: v_mov_b32_e32 v47, s7 +; CHECK-NEXT: v_accvgpr_write_b32 a32, s6 +; CHECK-NEXT: v_accvgpr_write_b32 a33, s7 ; CHECK-NEXT: s_mov_b64 s[6:7], src_private_base ; CHECK-NEXT: s_cmp_lg_u32 s64, -1 ; CHECK-NEXT: s_cselect_b32 s7, s7, 0 ; CHECK-NEXT: s_cselect_b32 s8, s64, 0 ; CHECK-NEXT: s_add_u32 s50, s34, 48 ; CHECK-NEXT: s_addc_u32 s51, s35, 0 -; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[4:5], s[4:5] op_sel:[0,1] +; CHECK-NEXT: v_pk_mov_b32 v[56:57], s[4:5], s[4:5] op_sel:[0,1] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, G@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, G@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[54:55], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; CHECK-NEXT: v_mov_b32_e32 v57, s7 +; CHECK-NEXT: v_mov_b32_e32 v47, s7 ; CHECK-NEXT: s_mov_b32 s7, s6 ; CHECK-NEXT: s_mov_b32 s53, s14 -; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 -; CHECK-NEXT: v_mov_b32_e32 v56, s8 -; CHECK-NEXT: v_pk_mov_b32 v[60:61], s[6:7], s[6:7] op_sel:[0,1] +; CHECK-NEXT: v_mov_b32_e32 v46, s8 +; CHECK-NEXT: v_pk_mov_b32 v[58:59], s[6:7], s[6:7] op_sel:[0,1] ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51] ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s14, s16 -; CHECK-NEXT: v_mov_b32_e32 v31, v40 +; CHECK-NEXT: v_mov_b32_e32 v31, v0 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_mov_b32 s33, s16 ; CHECK-NEXT: s_mov_b32 s52, s15 ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] -; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 -; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61] +; CHECK-NEXT: v_mov_b32_e32 v40, v0 +; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59] ; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] -; CHECK-NEXT: flat_load_dwordx2 v[62:63], v[58:59] -; CHECK-NEXT: v_accvgpr_read_b32 v0, a32 -; CHECK-NEXT: v_mov_b32_e32 v44, 0 -; CHECK-NEXT: v_mov_b32_e32 v45, 0x3ff00000 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a33 +; CHECK-NEXT: flat_load_dwordx2 v[60:61], v[56:57] +; CHECK-NEXT: v_mov_b32_e32 v62, 0 +; CHECK-NEXT: v_mov_b32_e32 v63, 0x3ff00000 ; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[50:51] @@ -69,20 +64,20 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x ; CHECK-NEXT: s_mov_b32 s13, s52 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 -; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[44:45] -; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[60:61] +; CHECK-NEXT: flat_store_dwordx2 v[44:45], v[62:63] +; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[58:59] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ; kill: def $sgpr15 killed $sgpr15 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[54:55] -; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[56:57] glc +; CHECK-NEXT: flat_load_dwordx2 v[0:1], v[46:47] glc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s64 ; CHECK-NEXT: v_cmp_lt_i32_e32 vcc, 0, v42 -; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[62:63] +; CHECK-NEXT: flat_store_dwordx2 v[56:57], v[60:61] ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_dwordx2 v[58:59], v[46:47] -; CHECK-NEXT: buffer_store_dword v47, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v44, v0, s[0:3], 0 offen +; CHECK-NEXT: flat_store_dwordx2 v[56:57], a[32:33] +; CHECK-NEXT: buffer_store_dword a33, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v62, v0, s[0:3], 0 offen ; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 8a8829832f688..cd6bc687c1851 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -348,13 +348,13 @@ define amdgpu_kernel void @fptrunc( ; GFX11-GCN-REAL16-NEXT: s_mov_b32 s8, s2 ; GFX11-GCN-REAL16-NEXT: s_mov_b32 s9, s3 ; GFX11-GCN-REAL16-NEXT: s_mov_b32 s4, s0 -; GFX11-GCN-REAL16-NEXT: buffer_load_b64 v[1:2], off, s[8:11], 0 +; GFX11-GCN-REAL16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 ; GFX11-GCN-REAL16-NEXT: s_mov_b32 s5, s1 ; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 -; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v1.l ; GFX11-GCN-REAL16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-GCN-REAL16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 58602a1ccd5ba..8abf92979b9bf 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1961,16 +1961,15 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) ; GFX942-LABEL: shuffle_v6f16_452367: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_load_dwordx3 v[0:2], v[6:7], off -; GFX942-NEXT: global_load_dword v3, v[4:5], off +; GFX942-NEXT: global_load_dwordx3 v[4:6], v[0:1], off +; GFX942-NEXT: global_load_dword v4, v[2:3], off +; GFX942-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX942-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v6f16_452367: @@ -5151,16 +5150,15 @@ define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace ; GFX942-LABEL: shuffle_v6bf16_452367: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v7, v1 -; GFX942-NEXT: v_mov_b32_e32 v6, v0 -; GFX942-NEXT: v_mov_b32_e32 v5, v3 -; GFX942-NEXT: v_mov_b32_e32 v4, v2 -; GFX942-NEXT: global_load_dwordx3 v[0:2], v[6:7], off -; GFX942-NEXT: global_load_dword v3, v[4:5], off +; GFX942-NEXT: global_load_dwordx3 v[4:6], v[0:1], off +; GFX942-NEXT: global_load_dword v4, v[2:3], off +; GFX942-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX942-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v0, v6 +; GFX942-NEXT: v_mov_b32_e32 v1, v5 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v2, v3 +; GFX942-NEXT: v_mov_b32_e32 v2, v4 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v6bf16_452367: diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index 310f3ad04917b..0ae6dbd06e33d 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -623,25 +623,25 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac ; GFX942-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v5 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[2:3] +; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB11_4 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[0:1], v6, s[10:11] +; GFX942-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] ; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v5 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB11_3 ; GFX942-NEXT: ; %bb.2: ; %bb.2 ; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[12:13] +; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[12:13] ; GFX942-NEXT: .LBB11_3: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: .LBB11_4: ; %bb.3 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15] +; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index 77d1e6c2593c1..03bf1d62a1c74 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -261,13 +261,13 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; GFX11-TRUE16-LABEL: widen_f16_constant_load: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, 4.0 -; GFX11-TRUE16-NEXT: global_store_b16 v[1:2], v0, off +; GFX11-TRUE16-NEXT: v_add_f16_e64 v2.l, s0, 4.0 +; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: widen_f16_constant_load: @@ -393,16 +393,18 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) ; GFX11-TRUE16-LABEL: no_widen_i16_constant_divergent_load: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x3e7, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, 4 -; GFX11-TRUE16-NEXT: global_store_b16 v[1:2], v0, off +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x3e7, v0.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, 4 +; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: no_widen_i16_constant_divergent_load: From a46bd5e712c82107f84ea2da587e27d324a9e774 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Tue, 1 Jul 2025 15:08:59 -0700 Subject: [PATCH 2/5] Bypass verifier for problematic tests Change-Id: Ia1e4719bd9edada963d9e5f07371f786eb490d15 --- .../AMDGPU/regalloc-failure-overlapping-insert-assert.mir | 2 +- .../test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll | 6 +++--- .../AMDGPU/register-killed-error-after-alloc-failure1.ll | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir index c9d0cf3893a2b..4b5c41b834a9e 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir @@ -1,4 +1,4 @@ -# RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -start-before=greedy,1 -stop-after=virtregrewriter,2 %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s +# RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -start-before=greedy,1 -stop-after=virtregrewriter,2 %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s # Make sure there's no machine verifier error after failure. diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll index 45ca0d4e156b1..c035e9f9415c1 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll +++ b/llvm/test/CodeGen/AMDGPU/regalloc-illegal-eviction-assert.ll @@ -1,4 +1,4 @@ -; RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -o - %s 2>%t.err | FileCheck -implicit-check-not=error %s +; RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -o - %s 2>%t.err | FileCheck -implicit-check-not=error %s ; RUN: FileCheck -check-prefix=ERR %s < %t.err ; This testcase would fail on an "illegal eviction". If the assert was @@ -9,9 +9,9 @@ %asm.output = type { <16 x i32>, <8 x i32>, <5 x i32>, <4 x i32>, <16 x i32> } ; CHECK-LABEL: {{^}}illegal_eviction_assert: -; CHECK: ; def v[4:19] v[20:27] v[0:4] v[0:3] a[0:15] +; CHECK: ; def v[13:28] v[0:7] v[8:12] v[0:3] a[0:15] ; CHECK: ; clobber -; CHECK: ; use v[4:19] v[20:27] v[0:4] v[0:3] a[1:16] +; CHECK: ; use v[13:28] v[0:7] v[8:12] v[0:3] a[1:16] define void @illegal_eviction_assert(ptr addrspace(1) %arg) #0 { ;%agpr0 = call i32 asm sideeffect "; def $0","=${a0}"() %asm = call %asm.output asm sideeffect "; def $0 $1 $2 $3 $4","=v,=v,=v,=v,={a[0:15]}"() diff --git a/llvm/test/CodeGen/AMDGPU/register-killed-error-after-alloc-failure1.ll b/llvm/test/CodeGen/AMDGPU/register-killed-error-after-alloc-failure1.ll index 5e466a9470fc5..6c84713d9025c 100644 --- a/llvm/test/CodeGen/AMDGPU/register-killed-error-after-alloc-failure1.ll +++ b/llvm/test/CodeGen/AMDGPU/register-killed-error-after-alloc-failure1.ll @@ -1,4 +1,4 @@ -; RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR -implicit-check-not=error %s +; RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR -implicit-check-not=error %s ; ERR: error: inline assembly requires more registers than available ; ERR-NOT: ERROR From 5bf1b0a49a86350493f363379385b8879d56814b Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 2 Jul 2025 14:10:03 -0700 Subject: [PATCH 3/5] Also update priorities for VGPR32/VGPR16 Change-Id: I2c56c9148c68fe820895d6eabb0a9058d04d7b4d --- llvm/lib/CodeGen/RegAllocGreedy.cpp | 1 + llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 10 +- .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 240 ++++---- .../buffer-fat-pointer-atomicrmw-fadd.ll | 74 ++- .../buffer-fat-pointer-atomicrmw-fmax.ll | 72 +-- .../buffer-fat-pointer-atomicrmw-fmin.ll | 72 +-- llvm/test/CodeGen/AMDGPU/build_vector.ll | 8 +- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 7 +- .../CodeGen/AMDGPU/fix-crash-valu-hazard.ll | 10 +- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 218 ++++--- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 501 +++++++-------- .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 501 +++++++-------- .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 218 ++++--- llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 133 ++-- .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 240 ++++---- llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll | 8 +- llvm/test/CodeGen/AMDGPU/freeze.ll | 11 +- llvm/test/CodeGen/AMDGPU/frem.ll | 396 ++++++------ .../AMDGPU/gfx-callable-argument-types.ll | 14 +- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 218 ++++--- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 501 +++++++-------- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 501 +++++++-------- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 218 ++++--- llvm/test/CodeGen/AMDGPU/half.ll | 20 +- .../AMDGPU/insert_vector_elt.v2bf16.ll | 14 +- ...llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll | 36 +- .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll | 52 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 232 +++---- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 148 ++--- ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 234 +++---- ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 172 +++--- .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 574 +++++++++--------- llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 8 +- .../AMDGPU/ptradd-sdag-optimizations.ll | 42 +- .../AMDGPU/sext-in-reg-vector-shuffle.ll | 34 +- .../AMDGPU/shufflevector-physreg-copy.ll | 106 ++-- llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll | 10 +- .../AMDGPU/tuple-allocation-failure.ll | 120 ++-- llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll | 10 +- llvm/test/CodeGen/AMDGPU/v_pack.ll | 8 +- .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 250 ++++---- llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 20 +- 42 files changed, 3155 insertions(+), 3107 deletions(-) diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index fa384b296f2e6..f1eba3c52a764 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -2290,6 +2290,7 @@ MCRegister RAGreedy::selectOrSplit(const LiveInterval &VirtReg, LLVMContext &Ctx = MF->getFunction().getContext(); SmallVirtRegSet FixedRegisters; RecoloringStack RecolorStack; + MCRegister Reg = selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters, RecolorStack); if (Reg == ~0U && (CutOffInfo != CO_None)) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index b4e968f5f455a..ad09a3383a04b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -112,6 +112,7 @@ class SIRegisterClass rTypes, int Align, dag rList> // RegisterClass (e.g. AGPR / VGPR) priority for allocation field int RegClassPriority = 1; + field int RegClassBit = 5; } @@ -575,7 +576,7 @@ let HasVGPR = 1 in { def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (interleave (sequence "VGPR%u_LO16", 0, 255), (sequence "VGPR%u_HI16", 0, 255)))> { - let AllocationPriority = 2; + let AllocationPriority = !add(2, !mul(RegClassPriority, !shl(1, RegClassBit))); let Size = 16; let GeneratePressureSet = 0; @@ -601,7 +602,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // i16/f16 only on VI+ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 255))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(RegClassPriority, !shl(1, RegClassBit))); let Size = 32; let Weight = 1; let BaseClassOrder = 32; @@ -610,7 +611,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types // Identical to VGPR_32 except it only contains the low 128 (Lo128) registers. def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 127))> { - let AllocationPriority = 0; + let AllocationPriority = !add(0, !mul(RegClassPriority, !shl(1, RegClassBit))); let GeneratePressureSet = 0; let Size = 32; let Weight = 1; @@ -941,7 +942,7 @@ class VRegClassBase regTypes, dag regList> : // Requires n v_mov_b32 to copy let CopyCost = numRegs; defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15)); - let AllocationPriority = !add(SizePrioriity, !mul(RegClassPriority, 16)); + let AllocationPriority = !add(SizePrioriity, !mul(RegClassPriority, !shl(1, RegClassBit))); let Weight = numRegs; } @@ -1067,6 +1068,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6 def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> { let HasVGPR = 1; let HasAGPR = 1; + let RegClassPriority = 0; let Size = 32; } } // End GeneratePressureSet = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 63009bdc2643f..4131e007fa18c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -70,12 +70,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: @@ -85,12 +85,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) @@ -153,12 +153,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: @@ -168,12 +168,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) @@ -236,12 +236,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: @@ -251,12 +251,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) @@ -319,12 +319,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: @@ -334,12 +334,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) @@ -402,12 +402,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: @@ -417,12 +417,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) @@ -485,12 +485,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: @@ -500,12 +500,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) @@ -568,12 +568,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: @@ -583,12 +583,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) @@ -651,12 +651,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: @@ -666,12 +666,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) @@ -734,12 +734,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: @@ -749,12 +749,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) @@ -817,12 +817,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: @@ -832,12 +832,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) @@ -900,12 +900,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: @@ -915,12 +915,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) @@ -983,12 +983,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: @@ -998,12 +998,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index dc84a85d6c207..36370361b677d 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -4295,15 +4295,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v7 ; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -4317,30 +4317,29 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6 -; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v7.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4355,13 +4354,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -4369,7 +4368,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4527,16 +4526,16 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v7 ; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -4548,30 +4547,29 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6 -; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v7.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -4585,13 +4583,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -4600,7 +4598,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index e0a81187aeed1..3ad1e5c0b81e0 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -3417,11 +3417,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v6 ; GFX12-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3435,32 +3435,32 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v7.l, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v7.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3475,13 +3475,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3489,7 +3489,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3657,11 +3657,11 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v6 ; GFX11-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3673,32 +3673,32 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v7.l, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3712,13 +3712,13 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3727,7 +3727,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index d4dd9327f77b6..1f51c93d08db1 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -3417,11 +3417,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v6 ; GFX12-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3435,32 +3435,32 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v7.l, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, v4.l, v7.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3475,13 +3475,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -3489,7 +3489,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3657,11 +3657,11 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 3, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v9, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v6 ; GFX11-TRUE16-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -3673,32 +3673,32 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v10, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v7.l, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v5.l, v5.l ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB12_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v8 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, v4.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v11, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -3712,13 +3712,13 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v10, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -3727,7 +3727,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB12_3 ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v9, v6 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index 7208eaeff8eb1..d47d396f5ce8b 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -307,13 +307,13 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; GFX942-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_lshl_b32 s3, s3, 16 ; GFX942-NEXT: s_lshl_b32 s2, s2, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, s2 -; GFX942-NEXT: v_mov_b32_e32 v1, s3 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NEXT: v_mov_b32_e32 v3, s3 +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 0e93e4d591f6a..9f48c8b5fe49c 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -463,13 +463,12 @@ define <2 x half> @chain_hi_to_lo_flat(ptr inreg %ptr) { ; GFX11-TRUE16: ; %bb.0: ; %bb ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v2, v[0:1] +; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat: diff --git a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll index 87811968c7871..4f752d102db74 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll @@ -8,10 +8,10 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt ; GFX942-LABEL: global_load_lds_dword_saddr: ; GFX942: ; %bb.0: ; %main_body ; GFX942-NEXT: v_readfirstlane_b32 s2, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_mov_b32 m0, s2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: global_load_lds_dword v2, s[0:1] offset:32 nt +; GFX942-NEXT: global_load_lds_dword v1, s[0:1] offset:32 nt ; GFX942-NEXT: s_getpc_b64 s[0:1] ; GFX942-NEXT: s_add_u32 s0, s0, G@gotpcrel32@lo+4 ; GFX942-NEXT: s_addc_u32 s1, s1, G@gotpcrel32@hi+12 @@ -21,9 +21,9 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_mul_i32 s3, s3, 10 ; GFX942-NEXT: s_mul_i32 s2, s2, 10 -; GFX942-NEXT: v_mov_b32_e32 v0, s2 -; GFX942-NEXT: v_mov_b32_e32 v1, s3 -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, s2 +; GFX942-NEXT: v_mov_b32_e32 v3, s3 +; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX90A-LABEL: global_load_lds_dword_saddr: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 8281a13fb87cb..8581e4d030261 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -9902,34 +9902,33 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -10021,35 +10020,34 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -10275,34 +10273,33 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -10395,35 +10392,34 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -10649,23 +10645,21 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v3.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -10735,24 +10729,22 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -11603,35 +11595,34 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -11724,35 +11715,34 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index de3e6838e15ab..e3bd4e7383598 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -6416,34 +6416,35 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6454,7 +6455,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6547,33 +6548,35 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6584,7 +6587,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6822,34 +6825,35 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6860,7 +6864,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6954,33 +6958,35 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6991,7 +6997,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7610,36 +7616,38 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7737,37 +7745,39 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8004,36 +8014,38 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8132,37 +8144,39 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8703,24 +8717,24 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8795,25 +8809,25 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8999,35 +9013,36 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9038,7 +9053,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9132,33 +9147,35 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -9169,7 +9186,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9409,37 +9426,39 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -9538,37 +9557,39 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 53596ea45c3df..25d59a26189c9 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -6416,34 +6416,35 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6454,7 +6455,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6547,33 +6548,35 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6584,7 +6587,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -6822,34 +6825,35 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -6860,7 +6864,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6954,33 +6958,35 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -6991,7 +6997,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -7610,36 +7616,38 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7737,37 +7745,39 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8004,36 +8014,38 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8132,37 +8144,39 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8703,24 +8717,24 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8795,25 +8809,25 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8999,35 +9013,36 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -9038,7 +9053,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9132,33 +9147,35 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -9169,7 +9186,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -9409,37 +9426,39 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -9538,37 +9557,39 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: flat_load_b32 v6, v[3:4] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index df4ff33da372e..c987effec3be3 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -7347,34 +7347,33 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7466,35 +7465,34 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7720,34 +7718,33 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7840,35 +7837,34 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b_neg: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8380,23 +8376,21 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v3.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8466,24 +8460,22 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -9048,35 +9040,34 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -9169,35 +9160,34 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_noret_f16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index 97ece6090cb5d..3304dbf3eaa3d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -7949,12 +7949,13 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_u8 v2, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-TRUE16-NEXT: flat_store_b8 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_i8_offset: @@ -8024,12 +8025,13 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_u8 v2, v[0:1] glc +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-TRUE16-NEXT: flat_store_b8 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_i8: @@ -8112,16 +8114,17 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 ; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s4 ; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_u8 v2, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: flat_load_d16_u8 v0, v[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-TRUE16-NEXT: flat_store_b8 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_i8_addr64_offset: @@ -8193,9 +8196,9 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b8 v[0:1], v2 offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 offset:16 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_i8_offset: @@ -8254,9 +8257,9 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b8 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_i8: @@ -8326,10 +8329,9 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s2 ; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s4 -; GFX11-TRUE16-NEXT: flat_store_b8 v[0:1], v2 offset:16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: flat_store_b8 v[1:2], v0 offset:16 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_i8_addr64_offset: @@ -8404,12 +8406,13 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_i16_offset: @@ -8479,12 +8482,13 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] glc +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_i16: @@ -8571,16 +8575,17 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 ; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s4 ; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_i16_addr64_offset: @@ -8653,9 +8658,9 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 offset:16 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_i16_offset: @@ -8714,9 +8719,9 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_i16: @@ -8788,12 +8793,12 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 ; GFX11-TRUE16-NEXT: s_add_u32 s0, s0, s2 ; GFX11-TRUE16-NEXT: s_addc_u32 s1, s1, s3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 offset:16 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_i16_addr64_offset: @@ -8861,9 +8866,9 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 offset:16 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 offset:16 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_f16_offset: @@ -8922,9 +8927,9 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_f16: @@ -8982,9 +8987,9 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_bf16_offset: @@ -9042,9 +9047,9 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_store_bf16: @@ -10593,12 +10598,13 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_f16_offset: @@ -10667,12 +10673,13 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] glc +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_f16: @@ -10744,12 +10751,13 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:16 glc +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] offset:16 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_bf16_offset: @@ -10818,12 +10826,13 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) { ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] glc +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-TRUE16-NEXT: flat_store_b16 v[0:1], v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: flat_store_b16 v[1:2], v0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: atomic_load_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 873fceedd7b72..6067194d947fa 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -71,12 +71,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: @@ -86,12 +86,12 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) @@ -154,12 +154,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: @@ -169,12 +169,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) @@ -237,12 +237,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: @@ -252,12 +252,12 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> % ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) @@ -320,12 +320,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: @@ -335,12 +335,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_add_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) @@ -403,12 +403,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: @@ -418,12 +418,12 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) @@ -486,12 +486,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: @@ -501,12 +501,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) @@ -569,12 +569,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: @@ -584,12 +584,12 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> % ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) @@ -652,12 +652,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: @@ -667,12 +667,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_min_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) @@ -735,12 +735,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: @@ -750,12 +750,12 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) @@ -818,12 +818,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: @@ -833,12 +833,12 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 4 offen sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 4 offen sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2) @@ -901,12 +901,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: @@ -916,12 +916,12 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> % ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) @@ -984,12 +984,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] -; GFX90A-NEXT: v_mov_b32_e32 v2, s10 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 glc slc -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v0, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: @@ -999,12 +999,12 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen offset:4 sc0 nt -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-NEXT: v_mov_b32_e32 v0, s10 +; GFX942-NEXT: buffer_atomic_max_f64 v[2:3], v0, s[0:3], 0 idxen offset:4 sc0 nt +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9] ; GFX942-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index 7113d5942208e..0a900f904bec5 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -554,13 +554,13 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[8:11], 0 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 -; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-SDAG-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index cf041ba1c2fbc..ac438062ae208 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -13984,14 +13984,13 @@ define void @freeze_v2i1_vcc(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX11-SDAG-TRUE16-LABEL: freeze_v2i1_vcc: ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-SDAG-TRUE16-NEXT: global_load_b64 v[4:5], v[0:1], off ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.l, 1, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.l, 1, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l ; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3 ; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[2:3], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 8ed8a4df3e761..d3432daedadf8 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -3430,103 +3430,103 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v5, s[2:3] -; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v5, s[4:5] offset:32 +; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.l -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v9, v8.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v9, v9 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v6 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v4, v1 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v6 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v4, v1 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v4.l, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v6.l -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v4.l, v4.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v7, v7, v9 -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v4.l, v4.l +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-TRUE16-NEXT: v_fma_f16 v4.l, -v4.l, v3.l, v1.l +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v10, -v4, v7, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v7, v10, v9 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v2, -v4, v7, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v9 +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v1, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v8.l, v6.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v1.l, v1.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_fma_f16 v1.l, -v1.l, v8.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v4.l, v1.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v8, -v6, v3, v7 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: v_fma_f16 v0.h, -v0.h, v8.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v8, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v6, -v6, v3, v7 op_sel_hi:[1,0,1] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v8, -v6, v0, v7 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v8, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v6, -v6, v0, v7 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v6, v4 -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v8, v7.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v8, v7.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v8, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v6, v6, v8 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v3.l, v3.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v9, -v3, v6, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v1.l ; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v6, v9, v8 -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_fma_f16 v3.l, -v3.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v1, -v3, v6, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v1, v8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v4.l -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v7.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v7.l, v4.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v3.l, v0.l -; GFX11-TRUE16-NEXT: global_store_b64 v5, v[0:1], s[0:1] +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: v_fma_f16 v0.h, -v0.h, v7.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v0.h +; GFX11-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: frem_v4f16: @@ -3642,106 +3642,107 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, 0 ; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-TRUE16-NEXT: s_clause 0x1 -; GFX1150-TRUE16-NEXT: global_load_b64 v[0:1], v5, s[2:3] -; GFX1150-TRUE16-NEXT: global_load_b64 v[2:3], v5, s[4:5] offset:32 +; GFX1150-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.h +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.h ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v2.h -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v8.l, v2.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.h +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v6 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v6 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v4.l, v4 -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v4.l, v7.l, v6.l +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v4.l, v0.h -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v4.l, v7.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v2.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v0.l, v7.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v3.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l ; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v7, v7 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v7 -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v4, v9 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v7 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v4, v10, v7 -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v4, v9 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v10, v7 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v7, v8, v7 ; GFX1150-TRUE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v4, v7, v4 -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v4 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v2.l, v0.l -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v4.l, v0.h +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 -; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v4.l, v2.l, v0.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.h -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.h -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v6.l -; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v2, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v3.l, v1.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v4.h +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v6.l +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v2, v6, v4 -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v2, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v6, v4 -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v2, v4, v2 -; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v3 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v2.l, v2.l, v6.l, v4.l -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v2.l, v2.l +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v3 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v2.l, v6.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.l -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v6.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v2, v1 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v2, v7, v6 +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v2, v1 op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v2, v6, v2 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v2 -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v1.h, v1.h, v3.l, v1.l +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v2.l, v1.h -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_f16 v1.l, v2.l, v3.l, v1.l -; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v4.l -; GFX1150-TRUE16-NEXT: global_store_b64 v5, v[0:1], s[0:1] +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v3.l +; GFX1150-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1] ; GFX1150-TRUE16-NEXT: s_endpgm ; ; GFX1150-FAKE16-LABEL: frem_v4f16: @@ -3862,106 +3863,107 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, 0 ; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1200-TRUE16-NEXT: s_clause 0x1 -; GFX1200-TRUE16-NEXT: global_load_b64 v[0:1], v5, s[2:3] -; GFX1200-TRUE16-NEXT: global_load_b64 v[2:3], v5, s[4:5] offset:32 +; GFX1200-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3] +; GFX1200-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32 ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.h +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.h ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v2.h -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v8.l, v2.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.h +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v6 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v6 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v4, v6, v4 -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v4.l, v4 -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v4.l, v7.l, v6.l +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v4.l, v0.h -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v4.l, v7.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v2.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v0.l, v7.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v3.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l ; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v7, v7 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v7 -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v4, v9 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v7 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v4, v10, v7 -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v4, v9 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v10, v7 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v7, v8, v7 ; GFX1200-TRUE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v4, v7, v4 -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v4 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v7, v0 +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v2.l, v0.l -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v4.l, v0.h +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4 -; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v4.l, v2.l, v0.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.h -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.h -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v6.l -; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v2, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v3.l, v1.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v4.h +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v6.l +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v2, v6, v4 -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v2, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v6, v4 -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v2, v4, v2 -; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v3 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v2.l, v2.l, v6.l, v4.l -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v2.l, v2.l +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v3 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v3, v0 +; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v2.l, v6.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.l -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v6.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v6, v6 -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v2, v1 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v2, v7, v6 +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v2, v1 op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v2, v6, v2 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v2 -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v1.h, v1.h, v3.l, v1.l +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0 +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v2.l, v1.h -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_f16 v1.l, v2.l, v3.l, v1.l -; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v4.l -; GFX1200-TRUE16-NEXT: global_store_b64 v5, v[0:1], s[0:1] +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v3.l +; GFX1200-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1] ; GFX1200-TRUE16-NEXT: s_endpgm ; ; GFX1200-FAKE16-LABEL: frem_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 505e5400a990a..e40917d4307fb 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -4800,15 +4800,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b8 v[0:1], v2, off -; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[40:41], v2, off +; GFX11-TRUE16-NEXT: global_store_b8 v[3:4], v2, off +; GFX11-TRUE16-NEXT: global_store_b16 v[40:41], v0, off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 501455a551db3..afa57b8692aa5 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -9971,34 +9971,33 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -10090,35 +10089,34 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -10394,34 +10392,33 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -10514,35 +10511,34 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -11143,23 +11139,21 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v3.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -11229,24 +11223,22 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -11900,35 +11892,34 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -12021,35 +12012,34 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index c2ba80f75c630..b9774808f1ad1 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -4895,34 +4895,35 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4933,7 +4934,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5026,33 +5027,35 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5063,7 +5066,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5353,34 +5356,35 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5391,7 +5395,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5485,33 +5489,35 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5522,7 +5528,7 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6242,36 +6248,38 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6369,37 +6377,39 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6686,36 +6696,38 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6814,37 +6826,39 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7474,24 +7488,24 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7566,25 +7580,25 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7807,35 +7821,36 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7846,7 +7861,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7940,33 +7955,35 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -7977,7 +7994,7 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8269,37 +8286,39 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8398,37 +8417,39 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index b4772c723a2eb..c30543642d314 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -4895,34 +4895,35 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -4933,7 +4934,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5026,33 +5027,35 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5063,7 +5066,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB27_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -5353,34 +5356,35 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX12-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -5391,7 +5395,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5485,33 +5489,35 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX11-TRUE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -5522,7 +5528,7 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB28_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: @@ -6242,36 +6248,38 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX12-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6369,37 +6377,39 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX11-TRUE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -6686,36 +6696,38 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6814,37 +6826,39 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7474,24 +7488,24 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v3.l, v3.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7566,25 +7580,25 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v3.l, v3.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7807,35 +7821,36 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 @@ -7846,7 +7861,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7940,33 +7955,35 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v5, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v6 ; GFX11-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv @@ -7977,7 +7994,7 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: @@ -8269,37 +8286,39 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v2.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX12-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX12-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v2.l, v2.l -; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, v2.l, v4.l +; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l +; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8398,37 +8417,39 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v2.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, -4, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-TRUE16-NEXT: global_load_b32 v6, v[3:4], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v1, 0xffff +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_not_b32_e32 v2, v5 ; GFX11-TRUE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v2.l, v2.l -; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 985e1f20b1d33..5e4a5c649bb24 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -6917,34 +6917,33 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7036,35 +7035,34 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB26_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -7340,34 +7338,33 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -7460,35 +7457,34 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b_neg: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8089,23 +8085,21 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v3.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8175,24 +8169,22 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_f16__offset12b__align4_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB29_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -8846,35 +8838,34 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -8967,35 +8958,34 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-LABEL: global_system_atomic_fsub_noret_f16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v5, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v2.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v6, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 90cf66b3f7b74..117cf40de72d2 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -2595,15 +2595,15 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; GFX11-TRUE16-LABEL: global_truncstore_v2f32_to_v2f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v3, s[2:3] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v1.l -; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-TRUE16-NEXT: global_store_b32 v3, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: global_truncstore_v2f32_to_v2f16: @@ -2772,12 +2772,12 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v3 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.h, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v2.l, v3.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v2.h -; GFX11-TRUE16-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v2.l, v3.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v1.l +; GFX11-TRUE16-NEXT: global_store_b64 v4, v[1:2], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: global_truncstore_v4f32_to_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index 47a371d8de07c..007b92d1780b2 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -1331,16 +1331,16 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX942-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX942-NEXT: v_mov_b32_e32 v9, 0x5040100 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 5, v0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x5040100 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] -; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GFX942-NEXT: global_load_dwordx4 v[6:9], v4, s[2:3] offset:16 ; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: v_perm_b32 v1, s6, v1, v9 +; GFX942-NEXT: v_perm_b32 v1, s6, v1, v5 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v4, v[6:9], s[0:1] offset:16 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll index a319f1260d870..0fe731f8c8474 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.AFLCustomIRMutator.opt.ll @@ -4,30 +4,30 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(<1 x i64> %L1) { ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GCN-NEXT: ds_read_b128 v[28:31], v32 offset:112 -; GCN-NEXT: ds_read_b128 v[24:27], v32 offset:96 -; GCN-NEXT: ds_read_b128 v[20:23], v32 offset:80 -; GCN-NEXT: ds_read_b128 v[16:19], v32 offset:64 -; GCN-NEXT: ds_read_b128 v[0:3], v32 -; GCN-NEXT: ds_read_b128 v[4:7], v32 offset:16 -; GCN-NEXT: ds_read_b128 v[8:11], v32 offset:32 -; GCN-NEXT: ds_read_b128 v[12:15], v32 offset:48 +; GCN-NEXT: ds_read_b128 v[30:33], v0 offset:112 +; GCN-NEXT: ds_read_b128 v[26:29], v0 offset:96 +; GCN-NEXT: ds_read_b128 v[22:25], v0 offset:80 +; GCN-NEXT: ds_read_b128 v[18:21], v0 offset:64 +; GCN-NEXT: ds_read_b128 v[2:5], v0 +; GCN-NEXT: ds_read_b128 v[6:9], v0 offset:16 +; GCN-NEXT: ds_read_b128 v[10:13], v0 offset:32 +; GCN-NEXT: ds_read_b128 v[14:17], v0 offset:48 ; GCN-NEXT: v_mov_b32_e32 v34, 0 ; GCN-NEXT: v_mov_b32_e32 v35, v34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GCN-NEXT: ; iglp_opt mask(0x00000001) -; GCN-NEXT: ds_write_b128 v32, v[28:31] offset:112 -; GCN-NEXT: ds_write_b128 v32, v[24:27] offset:96 -; GCN-NEXT: ds_write_b128 v32, v[20:23] offset:80 -; GCN-NEXT: ds_write_b128 v32, v[16:19] offset:64 -; GCN-NEXT: ds_write_b128 v32, v[12:15] offset:48 -; GCN-NEXT: ds_write_b128 v32, v[8:11] offset:32 -; GCN-NEXT: ds_write_b128 v32, v[4:7] offset:16 -; GCN-NEXT: ds_write_b128 v32, v[0:3] -; GCN-NEXT: ds_write_b64 v32, v[34:35] +; GCN-NEXT: ds_write_b128 v0, v[30:33] offset:112 +; GCN-NEXT: ds_write_b128 v0, v[26:29] offset:96 +; GCN-NEXT: ds_write_b128 v0, v[22:25] offset:80 +; GCN-NEXT: ds_write_b128 v0, v[18:21] offset:64 +; GCN-NEXT: ds_write_b128 v0, v[14:17] offset:48 +; GCN-NEXT: ds_write_b128 v0, v[10:13] offset:32 +; GCN-NEXT: ds_write_b128 v0, v[6:9] offset:16 +; GCN-NEXT: ds_write_b128 v0, v[2:5] +; GCN-NEXT: ds_write_b64 v0, v[34:35] ; GCN-NEXT: s_endpgm entry: call void @llvm.amdgcn.iglp.opt(i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 520884534ea77..d358837452eab 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -252,7 +252,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] @@ -274,40 +274,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg ; GCN-NEXT: v_accvgpr_write_b32 a18, s10 ; GCN-NEXT: v_accvgpr_write_b32 a17, s9 ; GCN-NEXT: v_accvgpr_write_b32 a16, s8 -; GCN-NEXT: v_mov_b32_e32 v8, s20 -; GCN-NEXT: v_mov_b32_e32 v9, s21 +; GCN-NEXT: v_mov_b32_e32 v10, s20 +; GCN-NEXT: v_mov_b32_e32 v11, s21 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] -; GCN-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NEXT: v_mov_b32_e32 v12, s22 +; GCN-NEXT: v_mov_b32_e32 v13, s23 ; GCN-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) @@ -322,7 +322,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] @@ -344,40 +344,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa ; GCN-NEXT: v_accvgpr_write_b32 a18, s10 ; GCN-NEXT: v_accvgpr_write_b32 a17, s9 ; GCN-NEXT: v_accvgpr_write_b32 a16, s8 -; GCN-NEXT: v_mov_b32_e32 v8, s20 -; GCN-NEXT: v_mov_b32_e32 v9, s21 +; GCN-NEXT: v_mov_b32_e32 v10, s20 +; GCN-NEXT: v_mov_b32_e32 v11, s21 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NEXT: v_mov_b32_e32 v12, s22 +; GCN-NEXT: v_mov_b32_e32 v13, s23 ; GCN-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 368b43017da97..9288ed2770647 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -503,7 +503,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] @@ -525,40 +525,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 ; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 ; SDAG-NEXT: v_mov_b32_e32 v0, s16 ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 ; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -627,7 +627,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] @@ -649,40 +649,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 ; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 ; SDAG-NEXT: v_mov_b32_e32 v0, s16 ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 ; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s12 ; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: v_mov_b32_e32 v2, s14 ; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mov_b32_e32 v0, s8 ; SDAG-NEXT: v_mov_b32_e32 v1, s9 ; SDAG-NEXT: v_mov_b32_e32 v2, s10 ; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -955,24 +955,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -1006,24 +1006,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -1420,17 +1420,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 +; SDAG-NEXT: v_mov_b32_e32 v6, s24 +; SDAG-NEXT: v_mov_b32_e32 v7, s25 +; SDAG-NEXT: v_mov_b32_e32 v8, s26 +; SDAG-NEXT: v_mov_b32_e32 v9, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 @@ -1449,41 +1449,41 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1551,17 +1551,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 +; SDAG-NEXT: v_mov_b32_e32 v6, s24 +; SDAG-NEXT: v_mov_b32_e32 v7, s25 +; SDAG-NEXT: v_mov_b32_e32 v8, s26 +; SDAG-NEXT: v_mov_b32_e32 v9, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 @@ -1580,41 +1580,41 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: v_mov_b32_e32 v2, s20 +; SDAG-NEXT: v_mov_b32_e32 v3, s21 +; SDAG-NEXT: v_mov_b32_e32 v4, s22 +; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 67ae05eb6f0b8..159e7e359337d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -1114,19 +1114,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s15 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s14 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s15 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s0 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v2 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v3 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 -; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s3 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s2 +; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s3 ; NOLIT-SRCC-NEXT: s_nop 1 -; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 +; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 @@ -1254,19 +1254,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 -; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s15 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s14 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s15 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) -; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s0 -; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 +; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v2 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v3 -; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 -; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s3 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1 +; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s2 +; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s3 ; LIT-SRCC-NEXT: s_nop 1 -; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 +; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 @@ -1330,7 +1330,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; GFX90A-LABEL: test_mfma_f32_32x32x4f16: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 ; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 @@ -1345,8 +1345,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v3, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 @@ -1371,27 +1371,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NEXT: v_mov_b32_e32 v4, s2 +; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: global_store_dwordx4 v4, a[24:27], s[36:37] offset:96 -; GFX90A-NEXT: global_store_dwordx4 v4, a[28:31], s[36:37] offset:112 -; GFX90A-NEXT: global_store_dwordx4 v4, a[16:19], s[36:37] offset:64 -; GFX90A-NEXT: global_store_dwordx4 v4, a[20:23], s[36:37] offset:80 -; GFX90A-NEXT: global_store_dwordx4 v4, a[8:11], s[36:37] offset:32 -; GFX90A-NEXT: global_store_dwordx4 v4, a[12:15], s[36:37] offset:48 -; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[36:37] -; GFX90A-NEXT: global_store_dwordx4 v4, a[4:7], s[36:37] offset:16 +; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96 +; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112 +; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64 +; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80 +; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32 +; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37] +; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16 ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x4f16: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 @@ -1406,8 +1406,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s0 -; GFX942-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NEXT: v_mov_b32_e32 v2, s0 +; GFX942-NEXT: v_mov_b32_e32 v3, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 ; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 ; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 @@ -1432,21 +1432,21 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 ; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 -; GFX942-NEXT: v_mov_b32_e32 v2, s2 -; GFX942-NEXT: v_mov_b32_e32 v3, s3 +; GFX942-NEXT: v_mov_b32_e32 v4, s2 +; GFX942-NEXT: v_mov_b32_e32 v5, s3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v4, a[24:27], s[36:37] offset:96 -; GFX942-NEXT: global_store_dwordx4 v4, a[28:31], s[36:37] offset:112 -; GFX942-NEXT: global_store_dwordx4 v4, a[16:19], s[36:37] offset:64 -; GFX942-NEXT: global_store_dwordx4 v4, a[20:23], s[36:37] offset:80 -; GFX942-NEXT: global_store_dwordx4 v4, a[8:11], s[36:37] offset:32 -; GFX942-NEXT: global_store_dwordx4 v4, a[12:15], s[36:37] offset:48 -; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[36:37] -; GFX942-NEXT: global_store_dwordx4 v4, a[4:7], s[36:37] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96 +; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112 +; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64 +; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37] +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16 ; GFX942-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -1752,45 +1752,45 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add ; GFX90A-LABEL: test_mfma_f32_4x4x4f16: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 -; GFX90A-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NEXT: v_mov_b32_e32 v3, s7 +; GFX90A-NEXT: v_mov_b32_e32 v4, s6 +; GFX90A-NEXT: v_mov_b32_e32 v5, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 4 -; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x4f16: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s4 -; GFX942-NEXT: v_mov_b32_e32 v1, s5 +; GFX942-NEXT: v_mov_b32_e32 v2, s4 +; GFX942-NEXT: v_mov_b32_e32 v3, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 -; GFX942-NEXT: v_mov_b32_e32 v2, s6 -; GFX942-NEXT: v_mov_b32_e32 v3, s7 +; GFX942-NEXT: v_mov_b32_e32 v4, s6 +; GFX942-NEXT: v_mov_b32_e32 v5, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s11 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 4 -; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg @@ -2099,46 +2099,46 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; GFX90A-LABEL: test_mfma_f32_16x16x16f16: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: v_mov_b32_e32 v1, s5 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v3, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 -; GFX90A-NEXT: v_mov_b32_e32 v2, s6 -; GFX90A-NEXT: v_mov_b32_e32 v3, s7 +; GFX90A-NEXT: v_mov_b32_e32 v4, s6 +; GFX90A-NEXT: v_mov_b32_e32 v5, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 ; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1] +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x16f16: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s4 -; GFX942-NEXT: v_mov_b32_e32 v1, s5 +; GFX942-NEXT: v_mov_b32_e32 v2, s4 +; GFX942-NEXT: v_mov_b32_e32 v3, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 -; GFX942-NEXT: v_mov_b32_e32 v2, s6 -; GFX942-NEXT: v_mov_b32_e32 v3, s7 +; GFX942-NEXT: v_mov_b32_e32 v4, s6 +; GFX942-NEXT: v_mov_b32_e32 v5, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s11 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 6 -; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1] +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 04ee0bbd17673..37809da10241b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -1485,30 +1485,30 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v12, s0 -; SDAG-NEXT: v_mov_b32_e32 v13, s1 -; SDAG-NEXT: v_mov_b32_e32 v14, s2 -; SDAG-NEXT: v_mov_b32_e32 v15, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 -; SDAG-NEXT: v_mov_b32_e32 v17, s17 -; SDAG-NEXT: v_mov_b32_e32 v18, s18 -; SDAG-NEXT: v_mov_b32_e32 v19, s19 -; SDAG-NEXT: v_mov_b32_e32 v20, s28 -; SDAG-NEXT: v_mov_b32_e32 v21, s29 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_mov_b32_e32 v8, s24 -; SDAG-NEXT: v_mov_b32_e32 v9, s25 -; SDAG-NEXT: v_mov_b32_e32 v10, s26 -; SDAG-NEXT: v_mov_b32_e32 v11, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v21 +; SDAG-NEXT: v_mov_b32_e32 v14, s0 +; SDAG-NEXT: v_mov_b32_e32 v15, s1 +; SDAG-NEXT: v_mov_b32_e32 v16, s2 +; SDAG-NEXT: v_mov_b32_e32 v17, s3 +; SDAG-NEXT: v_mov_b32_e32 v18, s16 +; SDAG-NEXT: v_mov_b32_e32 v19, s17 +; SDAG-NEXT: v_mov_b32_e32 v20, s18 +; SDAG-NEXT: v_mov_b32_e32 v21, s19 +; SDAG-NEXT: v_mov_b32_e32 v4, s28 +; SDAG-NEXT: v_mov_b32_e32 v5, s29 +; SDAG-NEXT: v_mov_b32_e32 v6, s20 +; SDAG-NEXT: v_mov_b32_e32 v7, s21 +; SDAG-NEXT: v_mov_b32_e32 v8, s22 +; SDAG-NEXT: v_mov_b32_e32 v9, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s24 +; SDAG-NEXT: v_mov_b32_e32 v11, s25 +; SDAG-NEXT: v_mov_b32_e32 v12, s26 +; SDAG-NEXT: v_mov_b32_e32 v13, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v4 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v5 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 @@ -1895,36 +1895,36 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s16 +; SDAG-NEXT: v_mov_b32_e32 v11, s17 +; SDAG-NEXT: v_mov_b32_e32 v12, s18 +; SDAG-NEXT: v_mov_b32_e32 v13, s19 +; SDAG-NEXT: v_mov_b32_e32 v14, s20 +; SDAG-NEXT: v_mov_b32_e32 v15, s21 +; SDAG-NEXT: v_mov_b32_e32 v16, s22 +; SDAG-NEXT: v_mov_b32_e32 v17, s23 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s12, v17 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s12, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[14:15] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[14:15] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: @@ -1964,33 +1964,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_movk_i32 s6, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s16 +; SDAG-NEXT: v_mov_b32_e32 v11, s17 +; SDAG-NEXT: v_mov_b32_e32 v12, s18 +; SDAG-NEXT: v_mov_b32_e32 v13, s19 +; SDAG-NEXT: v_mov_b32_e32 v14, s20 +; SDAG-NEXT: v_mov_b32_e32 v15, s21 +; SDAG-NEXT: v_mov_b32_e32 v16, s22 +; SDAG-NEXT: v_mov_b32_e32 v17, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: @@ -2031,33 +2031,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_movk_i32 s6, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s16 +; SDAG-NEXT: v_mov_b32_e32 v11, s17 +; SDAG-NEXT: v_mov_b32_e32 v12, s18 +; SDAG-NEXT: v_mov_b32_e32 v13, s19 +; SDAG-NEXT: v_mov_b32_e32 v14, s20 +; SDAG-NEXT: v_mov_b32_e32 v15, s21 +; SDAG-NEXT: v_mov_b32_e32 v16, s22 +; SDAG-NEXT: v_mov_b32_e32 v17, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal: @@ -2096,34 +2096,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s16 +; SDAG-NEXT: v_mov_b32_e32 v11, s17 +; SDAG-NEXT: v_mov_b32_e32 v12, s18 +; SDAG-NEXT: v_mov_b32_e32 v13, s19 +; SDAG-NEXT: v_mov_b32_e32 v14, s20 +; SDAG-NEXT: v_mov_b32_e32 v15, s21 +; SDAG-NEXT: v_mov_b32_e32 v16, s22 +; SDAG-NEXT: v_mov_b32_e32 v17, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm: @@ -2162,34 +2162,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s16 +; SDAG-NEXT: v_mov_b32_e32 v11, s17 +; SDAG-NEXT: v_mov_b32_e32 v12, s18 +; SDAG-NEXT: v_mov_b32_e32 v13, s19 +; SDAG-NEXT: v_mov_b32_e32 v14, s20 +; SDAG-NEXT: v_mov_b32_e32 v15, s21 +; SDAG-NEXT: v_mov_b32_e32 v16, s22 +; SDAG-NEXT: v_mov_b32_e32 v17, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v16, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index 6c0faadf9eae5..bc50058778dbf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -3515,26 +3515,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v24, s0 -; SDAG-NEXT: v_mov_b32_e32 v25, s1 -; SDAG-NEXT: v_mov_b32_e32 v26, s2 -; SDAG-NEXT: v_mov_b32_e32 v27, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: v_mov_b32_e32 v29, s17 -; SDAG-NEXT: v_mov_b32_e32 v30, s18 -; SDAG-NEXT: v_mov_b32_e32 v31, s19 -; SDAG-NEXT: v_mov_b32_e32 v32, s28 -; SDAG-NEXT: v_mov_b32_e32 v33, s29 -; SDAG-NEXT: v_mov_b32_e32 v16, s20 -; SDAG-NEXT: v_mov_b32_e32 v17, s21 -; SDAG-NEXT: v_mov_b32_e32 v18, s22 -; SDAG-NEXT: v_mov_b32_e32 v19, s23 -; SDAG-NEXT: v_mov_b32_e32 v20, s24 -; SDAG-NEXT: v_mov_b32_e32 v21, s25 -; SDAG-NEXT: v_mov_b32_e32 v22, s26 -; SDAG-NEXT: v_mov_b32_e32 v23, s27 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v32 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v33 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v30, s16 +; SDAG-NEXT: v_mov_b32_e32 v31, s17 +; SDAG-NEXT: v_mov_b32_e32 v32, s18 +; SDAG-NEXT: v_mov_b32_e32 v33, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s28 +; SDAG-NEXT: v_mov_b32_e32 v17, s29 +; SDAG-NEXT: v_mov_b32_e32 v18, s20 +; SDAG-NEXT: v_mov_b32_e32 v19, s21 +; SDAG-NEXT: v_mov_b32_e32 v20, s22 +; SDAG-NEXT: v_mov_b32_e32 v21, s23 +; SDAG-NEXT: v_mov_b32_e32 v22, s24 +; SDAG-NEXT: v_mov_b32_e32 v23, s25 +; SDAG-NEXT: v_mov_b32_e32 v24, s26 +; SDAG-NEXT: v_mov_b32_e32 v25, s27 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 ; SDAG-NEXT: v_accvgpr_write_b32 a2, v0 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 ; SDAG-NEXT: v_accvgpr_write_b32 a4, v2 @@ -3550,7 +3550,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[24:31], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -3993,34 +3993,34 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v16, s0 -; SDAG-NEXT: v_mov_b32_e32 v17, s1 -; SDAG-NEXT: v_mov_b32_e32 v18, s2 -; SDAG-NEXT: v_mov_b32_e32 v19, s3 -; SDAG-NEXT: v_mov_b32_e32 v20, s16 -; SDAG-NEXT: v_mov_b32_e32 v21, s17 -; SDAG-NEXT: v_mov_b32_e32 v22, s18 -; SDAG-NEXT: v_mov_b32_e32 v23, s19 -; SDAG-NEXT: v_mov_b32_e32 v24, s20 -; SDAG-NEXT: v_mov_b32_e32 v25, s21 -; SDAG-NEXT: v_mov_b32_e32 v26, s22 -; SDAG-NEXT: v_mov_b32_e32 v27, s23 -; SDAG-NEXT: v_mov_b32_e32 v28, s24 -; SDAG-NEXT: v_mov_b32_e32 v29, s25 -; SDAG-NEXT: v_mov_b32_e32 v30, s26 -; SDAG-NEXT: v_mov_b32_e32 v31, s27 -; SDAG-NEXT: v_mov_b32_e32 v32, s28 -; SDAG-NEXT: v_mov_b32_e32 v33, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v27 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v28 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v29 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v30 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v31 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v32 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v33 +; SDAG-NEXT: v_mov_b32_e32 v26, s0 +; SDAG-NEXT: v_mov_b32_e32 v27, s1 +; SDAG-NEXT: v_mov_b32_e32 v28, s2 +; SDAG-NEXT: v_mov_b32_e32 v29, s3 +; SDAG-NEXT: v_mov_b32_e32 v30, s16 +; SDAG-NEXT: v_mov_b32_e32 v31, s17 +; SDAG-NEXT: v_mov_b32_e32 v32, s18 +; SDAG-NEXT: v_mov_b32_e32 v33, s19 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b32_e32 v20, s24 +; SDAG-NEXT: v_mov_b32_e32 v21, s25 +; SDAG-NEXT: v_mov_b32_e32 v22, s26 +; SDAG-NEXT: v_mov_b32_e32 v23, s27 +; SDAG-NEXT: v_mov_b32_e32 v24, s28 +; SDAG-NEXT: v_mov_b32_e32 v25, s29 +; SDAG-NEXT: v_accvgpr_write_b32 a0, v16 +; SDAG-NEXT: v_accvgpr_write_b32 a1, v17 +; SDAG-NEXT: v_accvgpr_write_b32 a2, v18 +; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 +; SDAG-NEXT: v_accvgpr_write_b32 a4, v20 +; SDAG-NEXT: v_accvgpr_write_b32 a5, v21 +; SDAG-NEXT: v_accvgpr_write_b32 a6, v22 +; SDAG-NEXT: v_accvgpr_write_b32 a7, v23 +; SDAG-NEXT: v_accvgpr_write_b32 a8, v24 +; SDAG-NEXT: v_accvgpr_write_b32 a9, v25 ; SDAG-NEXT: v_accvgpr_write_b32 a10, v8 ; SDAG-NEXT: v_accvgpr_write_b32 a11, v9 ; SDAG-NEXT: v_accvgpr_write_b32 a12, v10 @@ -4028,7 +4028,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a14, v12 ; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 @@ -4540,22 +4540,22 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s8 +; SDAG-NEXT: v_mov_b32_e32 v3, s9 +; SDAG-NEXT: v_mov_b32_e32 v4, s10 +; SDAG-NEXT: v_mov_b32_e32 v5, s11 +; SDAG-NEXT: v_mov_b32_e32 v6, s12 +; SDAG-NEXT: v_mov_b32_e32 v7, s13 +; SDAG-NEXT: v_mov_b32_e32 v8, s14 +; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v10, s16 +; SDAG-NEXT: v_mov_b32_e32 v11, s17 +; SDAG-NEXT: v_mov_b32_e32 v12, s18 +; SDAG-NEXT: v_mov_b32_e32 v13, s19 +; SDAG-NEXT: v_mov_b32_e32 v14, s20 +; SDAG-NEXT: v_mov_b32_e32 v15, s21 +; SDAG-NEXT: v_mov_b32_e32 v16, s22 +; SDAG-NEXT: v_mov_b32_e32 v17, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 @@ -4571,9 +4571,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 ; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 -; SDAG-NEXT: v_mov_b32_e32 v16, s1 +; SDAG-NEXT: v_mov_b32_e32 v0, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 @@ -4735,26 +4735,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v6, s16 +; SDAG-NEXT: v_mov_b32_e32 v7, s17 +; SDAG-NEXT: v_mov_b32_e32 v8, s18 +; SDAG-NEXT: v_mov_b32_e32 v9, s19 +; SDAG-NEXT: v_mov_b32_e32 v10, s20 +; SDAG-NEXT: v_mov_b32_e32 v11, s21 +; SDAG-NEXT: v_mov_b32_e32 v12, s22 +; SDAG-NEXT: v_mov_b32_e32 v13, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 +; SDAG-NEXT: v_mov_b32_e32 v14, s24 +; SDAG-NEXT: v_mov_b32_e32 v15, s25 +; SDAG-NEXT: v_mov_b32_e32 v16, s26 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v17, s27 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 @@ -4770,9 +4770,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x ; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 ; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 ; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 -; SDAG-NEXT: v_mov_b32_e32 v16, s1 +; SDAG-NEXT: v_mov_b32_e32 v0, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel_hi:[0,0,0] +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel_hi:[0,0,0] ; SDAG-NEXT: v_mov_b32_e32 v2, s20 ; SDAG-NEXT: v_mov_b32_e32 v3, s21 ; SDAG-NEXT: v_mov_b32_e32 v4, s22 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index ba8c9c5837a4e..80568810e42b5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -17,9 +17,9 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[12:15], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] @@ -29,12 +29,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-NEXT: v_mov_b32_e32 v13, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[8:11], v[0:7], v17 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr: @@ -44,7 +44,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[12:15], v0, s[6:7] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] @@ -54,13 +54,13 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b32_e32 v16, s16 +; GISEL-NEXT: v_mov_b32_e32 v12, s16 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[8:11], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[12:15], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -120,25 +120,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, < ; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s0 +; SDAG-NEXT: v_mov_b32_e32 v11, s1 +; SDAG-NEXT: v_mov_b32_e32 v12, s2 +; SDAG-NEXT: v_mov_b32_e32 v13, s3 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: v_mov_b32_e32 v6, s20 +; SDAG-NEXT: v_mov_b32_e32 v7, s21 +; SDAG-NEXT: v_mov_b32_e32 v8, s22 +; SDAG-NEXT: v_mov_b32_e32 v9, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v12, s28 +; SDAG-NEXT: v_mov_b32_e32 v0, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[8:11], v[0:7], v12 +; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[10:13], v[2:9], v0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -547,9 +547,9 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[12:15], v0, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 ; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3] @@ -559,12 +559,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_mov_b32_e32 v17, s16 +; GCN-NEXT: v_mov_b32_e32 v13, s16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[8:11], v[0:7], v17 cbsz:1 abid:2 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] +; GCN-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] ; GCN-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -624,25 +624,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s1 -; GCN-NEXT: v_mov_b32_e32 v10, s2 -; GCN-NEXT: v_mov_b32_e32 v11, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: v_mov_b32_e32 v4, s20 -; GCN-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NEXT: v_mov_b32_e32 v6, s22 -; GCN-NEXT: v_mov_b32_e32 v7, s23 +; GCN-NEXT: v_mov_b32_e32 v10, s0 +; GCN-NEXT: v_mov_b32_e32 v11, s1 +; GCN-NEXT: v_mov_b32_e32 v12, s2 +; GCN-NEXT: v_mov_b32_e32 v13, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NEXT: v_mov_b32_e32 v4, s18 +; GCN-NEXT: v_mov_b32_e32 v5, s19 +; GCN-NEXT: v_mov_b32_e32 v6, s20 +; GCN-NEXT: v_mov_b32_e32 v7, s21 +; GCN-NEXT: v_mov_b32_e32 v8, s22 +; GCN-NEXT: v_mov_b32_e32 v9, s23 ; GCN-NEXT: v_accvgpr_write_b32 a0, s24 ; GCN-NEXT: v_accvgpr_write_b32 a1, s25 ; GCN-NEXT: v_accvgpr_write_b32 a2, s26 ; GCN-NEXT: v_accvgpr_write_b32 a3, s27 -; GCN-NEXT: v_mov_b32_e32 v12, s28 +; GCN-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[10:13], v[2:9], v0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -855,30 +855,30 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, s8 -; SDAG-NEXT: v_mov_b32_e32 v13, s9 -; SDAG-NEXT: v_mov_b32_e32 v14, s10 -; SDAG-NEXT: v_mov_b32_e32 v15, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr: @@ -887,7 +887,7 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 @@ -898,13 +898,13 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[8:11], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -964,25 +964,25 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s0 +; SDAG-NEXT: v_mov_b32_e32 v11, s1 +; SDAG-NEXT: v_mov_b32_e32 v12, s2 +; SDAG-NEXT: v_mov_b32_e32 v13, s3 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: v_mov_b32_e32 v6, s20 +; SDAG-NEXT: v_mov_b32_e32 v7, s21 +; SDAG-NEXT: v_mov_b32_e32 v8, s22 +; SDAG-NEXT: v_mov_b32_e32 v9, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v12, s28 +; SDAG-NEXT: v_mov_b32_e32 v0, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12 +; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[10:13], v[2:9], v0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1032,22 +1032,22 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -1397,30 +1397,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, s8 -; SDAG-NEXT: v_mov_b32_e32 v13, s9 -; SDAG-NEXT: v_mov_b32_e32 v14, s10 -; SDAG-NEXT: v_mov_b32_e32 v15, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr: @@ -1429,7 +1429,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 @@ -1440,13 +1440,13 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[8:11], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1506,25 +1506,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s0 +; SDAG-NEXT: v_mov_b32_e32 v11, s1 +; SDAG-NEXT: v_mov_b32_e32 v12, s2 +; SDAG-NEXT: v_mov_b32_e32 v13, s3 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: v_mov_b32_e32 v6, s20 +; SDAG-NEXT: v_mov_b32_e32 v7, s21 +; SDAG-NEXT: v_mov_b32_e32 v8, s22 +; SDAG-NEXT: v_mov_b32_e32 v9, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v12, s28 +; SDAG-NEXT: v_mov_b32_e32 v0, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[8:11], v[0:7], v12 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[10:13], v[2:9], v0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1566,30 +1566,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, s8 -; SDAG-NEXT: v_mov_b32_e32 v13, s9 -; SDAG-NEXT: v_mov_b32_e32 v14, s10 -; SDAG-NEXT: v_mov_b32_e32 v15, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr: @@ -1598,7 +1598,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 @@ -1609,13 +1609,13 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[8:11], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1675,25 +1675,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s0 +; SDAG-NEXT: v_mov_b32_e32 v11, s1 +; SDAG-NEXT: v_mov_b32_e32 v12, s2 +; SDAG-NEXT: v_mov_b32_e32 v13, s3 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: v_mov_b32_e32 v6, s20 +; SDAG-NEXT: v_mov_b32_e32 v7, s21 +; SDAG-NEXT: v_mov_b32_e32 v8, s22 +; SDAG-NEXT: v_mov_b32_e32 v9, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v12, s28 +; SDAG-NEXT: v_mov_b32_e32 v0, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[10:13], v[2:9], v0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1735,30 +1735,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, s8 -; SDAG-NEXT: v_mov_b32_e32 v13, s9 -; SDAG-NEXT: v_mov_b32_e32 v14, s10 -; SDAG-NEXT: v_mov_b32_e32 v15, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr: @@ -1767,7 +1767,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 @@ -1778,13 +1778,13 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[8:11], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1844,25 +1844,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s0 +; SDAG-NEXT: v_mov_b32_e32 v11, s1 +; SDAG-NEXT: v_mov_b32_e32 v12, s2 +; SDAG-NEXT: v_mov_b32_e32 v13, s3 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: v_mov_b32_e32 v6, s20 +; SDAG-NEXT: v_mov_b32_e32 v7, s21 +; SDAG-NEXT: v_mov_b32_e32 v8, s22 +; SDAG-NEXT: v_mov_b32_e32 v9, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v12, s28 +; SDAG-NEXT: v_mov_b32_e32 v0, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[10:13], v[2:9], v0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1904,30 +1904,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v12, s8 -; SDAG-NEXT: v_mov_b32_e32 v13, s9 -; SDAG-NEXT: v_mov_b32_e32 v14, s10 -; SDAG-NEXT: v_mov_b32_e32 v15, s11 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v14, s8 +; SDAG-NEXT: v_mov_b32_e32 v15, s9 +; SDAG-NEXT: v_mov_b32_e32 v16, s10 +; SDAG-NEXT: v_mov_b32_e32 v17, s11 +; SDAG-NEXT: v_mov_b32_e32 v2, s12 +; SDAG-NEXT: v_mov_b32_e32 v3, s13 +; SDAG-NEXT: v_mov_b32_e32 v4, s14 +; SDAG-NEXT: v_mov_b32_e32 v5, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v4, s0 -; SDAG-NEXT: v_mov_b32_e32 v5, s1 -; SDAG-NEXT: v_mov_b32_e32 v6, s2 -; SDAG-NEXT: v_mov_b32_e32 v7, s3 -; SDAG-NEXT: v_mov_b32_e32 v17, s16 +; SDAG-NEXT: v_mov_b32_e32 v6, s0 +; SDAG-NEXT: v_mov_b32_e32 v7, s1 +; SDAG-NEXT: v_mov_b32_e32 v8, s2 +; SDAG-NEXT: v_mov_b32_e32 v9, s3 +; SDAG-NEXT: v_mov_b32_e32 v1, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr: @@ -1936,7 +1936,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[12:15], v0, s[0:1] +; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1] ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54 ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64 @@ -1947,13 +1947,13 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19] -; GISEL-NEXT: v_mov_b32_e32 v16, s2 +; GISEL-NEXT: v_mov_b32_e32 v12, s2 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[8:11], v[0:7], v16 cbsz:1 abid:2 +; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, v[12:15], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] ; GISEL-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -2013,25 +2013,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 +; SDAG-NEXT: v_mov_b32_e32 v10, s0 +; SDAG-NEXT: v_mov_b32_e32 v11, s1 +; SDAG-NEXT: v_mov_b32_e32 v12, s2 +; SDAG-NEXT: v_mov_b32_e32 v13, s3 +; SDAG-NEXT: v_mov_b32_e32 v2, s16 +; SDAG-NEXT: v_mov_b32_e32 v3, s17 +; SDAG-NEXT: v_mov_b32_e32 v4, s18 +; SDAG-NEXT: v_mov_b32_e32 v5, s19 +; SDAG-NEXT: v_mov_b32_e32 v6, s20 +; SDAG-NEXT: v_mov_b32_e32 v7, s21 +; SDAG-NEXT: v_mov_b32_e32 v8, s22 +; SDAG-NEXT: v_mov_b32_e32 v9, s23 ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v12, s28 +; SDAG-NEXT: v_mov_b32_e32 v0, s28 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[10:13], v[2:9], v0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2081,22 +2081,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -2454,22 +2454,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -2827,22 +2827,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -3200,22 +3200,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v24, s8 -; SDAG-NEXT: v_mov_b32_e32 v25, s9 -; SDAG-NEXT: v_mov_b32_e32 v26, s10 -; SDAG-NEXT: v_mov_b32_e32 v27, s11 -; SDAG-NEXT: v_mov_b32_e32 v16, s12 -; SDAG-NEXT: v_mov_b32_e32 v17, s13 -; SDAG-NEXT: v_mov_b32_e32 v18, s14 -; SDAG-NEXT: v_mov_b32_e32 v19, s15 -; SDAG-NEXT: v_mov_b32_e32 v20, s0 -; SDAG-NEXT: v_mov_b32_e32 v21, s1 -; SDAG-NEXT: v_mov_b32_e32 v22, s2 -; SDAG-NEXT: v_mov_b32_e32 v23, s3 -; SDAG-NEXT: v_mov_b32_e32 v28, s16 +; SDAG-NEXT: v_mov_b32_e32 v26, s8 +; SDAG-NEXT: v_mov_b32_e32 v27, s9 +; SDAG-NEXT: v_mov_b32_e32 v28, s10 +; SDAG-NEXT: v_mov_b32_e32 v29, s11 +; SDAG-NEXT: v_mov_b32_e32 v18, s12 +; SDAG-NEXT: v_mov_b32_e32 v19, s13 +; SDAG-NEXT: v_mov_b32_e32 v20, s14 +; SDAG-NEXT: v_mov_b32_e32 v21, s15 +; SDAG-NEXT: v_mov_b32_e32 v22, s0 +; SDAG-NEXT: v_mov_b32_e32 v23, s1 +; SDAG-NEXT: v_mov_b32_e32 v24, s2 +; SDAG-NEXT: v_mov_b32_e32 v25, s3 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index c470a2f9e7ee8..a520c21d64212 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -2380,13 +2380,13 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x ; PACKED-SDAG-LABEL: fneg_v2f32_scalar: ; PACKED-SDAG: ; %bb.0: ; PACKED-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; PACKED-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-SDAG-NEXT: s_xor_b32 s3, s3, 0x80000000 ; PACKED-SDAG-NEXT: s_xor_b32 s2, s2, 0x80000000 -; PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s3 -; PACKED-SDAG-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; PACKED-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; PACKED-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; PACKED-SDAG-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; PACKED-SDAG-NEXT: s_endpgm ; ; PACKED-GISEL-LABEL: fneg_v2f32_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 1ec94162951a6..ab4effdaf465e 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -53,31 +53,31 @@ define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) { ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s20 -; GFX942-NEXT: v_mov_b32_e32 v1, s21 -; GFX942-NEXT: v_mov_b32_e32 v2, s22 -; GFX942-NEXT: v_mov_b32_e32 v3, s23 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; GFX942-NEXT: v_mov_b32_e32 v2, s20 +; GFX942-NEXT: v_mov_b32_e32 v3, s21 +; GFX942-NEXT: v_mov_b32_e32 v4, s22 +; GFX942-NEXT: v_mov_b32_e32 v5, s23 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, s16 -; GFX942-NEXT: v_mov_b32_e32 v1, s17 -; GFX942-NEXT: v_mov_b32_e32 v2, s18 -; GFX942-NEXT: v_mov_b32_e32 v3, s19 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; GFX942-NEXT: v_mov_b32_e32 v2, s16 +; GFX942-NEXT: v_mov_b32_e32 v3, s17 +; GFX942-NEXT: v_mov_b32_e32 v4, s18 +; GFX942-NEXT: v_mov_b32_e32 v5, s19 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, s12 -; GFX942-NEXT: v_mov_b32_e32 v1, s13 -; GFX942-NEXT: v_mov_b32_e32 v2, s14 -; GFX942-NEXT: v_mov_b32_e32 v3, s15 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; GFX942-NEXT: v_mov_b32_e32 v2, s12 +; GFX942-NEXT: v_mov_b32_e32 v3, s13 +; GFX942-NEXT: v_mov_b32_e32 v4, s14 +; GFX942-NEXT: v_mov_b32_e32 v5, s15 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mov_b32_e32 v0, s8 -; GFX942-NEXT: v_mov_b32_e32 v1, s9 -; GFX942-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-NEXT: v_mov_b32_e32 v3, s11 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v2, s8 +; GFX942-NEXT: v_mov_b32_e32 v3, s9 +; GFX942-NEXT: v_mov_b32_e32 v4, s10 +; GFX942-NEXT: v_mov_b32_e32 v5, s11 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] ; GFX942-NEXT: s_endpgm entry: store <16 x i32> %a, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll index 16e5c92f8e6d8..49dec15f9f7d7 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll @@ -9,33 +9,31 @@ define amdgpu_kernel void @v_sext_in_reg_i8_i16_shuffle_vector(ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v0, s[2:3] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h -; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v4, 24, v0 -; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.h, 8, v0.l -; GFX11-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8 -; GFX11-TRUE16-NEXT: v_bfe_i32 v8, v2, 0, 8 -; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v5, 24, v1 -; GFX11-TRUE16-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX11-TRUE16-NEXT: v_bfe_i32 v7, v0, 0, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h +; GFX11-TRUE16-NEXT: v_bfe_i32 v7, v2, 0, 8 +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v5, 24, v2 +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v6, 24, v1 +; GFX11-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8 ; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.l -; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v2.h, v4.l +; GFX11-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.h, 8, v2.l +; GFX11-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l -; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v0.h ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v2.h, v6.l ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v4.h, v5.l -; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v1.h, v2.l ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v1.l, v1.l +; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v1.h, v2.l ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v2.l, v3.l ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v4.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v3, v0.h, v1.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v3, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.h, v1.h ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v4.h, v4.l ; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll index 936118750ff10..4d864ad15b411 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector-physreg-copy.ll @@ -25,27 +25,27 @@ define void @shufflevector_v2i32_10_physreg_even_vgpr_pair_copy(ptr addrspace(1) ; GFX90A-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v4, v5 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: shufflevector_v2i32_10_physreg_even_vgpr_pair_copy: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v4, v5 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"() @@ -214,27 +214,27 @@ define void @shufflevector_v2i32_11_physreg_even_vgpr_pair_copy(ptr addrspace(1) ; GFX90A-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v4, v5 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v5 -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: shufflevector_v2i32_11_physreg_even_vgpr_pair_copy: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v4, v5 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v5 -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %asm = call { i32, i32 } asm "; def $0, $1", "={v4},={v5}"() @@ -265,31 +265,31 @@ define void @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy(ptr addrspace( ; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v4, v5, v6, v7 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v10, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: v_mov_b32_e32 v11, v4 +; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v4, v5, v6, v7 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v10, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v6 +; GFX940-NEXT: v_mov_b32_e32 v8, v7 +; GFX940-NEXT: v_mov_b32_e32 v11, v4 +; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"() @@ -327,31 +327,31 @@ define void @shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy(ptr addrspace( ; GFX90A-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v4, v5, v6, v7 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v6 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v8, v5 +; GFX90A-NEXT: v_mov_b32_e32 v11, v6 +; GFX90A-NEXT: v_mov_b32_e32 v10, v7 +; GFX90A-NEXT: v_mov_b32_e32 v9, v4 +; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: shufflevector_v4i32_1032_physreg_even_vgpr_quad_copy: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v4, v5, v6, v7 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v6 -; GFX940-NEXT: v_mov_b32_e32 v2, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v8, v5 +; GFX940-NEXT: v_mov_b32_e32 v11, v6 +; GFX940-NEXT: v_mov_b32_e32 v10, v7 +; GFX940-NEXT: v_mov_b32_e32 v9, v4 +; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %asm = call { i32, i32, i32, i32 } asm "; def $0, $1, $2, $3", "={v4},={v5},={v6},={v7}"() @@ -746,16 +746,15 @@ define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(p ; GFX90A-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v4, v5, v6, v7 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v4 -; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[16:17] -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, v5 +; GFX90A-NEXT: v_mov_b32_e32 v9, v6 +; GFX90A-NEXT: v_mov_b32_e32 v8, v7 +; GFX90A-NEXT: v_mov_b32_e32 v11, v4 +; GFX90A-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17] ; GFX90A-NEXT: v_mov_b32_e32 v0, v6 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -763,17 +762,16 @@ define i32 @shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt(p ; GFX940-LABEL: shufflevector_v4i32_3210_physreg_even_vgpr_quad_copy_other_use_elt: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v8, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v4, v5, v6, v7 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v4 -; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] -; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_mov_b32_e32 v10, v5 +; GFX940-NEXT: v_mov_b32_e32 v9, v6 +; GFX940-NEXT: v_mov_b32_e32 v8, v7 +; GFX940-NEXT: v_mov_b32_e32 v11, v4 +; GFX940-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1] ; GFX940-NEXT: v_mov_b32_e32 v0, v6 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll index f7caca2d143c8..ebe6b232bfcbc 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -1110,13 +1110,13 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v1 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.h, v3 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v2.h, v2.l -; GFX11-TRUE16-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v1.h, v1.l +; GFX11-TRUE16-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_sint_to_fp_v4i64_to_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index d23e314b9465f..f6c357dc38b48 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -70,12 +70,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 0 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s8, 0 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 1 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[70:71], 1, v3 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80 ; GLOBALNESS1-NEXT: s_mov_b32 s82, s16 @@ -84,7 +84,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr56_vgpr57 +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr58_vgpr59 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -93,24 +93,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 2 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 2 ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 3 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 3 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 4 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 5 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 4 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 5 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 6 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 7 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s4, 6 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s5, 7 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s70, 8 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s71, 9 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s70, 8 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s71, 9 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 6 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 7 +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 6 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 7 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 @@ -120,7 +120,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 @@ -128,7 +128,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: flat_load_dword v40, v[46:47] ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0 -; GLOBALNESS1-NEXT: flat_load_dword v58, v[46:47] +; GLOBALNESS1-NEXT: flat_load_dword v56, v[46:47] ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 @@ -186,10 +186,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 10 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 11 -; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 2 -; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 3 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s8, 10 +; GLOBALNESS1-NEXT: v_writelane_b32 v57, s9, 11 +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 2 +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 3 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i @@ -198,7 +198,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] @@ -228,8 +228,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 0 -; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 1 +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v57, 0 +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v57, 1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS1-NEXT: ; %bb.20: ; %bb6.i.i @@ -265,7 +265,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b32 s13, s83 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s82 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[56:57], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[58:59], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[54:55] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 @@ -277,13 +277,13 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0 -; GLOBALNESS1-NEXT: v_readlane_b32 s70, v59, 8 -; GLOBALNESS1-NEXT: v_readlane_b32 s8, v59, 10 +; GLOBALNESS1-NEXT: v_readlane_b32 s70, v57, 8 +; GLOBALNESS1-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: v_readlane_b32 s71, v59, 9 +; GLOBALNESS1-NEXT: v_readlane_b32 s71, v57, 9 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_mov_b32 s55, s7 -; GLOBALNESS1-NEXT: v_readlane_b32 s9, v59, 11 +; GLOBALNESS1-NEXT: v_readlane_b32 s9, v57, 11 ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[52:53] @@ -291,8 +291,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v59, 4 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v59, 5 +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v57, 4 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v57, 5 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb69.i @@ -384,12 +384,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr57 : SGPR spill to VGPR lane ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 0 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s8, 0 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 1 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[84:85], 1, v3 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80 ; GLOBALNESS0-NEXT: s_mov_b32 s70, s16 @@ -398,7 +398,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr56_vgpr57 +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr58_vgpr59 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -407,24 +407,24 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 2 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 2 ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 3 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 3 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v3 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 4 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 5 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 4 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 5 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v2 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 6 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 7 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s4, 6 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s5, 7 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[80:81], 1, v1 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s84, 8 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s85, 9 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s84, 8 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s85, 9 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 6 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 7 +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 6 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 7 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_28 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 @@ -434,7 +434,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[58:59], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 @@ -442,7 +442,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: flat_load_dword v40, v[46:47] ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0 -; GLOBALNESS0-NEXT: flat_load_dword v58, v[46:47] +; GLOBALNESS0-NEXT: flat_load_dword v56, v[46:47] ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 @@ -500,10 +500,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 10 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 11 -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 2 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 3 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s8, 10 +; GLOBALNESS0-NEXT: v_writelane_b32 v57, s9, 11 +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 2 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 3 ; GLOBALNESS0-NEXT: s_mov_b32 s83, s55 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 @@ -513,7 +513,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] @@ -543,8 +543,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb3.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 0 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 1 +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v57, 0 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v57, 1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_21 ; GLOBALNESS0-NEXT: ; %bb.20: ; %bb6.i.i @@ -580,7 +580,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[56:57], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[58:59], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[54:55] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 @@ -591,12 +591,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_branch .LBB1_14 ; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s84, v59, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s8, v59, 10 +; GLOBALNESS0-NEXT: v_readlane_b32 s84, v57, 8 +; GLOBALNESS0-NEXT: v_readlane_b32 s8, v57, 10 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b32 s55, s83 -; GLOBALNESS0-NEXT: v_readlane_b32 s85, v59, 9 -; GLOBALNESS0-NEXT: v_readlane_b32 s9, v59, 11 +; GLOBALNESS0-NEXT: v_readlane_b32 s85, v57, 9 +; GLOBALNESS0-NEXT: v_readlane_b32 s9, v57, 11 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[52:53] @@ -604,8 +604,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.26: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v59, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v59, 5 +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v57, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v57, 5 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb69.i diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll index 1f58623cfd13c..5b1a5206c3403 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -894,13 +894,13 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v1 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.h, v3 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v2.h, v2.l -; GFX11-TRUE16-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v1.h, v1.l +; GFX11-TRUE16-NEXT: global_store_b64 v3, v[1:2], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_uint_to_fp_v4i64_to_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index cd6bc687c1851..8a8829832f688 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -348,13 +348,13 @@ define amdgpu_kernel void @fptrunc( ; GFX11-GCN-REAL16-NEXT: s_mov_b32 s8, s2 ; GFX11-GCN-REAL16-NEXT: s_mov_b32 s9, s3 ; GFX11-GCN-REAL16-NEXT: s_mov_b32 s4, s0 -; GFX11-GCN-REAL16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-GCN-REAL16-NEXT: buffer_load_b64 v[1:2], off, s[8:11], 0 ; GFX11-GCN-REAL16-NEXT: s_mov_b32 s5, s1 ; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 -; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 +; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 ; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v1.l +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l ; GFX11-GCN-REAL16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-GCN-REAL16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index 0ae6dbd06e33d..14ff20b63abf2 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -58,19 +58,19 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v2, 2, v3 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v3 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dword v1, v2, s[0:1] +; GFX942-NEXT: global_load_dword v2, v1, s[0:1] ; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB1_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dword v1, v2, s[2:3] +; GFX942-NEXT: global_load_dword v2, v1, s[2:3] ; GFX942-NEXT: .LBB1_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dword v0, v1, s[6:7] +; GFX942-NEXT: global_store_dword v0, v2, s[6:7] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -136,19 +136,19 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1] +; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[0:1] ; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB3_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] +; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3] ; GFX942-NEXT: .LBB3_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[6:7] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -173,19 +173,19 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-NEXT: v_and_b32_e32 v6, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, 4, v6 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 4, v6 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx4 v[0:3], v5, s[0:1] +; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1] ; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v6 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB4_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] +; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3] ; GFX942-NEXT: .LBB4_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -210,23 +210,23 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1 ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX942-NEXT: v_and_b32_e32 v10, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v9, 5, v10 -; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 5, v10 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx4 v[4:7], v9, s[0:1] offset:16 -; GFX942-NEXT: global_load_dwordx4 v[0:3], v9, s[0:1] +; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1] ; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v10 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB5_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx4 v[4:7], v9, s[2:3] offset:16 -; GFX942-NEXT: global_load_dwordx4 v[0:3], v9, s[2:3] +; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3] ; GFX942-NEXT: .LBB5_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:16 ; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -250,72 +250,72 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace( ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX942-NEXT: v_and_b32_e32 v62, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v62 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx4 v[28:31], v1, s[0:1] offset:240 -; GFX942-NEXT: global_load_dwordx4 v[24:27], v1, s[0:1] offset:224 -; GFX942-NEXT: global_load_dwordx4 v[20:23], v1, s[0:1] offset:208 -; GFX942-NEXT: global_load_dwordx4 v[16:19], v1, s[0:1] offset:192 -; GFX942-NEXT: global_load_dwordx4 v[12:15], v1, s[0:1] offset:176 -; GFX942-NEXT: global_load_dwordx4 v[8:11], v1, s[0:1] offset:160 -; GFX942-NEXT: global_load_dwordx4 v[4:7], v1, s[0:1] offset:144 -; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:128 -; GFX942-NEXT: global_load_dwordx4 v[60:63], v1, s[0:1] offset:112 -; GFX942-NEXT: global_load_dwordx4 v[56:59], v1, s[0:1] offset:96 -; GFX942-NEXT: global_load_dwordx4 v[52:55], v1, s[0:1] offset:80 -; GFX942-NEXT: global_load_dwordx4 v[48:51], v1, s[0:1] offset:64 -; GFX942-NEXT: global_load_dwordx4 v[44:47], v1, s[0:1] offset:48 -; GFX942-NEXT: global_load_dwordx4 v[40:43], v1, s[0:1] offset:32 -; GFX942-NEXT: global_load_dwordx4 v[36:39], v1, s[0:1] offset:16 -; GFX942-NEXT: global_load_dwordx4 v[32:35], v1, s[0:1] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2 +; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[0:1] offset:240 +; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[0:1] offset:224 +; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[0:1] offset:208 +; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[0:1] offset:192 +; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[0:1] offset:176 +; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[0:1] offset:160 +; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[0:1] offset:144 +; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[0:1] offset:128 +; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[0:1] offset:112 +; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[0:1] offset:96 +; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[0:1] offset:80 +; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[0:1] offset:64 +; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[0:1] offset:48 +; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[0:1] offset:32 +; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[0:1] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[0:1] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v62 ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB6_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx4 v[28:31], v1, s[2:3] offset:240 -; GFX942-NEXT: global_load_dwordx4 v[24:27], v1, s[2:3] offset:224 -; GFX942-NEXT: global_load_dwordx4 v[20:23], v1, s[2:3] offset:208 -; GFX942-NEXT: global_load_dwordx4 v[16:19], v1, s[2:3] offset:192 -; GFX942-NEXT: global_load_dwordx4 v[12:15], v1, s[2:3] offset:176 -; GFX942-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3] offset:160 -; GFX942-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:144 -; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[2:3] offset:128 -; GFX942-NEXT: global_load_dwordx4 v[60:63], v1, s[2:3] offset:112 -; GFX942-NEXT: global_load_dwordx4 v[56:59], v1, s[2:3] offset:96 -; GFX942-NEXT: global_load_dwordx4 v[52:55], v1, s[2:3] offset:80 -; GFX942-NEXT: global_load_dwordx4 v[48:51], v1, s[2:3] offset:64 -; GFX942-NEXT: global_load_dwordx4 v[44:47], v1, s[2:3] offset:48 -; GFX942-NEXT: global_load_dwordx4 v[40:43], v1, s[2:3] offset:32 -; GFX942-NEXT: global_load_dwordx4 v[36:39], v1, s[2:3] offset:16 -; GFX942-NEXT: global_load_dwordx4 v[32:35], v1, s[2:3] +; GFX942-NEXT: global_load_dwordx4 v[30:33], v1, s[2:3] offset:240 +; GFX942-NEXT: global_load_dwordx4 v[26:29], v1, s[2:3] offset:224 +; GFX942-NEXT: global_load_dwordx4 v[22:25], v1, s[2:3] offset:208 +; GFX942-NEXT: global_load_dwordx4 v[18:21], v1, s[2:3] offset:192 +; GFX942-NEXT: global_load_dwordx4 v[14:17], v1, s[2:3] offset:176 +; GFX942-NEXT: global_load_dwordx4 v[10:13], v1, s[2:3] offset:160 +; GFX942-NEXT: global_load_dwordx4 v[6:9], v1, s[2:3] offset:144 +; GFX942-NEXT: global_load_dwordx4 v[2:5], v1, s[2:3] offset:128 +; GFX942-NEXT: global_load_dwordx4 a[0:3], v1, s[2:3] offset:112 +; GFX942-NEXT: global_load_dwordx4 v[58:61], v1, s[2:3] offset:96 +; GFX942-NEXT: global_load_dwordx4 v[54:57], v1, s[2:3] offset:80 +; GFX942-NEXT: global_load_dwordx4 v[50:53], v1, s[2:3] offset:64 +; GFX942-NEXT: global_load_dwordx4 v[46:49], v1, s[2:3] offset:48 +; GFX942-NEXT: global_load_dwordx4 v[42:45], v1, s[2:3] offset:32 +; GFX942-NEXT: global_load_dwordx4 v[38:41], v1, s[2:3] offset:16 +; GFX942-NEXT: global_load_dwordx4 v[34:37], v1, s[2:3] ; GFX942-NEXT: .LBB6_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[60:63], s[6:7] offset:112 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] offset:112 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[56:59], s[6:7] offset:96 +; GFX942-NEXT: global_store_dwordx4 v0, v[58:61], s[6:7] offset:96 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[52:55], s[6:7] offset:80 +; GFX942-NEXT: global_store_dwordx4 v0, v[54:57], s[6:7] offset:80 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[48:51], s[6:7] offset:64 +; GFX942-NEXT: global_store_dwordx4 v0, v[50:53], s[6:7] offset:64 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[44:47], s[6:7] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, v[46:49], s[6:7] offset:48 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[40:43], s[6:7] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, v[42:45], s[6:7] offset:32 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[36:39], s[6:7] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, v[38:41], s[6:7] offset:16 ; GFX942-NEXT: s_waitcnt vmcnt(7) -; GFX942-NEXT: global_store_dwordx4 v0, v[32:35], s[6:7] -; GFX942-NEXT: global_store_dwordx4 v0, v[28:31], s[6:7] offset:240 -; GFX942-NEXT: global_store_dwordx4 v0, v[24:27], s[6:7] offset:224 -; GFX942-NEXT: global_store_dwordx4 v0, v[20:23], s[6:7] offset:208 -; GFX942-NEXT: global_store_dwordx4 v0, v[16:19], s[6:7] offset:192 -; GFX942-NEXT: global_store_dwordx4 v0, v[12:15], s[6:7] offset:176 -; GFX942-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7] offset:160 -; GFX942-NEXT: global_store_dwordx4 v0, v[4:7], s[6:7] offset:144 -; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] offset:128 +; GFX942-NEXT: global_store_dwordx4 v0, v[34:37], s[6:7] +; GFX942-NEXT: global_store_dwordx4 v0, v[30:33], s[6:7] offset:240 +; GFX942-NEXT: global_store_dwordx4 v0, v[26:29], s[6:7] offset:224 +; GFX942-NEXT: global_store_dwordx4 v0, v[22:25], s[6:7] offset:208 +; GFX942-NEXT: global_store_dwordx4 v0, v[18:21], s[6:7] offset:192 +; GFX942-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7] offset:176 +; GFX942-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] offset:160 +; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] offset:144 +; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[6:7] offset:128 ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -391,17 +391,17 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-LABEL: v8i8_phi_chain: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v2 -; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v2 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9] ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB8_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v2 +; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] @@ -410,14 +410,14 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB8_4 ; GFX942-NEXT: ; %bb.3: ; %bb.2 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[12:13] ; GFX942-NEXT: .LBB8_4: ; %bb.3 ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15] +; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[14:15] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -447,18 +447,18 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa ; GFX942-LABEL: v8i8_phi_zeroinit: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v4 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v2 +; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v2 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9] -; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9] +; GFX942-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB9_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v4 +; GFX942-NEXT: global_load_dwordx2 v[4:5], v3, s[10:11] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v2 ; GFX942-NEXT: s_waitcnt vmcnt(1) ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -470,15 +470,15 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] ; GFX942-NEXT: s_cbranch_execz .LBB9_4 ; GFX942-NEXT: ; %bb.3: ; %bb.2 -; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1] ; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13] -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-NEXT: .LBB9_4: ; %bb.3 ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[14:15] +; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[14:15] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -618,30 +618,30 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac ; GFX942-LABEL: v8i8_multi_block: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v5, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v6, 3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v5 +; GFX942-NEXT: v_and_b32_e32 v3, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v4, 3, v3 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB11_4 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v5 +; GFX942-NEXT: global_load_dwordx2 v[6:7], v4, s[10:11] +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v3 ; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX942-NEXT: s_cbranch_execz .LBB11_3 ; GFX942-NEXT: ; %bb.2: ; %bb.2 -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: global_store_dwordx2 v5, v[0:1], s[12:13] +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: global_store_dwordx2 v3, v[0:1], s[12:13] ; GFX942-NEXT: .LBB11_3: ; %Flow ; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX942-NEXT: .LBB11_4: ; %bb.3 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15] +; GFX942-NEXT: global_store_dwordx2 v2, v[6:7], s[14:15] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -860,15 +860,15 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[8:9] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB14_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11] +; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[10:11] ; GFX942-NEXT: .LBB14_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[14:15], 0x0 @@ -879,9 +879,9 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1) ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 6 -; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[12:13] +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -910,15 +910,15 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace( ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[36:43], s[4:5], 0x24 ; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v4 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[36:37] +; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[36:37] ; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX942-NEXT: s_cbranch_execz .LBB15_2 ; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[38:39] +; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[38:39] ; GFX942-NEXT: .LBB15_2: ; %bb.2 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[42:43], 0x0 @@ -958,18 +958,18 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[0:1], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 +; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 2 -; GFX942-NEXT: global_store_dwordx4 v2, a[28:31], s[40:41] offset:112 -; GFX942-NEXT: global_store_dwordx4 v2, a[24:27], s[40:41] offset:96 -; GFX942-NEXT: global_store_dwordx4 v2, a[20:23], s[40:41] offset:80 -; GFX942-NEXT: global_store_dwordx4 v2, a[16:19], s[40:41] offset:64 -; GFX942-NEXT: global_store_dwordx4 v2, a[12:15], s[40:41] offset:48 -; GFX942-NEXT: global_store_dwordx4 v2, a[8:11], s[40:41] offset:32 -; GFX942-NEXT: global_store_dwordx4 v2, a[4:7], s[40:41] offset:16 -; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[40:41] +; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[40:41] offset:112 +; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[40:41] offset:96 +; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[40:41] offset:80 +; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[40:41] offset:64 +; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[40:41] offset:48 +; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[40:41] offset:32 +; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[40:41] offset:16 +; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[40:41] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index 03bf1d62a1c74..77d1e6c2593c1 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -261,13 +261,13 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) { ; GFX11-TRUE16-LABEL: widen_f16_constant_load: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_f16_e64 v2.l, s0, 4.0 -; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, 4.0 +; GFX11-TRUE16-NEXT: global_store_b16 v[1:2], v0, off ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: widen_f16_constant_load: @@ -393,18 +393,16 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4) ; GFX11-TRUE16-LABEL: no_widen_i16_constant_divergent_load: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x3e7, v0.l -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, 4 -; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x3e7, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, 4 +; GFX11-TRUE16-NEXT: global_store_b16 v[1:2], v0, off ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: no_widen_i16_constant_divergent_load: From dc9323b522ebacc133b529bad1e2609c646c3b90 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 2 Jul 2025 15:06:34 -0700 Subject: [PATCH 4/5] Address Review comments Change-Id: I7df86a09024b78c48e728119d42c2d3d812bbebd --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 42 +++++++++++++++++------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index ad09a3383a04b..9367fddd8a6dc 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -110,9 +110,21 @@ class SIRegisterClass rTypes, int Align, dag rList> let TSFlags{3} = HasAGPR; let TSFlags{4} = HasSGPR; - // RegisterClass (e.g. AGPR / VGPR) priority for allocation - field int RegClassPriority = 1; - field int RegClassBit = 5; + // RA will use RegisterClass AllocationPriority amongst other info (e.g. ordering in the basic block) + // to decide which registers to try to assign first. Usually, this RegisterClass priority is given + // very high priority, if not the highest priority, when considering which VirtReg to allocate next. + // + // We have 5 bits to assign AllocationPriorities to RegisterClasses. Generally, it is beneficial to + // assign more constrained RegisterClasses first. As a result, we prioritize larger register classes + // over smaller register classes. + // + // The interesting case is the vector register case on architectures which have ARegs, VRegs, AVRegs. + // In this case, we would like to assign ARegs and VRegs before AVRegs, as AVRegs are less constrained + // and can be assigned to both AGPRs and VGPRs. We use the 5th bit to encode this into the + // RegisterClass AllocationPriority. BaseClassPriority is used to turn the bit on, and BaseClassScaleFactor + // is used for scaling of the bit (i.e. 1 << 4). + field int BaseClassPriority = 1; + field int BaseClassScaleFactor = 16; } @@ -576,7 +588,7 @@ let HasVGPR = 1 in { def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, (add (interleave (sequence "VGPR%u_LO16", 0, 255), (sequence "VGPR%u_HI16", 0, 255)))> { - let AllocationPriority = !add(2, !mul(RegClassPriority, !shl(1, RegClassBit))); + let AllocationPriority = !add(2, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 16; let GeneratePressureSet = 0; @@ -602,7 +614,7 @@ def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16, // i16/f16 only on VI+ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 255))> { - let AllocationPriority = !add(0, !mul(RegClassPriority, !shl(1, RegClassBit))); + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let Size = 32; let Weight = 1; let BaseClassOrder = 32; @@ -611,7 +623,7 @@ def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types // Identical to VGPR_32 except it only contains the low 128 (Lo128) registers. def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 127))> { - let AllocationPriority = !add(0, !mul(RegClassPriority, !shl(1, RegClassBit))); + let AllocationPriority = !add(0, !mul(BaseClassPriority, BaseClassScaleFactor)); let GeneratePressureSet = 0; let Size = 32; let Weight = 1; @@ -941,15 +953,23 @@ class VRegClassBase regTypes, dag regList> : // Requires n v_mov_b32 to copy let CopyCost = numRegs; + + // Since we only have 5 bits for the RegisterClass Allocation Priorty, and since we use the + // 5th bit for BaseClassPriority, we need to encode the SizePriority into 4 bits. As a result + // of this encoding, for registers with numRegs 15 or 16, we give SizePriority of 14, and for + // regsters with numRegs 17+ we give SizePriority of 15. In practice, there is only one + // RegClass per Vector Register type in each of these groups (i.e. numRegs = 15,16 : {VReg_512}, + // and numRegs = 17+ : {VReg_1024}). Therefore, we have not lost any info by compressing. defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15)); - let AllocationPriority = !add(SizePrioriity, !mul(RegClassPriority, !shl(1, RegClassBit))); + + let AllocationPriority = !add(SizePrioriity, !mul(BaseClassPriority, BaseClassScaleFactor)); let Weight = numRegs; } // Define a register tuple class, along with one requiring an even // aligned base register. multiclass VRegClass regTypes, dag regList> { - let HasVGPR = 1, RegClassPriority = 1 in { + let HasVGPR = 1, BaseClassPriority = 1 in { // Define the regular class. def "" : VRegClassBase { let BaseClassOrder = !mul(numRegs, 32); @@ -983,7 +1003,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>; } multiclass ARegClass regTypes, dag regList> { - let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, RegClassPriority = 1 in { + let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, BaseClassPriority = 1 in { // Define the regular class. def "" : VRegClassBase { let BaseClassOrder = !mul(numRegs, 32); @@ -1068,7 +1088,7 @@ def VS_64 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add VReg_64, SReg_6 def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> { let HasVGPR = 1; let HasAGPR = 1; - let RegClassPriority = 0; + let BaseClassPriority = 0; let Size = 32; } } // End GeneratePressureSet = 0 @@ -1077,7 +1097,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3 // aligned base register. multiclass AVRegClass regTypes, dag vregList, dag aregList> { - let HasVGPR = 1, HasAGPR = 1, RegClassPriority = 0 in { + let HasVGPR = 1, HasAGPR = 1, BaseClassPriority = 0 in { // Define the regular class. def "" : VRegClassBase; From 419e53e76583b1c6c33ea237c6af19ecc0956074 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 2 Jul 2025 15:10:33 -0700 Subject: [PATCH 5/5] Delete accidental change Change-Id: Iaaa9a2317a2dc67f35d8664735c0dd9b2056314f --- llvm/lib/CodeGen/RegAllocGreedy.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index f1eba3c52a764..fa384b296f2e6 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -2290,7 +2290,6 @@ MCRegister RAGreedy::selectOrSplit(const LiveInterval &VirtReg, LLVMContext &Ctx = MF->getFunction().getContext(); SmallVirtRegSet FixedRegisters; RecoloringStack RecolorStack; - MCRegister Reg = selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters, RecolorStack); if (Reg == ~0U && (CutOffInfo != CO_None)) {