diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 11017fe4e01b4..ab2d1b4e7eaca 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -971,6 +971,10 @@ Syntax: declare void @llvm.nvvm.prefetch.L1(ptr %ptr) declare void @llvm.nvvm.prefetch.L2(ptr %ptr) + declare void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr) + declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr) + declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr) + declare void @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %global_ptr) declare void @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %global_ptr) @@ -983,7 +987,10 @@ The '``@llvm.nvvm.prefetch.*``' and '``@llvm.nvvm.prefetchu.*``' intrinsic correspond to the '``prefetch.*``;' and '``prefetchu.*``' family of PTX instructions. The '``prefetch.*``' instructions bring the cache line containing the specified address in global or local memory address space into the -specified cache level (L1 or L2). The '`prefetchu.*``' instruction brings the cache line +specified cache level (L1 or L2). If the '``.tensormap``' qualifier is specified then the +prefetch instruction brings the cache line containing the specified address in the +'``.const``' or '``.param memory``' state space for subsequent use by the '``cp.async.bulk.tensor``' +instruction. The '`prefetchu.*``' instruction brings the cache line containing the specified generic address into the specified uniform cache level. If no address space is specified, it is assumed to be generic address. The intrinsic uses and eviction priority which can be accessed by the '``.level::eviction_priority``' modifier. diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 0375f29ad8906..303a807f408cf 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -137,6 +137,7 @@ def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr +def llvm_constant_ptr_ty: LLVMQualPointerType<4>; // (const)ptr def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr @@ -2087,15 +2088,17 @@ foreach dim = 1...5 in { // Intrinsics for Prefetch and Prefetchu let IntrProperties = [IntrArgMemOnly, ReadOnly>, NoCapture>] in { foreach level = ["L1", "L2"] in { - def int_nvvm_prefetch_ # level : Intrinsic<[], [llvm_ptr_ty]>; - def int_nvvm_prefetch_global_ # level : Intrinsic<[], [llvm_global_ptr_ty]>; - def int_nvvm_prefetch_local_ # level : Intrinsic<[], [llvm_local_ptr_ty]>; + def int_nvvm_prefetch_ # level : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>; + def int_nvvm_prefetch_global_ # level : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>; + def int_nvvm_prefetch_local_ # level : DefaultAttrsIntrinsic<[], [llvm_local_ptr_ty]>; } + def int_nvvm_prefetch_tensormap : DefaultAttrsIntrinsic<[], [llvm_anyptr_ty]>; + foreach eviction_priority = ["evict_normal", "evict_last"] in - def int_nvvm_prefetch_global_L2_ # eviction_priority : Intrinsic<[], [llvm_global_ptr_ty]>; + def int_nvvm_prefetch_global_L2_ # eviction_priority : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>; - def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty]>; + def int_nvvm_prefetchu_L1 : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>; } // applypriority diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d9192fbfceff1..d6c9bff9dc42b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3973,6 +3973,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( return true; } + case Intrinsic::nvvm_prefetch_tensormap: { + auto &DL = I.getDataLayout(); + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = getPointerTy(DL); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable; + Info.align.reset(); + return true; + } + case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: case Intrinsic::nvvm_ldu_global_p: { diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index cc1fd027d8515..4a306ce87fcc7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -39,6 +39,12 @@ def AS_match { code global = [{ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL); }]; + code const = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST); + }]; + code param = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM); + }]; } // A node that will be replaced with the current PTX version. @@ -744,35 +750,42 @@ foreach dim = [1, 2, 3, 4, 5] in { } } -//Prefetch and Prefetchu +//Prefetchu and Prefetch + +defvar frag_pat = (int_nvvm_prefetch_tensormap node:$addr); + +def prefetch_tensormap_const : PatFrag; +def prefetch_tensormap_gen : PatFrag; +def prefetch_tensormap_param : PatFrag; -class PREFETCH_INTRS : +multiclass PREFETCH_TENSORMAP_INST { + def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr), + !strconcat("prefetch", !if(!eq(addrspace_name, ""), "", + !strconcat(".", addrspace_name)), ".tensormap [$addr];"), + [(pattern_frag addr:$addr)]>, + Requires<[hasPTX<80>, hasSM<90>]>; +} + +defm PREFETCH_CONST_TENSORMAP : PREFETCH_TENSORMAP_INST<"const", prefetch_tensormap_const>; +defm PREFETCH_GENERIC_TENSORMAP : PREFETCH_TENSORMAP_INST<"", prefetch_tensormap_gen>; +defm PREFETCH_PARAM_TENSORMAP : PREFETCH_TENSORMAP_INST<"param", prefetch_tensormap_param>; + +class PREFETCH_INTRS : BasicNVPTXInst<(outs), (ins ADDR:$addr), InstName, - [(!cast(!strconcat("int_nvvm_", - !subst(".", "_", InstName))) addr:$addr)]>, - Requires<[hasPTX<80>, hasSM<90>]>; - - -def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">; -def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">; -def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">; -def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1">; -def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">; -def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">; - -def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr), - "prefetch.global.L2::evict_normal", - [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>, - Requires<[hasPTX<80>, hasSM<90>]>; - -def PREFETCH_GLOBAL_L2_EVICT_LAST : BasicNVPTXInst<(outs), (ins ADDR:$addr), - "prefetch.global.L2::evict_last", - [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>, - Requires<[hasPTX<80>, hasSM<90>]>; - - -def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">; + [(Intr addr:$addr)]>; + +def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1", int_nvvm_prefetchu_L1>; +def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1", int_nvvm_prefetch_L1>; +def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2", int_nvvm_prefetch_L2>; +def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1", int_nvvm_prefetch_global_L1>; +def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1", int_nvvm_prefetch_local_L1>; +def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2", int_nvvm_prefetch_global_L2>; +def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2", int_nvvm_prefetch_local_L2>; +def PREFETCH_GLOBAL_L2_EVICT_NORMAL : PREFETCH_INTRS<"prefetch.global.L2::evict_normal", + int_nvvm_prefetch_global_L2_evict_normal>; +def PREFETCH_GLOBAL_L2_EVICT_LAST : PREFETCH_INTRS<"prefetch.global.L2::evict_last", + int_nvvm_prefetch_global_L2_evict_last>; //Applypriority intrinsics class APPLYPRIORITY_L2_INTRS : diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 3ae2d9d5181a3..28cb4ad17580f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -564,7 +564,8 @@ bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl &OpIndexes, case Intrinsic::nvvm_isspacep_global: case Intrinsic::nvvm_isspacep_local: case Intrinsic::nvvm_isspacep_shared: - case Intrinsic::nvvm_isspacep_shared_cluster: { + case Intrinsic::nvvm_isspacep_shared_cluster: + case Intrinsic::nvvm_prefetch_tensormap: { OpIndexes.push_back(0); return true; } @@ -587,6 +588,15 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, return ConstantInt::get(II->getType(), *R); return nullptr; } + case Intrinsic::nvvm_prefetch_tensormap: { + IRBuilder<> Builder(II); + Module *M = II->getModule(); + Function *NewIntrinsic = Intrinsic::getDeclaration( + M, Intrinsic::nvvm_prefetch_tensormap, {NewV->getType()}); + CallInst *NewCall = Builder.CreateCall(NewIntrinsic, {NewV}); + NewCall->setAttributes(II->getAttributes()); + return NewCall; + } } return nullptr; } diff --git a/llvm/test/CodeGen/NVPTX/prefetch.ll b/llvm/test/CodeGen/NVPTX/prefetch.ll index a64e4fe7a508e..862e26d704679 100644 --- a/llvm/test/CodeGen/NVPTX/prefetch.ll +++ b/llvm/test/CodeGen/NVPTX/prefetch.ll @@ -12,6 +12,10 @@ declare void @llvm.nvvm.prefetch.local.L2(ptr addrspace(5) %local_ptr) declare void @llvm.nvvm.prefetch.L1(ptr %ptr) declare void @llvm.nvvm.prefetch.L2(ptr %ptr) +declare void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr) +declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr) +declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr) + declare void @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %global_ptr) declare void @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %global_ptr) @@ -78,4 +82,43 @@ define void @prefetchu_l1(ptr %ptr) { ; CHECK-PTX64-NEXT: ret; tail call void @llvm.nvvm.prefetchu.L1(ptr %ptr) ret void +} + +define void @prefetch_tensormap(ptr %ptr) { +; CHECK-PTX64-LABEL: prefetch_tensormap( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_tensormap_param_0]; +; CHECK-PTX64-NEXT: prefetch.tensormap [%rd1]; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr) + ret void +} + +define void @prefetch_const_tensormap(ptr addrspace(4) %const_ptr) { +; CHECK-PTX64-LABEL: prefetch_const_tensormap( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_const_tensormap_param_0]; +; CHECK-PTX64-NEXT: prefetch.const.tensormap [%rd1]; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr) + ret void +} + +define void @prefetch_param_tensormap(ptr addrspace(101) %param_ptr) { +; CHECK-PTX64-LABEL: prefetch_param_tensormap( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_param_tensormap_param_0]; +; CHECK-PTX64-NEXT: prefetch.param.tensormap [%rd1]; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr) + ret void } \ No newline at end of file