-
Notifications
You must be signed in to change notification settings - Fork 543
MetalPerformancePrimitives iOS xcode26.0 b1
Alex Soto edited this page Jun 9, 2025
·
1 revision
#MetalPerformancePrimitives.framework
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h 1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h 2025-05-28 02:28:43
@@ -0,0 +1,177 @@
+
+// -*- Metal -*-
+//===-- MetalTensorOpsConvolution2d
+//------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalTensorOpsConvolution2d__
+#define __MetalTensorOpsConvolution2d__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+#include "__impl/MPPTensorOpsBase.h"
+#include "__impl/MPPTensorOpsUtility.h"
+
+#pragma METAL internals : enable
+
+namespace mpp
+{
+namespace tensor_ops
+{
+
+// This API performs 2d-convolution that occur in convolutional neural network.
+// 2d stands here for two spatial dimensions width x height even though tensors
+// consumed by this op are 4D. Source/inputs to the op run method are
+// Activation tensor with named NHWC layout
+// N = batch (slowest moving dimension)
+// H = height
+// W = width
+// C = input channels (fastest moving dimension)
+// Weights tensor with named HWCO layout
+// H = kernel height
+// W = kernel width
+// C = input channels
+// O = output channels
+// Destination tesnor is NHWO layout
+// N = batch (slowest moving dimension)
+// H = height
+// W = width
+// O = output channels (fastest moving dimension)
+//
+// Destination can also be cooperative tensor. See TensorOpsMatMul2d.h for
+// details on how to use cooperative tensor for example, for bias add and
+// applying activation before writing out the result. Currently only scope
+// supported by convolution2d op is full threadgroup. See TensorOpsMatMul2d.h
+// for details on scopes supported by tensor ops.
+
+enum class convolution2d_activation_layout
+{
+ nhwc,
+};
+
+enum class convolution2d_weights_layout
+{
+ hwio,
+};
+
+struct convolution2d_descriptor
+{
+ enum class mode
+ {
+ multiply,
+ multiply_accumulate,
+ };
+
+ // for nhwc, .x = output channel, .y = destination width, .z = destination
+ // height, .w = batch size
+ int4 destination_dimensions;
+ int4 source_dimensions;
+ int2 kernel_dimensions;
+ convolution2d_activation_layout activation_layout;
+ convolution2d_weights_layout weights_layout;
+ int2 strides;
+ int2 dilations;
+ int groups;
+ bool relaxed_precision;
+ mode conv2d_mode;
+
+ constexpr convolution2d_descriptor(
+ int4 _destination_dimensions, int4 _source_dimensions,
+ int2 _kernel_dimensions,
+ convolution2d_activation_layout _activation_layout =
+ convolution2d_activation_layout::nhwc,
+ convolution2d_weights_layout _weights_layout =
+ convolution2d_weights_layout::hwio,
+ int2 _strides = int2(1, 1), int2 _dilations = int2(1, 1), int _groups = 1,
+ bool _relaxed_precision = false,
+ mode _convolution2d_mode = mode::multiply) thread
+ : destination_dimensions(_destination_dimensions),
+ source_dimensions(_source_dimensions),
+ kernel_dimensions(_kernel_dimensions),
+ activation_layout(_activation_layout),
+ weights_layout(_weights_layout),
+ strides(_strides),
+ dilations(_dilations),
+ groups(_groups),
+ relaxed_precision(_relaxed_precision),
+ conv2d_mode(_convolution2d_mode)
+ {
+ }
+};
+
+enum class convolution2d_cooperative_operand
+{
+ destination,
+};
+
+#include "__impl/MPPTensorOpsConvolution2dImpl.h"
+
+template <convolution2d_descriptor Descriptor, typename Scope,
+ typename... ConvArgs>
+class convolution2d : __tensor_ops_detail::op
+{
+ static_assert(Descriptor.activation_layout ==
+ convolution2d_activation_layout::nhwc,
+ "only nhwc activation layout supported currently");
+ static_assert(Descriptor.weights_layout == convolution2d_weights_layout::hwio,
+ "only hwio weights layout supported currently");
+ static_assert(Descriptor.groups == 1,
+ "only group size 1 supported currently");
+
+private:
+ thread int2 __offset;
+
+public:
+ convolution2d() thread : __offset(0)
+ {
+ }
+
+ void set_offsets(int2 o) thread
+ {
+ __offset = o;
+ }
+
+ template <typename ActivationTensorType, typename WeightsTensorType,
+ typename DestinationTensorType, typename... RunArgs>
+ INLINE void run(thread ActivationTensorType &activation,
+ thread WeightsTensorType &weights,
+ thread DestinationTensorType &destination) const thread
+ {
+ convolution2d_descriptor d = Descriptor;
+ __convolution2d_detail::__run<Scope, ActivationTensorType,
+ WeightsTensorType, DestinationTensorType,
+ RunArgs...>(activation, weights, destination,
+ d, __offset);
+ }
+
+ template <typename ActivationOperandType, typename WeightsOperandType,
+ typename ElementType, typename CoordType = int,
+ typename... CoopArgs>
+ using cooperative_tensor_destination_t =
+ __convolution2d_detail::__cooperative_tensor_destination_t<
+ Descriptor, Scope, ActivationOperandType, WeightsOperandType,
+ ElementType, CoordType, CoopArgs...>;
+
+ template <typename ActivationOperandType, typename WeightsOperandType,
+ typename ElementType, typename CoordType = int,
+ typename... CoopArgs>
+ INLINE cooperative_tensor_destination_t<ActivationOperandType,
+ WeightsOperandType, ElementType,
+ CoordType, CoopArgs...>
+ get_destination_cooperative_tensor() const thread
+ {
+ return __convolution2d_detail::__get_destination_cooperative_tensor<
+ Descriptor, Scope, ActivationOperandType, WeightsOperandType,
+ ElementType, CoordType, CoopArgs...>();
+ }
+};
+
+} // namespace tensor_ops
+} // namespace mpp
+
+#pragma METAL internals : disable
+
+#endif
+
+#endif // __TensorOpsConvolution2D__
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h 1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h 2025-05-26 00:45:42
@@ -0,0 +1,495 @@
+
+// -*- Metal -*-
+//===-- MetalTensorOpsMatMul2d
+//------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+// This API performs generalized matrix multiplication operation
+// C = A*B + C;
+// A and B can be tensor_handle, tensor_offset, and tensor_inline.
+// C can be tensor_handle, tensor_offset, tensor_inline or cooperative_tensor.
+// Data type combinations supported by this operation are as follows
+// A B C
+// int8_t int8_t int32_t
+// int8_t int8_t float
+// int8_t int8_t half
+// uint8_t int8_t int32_t
+// uint8_t int8_t float
+// uint8_t int8_t half
+// int8_t uint8_t int32_t
+// int8_t uint8_t float
+// int8_t uint8_t half
+// uint8_t uint8_t int32_t
+// uint8_t uint8_t float
+// uint8_t uint8_t half
+// half half float
+// half half half
+//
+// Basic usage is in the following example which takes M x K matrix A of type
+// half, K x N matrix B of type half, both in device memory and produces M x N
+// matrix C of type float in device memory. It tiles this matrix multiplication
+// in thread groups, where each thread group computes a 64 x 32 tile of output
+// but multiplying 64 x K tile of A with K x 32 tile of B. This compute kernel
+// will be launched with dispatch grid of
+//
+// MTLSize threadgroups = MTLSizeMake((M + 63)/64, (N + 31)/32, 1);
+//
+// It uses 4 SIMD-Groups per threadgroup
+// The way to dispatch this compute kernel is
+//
+// id<MTLComputePipelineState> state = [device newComputePipelineState:...];
+// NSUInteger simdgroupWidth = [state threadExecutionWidth];
+// ...
+// [encoder dispatchThreadgroups:threadgroups
+// threadPerThreadgroups:MTLSizeMake(simdgroupWidth*4, 1, 1)];
+//
+// kernel void simpleMatMul(tensor<device half, dextents<int32_t, 2>,
+// tensor_handle> A,
+// tensor<device half, dextents<int32_t, 2>,
+// tensor_handle> B, tensor<device float,
+// dextents<int32_t, 2>, tensor_handle> C, constant
+// uint& M, constant uint& N, constant uint& K, uint2
+// tgid [[threadgroup_position_in_grid]])
+// {
+// // descriptor to create matmul operation that does 64x32 times 32x32
+// producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+// //m outer dim of local tile
+// 32, //n outer dim
+// of local tile
+// 0, //k inner
+// dimension. 0 means
+// //operation
+// will read K
+// from
+// //input tensor
+// //K =
+// A.extents().extent(0)
+// or
+// B.extents().extent(1)
+// for NN
+// //K =
+// A.extents().extent(0)
+// or
+// B.extents().extent(0)
+// for NT
+// //and so on..
+// false, //transpse_left
+// = false for NN and NT
+// and true for TN and TT
+// false,
+// //transpse_right =
+// false for NN and TN
+// and true for NT and TT
+// false,
+// //relaxed_precision =
+// false, set it to true
+// to allow
+// implementation
+// //sacrifice
+// accurancy for
+// performance.
+// );
+//
+// // create matmul op from above descriptor with 4 SIMD-Groups. All 4
+// SIMD-Groups in this threadgroup will execute this
+// // matmul cooperatively. More on this scope below.
+// matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//
+// // Following three lines of code create appropriate slice for this thread
+// group to work on.
+// // E.g. A.offset below creates a tensor<device half, dextents<int32_t, 2>,
+// tensor_offset>
+// // which has same extents as original tensor A but origin shifted to
+// (0,tgid.y*64) i.e.
+// // mA[x,y] == A[x,tgid.y*64+y]
+//
+// auto mA = A.offset(0, tgid.y*64);
+// auto mB = B.offset(tgid.x*32, 0);
+// auto mC = C.offset(tgid.x*32, tgid.y*64);
+//
+// // execute the operation. Assumes C is is initialized to zero.
+// op.run(mA, mB, mC);
+// }
+//
+// Above matrix multiplication implementation will do edge checking for all
+// thread groups against extents of original tensor although for large enough
+// matrices most of thread groups will be working on "inside" tiles, requring no
+// bounds check. In high performance code we can avoid edge checking for inside
+// thread groups and get better performance
+//
+// kernel void matMul(tensor<device half, dextents<int32_t, 2>, tensor_handle>
+// A,
+// tensor<device half, dextents<int32_t, 2>, tensor_handle>
+// B, tensor<device float, dextents<int32_t, 2>,
+// tensor_handle> C, constant uint& M, constant uint& N,
+// constant uint& K, uint2 tgid
+// [[threadgroup_position_in_grid]])
+// {
+// // descriptor to create matmul operation that does 64x32 times 32x32
+// producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+// 32,
+// 0,
+// false,
+// false,
+// false);
+//
+// matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//
+// // Inside thredgroup in both outer dimensions M and N.
+// if ( tgid.x*64 + 63 < M && tgid.y*32 + 31 < N)
+// {
+// auto tA = A.static_slice<dynamic_extent, 64>(0,tgid.y*64);
+// auto tB = B.static_slice<32, dynamic_extent>(tgid.x*32, 0);
+// auto tC = C.static_slice<32, 64>(tgid.x*32, tgid.y*64);
+//
+// op.run(tA, tB, tC);
+// }
+// else
+// {
+// auto tA = A.offset(0, tgid.y*64);
+// auto tB = B.offset(tgid.x*32, 0);
+// auto tC = C.offset(tgid.x*32, tgid.y*64);
+//
+// op.run(tA, tB, tC);
+// }
+// }
+//
+// User can also take ownership of looping over in reduction or k-dimension by
+// choosing appropriate chunk size in k (called k-tile or tilek) For following
+// example, we choose 16. kernel void matMulKLoop(tensor<device half,
+// dextents<int32_t, 2>, tensor_handle> A,
+// tensor<device half, dextents<int32_t, 2>,
+// tensor_handle> B, tensor<device float,
+// dextents<int32_t, 2>, tensor_handle> C, constant
+// uint& M, constant uint& N, constant uint& K, uint2
+// tgid [[threadgroup_position_in_grid]])
+// {
+// // descriptor to create matmul operation that does 64x32 times 32x32
+// producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+// 32,
+// 16, // tilek = 16,
+// we loop over K in
+// chucks of 16
+// // rather than
+// letting matmul
+// op run method
+// looping over K
+// // internally
+// choosing tileK
+// false,
+// false,
+// false);
+//
+// matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//
+// constexpr int tilek = 16;
+//
+// // Inside thredgroup in both outer dimensions M and N.
+// if ( tgid.x*64 + 63 < M && tgid.y*32 + 31 < N)
+// {
+// auto tC = C.static_slice<32, 64>(tgid.x*32, tgid.y*64);
+// int k = 0;
+// for (; k + tilek - 1 < K; k += tilek)
+// {
+// auto tA = A.static_slice<tilek, 64>(k,tgid.y*64);
+// auto tB = B.static_slice<32, tilek>(tgid.x*32, k);
+//
+// op.run(tA, tB, tC);
+// }
+//
+// auto tA = A.static_slice<dynamic_extent, 64>(k, tgid.y*64);
+// auto tB = B.static_slice<32, dynamic_extent>(tgid.x*32, k);
+// op.run(tA, tB, tC);
+// }
+// else
+// {
+// auto tA = A.offset(0, tgid.y*64);
+// auto tB = B.offset(tgid.x*32, 0);
+// auto tC = C.offset(tgid.x*32, tgid.y*64);
+//
+// op.run(tA, tB, tC);
+// }
+// }
+//
+// Often times, we need to do some post processing on computed results before
+// storing to device or threadgroup memory. For example, in machine learning we
+// need to apply activation function on compute value. One can do GEMM as above
+// which writes the result to device memory, read the value back, call post
+// processing function and write again. This results in wasted bandwidth,
+// performance and power. User can apply post processing in-register where GEMM
+// output is computed using cooperative_tensor. Unlike tensor_handle,
+// tensor_offset and tensor_inline which are non-owning meaning these are
+// wrappers around resource in device, threadgroup or thread addressspce,
+// cooperative_tensor owns thread private data and divides the data for entire
+// tensor among threads (participating the scope of operation) in implementation
+// defined manner. This thread private memory is allocated at construction of
+// cooperative_tensor and deallocated when this cooperative_tensor goes out of
+// scope. The layout of cooperative_tensor depends on operation, data type,
+// number of threads in opscope with which op was created. Note that
+// cooperative_tensor created from an op is only valid for threads that are part
+// of opscope on which op was created. Though the layout of cooperative_tensor
+// is implemtation defined, we provide accessor functions as shown in the
+// example below
+//
+// kernel void simpleMatMulCooperative(tensor<device half, dextents<int32_t,
+// 2>, tensor_handle> A,
+// tensor<device half, dextents<int32_t, 2>,
+// tensor_handle> B, tensor<device float,
+// dextents<int32_t, 2>, tensor_handle> C,
+// tensor<device half, dextents<int32_t, 2>,
+// tensor_handle> bias, constant uint& M, constant
+// uint& N, constant uint& K, uint2 tgid
+// [[threadgroup_position_in_grid]])
+// {
+// constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+// 32,
+// 0,
+// false,
+// false,
+// false);
+//
+// matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//
+// auto mA = A.offset(0, tgid.y*64);
+// auto mB = B.offset(tgid.x*32, 0);
+// auto mC = C.offset(tgid.x*32, tgid.y*64);
+//
+// // This creates cooperative destination tensor of float element type.
+// // Since matmul op above descriptor is created with 4 SIMD-Groups,
+// coopeartive tensor will divide data among the threads on these
+// // 4 SIMD-Groups. The layout of data among lanes is implementation defined
+// and not all threads and even all elements within a thread need
+// // be valid. We provide valid element check shown below which developer
+// should use to guard their access to elements of cooperative_tensor
+//
+// auto cT = matmulOp.get_destination_cooperative_tensor<decltype(mA),
+// decltype(mB), float>();
+//
+// // Loop over all the elements of cooperative_tensor thread elements owned
+// by "this" thread and initialize to zero.
+// // Its imperative for performance to include "unroll pragma" so compiler
+// fully unrolls the loop.
+//
+// #pragma unroll full
+// for (uint16_t i = 0, i < cT.capacity(); ++i) {
+//
+// if(cT.mask(i))
+// cT[i] = 0;
+// }
+//
+// // execute the operation. All threads computes the matmul cooperatively
+// and results are written to cooperative_tensor. op.run(mA, mB, cT);
+//
+// // create cooperative bias tensor with same layout as destination
+// cooperative_tensor of matmul auto biasT =
+// matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB),
+// float>();
+//
+// // load data from bias tensor_handle into biasT cooperative_tensor using
+// layout and distribution of element among threads of scope
+// // on which matmul was created.
+// biasT.load(bias);
+//
+// #pragma unroll full
+// for (uint16_t i = 0, i < cT.capacity(); ++i) {
+//
+// if(cT.mask(i)) {
+// //add bias
+// cT[i] += biasT[i];
+//
+// // get the 2-dimensional local coordinate of this thread's i-th
+// element in destination local coordinate system (in this example
+// // 32 x 64 tile).
+// auto ids = cT.multidimensional_indices(i);
+// cT[i] = foo(cT[i], idx); // do some operation based on coordinate
+// values
+// }
+// }
+//
+// // store to tensor handle
+// cT.store(mC);
+// }
+//
+// Note on scope of operation
+// ==========================
+// A tensor operation may be executed on a single thread entirely or
+// cooperatively among a set of SIMD groups. We call these set of threads
+// "execution scop" of the tensor operation. A tensor ops must be created with
+// execution scope provided as template argument. All the threads in this
+// execution scope must enter the run method i.e. call to run methods must be
+// "execution scope" uniform. Use the following types to configure the execution
+// modes of each operation: metal::execution_thread: the operation will be run on a
+// single thread.
+// Fragment shaders only support this execution scope.
+// metal::execution_simdgroup - the operation will be run cooperatively by all threads in
+// this SIMD group.
+// May be used for finer control over tiling by slicing
+// tensors with SIMD IDs.
+// opscope_SIMD-Groups<N> - the operation will be executed cooperatively by N
+// SIMD groups.
+// Must be used when all threads in a threadgroup are
+// cooperatively performing the operation.
+// It is undefined behavior if the number of SIMD groups dispatched does not
+// match the number of SIMD groups that the operation was configured with.
+//
+// Even though each thread in execution scope can potentially independently
+// enter and exit run method, developer cannot assume that threads in execution
+// scope are working completely independently i.e. tensor operation run
+// implementation may need for (for correctness or performance) synchronize
+// among the threads in execution scope it was created with.
+//
+//
+//===----------------------------------------------------------------------===//
+#ifndef __MetalTensorOpsMatMul2d__
+#define __MetalTensorOpsMatMul2d__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+#include "__impl/MPPTensorOpsBase.h"
+#include "__impl/MPPTensorOpsTypes.h"
+#include <metal_numeric>
+
+#pragma METAL internals : enable
+
+namespace mpp
+{
+namespace tensor_ops
+{
+
+enum class matmul2d_cooperative_operand_index
+{
+ destination,
+};
+
+enum class reduction_operation
+{
+ sum,
+ max,
+ min,
+};
+
+struct matmul2d_descriptor
+{
+ enum class mode
+ {
+ multiply,
+ multiply_accumulate,
+ };
+
+ int m, n, k;
+ bool transpose_left, transpose_right;
+ bool relaxed_precision;
+ mode matmul_mode;
+
+public:
+ constexpr matmul2d_descriptor(int __m, int __n, int __k = dynamic_length_v<int>,
+ bool __transpose_left = false,
+ bool __transpose_right = false,
+ bool __relaxed_precision = false,
+ mode __matmul_mode = mode::multiply) thread
+ : m(__m),
+ n(__n),
+ k(__k),
+ transpose_left(__transpose_left),
+ transpose_right(__transpose_right),
+ relaxed_precision(__relaxed_precision),
+ matmul_mode(__matmul_mode)
+ {
+ }
+};
+
+template <typename ElementType>
+struct reduction_operation_identity
+{
+ static const constant ElementType sum_identity = (ElementType)0;
+ static const constant ElementType max_identity =
+ metal::numeric_limits<ElementType>::lowest;
+ static const constant ElementType min_identity =
+ metal::numeric_limits<ElementType>::max;
+};
+
+#include "__impl/MPPTensorOpsMatMul2dImpl.h"
+
+template <matmul2d_descriptor Descriptor, typename Scope, class... Args>
+class matmul2d : __tensor_ops_detail::op
+{
+ static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v<Scope>,
+ "Scope template argument should be of op_scope type");
+
+public:
+ matmul2d() thread = default;
+
+ template <
+ typename LeftOperandType, typename RightOperandType,
+ typename DestinationOperandType,
+ typename V = __tensor_ops_detail::__enable_if_t<
+ (__tensor_ops_detail::__is_tensor_type_v<LeftOperandType> &&
+ __tensor_ops_detail::__is_tensor_type_v<RightOperandType> &&
+ (__tensor_ops_detail::__is_tensor_type_v<DestinationOperandType> ||
+ __tensor_ops_detail::__is_cooperative_tensor_type_v<
+ DestinationOperandType>))>,
+ typename... RunArgs>
+ INLINE void run(thread LeftOperandType &left, thread RightOperandType &right,
+ thread DestinationOperandType &destination) thread const
+ {
+
+ __mutmul2d_detail::__run<Descriptor, Scope, LeftOperandType,
+ RightOperandType, DestinationOperandType,
+ RunArgs...>(left, right, destination);
+ }
+
+ template <typename ElementType, typename CoordType, typename... CoopArgs>
+ using cooperative_tensor_destination_t =
+ __mutmul2d_detail::__cooperative_tensor_destination_t<
+ Descriptor, Scope, ElementType, CoordType, CoopArgs...>;
+
+ template <typename LeftOperandType, typename RightOperandType,
+ typename ElementType, typename CoordType = int,
+ typename U = __tensor_ops_detail::__enable_if_t<
+ __tensor_ops_detail::__is_tensor_type_v<LeftOperandType> &&
+ __tensor_ops_detail::__is_tensor_type_v<RightOperandType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
+ __tensor_ops_detail::__is_integral_v<CoordType>>,
+ typename... CoopArgs>
+ INLINE cooperative_tensor_destination_t<ElementType, CoordType, CoopArgs...>
+ get_destination_cooperative_tensor() thread const
+ {
+
+ return __mutmul2d_detail::__get_destination_cooperative_tensor<
+ Descriptor, Scope, ElementType, CoordType, LeftOperandType,
+ RightOperandType, CoopArgs...>();
+ }
+};
+
+template <class ElementType, class Extents, class Layout>
+inline void reduce_rows(
+ thread metal::cooperative_tensor<ElementType, Extents, Layout> &source,
+ thread metal::cooperative_tensor<ElementType, Extents, Layout> &destination,
+ reduction_operation op = reduction_operation::sum,
+ ElementType identity =
+ reduction_operation_identity<ElementType>::sum_identity)
+{
+ __mutmul2d_detail::__reduce_rows<ElementType, Extents, Layout>(
+ source, destination, identity, op);
+}
+
+template <class ElementType, class Extents, class Layout>
+inline void reduce_columns(
+ thread metal::cooperative_tensor<ElementType, Extents, Layout> &source,
+ thread metal::cooperative_tensor<ElementType, Extents, Layout> &destination,
+ reduction_operation op = reduction_operation::sum,
+ ElementType identity =
+ reduction_operation_identity<ElementType>::sum_identity)
+{
+ __mutmul2d_detail::__reduce_columns<ElementType, Extents, Layout>(
+ source, destination, identity, op);
+}
+
+} // namespace tensor_ops
+} // namespace mpp
+
+#pragma METAL internals : disable
+
+#endif
+
+#endif // __TensorOpsMatMul2D__
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MetalPerformancePrimitives.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MetalPerformancePrimitives.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MetalPerformancePrimitives.h 1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MetalPerformancePrimitives.h 2025-05-23 06:26:11
@@ -0,0 +1,12 @@
+// -*- Metal -*-
+//===-- MetalPerformancePrimitives ------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalPerformancePrimitives__
+#define __MetalPerformancePrimitives__
+
+#include <MetalPerformancePrimitives/MPPTensorOpsConvolution2d.h>
+#include <MetalPerformancePrimitives/MPPTensorOpsMatMul2d.h>
+
+#endif
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsBase.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsBase.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsBase.h 1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsBase.h 2025-05-23 06:26:11
@@ -0,0 +1,28 @@
+
+// -*- Metal -*-
+//===-- MetalTensorOpsBase ------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalTensorOpsBase__
+#define __MetalTensorOpsBase__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+namespace mpp
+{
+namespace tensor_ops
+{
+
+namespace __tensor_ops_detail
+{
+class op
+{
+};
+} // namespace __tensor_ops_detail
+
+} // namespace tensor_ops
+} // namespace mpp
+
+#endif
+#endif
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h 1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h 2025-05-23 06:26:11
@@ -0,0 +1,4845 @@
+
+
+// -*- Metal -*-
+//===-- MetalTensorOpsConvolution2dImpl
+//------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalTensorOpsConvolution2dImpl__
+#define __MetalTensorOpsConvolution2dImpl__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+namespace __convolution2d_detail
+{
+
+#ifndef EXTERNALLY_DEFINED_ATTR
+#define EXTERNALLY_DEFINED_ATTR \
+ __attribute__((section("air.externally_defined")))
+#endif
+
+#define TENSOROPS_EXPORT [[gnu::visibility("default")]]
+#define INLINE __attribute__((__always_inline__))
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f32_dv_f32_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f32_dv_f32_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f32_tg_f32_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f32_tg_f32_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f32_dv_f32_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f32_dv_f32_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f32_tg_f32_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f32_tg_f32_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_dv_bf(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_tg_bf(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_dv_bf(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_tg_bf(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_dv_bf(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_tg_bf(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_dv_bf(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_tg_bf(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type,
+ int threads, thread int2 &offset);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f16_dv_f16_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f16_tg_f16_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f16_dv_f16_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f16_tg_f16_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f16_dv_f16_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f16_tg_f16_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f16_dv_f16_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f16_tg_f16_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f32_dv_f32_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f32_tg_f32_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f32_dv_f32_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f32_tg_f32_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_f16(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_i32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_bf_dv_bf_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_bf_tg_bf_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_bf_dv_bf_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_bf_tg_bf_f32(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_bf_dv_bf_bf(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_bf_tg_bf_bf(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_bf_dv_bf_bf(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_bf_tg_bf_bf(
+ thread convolution2d_descriptor &desc, thread void *activation,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type,
+ thread void *weights,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+ thread void *destination, int threads, thread int2 &offset);
+
+template <typename scope, typename activation_type, typename weights_type,
+ typename destination_type, typename... run_args>
+void __run(thread activation_type &activation_tensor,
+ thread weights_type &weights_tensor,
+ thread destination_type &destination_tensor,
+ thread convolution2d_descriptor &__descriptor, int2 __offset)
+{
+ using activation_tensor_type = __tensor_ops_detail::__remove_addrspace_t<
+ __tensor_ops_detail::__remove_reference_t<decltype(activation_tensor)>>;
+ using weights_tensor_type = __tensor_ops_detail::__remove_addrspace_t<
+ __tensor_ops_detail::__remove_reference_t<decltype(weights_tensor)>>;
+ using destination_tensor_type = __tensor_ops_detail::__remove_addrspace_t<
+ __tensor_ops_detail::__remove_reference_t<decltype(destination_tensor)>>;
+
+ metal::execution_threads t = scope();
+ int threads = t.size();
+
+ static_assert(__tensor_ops_detail::__is_tensor_type_v<activation_tensor_type>,
+ "Activation must be a tensor");
+ static_assert(__tensor_ops_detail::__is_tensor_type_v<weights_tensor_type>,
+ "Weights must be a tensor");
+ static_assert(
+ __tensor_ops_detail::__is_tensor_type_v<destination_tensor_type> ||
+ __tensor_ops_detail::__is_cooperative_tensor_type_v<
+ destination_tensor_type>,
+ "Destination must be a tensor or cooperative tensor");
+
+ static_assert(__tensor_ops_detail::__get_rank<activation_tensor_type>() == 4,
+ "Activation must be rank 4");
+ static_assert(__tensor_ops_detail::__get_rank<weights_tensor_type>() == 4,
+ "Weights must be rank 4");
+ static_assert(__tensor_ops_detail::__get_rank<destination_tensor_type>() == 4,
+ "Destination must be rank 4");
+
+ static_assert(__tensor_ops_detail::__is_same_v<
+ typename activation_tensor_type::index_type, int>,
+ "Index type must be int");
+ static_assert(
+ __tensor_ops_detail::__is_same_v<typename weights_tensor_type::index_type,
+ int>,
+ "Index type must be int");
+ static_assert(__tensor_ops_detail::__is_same_v<
+ typename destination_tensor_type::index_type, int>,
+ "Index type must be int");
+
+ using activation_ptr_type = typename activation_tensor_type::data_handle_type;
+ using weights_ptr_type = typename weights_tensor_type::data_handle_type;
+ using destination_ptr_type =
+ typename destination_tensor_type::data_handle_type;
+
+ using activation_value_type = typename activation_tensor_type::value_type;
+ using weights_value_type = typename weights_tensor_type::value_type;
+ using destination_value_type = typename destination_tensor_type::value_type;
+
+ auto activation = (thread void *)(&activation_tensor);
+ auto weights = (thread void *)(&weights_tensor);
+ auto offset = __offset;
+
+ const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ activation_desc_type =
+ __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+ activation_tensor_type>();
+ const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ weights_desc_type =
+ __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+ weights_tensor_type>();
+
+ convolution2d_descriptor desc = __descriptor;
+
+ if constexpr (__tensor_ops_detail::__is_cooperative_tensor_type_v<
+ destination_tensor_type>)
+ {
+ thread void *destination =
+ &destination_tensor[__tensor_ops_detail::__tensor_ops_reserved_index];
+
+ if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<destination_value_type,
+ half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, int32_t>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, int32_t>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, int32_t>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, int32_t>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ half> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ half> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_f16_dv_f16_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_f16_dv_f16_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_f16_tg_f16_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_f16_tg_f16_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ half> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ half> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_f16_dv_f16_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_f16_dv_f16_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_f16_tg_f16_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_f16_tg_f16_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+#if __HAVE_BFLOAT__
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ bfloat> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ bfloat> &&
+ __tensor_ops_detail::__is_same_v<destination_value_type,
+ bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_bf_dv_bf_bf(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_bf_dv_bf_bf(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_bf_tg_bf_bf(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_bf_tg_bf_bf(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ bfloat> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ bfloat> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_bf_dv_bf_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_bf_dv_bf_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_bf_tg_bf_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_bf_tg_bf_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+#endif
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ float> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ float> &&
+ __tensor_ops_detail::__is_same_v<destination_value_type,
+ float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_f32_dv_f32_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_f32_dv_f32_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_dv_f32_tg_f32_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_cooperative_tg_f32_tg_f32_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_value_type>,
+ "Unsupported type");
+ }
+ else
+ {
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type =
+ __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+ destination_tensor_type>();
+
+ thread void *destination = (thread void *)(&destination_tensor);
+
+ if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<destination_value_type,
+ half>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, int32_t>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, half>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, int32_t>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, half>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, int32_t>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, half>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ uint8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, int32_t>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_i32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ half> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ half> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, half>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_dv_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_tg_f16(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ half> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ half> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+#if __HAVE_BFLOAT__
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ bfloat> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ bfloat> &&
+ __tensor_ops_detail::__is_same_v<destination_value_type,
+ bfloat>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_dv_bf(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_dv_bf(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_dv_bf(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_dv_bf(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_tg_bf(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_tg_bf(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_tg_bf(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_tg_bf(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ bfloat> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ bfloat> &&
+ __tensor_ops_detail::__is_same_v<
+ destination_value_type, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+#endif
+ else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+ float> &&
+ __tensor_ops_detail::__is_same_v<weights_value_type,
+ float> &&
+ __tensor_ops_detail::__is_same_v<destination_value_type,
+ float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_f32_dv_f32_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_f32_dv_f32_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_f32_tg_f32_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_f32_tg_f32_dv_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_f32_dv_f32_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_f32_dv_f32_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_dv_f32_tg_f32_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ activation_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ weights_ptr_type> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_run_tg_f32_tg_f32_tg_f32(
+ desc, activation, activation_desc_type, weights, weights_desc_type,
+ destination, destination_desc_type, threads, offset);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_value_type>,
+ "Unsupported type");
+ }
+}
+
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR size_t
+__tensorops_impl_conv2d_cooperative_destination_data_size(
+ const thread convolution2d_descriptor &descriptor,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR uint16_t
+__tensorops_impl_conv2d_cooperative_destination_tensor_num_elements(
+ const thread convolution2d_descriptor &descriptor,
+ __tensor_ops_detail::__const_thread_void_t, int threads);
+extern "C" TENSOROPS_EXPORT
+ EXTERNALLY_DEFINED_ATTR __tensor_ops_detail::__thread_void_t
+ __tensorops_impl_conv2d_cooperative_destination_tensor_elements(
+ __tensor_ops_detail::__thread_void_t, uint16_t,
+ __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+ const thread convolution2d_descriptor &descriptor,
+ __tensor_ops_detail::__thread_void_t, uint16_t,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__thread_void_t,
+ __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_init(
+ __tensor_ops_detail::__thread_void_t,
+ const thread convolution2d_descriptor &,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_copy(
+ __tensor_ops_detail::__thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ const thread convolution2d_descriptor &,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_move(
+ __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__thread_void_t,
+ const thread convolution2d_descriptor &,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_destory(
+ __tensor_ops_detail::__thread_void_t,
+ const thread convolution2d_descriptor &, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR bool
+__tensorops_impl_conv2d_cooperative_destination_tensor_is_valid_element(
+ const thread convolution2d_descriptor &,
+ __tensor_ops_detail::__thread_void_t, uint16_t,
+ __tensor_ops_detail::__tensor_ops_datatype, int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f32(
+ thread convolution2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f32(
+ thread convolution2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_i32(
+ thread convolution2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_i32(
+ thread convolution2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f16(
+ thread convolution2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f16(
+ thread convolution2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_bf(
+ thread convolution2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_bf(
+ thread convolution2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f32(
+ thread convolution2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f32(
+ thread convolution2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_i32(
+ thread convolution2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_i32(
+ thread convolution2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f16(
+ thread convolution2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f16(
+ thread convolution2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_bf(
+ thread convolution2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_bf(
+ thread convolution2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+
+template <convolution2d_descriptor descriptor,
+ convolution2d_cooperative_operand operand, typename scope,
+ typename activation_operand_type, typename weights_operand_type,
+ typename element_type, typename coord_type, typename... coop_args>
+struct __operand_layout
+{
+ static_assert(operand == convolution2d_cooperative_operand::destination,
+ "only destination can be cooperative tensor");
+ static_assert(__tensor_ops_detail::__is_same_v<element_type, float> ||
+ __tensor_ops_detail::__is_same_v<element_type, half> ||
+#if __HAVE_BFLOAT__
+ __tensor_ops_detail::__is_same_v<element_type, bfloat> ||
+#endif
+ __tensor_ops_detail::__is_same_v<element_type, int32_t>,
+ "cooperative tensor data type can only be one of "
+ "float/half/bfloat/int32_t");
+
+ static constant constexpr __tensor_ops_detail::__rank_t rank = 4;
+ using element_t = element_type;
+ using coord_t = coord_type;
+ using extent_t = metal::dextents<coord_t, rank>;
+ using thread_storage_t = thread void *;
+ using const_thread_storage_t = const thread void *;
+ using index_t = uint16_t;
+ using operand_layout_t =
+ __operand_layout<descriptor, operand, scope, activation_operand_type,
+ weights_operand_type, element_type, coord_type,
+ coop_args...>;
+ using cooperative_tensor_t =
+ metal::cooperative_tensor<element_type, extent_t, operand_layout_t>;
+
+ using a_type = __tensor_ops_detail::__remove_addrspace_t<
+ __tensor_ops_detail::__remove_reference_t<activation_operand_type>>;
+ using w_type = __tensor_ops_detail::__remove_addrspace_t<
+ __tensor_ops_detail::__remove_reference_t<weights_operand_type>>;
+
+ using a_elem_type = typename a_type::element_type;
+ using w_elem_type = typename w_type::element_type;
+
+ static size_t thread_storage_size()
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+ __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+ __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+ return __tensorops_impl_conv2d_cooperative_destination_data_size(
+ descriptor, d_data_type, a_data_type, w_data_type, threads);
+ }
+
+ static constexpr size_t thread_storage_align()
+ {
+ return alignof(element_t);
+ };
+
+ static uint16_t size(const_thread_storage_t storage)
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ return __tensorops_impl_conv2d_cooperative_destination_tensor_num_elements(
+ descriptor, storage, threads);
+ }
+
+ static void construct(thread void *this_)
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+ __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+ __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+ __tensorops_impl_conv2d_cooperative_destination_tensor_init(
+ this_, descriptor, d_data_type, a_data_type, w_data_type, threads);
+ }
+
+ static void copy_construct(thread void *this_, thread void *other)
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+ __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+ __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+ __tensorops_impl_conv2d_cooperative_destination_tensor_copy(
+ this_, other, descriptor, d_data_type, a_data_type, w_data_type,
+ threads);
+ };
+
+ static void move_construct(thread void *this_, thread void *other)
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+ __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+ __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+ __tensorops_impl_conv2d_cooperative_destination_tensor_move(
+ this_, other, descriptor, d_data_type, a_data_type, w_data_type,
+ threads);
+ };
+
+ static void copy_assign(thread void *this_, thread void *other)
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+ __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+ __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+ __tensorops_impl_conv2d_cooperative_destination_tensor_copy(
+ this_, other, descriptor, d_data_type, a_data_type, w_data_type,
+ threads);
+ };
+
+ static void move_assign(thread void *this_, thread void *other)
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+ __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+ __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+ __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+ __tensorops_impl_conv2d_cooperative_destination_tensor_move(
+ this_, other, descriptor, d_data_type, a_data_type, w_data_type,
+ threads);
+ };
+
+ // Destroys the per-thread object.
+ static void destroy(thread void *this_)
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ __tensorops_impl_conv2d_cooperative_destination_tensor_destory(
+ this_, descriptor, threads);
+ }
+
+ template <class ElemType, class Extents, class Descriptor, class... Tags>
+ static void load(thread_storage_t storage,
+ const thread metal::tensor<ElemType, Extents, Descriptor,
+ Tags...> &sourceT)
+ {
+ using elem_t = __tensor_ops_detail::__remove_addrspace_t<ElemType>;
+
+ metal::execution_threads t = scope();
+ int threads = t.size();
+
+ static_assert(__tensor_ops_detail::__is_same_v<elem_t, element_type>,
+ "Source tensor datatype does not match cooperative tensor");
+ static_assert(Extents::rank() == 1 || Extents::rank() == 4,
+ "Source tensor must be rank 1 or 4");
+
+ int sourceRank = Extents::rank();
+
+ convolution2d_descriptor desc = descriptor;
+
+ using tensorType = metal::tensor<ElemType, Extents, Descriptor, Tags...>;
+
+ using sourcePtrType = typename tensorType::data_handle_type;
+
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType =
+ __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+ tensorType>();
+
+ const thread void *source = (const thread void *)(&sourceT);
+
+ if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f16(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ sourcePtrType>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f16(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, int32_t>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_i32(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ sourcePtrType>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_i32(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f32(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ sourcePtrType>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f32(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+ "Unsupported address space");
+ }
+#if __HAVE_BFLOAT__
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_bf(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ sourcePtrType>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_bf(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+ "Unsupported address space");
+ }
+#endif
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
+ "Unsupported type");
+ };
+
+ template <class ElemType, class Extents, class Descriptor, class... Tags>
+ static void store(const_thread_storage_t storage,
+ const thread metal::tensor<ElemType, Extents, Descriptor,
+ Tags...> &destinationT)
+ {
+ using elem_t = __tensor_ops_detail::__remove_addrspace_t<ElemType>;
+
+ metal::execution_threads t = scope();
+ int threads = t.size();
+
+ static_assert(__tensor_ops_detail::__is_same_v<elem_t, element_type>,
+ "Tensor datatype does not match cooperative tensor");
+ static_assert(Extents::rank() == 1 || Extents::rank() == rank,
+ "Tensor must be rank 1 or 4");
+
+ convolution2d_descriptor desc = descriptor;
+
+ using tensorType = metal::tensor<ElemType, Extents, Descriptor, Tags...>;
+
+ using destination_ptr_type = typename tensorType::data_handle_type;
+
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destination_desc_type =
+ __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+ tensorType>();
+
+ const thread void *destination = (const thread void *)(&destinationT);
+
+ if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f16(
+ desc, storage, destination, destination_desc_type, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f16(
+ desc, storage, destination, destination_desc_type, threads);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, int32_t>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_i32(
+ desc, storage, destination, destination_desc_type, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_i32(
+ desc, storage, destination, destination_desc_type, threads);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f32(
+ desc, storage, destination, destination_desc_type, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f32(
+ desc, storage, destination, destination_desc_type, threads);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+#if __HAVE_BFLOAT__
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_bf(
+ desc, storage, destination, destination_desc_type, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destination_ptr_type>)
+ __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_bf(
+ desc, storage, destination, destination_desc_type, threads);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+ "Unsupported address space");
+ }
+#endif
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
+ "Unsupported type");
+ };
+
+ static thread element_t *get_pointer_to(const_thread_storage_t storage,
+ index_t idx)
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ __tensor_ops_detail::__tensor_ops_datatype dataType;
+ if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<element_type>,
+ "unsupported data type");
+
+ return (thread element_t *)
+ __tensorops_impl_conv2d_cooperative_destination_tensor_elements(
+ (thread_storage_t)storage, idx, dataType, threads);
+ };
+
+ static bool mask(const_thread_storage_t storage, index_t idx)
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ __tensor_ops_detail::__tensor_ops_datatype dataType;
+ if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<element_type>,
+ "unsupported data type");
+
+ return __tensorops_impl_conv2d_cooperative_destination_tensor_is_valid_element(
+ descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+ dataType, threads);
+ }
+
+ template <typename index_t, __tensor_ops_detail::__rank_t rank>
+ static metal::array<index_t, rank>
+ multidimensional_indices(const_thread_storage_t storage, index_t idx)
+ {
+
+ metal::execution_threads t = scope();
+ int threads = t.size();
+
+ static_assert(rank == 4, "multidimensional_indices returns 4D indices");
+
+ __tensor_ops_detail::__tensor_ops_datatype dataType;
+ if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<element_type>,
+ "unsupported data type");
+
+ if constexpr (__tensor_ops_detail::__is_same_v<coord_t, ushort>)
+ {
+ ushort coords[4];
+ __tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+ descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+ dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint16,
+ threads);
+ return {coords[0], coords[1], coords[2], coords[3]};
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, short>)
+ {
+ short coords[4];
+ __tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+ descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+ dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int16,
+ threads);
+ return {coords[0], coords[1], coords[2], coords[3]};
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, uint>)
+ {
+ uint coords[4];
+ __tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+ descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+ dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint32,
+ threads);
+ return {coords[0], coords[1], coords[2], coords[3]};
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, int>)
+ {
+ int coords[4];
+ __tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+ descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+ dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int32,
+ threads);
+ return {coords[0], coords[1], coords[2], coords[3]};
+ }
+ }
+};
+
+template <convolution2d_descriptor descriptor,
+ convolution2d_cooperative_operand operand, typename scope,
+ typename activation_operand_type, typename weights_operand_type,
+ typename element_type, typename coord_type, typename... coop_args>
+using __cooperative_tensor_t = typename __operand_layout<
+ descriptor, operand, scope, activation_operand_type, weights_operand_type,
+ element_type, coord_type, coop_args...>::cooperative_tensor_t;
+
+template <convolution2d_descriptor descriptor, typename scope,
+ typename activation_operand_type, typename weights_operand_type,
+ typename element_type, typename coord_type, typename... coop_args>
+using __cooperative_tensor_destination_t =
+ __cooperative_tensor_t<descriptor,
+ convolution2d_cooperative_operand::destination,
+ scope, activation_operand_type, weights_operand_type,
+ element_type, coord_type, coop_args...>;
+
+template <convolution2d_descriptor descriptor, typename scope,
+ typename activation_operand_type, typename weights_operand_type,
+ typename element_type, typename coord_type, typename... coop_args>
+__cooperative_tensor_destination_t<descriptor, scope, activation_operand_type,
+ weights_operand_type, element_type,
+ coord_type, coop_args...>
+__get_destination_cooperative_tensor()
+{
+ return __cooperative_tensor_destination_t<
+ descriptor, scope, activation_operand_type, weights_operand_type,
+ element_type, coord_type, coop_args...>();
+}
+
+#undef EXTERNALLY_DEFINED_ATTR
+
+} // namespace __convolution2d_detail
+
+#endif
+
+#endif // __TensorOpsConvolution2D__
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h 1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h 2025-05-23 06:26:11
@@ -0,0 +1,5131 @@
+
+// -*- Metal -*-
+//===-- MetalTensorOpsMatMul2dImpl
+//------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalTensorOpsMatMul2dImpl__
+#define __MetalTensorOpsMatMul2dImpl__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+namespace __mutmul2d_detail
+{
+
+#ifndef EXTERNALLY_DEFINED_ATTR
+#define EXTERNALLY_DEFINED_ATTR \
+ __attribute__((section("air.externally_defined")))
+#endif
+
+#define TENSOROPS_EXPORT [[gnu::visibility("default")]]
+#define INLINE __attribute__((__always_inline__))
+
+using __matmul2d_descriptor = matmul2d_descriptor;
+
+using __matmul2d_cooperative_operand_index = matmul2d_cooperative_operand_index;
+
+using __reduction_operation = reduction_operation;
+
+extern "C" EXTERNALLY_DEFINED_ATTR size_t
+__tensorops_impl_matmul2d_op_cooperative_destination_data_size(
+ const __matmul2d_descriptor descriptor, const int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR uint16_t
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_num_elements(
+ const __matmul2d_descriptor descriptor, const int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR thread void *
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_elements(
+ __tensor_ops_detail::__thread_void_t, uint16_t,
+ __tensor_ops_detail::__tensor_ops_datatype);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+ const __matmul2d_descriptor descriptor,
+ __tensor_ops_detail::__thread_void_t, uint16_t,
+ __tensor_ops_detail::__tensor_ops_datatype, thread void *,
+ __tensor_ops_detail::__tensor_ops_datatype, const int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_init(
+ __tensor_ops_detail::__thread_void_t, __matmul2d_descriptor,
+ __tensor_ops_detail::__tensor_ops_datatype, const int);
+extern "C" EXTERNALLY_DEFINED_ATTR bool
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_is_valid_element(
+ const __matmul2d_descriptor descriptor,
+ __tensor_ops_detail::__thread_void_t, uint16_t,
+ __tensor_ops_detail::__tensor_ops_datatype, const int);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_f16(
+ thread __matmul2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_i32(
+ thread __matmul2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_i32(
+ thread __matmul2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *storage,
+ const thread void *source,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+ int sourceRank, int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f16(
+ thread __matmul2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_f16(
+ thread __matmul2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_i32(
+ thread __matmul2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_i32(
+ thread __matmul2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f32(
+ thread __matmul2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_f32(
+ thread __matmul2d_descriptor &desc, const thread void *storage,
+ const thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+ int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f16(
+ thread __matmul2d_descriptor &desc, const thread void *src,
+ thread void *dst, half identity, __reduction_operation op);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f32(
+ thread __matmul2d_descriptor &desc, const thread void *src,
+ thread void *dst, float identity, __reduction_operation op);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_i32(
+ thread __matmul2d_descriptor &desc, const thread void *src,
+ thread void *dst, int identity, __reduction_operation op);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f16(
+ thread __matmul2d_descriptor &desc, const thread void *src,
+ thread void *dst, half identity, __reduction_operation op);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f32(
+ thread __matmul2d_descriptor &desc, const thread void *src,
+ thread void *dst, float identity, __reduction_operation op);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_i32(
+ thread __matmul2d_descriptor &desc, const thread void *src,
+ thread void *dst, int identity, __reduction_operation op);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_tg_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_dv_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_tg_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_dv_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_tg_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_dv_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_tg_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_dv_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_tg_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType,
+ int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f32_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f32_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f32_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f32_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f16_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f16_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f16_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f16_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f32_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f32_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f32_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f32_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_i8_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_i8_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_i8_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_i8_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f32_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f32_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f32_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f32_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_i8_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_i8_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_i8_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_i8_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination, int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f16(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_dv_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_th_f32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_dv_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_dv_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_dv_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_dv_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_th_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_th_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_th_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_th_i32(
+ thread __matmul2d_descriptor &desc, thread void *left,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+ thread void *right,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+ thread void *destination,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType);
+
+template <__matmul2d_descriptor descriptor,
+ __matmul2d_cooperative_operand_index operand_index, typename scope,
+ typename element_type, typename coord_type, typename... args>
+struct __operand_layout
+{
+
+ static_assert(operand_index ==
+ matmul2d_cooperative_operand_index::destination,
+ "only destination can be cooperative tensor");
+ static_assert(__tensor_ops_detail::__is_same_v<element_type, float> ||
+ __tensor_ops_detail::__is_same_v<element_type, half> ||
+#if __HAVE_BFLOAT__
+ __tensor_ops_detail::__is_same_v<element_type, bfloat> ||
+#endif
+ __tensor_ops_detail::__is_same_v<element_type, int32_t>,
+ "cooperative tensor data type can only be one of "
+ "float/half/bfloat/int32_t");
+
+ static constant constexpr __tensor_ops_detail::__rank_t rank = 2;
+ using element_t = element_type;
+ using coord_t = coord_type;
+ using extent_t = metal::dextents<coord_t, rank>;
+ using thread_storage_t = thread void *;
+ using const_thread_storage_t = const thread void *;
+ using index_t = uint16_t;
+ using operand_layout_t =
+ __operand_layout<descriptor, operand_index, scope, element_t, coord_t>;
+ using cooperative_tensor_t =
+ metal::cooperative_tensor<element_t, extent_t, operand_layout_t>;
+ using scope_t = scope;
+
+ static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v<scope>,
+ "scope should be of type __tensorops_scope");
+
+ static constexpr constant bool is_matmul2d_cooperative_destination_layout =
+ true;
+
+ static constexpr constant __matmul2d_descriptor matmul2d_desc = descriptor;
+
+ // Returns the alignment of the storage allocated in each thread
+ // for this cooperative_tensor.
+ static constexpr size_t thread_storage_align()
+ {
+ return alignof(element_t);
+ };
+
+ // Copy-constructs from the cooperative_tensor `other`.
+ static void copy_construct(thread void *this_, thread void *other)
+ {
+ thread element_t *this_e = (thread element_t *)(this_);
+ thread element_t *other_e = (thread element_t *)(other);
+ for (size_t i = 0, e = size(this_); i != e; ++i)
+ {
+ other_e[i] = this_e[i];
+ }
+ };
+
+ // Move-constructs from the cooperative_tensor `other`.
+ static void move_construct(thread void *this_, thread void *other)
+ {
+ thread element_t *this_e = (thread element_t *)(this_);
+ thread element_t *other_e = this_e;
+ };
+
+ // Copy-assigns from the cooperative_tensor `other`.
+ static void copy_assign(thread void *this_, thread void *other)
+ {
+ thread element_t *this_e = (thread element_t *)(this_);
+ thread element_t *other_e = (thread element_t *)(other);
+ for (size_t i = 0, e = size(this_); i != e; ++i)
+ {
+ other_e[i] = this_e[i];
+ }
+ };
+
+ // Move-assigns from the cooperative_tensor `other`.
+ static void move_assign(thread void *this_, thread void *other)
+ {
+ thread element_t *this_e = (thread element_t *)(this_);
+ thread element_t *other_e = this_e;
+ };
+
+ // Destroys the per-thread object.
+ static void destroy(thread void *) {};
+
+ static size_t thread_storage_size()
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ return __tensorops_impl_matmul2d_op_cooperative_destination_data_size(
+ descriptor, threads);
+ }
+
+ template <class ElemType, class Extents, class Descriptor, class... Tags>
+ static void load(thread_storage_t storage,
+ const thread metal::tensor<ElemType, Extents, Descriptor,
+ Tags...> &sourceT)
+ {
+ using elem_t = __tensor_ops_detail::__remove_addrspace_t<ElemType>;
+
+ static_assert(__tensor_ops_detail::__is_same_v<elem_t, element_t>,
+ "Source tensor datatype does not match cooperative tensor");
+ static_assert(Extents::rank() == 1 || Extents::rank() == 2,
+ "Source tensor must be rank 1 or 2");
+
+ int sourceRank = Extents::rank();
+
+ metal::execution_threads t = scope();
+ int threads = t.size();
+
+ __matmul2d_descriptor desc = descriptor;
+
+ using tensorType = metal::tensor<ElemType, Extents, Descriptor, Tags...>;
+
+ using sourcePtrType = typename tensorType::data_handle_type;
+
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType =
+ __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+ tensorType>();
+
+ const thread void *source = (const thread void *)(&sourceT);
+
+ if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_f16(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ sourcePtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_f16(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, int32_t>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_i32(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ sourcePtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_i32(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_f32(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ sourcePtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_f32(
+ desc, storage, source, sourceDescType, sourceRank, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+ "Unsupported address space");
+ }
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
+ "Unsupported type");
+ };
+
+ template <class ElemType, class Extents, class Descriptor, class... Tags>
+ static void store(const_thread_storage_t storage,
+ const thread metal::tensor<ElemType, Extents, Descriptor,
+ Tags...> &destinationT)
+ {
+ using elem_t = __tensor_ops_detail::__remove_addrspace_t<ElemType>;
+
+ static_assert(__tensor_ops_detail::__is_same_v<elem_t, element_t>,
+ "Tensor datatype does not match cooperative tensor");
+ static_assert(Extents::rank() == 1 || Extents::rank() == rank,
+ "Tensor must be rank 1 or 2");
+
+ __matmul2d_descriptor desc = descriptor;
+
+ metal::execution_threads t = scope();
+ int threads = t.size();
+
+ using tensorType = metal::tensor<ElemType, Extents, Descriptor, Tags...>;
+
+ using destinationPtrType = typename tensorType::data_handle_type;
+
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType =
+ __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+ tensorType>();
+
+ const thread void *destination = (const thread void *)(&destinationT);
+
+ if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f16(
+ desc, storage, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_f16(
+ desc, storage, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, int32_t>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_i32(
+ desc, storage, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_i32(
+ desc, storage, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f32(
+ desc, storage, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_f32(
+ desc, storage, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
+ "Unsupported type");
+ };
+
+ static uint16_t size(const_thread_storage_t storage)
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ return __tensorops_impl_matmul2d_op_cooperative_destination_tensor_num_elements(
+ descriptor, threads);
+ }
+
+ static thread element_t *get_pointer_to(const_thread_storage_t storage,
+ index_t idx)
+ {
+ __tensor_ops_detail::__tensor_ops_datatype dataType;
+ if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_int32;
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<element_t>,
+ "unsupported data type");
+
+ return (thread element_t *)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_elements(
+ (thread_storage_t)storage, idx, dataType);
+ };
+
+ static bool mask(const_thread_storage_t storage, index_t idx)
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ __tensor_ops_detail::__tensor_ops_datatype dataType;
+ if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_int32;
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<element_t>,
+ "unsupported data type");
+
+ return __tensorops_impl_matmul2d_op_cooperative_destination_tensor_is_valid_element(
+ descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+ dataType, threads);
+ }
+
+ template <typename index_t, __tensor_ops_detail::__rank_t rank = 2>
+ static metal::array<index_t, rank>
+ multidimensional_indices(const_thread_storage_t storage, index_t idx)
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ __tensor_ops_detail::__tensor_ops_datatype dataType;
+ if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_int32;
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<element_t>,
+ "unsupported data type");
+
+ if constexpr (__tensor_ops_detail::__is_same_v<coord_t, ushort>)
+ {
+ ushort coords[2];
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+ descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+ dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint16,
+ threads);
+ return {coords[0], coords[1]};
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, short>)
+ {
+ short coords[2];
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+ descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+ dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int16,
+ threads);
+ return {coords[0], coords[1]};
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, uint>)
+ {
+ uint coords[2];
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+ descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+ dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint32,
+ threads);
+ ;
+ return {coords[0], coords[1]};
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, int>)
+ {
+ int coords[2];
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+ descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+ dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int32,
+ threads);
+ return {coords[0], coords[1]};
+ }
+ }
+
+ static void construct(thread_storage_t storage)
+ {
+ metal::execution_threads t = scope();
+ int threads = t.size();
+ __tensor_ops_detail::__tensor_ops_datatype dataType;
+ if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+ else if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+ dataType = __tensor_ops_detail::__tensor_ops_datatype_int32;
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<element_t>,
+ "unsupported data type");
+
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_init(
+ (__tensor_ops_detail::__thread_void_t)storage, descriptor, dataType,
+ threads);
+ }
+};
+
+template <__matmul2d_descriptor descriptor,
+ __matmul2d_cooperative_operand_index operand_index, typename scope,
+ typename element_type, typename coord_type, typename... args>
+using __cooperative_tensor_t =
+ typename __operand_layout<descriptor, operand_index, scope, element_type,
+ coord_type, args...>::cooperative_tensor_t;
+
+template <__matmul2d_descriptor descriptor, typename scope,
+ typename element_type, typename coord_type, typename... args>
+using __cooperative_tensor_destination_t =
+ __cooperative_tensor_t<descriptor,
+ matmul2d_cooperative_operand_index::destination,
+ scope, element_type, coord_type, args...>;
+
+template <__matmul2d_descriptor descriptor, typename scope,
+ typename element_type, typename coord_type, typename left_operand,
+ typename right_operand, typename... args>
+__cooperative_tensor_destination_t<descriptor, scope, element_type, coord_type,
+ args...>
+__get_destination_cooperative_tensor()
+{
+ static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v<scope>,
+ "scope should be of type __tensorops_scope");
+ return __cooperative_tensor_destination_t<descriptor, scope, element_type,
+ coord_type, args...>();
+}
+
+template <__matmul2d_descriptor descriptor, typename scope,
+ typename left_operand, typename right_operand,
+ typename destination_operand, typename... args>
+void __run(thread left_operand &leftIn, thread right_operand &rightIn,
+ thread destination_operand &destinationT)
+{
+ using leftTensorType = __tensor_ops_detail::__remove_addrspace_t<
+ __tensor_ops_detail::__remove_reference_t<decltype(leftIn)>>;
+ using rightTensorType = __tensor_ops_detail::__remove_addrspace_t<
+ __tensor_ops_detail::__remove_reference_t<decltype(rightIn)>>;
+ using destinationTensorType = __tensor_ops_detail::__remove_addrspace_t<
+ __tensor_ops_detail::__remove_reference_t<decltype(destinationT)>>;
+
+ metal::execution_threads t = scope();
+ int threads = t.size();
+
+ static_assert(__tensor_ops_detail::__is_tensor_type_v<leftTensorType>,
+ "Left operand must be a tensor");
+ static_assert(__tensor_ops_detail::__is_tensor_type_v<rightTensorType>,
+ "Right operand must be a tensor");
+ static_assert(
+ __tensor_ops_detail::__is_tensor_type_v<destinationTensorType> ||
+ __tensor_ops_detail::__is_cooperative_tensor_type_v<
+ destinationTensorType>,
+ "Destination operand must be a tensor or cooperative tensor");
+
+ static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v<scope>,
+ "scope should be of type __tensorops_scope");
+
+ static_assert(__tensor_ops_detail::__get_rank<leftTensorType>() == 2,
+ "Operand must have rank 2");
+ static_assert(__tensor_ops_detail::__get_rank<rightTensorType>() == 2,
+ "Operand must have rank 2");
+ static_assert(__tensor_ops_detail::__get_rank<destinationTensorType>() == 2,
+ "Operand must have rank 2");
+
+ static_assert(
+ __tensor_ops_detail::__is_same_v<typename leftTensorType::index_type,
+ int>,
+ "Index type must be int");
+ static_assert(
+ __tensor_ops_detail::__is_same_v<typename rightTensorType::index_type,
+ int>,
+ "Index type must be int");
+ static_assert(__tensor_ops_detail::__is_same_v<
+ typename destinationTensorType::index_type, int>,
+ "Index type must be int");
+
+ using leftPtrType = typename leftTensorType::data_handle_type;
+ using rightPtrType = typename rightTensorType::data_handle_type;
+ using destinationPtrType = typename destinationTensorType::data_handle_type;
+
+ using leftValueType = __tensor_ops_detail::__remove_addrspace_t<
+ __tensor_ops_detail::__remove_reference_t<
+ typename leftTensorType::element_type>>;
+ using rightValueType = __tensor_ops_detail::__remove_addrspace_t<
+ __tensor_ops_detail::__remove_reference_t<
+ typename rightTensorType::element_type>>;
+ using destinationValueType = __tensor_ops_detail::__remove_addrspace_t<
+ __tensor_ops_detail::__remove_reference_t<
+ typename destinationTensorType::element_type>>;
+
+ const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType =
+ __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+ leftTensorType>();
+ const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType =
+ __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+ rightTensorType>();
+
+ thread void *left = (thread void *)(&leftIn);
+ thread void *right = (thread void *)(&rightIn);
+
+ __matmul2d_descriptor desc = descriptor;
+
+ // single thread
+ if constexpr (__tensor_ops_detail::__is_same_v<scope, metal::execution_thread>)
+ {
+ if constexpr (!__tensor_ops_detail::__is_cooperative_tensor_type_v<
+ destinationTensorType>)
+ {
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType =
+ __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+ destinationTensorType>();
+
+ thread void *destination = (thread void *)(&destinationT);
+
+ if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType,
+ half>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, half>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ half> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, half>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ half> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ float> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ float> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ half> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ float> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ float> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ float> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ half> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ float> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_th_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, int32_t>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_dv_i32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_dv_i32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_dv_i32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_dv_i32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_th_i32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_th_i32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_th_i32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else if constexpr (
+ __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_th_i32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationValueType>,
+ "Unsupported type");
+ }
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationTensorType>,
+ "destination cannot be cooperative tensor with cooperative group of "
+ "size 1");
+ }
+ else
+ {
+ // multiple threads
+ if constexpr (!__tensor_ops_detail::__is_cooperative_tensor_type_v<
+ destinationTensorType>)
+ {
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+ destinationDescType =
+ __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+ destinationTensorType>();
+
+ thread void *destination = (thread void *)(&destinationT);
+
+ if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType,
+ half>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, half>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ int8_t> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ half> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, half>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f16(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ half> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f32(
+ desc, left, leftDescType, right, rightDescType, destination,
+ destinationDescType, threads);
+ else
+ static_assert(
+ __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+ half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType,
+ float> &&
+ __tensor_ops_detail::__is_same_v<
+ destinationValueType, float>)
+ {
+ if constexpr (
+ __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspac