Skip to content

MetalPerformancePrimitives macOS xcode26.0 b1

Alex Soto edited this page Jun 9, 2025 · 1 revision

#MetalPerformancePrimitives.framework

diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h	1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h	2025-05-28 02:28:43
@@ -0,0 +1,177 @@
+
+// -*- Metal -*-
+//===-- MetalTensorOpsConvolution2d
+//------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalTensorOpsConvolution2d__
+#define __MetalTensorOpsConvolution2d__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+#include "__impl/MPPTensorOpsBase.h"
+#include "__impl/MPPTensorOpsUtility.h"
+
+#pragma METAL internals : enable
+
+namespace mpp
+{
+namespace tensor_ops
+{
+
+// This API performs 2d-convolution that occur in convolutional neural network.
+// 2d stands here for two spatial dimensions width x height even though tensors
+// consumed by this op are 4D. Source/inputs to the op run method are
+//      Activation tensor with named NHWC layout
+//           N = batch (slowest moving dimension)
+//           H = height
+//           W = width
+//           C = input channels (fastest moving dimension)
+//      Weights tensor with named HWCO layout
+//           H = kernel height
+//           W = kernel width
+//           C = input channels
+//           O = output channels
+// Destination tesnor is NHWO layout
+//           N = batch (slowest moving dimension)
+//           H = height
+//           W = width
+//           O = output channels (fastest moving dimension)
+//
+// Destination can also be cooperative tensor. See TensorOpsMatMul2d.h for
+// details on how to use cooperative tensor for example, for bias add and
+// applying activation before writing out the result. Currently only scope
+// supported by convolution2d op is full threadgroup. See TensorOpsMatMul2d.h
+// for details on scopes supported by tensor ops.
+
+enum class convolution2d_activation_layout
+{
+  nhwc,
+};
+
+enum class convolution2d_weights_layout
+{
+  hwio,
+};
+
+struct convolution2d_descriptor
+{
+  enum class mode
+  {
+    multiply,
+    multiply_accumulate,
+  };
+
+  // for nhwc, .x = output channel, .y = destination width, .z = destination
+  // height, .w = batch size
+  int4 destination_dimensions;
+  int4 source_dimensions;
+  int2 kernel_dimensions;
+  convolution2d_activation_layout activation_layout;
+  convolution2d_weights_layout weights_layout;
+  int2 strides;
+  int2 dilations;
+  int groups;
+  bool relaxed_precision;
+  mode conv2d_mode;
+
+  constexpr convolution2d_descriptor(
+      int4 _destination_dimensions, int4 _source_dimensions,
+      int2 _kernel_dimensions,
+      convolution2d_activation_layout _activation_layout =
+          convolution2d_activation_layout::nhwc,
+      convolution2d_weights_layout _weights_layout =
+          convolution2d_weights_layout::hwio,
+      int2 _strides = int2(1, 1), int2 _dilations = int2(1, 1), int _groups = 1,
+      bool _relaxed_precision = false,
+      mode _convolution2d_mode = mode::multiply) thread
+      : destination_dimensions(_destination_dimensions),
+        source_dimensions(_source_dimensions),
+        kernel_dimensions(_kernel_dimensions),
+        activation_layout(_activation_layout),
+        weights_layout(_weights_layout),
+        strides(_strides),
+        dilations(_dilations),
+        groups(_groups),
+        relaxed_precision(_relaxed_precision),
+        conv2d_mode(_convolution2d_mode)
+  {
+  }
+};
+
+enum class convolution2d_cooperative_operand
+{
+  destination,
+};
+
+#include "__impl/MPPTensorOpsConvolution2dImpl.h"
+
+template <convolution2d_descriptor Descriptor, typename Scope,
+          typename... ConvArgs>
+class convolution2d : __tensor_ops_detail::op
+{
+  static_assert(Descriptor.activation_layout ==
+                    convolution2d_activation_layout::nhwc,
+                "only nhwc activation layout supported currently");
+  static_assert(Descriptor.weights_layout == convolution2d_weights_layout::hwio,
+                "only hwio weights layout supported currently");
+  static_assert(Descriptor.groups == 1,
+                "only group size 1 supported currently");
+
+private:
+  thread int2 __offset;
+
+public:
+  convolution2d() thread : __offset(0)
+  {
+  }
+
+  void set_offsets(int2 o) thread
+  {
+    __offset = o;
+  }
+
+  template <typename ActivationTensorType, typename WeightsTensorType,
+            typename DestinationTensorType, typename... RunArgs>
+  INLINE void run(thread ActivationTensorType &activation,
+                  thread WeightsTensorType &weights,
+                  thread DestinationTensorType &destination) const thread
+  {
+    convolution2d_descriptor d = Descriptor;
+    __convolution2d_detail::__run<Scope, ActivationTensorType,
+                                  WeightsTensorType, DestinationTensorType,
+                                  RunArgs...>(activation, weights, destination,
+                                              d, __offset);
+  }
+
+  template <typename ActivationOperandType, typename WeightsOperandType,
+            typename ElementType, typename CoordType = int,
+            typename... CoopArgs>
+  using cooperative_tensor_destination_t =
+      __convolution2d_detail::__cooperative_tensor_destination_t<
+          Descriptor, Scope, ActivationOperandType, WeightsOperandType,
+          ElementType, CoordType, CoopArgs...>;
+
+  template <typename ActivationOperandType, typename WeightsOperandType,
+            typename ElementType, typename CoordType = int,
+            typename... CoopArgs>
+  INLINE cooperative_tensor_destination_t<ActivationOperandType,
+                                          WeightsOperandType, ElementType,
+                                          CoordType, CoopArgs...>
+  get_destination_cooperative_tensor() const thread
+  {
+    return __convolution2d_detail::__get_destination_cooperative_tensor<
+        Descriptor, Scope, ActivationOperandType, WeightsOperandType,
+        ElementType, CoordType, CoopArgs...>();
+  }
+};
+
+} // namespace tensor_ops
+} // namespace mpp
+
+#pragma METAL internals : disable
+
+#endif
+
+#endif // __TensorOpsConvolution2D__
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h	1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h	2025-05-26 00:45:42
@@ -0,0 +1,495 @@
+
+// -*- Metal -*-
+//===-- MetalTensorOpsMatMul2d
+//------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+// This API performs generalized matrix multiplication operation
+//             C = A*B + C;
+// A and B can be tensor_handle, tensor_offset, and tensor_inline.
+// C can be tensor_handle, tensor_offset, tensor_inline or cooperative_tensor.
+// Data type combinations supported by this operation are as follows
+//   A           B         C
+//  int8_t     int8_t    int32_t
+//  int8_t     int8_t    float
+//  int8_t     int8_t    half
+//  uint8_t    int8_t    int32_t
+//  uint8_t    int8_t    float
+//  uint8_t    int8_t    half
+//  int8_t     uint8_t   int32_t
+//  int8_t     uint8_t   float
+//  int8_t     uint8_t   half
+//  uint8_t    uint8_t   int32_t
+//  uint8_t    uint8_t   float
+//  uint8_t    uint8_t   half
+//   half       half     float
+//   half       half     half
+//
+// Basic usage is in the following example which takes M x K matrix A of type
+// half, K x N matrix B of type half, both in device memory and produces M x N
+// matrix C of type float in device memory. It tiles this matrix multiplication
+// in thread groups, where each thread group computes a 64 x 32 tile of output
+// but multiplying 64 x K tile of A with K x 32 tile of B. This compute kernel
+// will be launched with dispatch grid of
+//
+//        MTLSize threadgroups = MTLSizeMake((M + 63)/64, (N + 31)/32, 1);
+//
+// It uses 4 SIMD-Groups per threadgroup
+// The way to dispatch this compute kernel is
+//
+//    id<MTLComputePipelineState> state = [device newComputePipelineState:...];
+//    NSUInteger simdgroupWidth = [state threadExecutionWidth];
+//    ...
+//    [encoder dispatchThreadgroups:threadgroups
+//    threadPerThreadgroups:MTLSizeMake(simdgroupWidth*4, 1, 1)];
+//
+// kernel void simpleMatMul(tensor<device half,  dextents<int32_t, 2>,
+// tensor_handle> A,
+//                          tensor<device half,  dextents<int32_t, 2>,
+//                          tensor_handle> B, tensor<device float,
+//                          dextents<int32_t, 2>, tensor_handle> C, constant
+//                          uint& M, constant uint& N, constant uint& K, uint2
+//                          tgid [[threadgroup_position_in_grid]])
+// {
+//     // descriptor to create matmul operation that does 64x32 times 32x32
+//     producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+//     //m outer dim of local tile
+//                                                           32, //n outer dim
+//                                                           of local tile
+//                                                            0, //k inner
+//                                                            dimension. 0 means
+//                                                               //operation
+//                                                               will read K
+//                                                               from
+//                                                               //input tensor
+//                                                               //K =
+//                                                               A.extents().extent(0)
+//                                                               or
+//                                                               B.extents().extent(1)
+//                                                               for NN
+//                                                               //K =
+//                                                               A.extents().extent(0)
+//                                                               or
+//                                                               B.extents().extent(0)
+//                                                               for NT
+//                                                               //and so on..
+//                                                        false, //transpse_left
+//                                                        = false for NN and NT
+//                                                        and true for TN and TT
+//                                                        false,
+//                                                        //transpse_right =
+//                                                        false for NN and TN
+//                                                        and true for NT and TT
+//                                                        false,
+//                                                        //relaxed_precision =
+//                                                        false, set it to true
+//                                                        to allow
+//                                                        implementation
+//                                                               //sacrifice
+//                                                               accurancy for
+//                                                               performance.
+//                                                          );
+//
+//    // create matmul op from above descriptor with 4 SIMD-Groups. All 4
+//    SIMD-Groups in this threadgroup will execute this
+//    // matmul cooperatively. More on this scope below.
+//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//
+//    // Following three lines of code create appropriate slice for this thread
+//    group to work on.
+//    // E.g. A.offset below creates a tensor<device half, dextents<int32_t, 2>,
+//    tensor_offset>
+//    // which has same extents as original tensor A but origin shifted to
+//    (0,tgid.y*64) i.e.
+//    // mA[x,y] == A[x,tgid.y*64+y]
+//
+//    auto mA = A.offset(0, tgid.y*64);
+//    auto mB = B.offset(tgid.x*32, 0);
+//    auto mC = C.offset(tgid.x*32, tgid.y*64);
+//
+//     // execute the operation. Assumes C is is initialized to zero.
+//     op.run(mA, mB, mC);
+// }
+//
+// Above matrix multiplication implementation will do edge checking for all
+// thread groups against extents of original tensor although for large enough
+// matrices most of thread groups will be working on "inside" tiles, requring no
+// bounds check. In high performance code we can avoid edge checking for inside
+// thread groups and get better performance
+//
+// kernel void matMul(tensor<device half,  dextents<int32_t, 2>, tensor_handle>
+// A,
+//                    tensor<device half,  dextents<int32_t, 2>, tensor_handle>
+//                    B, tensor<device float, dextents<int32_t, 2>,
+//                    tensor_handle> C, constant uint& M, constant uint& N,
+//                    constant uint& K, uint2 tgid
+//                    [[threadgroup_position_in_grid]])
+// {
+//     // descriptor to create matmul operation that does 64x32 times 32x32
+//     producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+//                                                           32,
+//                                                            0,
+//                                                        false,
+//                                                        false,
+//                                                        false);
+//
+//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//
+//    // Inside thredgroup in both outer dimensions M and N.
+//    if ( tgid.x*64 + 63 < M && tgid.y*32 + 31 < N)
+//    {
+//      auto tA = A.static_slice<dynamic_extent, 64>(0,tgid.y*64);
+//      auto tB = B.static_slice<32, dynamic_extent>(tgid.x*32, 0);
+//      auto tC = C.static_slice<32, 64>(tgid.x*32, tgid.y*64);
+//
+//      op.run(tA, tB, tC);
+//    }
+//    else
+//    {
+//      auto tA = A.offset(0, tgid.y*64);
+//      auto tB = B.offset(tgid.x*32, 0);
+//      auto tC = C.offset(tgid.x*32, tgid.y*64);
+//
+//      op.run(tA, tB, tC);
+//    }
+// }
+//
+// User can also take ownership of looping over in reduction or k-dimension by
+// choosing appropriate chunk size in k (called k-tile or tilek) For following
+// example, we choose 16. kernel void matMulKLoop(tensor<device half,
+// dextents<int32_t, 2>, tensor_handle> A,
+//                         tensor<device half,  dextents<int32_t, 2>,
+//                         tensor_handle> B, tensor<device float,
+//                         dextents<int32_t, 2>, tensor_handle> C, constant
+//                         uint& M, constant uint& N, constant uint& K, uint2
+//                         tgid [[threadgroup_position_in_grid]])
+// {
+//     // descriptor to create matmul operation that does 64x32 times 32x32
+//     producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+//                                                           32,
+//                                                           16, // tilek = 16,
+//                                                           we loop over K in
+//                                                           chucks of 16
+//                                                               // rather than
+//                                                               letting matmul
+//                                                               op run method
+//                                                               looping over K
+//                                                               // internally
+//                                                               choosing tileK
+//                                                        false,
+//                                                        false,
+//                                                        false);
+//
+//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//
+//     constexpr int tilek = 16;
+//
+//    // Inside thredgroup in both outer dimensions M and N.
+//    if ( tgid.x*64 + 63 < M && tgid.y*32 + 31 < N)
+//    {
+//      auto tC = C.static_slice<32, 64>(tgid.x*32, tgid.y*64);
+//      int k = 0;
+//      for (; k + tilek - 1 < K; k += tilek)
+//      {
+//        auto tA = A.static_slice<tilek, 64>(k,tgid.y*64);
+//        auto tB = B.static_slice<32, tilek>(tgid.x*32, k);
+//
+//        op.run(tA, tB, tC);
+//      }
+//
+//      auto tA = A.static_slice<dynamic_extent, 64>(k, tgid.y*64);
+//      auto tB = B.static_slice<32, dynamic_extent>(tgid.x*32, k);
+//      op.run(tA, tB, tC);
+//    }
+//    else
+//    {
+//      auto tA = A.offset(0, tgid.y*64);
+//      auto tB = B.offset(tgid.x*32, 0);
+//      auto tC = C.offset(tgid.x*32, tgid.y*64);
+//
+//      op.run(tA, tB, tC);
+//    }
+// }
+//
+// Often times, we need to do some post processing on computed results before
+// storing to device or threadgroup memory. For example, in machine learning we
+// need to apply activation function on compute value. One can do GEMM as above
+// which writes the result to device memory, read the value back, call post
+// processing function and write again. This results in wasted bandwidth,
+// performance and power. User can apply post processing in-register where GEMM
+// output is computed using cooperative_tensor. Unlike tensor_handle,
+// tensor_offset and tensor_inline which are non-owning meaning these are
+// wrappers around resource in device, threadgroup or thread addressspce,
+// cooperative_tensor owns thread private data and divides the data for entire
+// tensor among threads (participating the scope of operation) in implementation
+// defined manner. This thread private memory is allocated at construction of
+// cooperative_tensor and deallocated when this cooperative_tensor goes out of
+// scope. The layout of cooperative_tensor depends on operation, data type,
+// number of threads in opscope with which op was created. Note that
+// cooperative_tensor created from an op is only valid for threads that are part
+// of opscope on which op was created. Though the layout of cooperative_tensor
+// is implemtation defined, we provide accessor functions as shown in the
+// example below
+//
+// kernel void simpleMatMulCooperative(tensor<device half,  dextents<int32_t,
+// 2>, tensor_handle> A,
+//                          tensor<device half,  dextents<int32_t, 2>,
+//                          tensor_handle> B, tensor<device float,
+//                          dextents<int32_t, 2>, tensor_handle> C,
+//                          tensor<device half, dextents<int32_t, 2>,
+//                          tensor_handle> bias, constant uint& M, constant
+//                          uint& N, constant uint& K, uint2 tgid
+//                          [[threadgroup_position_in_grid]])
+// {
+//     constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+//                                                           32,
+//                                                            0,
+//                                                        false,
+//                                                        false,
+//                                                        false);
+//
+//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//
+//    auto mA = A.offset(0, tgid.y*64);
+//    auto mB = B.offset(tgid.x*32, 0);
+//    auto mC = C.offset(tgid.x*32, tgid.y*64);
+//
+//    // This creates cooperative destination tensor of float element type.
+//    // Since matmul op above descriptor is created with 4 SIMD-Groups,
+//    coopeartive tensor will divide data among the threads on these
+//    // 4 SIMD-Groups. The layout of data among lanes is implementation defined
+//    and not all threads and even all elements within a thread need
+//    // be valid. We provide valid element check shown below which developer
+//    should use to guard their access to elements of cooperative_tensor
+//
+//    auto cT = matmulOp.get_destination_cooperative_tensor<decltype(mA),
+//    decltype(mB), float>();
+//
+//    // Loop over all the elements of cooperative_tensor thread elements owned
+//    by "this" thread and initialize to zero.
+//    // Its imperative for performance to include "unroll pragma" so compiler
+//    fully unrolls the loop.
+//
+//    #pragma unroll full
+//    for (uint16_t i = 0, i < cT.capacity(); ++i) {
+//
+//      if(cT.mask(i))
+//        cT[i] = 0;
+//    }
+//
+//    // execute the operation. All threads computes the matmul cooperatively
+//    and results are written to cooperative_tensor. op.run(mA, mB, cT);
+//
+//   // create cooperative bias tensor with same layout as destination
+//   cooperative_tensor of matmul auto biasT =
+//   matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB),
+//   float>();
+//
+//   // load data from bias tensor_handle into biasT cooperative_tensor using
+//   layout and distribution of element among threads of scope
+//   // on which matmul was created.
+//   biasT.load(bias);
+//
+//    #pragma unroll full
+//    for (uint16_t i = 0, i < cT.capacity(); ++i) {
+//
+//      if(cT.mask(i)) {
+//        //add bias
+//        cT[i] += biasT[i];
+//
+//        // get the 2-dimensional local coordinate of this thread's i-th
+//        element in destination local coordinate system (in this example
+//        // 32 x 64 tile).
+//        auto ids = cT.multidimensional_indices(i);
+//        cT[i] = foo(cT[i], idx); // do some operation based on coordinate
+//        values
+//      }
+//    }
+//
+//   // store to tensor handle
+//   cT.store(mC);
+// }
+//
+// Note on scope of operation
+// ==========================
+// A tensor operation may be executed on a single thread entirely or
+// cooperatively among a set of SIMD groups. We call these set of threads
+// "execution scop" of the tensor operation. A tensor ops must be created with
+// execution scope provided as template argument. All the threads in this
+// execution scope must enter the run method i.e. call to run methods must be
+// "execution scope" uniform. Use the following types to configure the execution
+// modes of each operation: metal::execution_thread: the operation will be run on a
+// single thread.
+//                 Fragment shaders only support this execution scope.
+// metal::execution_simdgroup - the operation will be run cooperatively by all threads in
+// this SIMD group.
+//                     May be used for finer control over tiling by slicing
+//                     tensors with SIMD IDs.
+// opscope_SIMD-Groups<N> - the operation will be executed cooperatively by N
+// SIMD groups.
+//                          Must be used when all threads in a threadgroup are
+//                          cooperatively performing the operation.
+// It is undefined behavior if the number of SIMD groups dispatched does not
+// match the number of SIMD groups that the operation was configured with.
+//
+// Even though each thread in execution scope can potentially independently
+// enter and exit run method, developer cannot assume that threads in execution
+// scope are working completely independently i.e. tensor operation run
+// implementation may need for (for correctness or performance) synchronize
+// among the threads in execution scope it was created with.
+//
+//
+//===----------------------------------------------------------------------===//
+#ifndef __MetalTensorOpsMatMul2d__
+#define __MetalTensorOpsMatMul2d__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+#include "__impl/MPPTensorOpsBase.h"
+#include "__impl/MPPTensorOpsTypes.h"
+#include <metal_numeric>
+
+#pragma METAL internals : enable
+
+namespace mpp
+{
+namespace tensor_ops
+{
+
+enum class matmul2d_cooperative_operand_index
+{
+  destination,
+};
+
+enum class reduction_operation
+{
+  sum,
+  max,
+  min,
+};
+
+struct matmul2d_descriptor
+{
+  enum class mode
+  {
+    multiply,
+    multiply_accumulate,
+  };
+
+  int m, n, k;
+  bool transpose_left, transpose_right;
+  bool relaxed_precision;
+  mode matmul_mode;
+
+public:
+  constexpr matmul2d_descriptor(int __m, int __n, int __k = dynamic_length_v<int>,
+                                bool __transpose_left = false,
+                                bool __transpose_right = false,
+                                bool __relaxed_precision = false,
+                                mode __matmul_mode = mode::multiply) thread
+      : m(__m),
+        n(__n),
+        k(__k),
+        transpose_left(__transpose_left),
+        transpose_right(__transpose_right),
+        relaxed_precision(__relaxed_precision),
+        matmul_mode(__matmul_mode)
+  {
+  }
+};
+
+template <typename ElementType>
+struct reduction_operation_identity
+{
+  static const constant ElementType sum_identity = (ElementType)0;
+  static const constant ElementType max_identity =
+      metal::numeric_limits<ElementType>::lowest;
+  static const constant ElementType min_identity =
+      metal::numeric_limits<ElementType>::max;
+};
+
+#include "__impl/MPPTensorOpsMatMul2dImpl.h"
+
+template <matmul2d_descriptor Descriptor, typename Scope, class... Args>
+class matmul2d : __tensor_ops_detail::op
+{
+  static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v<Scope>,
+                "Scope template argument should be of op_scope type");
+
+public:
+  matmul2d() thread = default;
+
+  template <
+      typename LeftOperandType, typename RightOperandType,
+      typename DestinationOperandType,
+      typename V = __tensor_ops_detail::__enable_if_t<
+          (__tensor_ops_detail::__is_tensor_type_v<LeftOperandType> &&
+           __tensor_ops_detail::__is_tensor_type_v<RightOperandType> &&
+           (__tensor_ops_detail::__is_tensor_type_v<DestinationOperandType> ||
+            __tensor_ops_detail::__is_cooperative_tensor_type_v<
+                DestinationOperandType>))>,
+      typename... RunArgs>
+  INLINE void run(thread LeftOperandType &left, thread RightOperandType &right,
+                  thread DestinationOperandType &destination) thread const
+  {
+
+    __mutmul2d_detail::__run<Descriptor, Scope, LeftOperandType,
+                             RightOperandType, DestinationOperandType,
+                             RunArgs...>(left, right, destination);
+  }
+
+  template <typename ElementType, typename CoordType, typename... CoopArgs>
+  using cooperative_tensor_destination_t =
+      __mutmul2d_detail::__cooperative_tensor_destination_t<
+          Descriptor, Scope, ElementType, CoordType, CoopArgs...>;
+
+  template <typename LeftOperandType, typename RightOperandType,
+            typename ElementType, typename CoordType = int,
+            typename U = __tensor_ops_detail::__enable_if_t<
+                __tensor_ops_detail::__is_tensor_type_v<LeftOperandType> &&
+                __tensor_ops_detail::__is_tensor_type_v<RightOperandType> &&
+                __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
+                __tensor_ops_detail::__is_integral_v<CoordType>>,
+            typename... CoopArgs>
+  INLINE cooperative_tensor_destination_t<ElementType, CoordType, CoopArgs...>
+  get_destination_cooperative_tensor() thread const
+  {
+
+    return __mutmul2d_detail::__get_destination_cooperative_tensor<
+        Descriptor, Scope, ElementType, CoordType, LeftOperandType,
+        RightOperandType, CoopArgs...>();
+  }
+};
+
+template <class ElementType, class Extents, class Layout>
+inline void reduce_rows(
+    thread metal::cooperative_tensor<ElementType, Extents, Layout> &source,
+    thread metal::cooperative_tensor<ElementType, Extents, Layout> &destination,
+    reduction_operation op = reduction_operation::sum,
+    ElementType identity =
+        reduction_operation_identity<ElementType>::sum_identity)
+{
+  __mutmul2d_detail::__reduce_rows<ElementType, Extents, Layout>(
+      source, destination, identity, op);
+}
+
+template <class ElementType, class Extents, class Layout>
+inline void reduce_columns(
+    thread metal::cooperative_tensor<ElementType, Extents, Layout> &source,
+    thread metal::cooperative_tensor<ElementType, Extents, Layout> &destination,
+    reduction_operation op = reduction_operation::sum,
+    ElementType identity =
+        reduction_operation_identity<ElementType>::sum_identity)
+{
+  __mutmul2d_detail::__reduce_columns<ElementType, Extents, Layout>(
+      source, destination, identity, op);
+}
+
+} // namespace tensor_ops
+} // namespace mpp
+
+#pragma METAL internals : disable
+
+#endif
+
+#endif // __TensorOpsMatMul2D__
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MetalPerformancePrimitives.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MetalPerformancePrimitives.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MetalPerformancePrimitives.h	1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MetalPerformancePrimitives.h	2025-05-26 00:45:42
@@ -0,0 +1,12 @@
+// -*- Metal -*-
+//===-- MetalPerformancePrimitives ------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalPerformancePrimitives__
+#define __MetalPerformancePrimitives__
+
+#include <MetalPerformancePrimitives/MPPTensorOpsConvolution2d.h>
+#include <MetalPerformancePrimitives/MPPTensorOpsMatMul2d.h>
+
+#endif
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsBase.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsBase.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsBase.h	1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsBase.h	2025-05-26 00:45:42
@@ -0,0 +1,28 @@
+
+// -*- Metal -*-
+//===-- MetalTensorOpsBase ------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalTensorOpsBase__
+#define __MetalTensorOpsBase__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+namespace mpp
+{
+namespace tensor_ops
+{
+
+namespace __tensor_ops_detail
+{
+class op
+{
+};
+} // namespace __tensor_ops_detail
+
+} // namespace tensor_ops
+} // namespace mpp
+
+#endif
+#endif
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h	1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h	2025-05-23 06:26:11
@@ -0,0 +1,4845 @@
+
+
+// -*- Metal -*-
+//===-- MetalTensorOpsConvolution2dImpl
+//------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalTensorOpsConvolution2dImpl__
+#define __MetalTensorOpsConvolution2dImpl__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+namespace __convolution2d_detail
+{
+
+#ifndef EXTERNALLY_DEFINED_ATTR
+#define EXTERNALLY_DEFINED_ATTR \
+  __attribute__((section("air.externally_defined")))
+#endif
+
+#define TENSOROPS_EXPORT [[gnu::visibility("default")]]
+#define INLINE __attribute__((__always_inline__))
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f32_dv_f32_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f32_dv_f32_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f32_tg_f32_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f32_tg_f32_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f32_dv_f32_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f32_dv_f32_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f32_tg_f32_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f32_tg_f32_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_dv_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_tg_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_dv_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_tg_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_dv_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_tg_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_dv_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_tg_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f16_dv_f16_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f16_tg_f16_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f16_dv_f16_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f16_tg_f16_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f16_dv_f16_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f16_tg_f16_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f16_dv_f16_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f16_tg_f16_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f32_dv_f32_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f32_tg_f32_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f32_dv_f32_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f32_tg_f32_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_bf_dv_bf_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_bf_tg_bf_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_bf_dv_bf_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_bf_tg_bf_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_bf_dv_bf_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_bf_tg_bf_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_bf_dv_bf_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_bf_tg_bf_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+
+template <typename scope, typename activation_type, typename weights_type,
+          typename destination_type, typename... run_args>
+void __run(thread activation_type &activation_tensor,
+           thread weights_type &weights_tensor,
+           thread destination_type &destination_tensor,
+           thread convolution2d_descriptor &__descriptor, int2 __offset)
+{
+  using activation_tensor_type = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<decltype(activation_tensor)>>;
+  using weights_tensor_type = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<decltype(weights_tensor)>>;
+  using destination_tensor_type = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<decltype(destination_tensor)>>;
+
+  metal::execution_threads t = scope();
+  int threads = t.size();
+
+  static_assert(__tensor_ops_detail::__is_tensor_type_v<activation_tensor_type>,
+                "Activation must be a tensor");
+  static_assert(__tensor_ops_detail::__is_tensor_type_v<weights_tensor_type>,
+                "Weights must be a tensor");
+  static_assert(
+      __tensor_ops_detail::__is_tensor_type_v<destination_tensor_type> ||
+          __tensor_ops_detail::__is_cooperative_tensor_type_v<
+              destination_tensor_type>,
+      "Destination must be a tensor or cooperative tensor");
+
+  static_assert(__tensor_ops_detail::__get_rank<activation_tensor_type>() == 4,
+                "Activation must be rank 4");
+  static_assert(__tensor_ops_detail::__get_rank<weights_tensor_type>() == 4,
+                "Weights must be rank 4");
+  static_assert(__tensor_ops_detail::__get_rank<destination_tensor_type>() == 4,
+                "Destination must be rank 4");
+
+  static_assert(__tensor_ops_detail::__is_same_v<
+                    typename activation_tensor_type::index_type, int>,
+                "Index type must be int");
+  static_assert(
+      __tensor_ops_detail::__is_same_v<typename weights_tensor_type::index_type,
+                                       int>,
+      "Index type must be int");
+  static_assert(__tensor_ops_detail::__is_same_v<
+                    typename destination_tensor_type::index_type, int>,
+                "Index type must be int");
+
+  using activation_ptr_type = typename activation_tensor_type::data_handle_type;
+  using weights_ptr_type = typename weights_tensor_type::data_handle_type;
+  using destination_ptr_type =
+      typename destination_tensor_type::data_handle_type;
+
+  using activation_value_type = typename activation_tensor_type::value_type;
+  using weights_value_type = typename weights_tensor_type::value_type;
+  using destination_value_type = typename destination_tensor_type::value_type;
+
+  auto activation = (thread void *)(&activation_tensor);
+  auto weights = (thread void *)(&weights_tensor);
+  auto offset = __offset;
+
+  const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+      activation_desc_type =
+          __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+              activation_tensor_type>();
+  const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+      weights_desc_type =
+          __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+              weights_tensor_type>();
+
+  convolution2d_descriptor desc = __descriptor;
+
+  if constexpr (__tensor_ops_detail::__is_cooperative_tensor_type_v<
+                    destination_tensor_type>)
+  {
+    thread void *destination =
+        &destination_tensor[__tensor_ops_detail::__tensor_ops_reserved_index];
+
+    if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                   int8_t> &&
+                  __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                   int8_t> &&
+                  __tensor_ops_detail::__is_same_v<destination_value_type,
+                                                   half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_f16_dv_f16_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_f16_dv_f16_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_f16_tg_f16_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_f16_tg_f16_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_f16_dv_f16_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_f16_dv_f16_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_f16_tg_f16_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_f16_tg_f16_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<destination_value_type,
+                                                        bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_bf_dv_bf_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_bf_dv_bf_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_bf_tg_bf_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_bf_tg_bf_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_bf_dv_bf_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_bf_dv_bf_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_bf_tg_bf_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_bf_tg_bf_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+#endif
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        float> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        float> &&
+                       __tensor_ops_detail::__is_same_v<destination_value_type,
+                                                        float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_f32_dv_f32_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_f32_dv_f32_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_f32_tg_f32_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_f32_tg_f32_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else
+      static_assert(
+          __tensor_ops_detail::__assert_false_v<destination_value_type>,
+          "Unsupported type");
+  }
+  else
+  {
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type =
+            __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+                destination_tensor_type>();
+
+    thread void *destination = (thread void *)(&destination_tensor);
+
+    if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                   int8_t> &&
+                  __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                   int8_t> &&
+                  __tensor_ops_detail::__is_same_v<destination_value_type,
+                                                   half>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<destination_value_type,
+                                                        bfloat>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_dv_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_dv_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_dv_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_dv_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_tg_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_tg_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_tg_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_tg_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+#endif
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        float> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        float> &&
+                       __tensor_ops_detail::__is_same_v<destination_value_type,
+                                                        float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f32_dv_f32_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f32_dv_f32_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f32_tg_f32_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f32_tg_f32_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f32_dv_f32_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f32_dv_f32_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f32_tg_f32_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f32_tg_f32_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else
+      static_assert(
+          __tensor_ops_detail::__assert_false_v<destination_value_type>,
+          "Unsupported type");
+  }
+}
+
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR size_t
+__tensorops_impl_conv2d_cooperative_destination_data_size(
+    const thread convolution2d_descriptor &descriptor,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR uint16_t
+__tensorops_impl_conv2d_cooperative_destination_tensor_num_elements(
+    const thread convolution2d_descriptor &descriptor,
+    __tensor_ops_detail::__const_thread_void_t, int threads);
+extern "C" TENSOROPS_EXPORT
+    EXTERNALLY_DEFINED_ATTR __tensor_ops_detail::__thread_void_t
+    __tensorops_impl_conv2d_cooperative_destination_tensor_elements(
+        __tensor_ops_detail::__thread_void_t, uint16_t,
+        __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+    const thread convolution2d_descriptor &descriptor,
+    __tensor_ops_detail::__thread_void_t, uint16_t,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_init(
+    __tensor_ops_detail::__thread_void_t,
+    const thread convolution2d_descriptor &,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_copy(
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    const thread convolution2d_descriptor &,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_move(
+    __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__thread_void_t,
+    const thread convolution2d_descriptor &,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_destory(
+    __tensor_ops_detail::__thread_void_t,
+    const thread convolution2d_descriptor &, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR bool
+__tensorops_impl_conv2d_cooperative_destination_tensor_is_valid_element(
+    const thread convolution2d_descriptor &,
+    __tensor_ops_detail::__thread_void_t, uint16_t,
+    __tensor_ops_detail::__tensor_ops_datatype, int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_bf(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_bf(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f32(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f32(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_i32(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_i32(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f16(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f16(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_bf(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_bf(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+
+template <convolution2d_descriptor descriptor,
+          convolution2d_cooperative_operand operand, typename scope,
+          typename activation_operand_type, typename weights_operand_type,
+          typename element_type, typename coord_type, typename... coop_args>
+struct __operand_layout
+{
+  static_assert(operand == convolution2d_cooperative_operand::destination,
+                "only destination can be cooperative tensor");
+  static_assert(__tensor_ops_detail::__is_same_v<element_type, float> ||
+                    __tensor_ops_detail::__is_same_v<element_type, half> ||
+#if __HAVE_BFLOAT__
+                    __tensor_ops_detail::__is_same_v<element_type, bfloat> ||
+#endif
+                    __tensor_ops_detail::__is_same_v<element_type, int32_t>,
+                "cooperative tensor data type can only be one of "
+                "float/half/bfloat/int32_t");
+
+  static constant constexpr __tensor_ops_detail::__rank_t rank = 4;
+  using element_t = element_type;
+  using coord_t = coord_type;
+  using extent_t = metal::dextents<coord_t, rank>;
+  using thread_storage_t = thread void *;
+  using const_thread_storage_t = const thread void *;
+  using index_t = uint16_t;
+  using operand_layout_t =
+      __operand_layout<descriptor, operand, scope, activation_operand_type,
+                       weights_operand_type, element_type, coord_type,
+                       coop_args...>;
+  using cooperative_tensor_t =
+      metal::cooperative_tensor<element_type, extent_t, operand_layout_t>;
+
+  using a_type = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<activation_operand_type>>;
+  using w_type = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<weights_operand_type>>;
+
+  using a_elem_type = typename a_type::element_type;
+  using w_elem_type = typename w_type::element_type;
+
+  static size_t thread_storage_size()
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+    __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+    __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+    return __tensorops_impl_conv2d_cooperative_destination_data_size(
+        descriptor, d_data_type, a_data_type, w_data_type, threads);
+  }
+
+  static constexpr size_t thread_storage_align()
+  {
+    return alignof(element_t);
+  };
+
+  static uint16_t size(const_thread_storage_t storage)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    return __tensorops_impl_conv2d_cooperative_destination_tensor_num_elements(
+        descriptor, storage, threads);
+  }
+
+  static void construct(thread void *this_)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+    __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+    __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+    __tensorops_impl_conv2d_cooperative_destination_tensor_init(
+        this_, descriptor, d_data_type, a_data_type, w_data_type, threads);
+  }
+
+  static void copy_construct(thread void *this_, thread void *other)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+    __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+    __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+    __tensorops_impl_conv2d_cooperative_destination_tensor_copy(
+        this_, other, descriptor, d_data_type, a_data_type, w_data_type,
+        threads);
+  };
+
+  static void move_construct(thread void *this_, thread void *other)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+    __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+    __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+    __tensorops_impl_conv2d_cooperative_destination_tensor_move(
+        this_, other, descriptor, d_data_type, a_data_type, w_data_type,
+        threads);
+  };
+
+  static void copy_assign(thread void *this_, thread void *other)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+    __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+    __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+    __tensorops_impl_conv2d_cooperative_destination_tensor_copy(
+        this_, other, descriptor, d_data_type, a_data_type, w_data_type,
+        threads);
+  };
+
+  static void move_assign(thread void *this_, thread void *other)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+    __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+    __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+    __tensorops_impl_conv2d_cooperative_destination_tensor_move(
+        this_, other, descriptor, d_data_type, a_data_type, w_data_type,
+        threads);
+  };
+
+  // Destroys the per-thread object.
+  static void destroy(thread void *this_)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensorops_impl_conv2d_cooperative_destination_tensor_destory(
+        this_, descriptor, threads);
+  }
+
+  template <class ElemType, class Extents, class Descriptor, class... Tags>
+  static void load(thread_storage_t storage,
+                   const thread metal::tensor<ElemType, Extents, Descriptor,
+                                              Tags...> &sourceT)
+  {
+    using elem_t = __tensor_ops_detail::__remove_addrspace_t<ElemType>;
+
+    metal::execution_threads t = scope();
+    int threads = t.size();
+
+    static_assert(__tensor_ops_detail::__is_same_v<elem_t, element_type>,
+                  "Source tensor datatype does not match cooperative tensor");
+    static_assert(Extents::rank() == 1 || Extents::rank() == 4,
+                  "Source tensor must be rank 1 or 4");
+
+    int sourceRank = Extents::rank();
+
+    convolution2d_descriptor desc = descriptor;
+
+    using tensorType = metal::tensor<ElemType, Extents, Descriptor, Tags...>;
+
+    using sourcePtrType = typename tensorType::data_handle_type;
+
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType =
+        __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+            tensorType>();
+
+    const thread void *source = (const thread void *)(&sourceT);
+
+    if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f16(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f16(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_i32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_i32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_bf(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_bf(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+#endif
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
+                    "Unsupported type");
+  };
+
+  template <class ElemType, class Extents, class Descriptor, class... Tags>
+  static void store(const_thread_storage_t storage,
+                    const thread metal::tensor<ElemType, Extents, Descriptor,
+                                               Tags...> &destinationT)
+  {
+    using elem_t = __tensor_ops_detail::__remove_addrspace_t<ElemType>;
+
+    metal::execution_threads t = scope();
+    int threads = t.size();
+
+    static_assert(__tensor_ops_detail::__is_same_v<elem_t, element_type>,
+                  "Tensor datatype does not match cooperative tensor");
+    static_assert(Extents::rank() == 1 || Extents::rank() == rank,
+                  "Tensor must be rank 1 or 4");
+
+    convolution2d_descriptor desc = descriptor;
+
+    using tensorType = metal::tensor<ElemType, Extents, Descriptor, Tags...>;
+
+    using destination_ptr_type = typename tensorType::data_handle_type;
+
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type =
+            __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+                tensorType>();
+
+    const thread void *destination = (const thread void *)(&destinationT);
+
+    if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f16(
+            desc, storage, destination, destination_desc_type, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f16(
+            desc, storage, destination, destination_desc_type, threads);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_i32(
+            desc, storage, destination, destination_desc_type, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_i32(
+            desc, storage, destination, destination_desc_type, threads);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f32(
+            desc, storage, destination, destination_desc_type, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f32(
+            desc, storage, destination, destination_desc_type, threads);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_bf(
+            desc, storage, destination, destination_desc_type, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_bf(
+            desc, storage, destination, destination_desc_type, threads);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+#endif
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
+                    "Unsupported type");
+  };
+
+  static thread element_t *get_pointer_to(const_thread_storage_t storage,
+                                          index_t idx)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_type>,
+                    "unsupported data type");
+
+    return (thread element_t *)
+        __tensorops_impl_conv2d_cooperative_destination_tensor_elements(
+            (thread_storage_t)storage, idx, dataType, threads);
+  };
+
+  static bool mask(const_thread_storage_t storage, index_t idx)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_type>,
+                    "unsupported data type");
+
+    return __tensorops_impl_conv2d_cooperative_destination_tensor_is_valid_element(
+        descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+        dataType, threads);
+  }
+
+  template <typename index_t, __tensor_ops_detail::__rank_t rank>
+  static metal::array<index_t, rank>
+  multidimensional_indices(const_thread_storage_t storage, index_t idx)
+  {
+
+    metal::execution_threads t = scope();
+    int threads = t.size();
+
+    static_assert(rank == 4, "multidimensional_indices returns 4D indices");
+
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_type>,
+                    "unsupported data type");
+
+    if constexpr (__tensor_ops_detail::__is_same_v<coord_t, ushort>)
+    {
+      ushort coords[4];
+      __tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint16,
+          threads);
+      return {coords[0], coords[1], coords[2], coords[3]};
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, short>)
+    {
+      short coords[4];
+      __tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int16,
+          threads);
+      return {coords[0], coords[1], coords[2], coords[3]};
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, uint>)
+    {
+      uint coords[4];
+      __tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint32,
+          threads);
+      return {coords[0], coords[1], coords[2], coords[3]};
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, int>)
+    {
+      int coords[4];
+      __tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int32,
+          threads);
+      return {coords[0], coords[1], coords[2], coords[3]};
+    }
+  }
+};
+
+template <convolution2d_descriptor descriptor,
+          convolution2d_cooperative_operand operand, typename scope,
+          typename activation_operand_type, typename weights_operand_type,
+          typename element_type, typename coord_type, typename... coop_args>
+using __cooperative_tensor_t = typename __operand_layout<
+    descriptor, operand, scope, activation_operand_type, weights_operand_type,
+    element_type, coord_type, coop_args...>::cooperative_tensor_t;
+
+template <convolution2d_descriptor descriptor, typename scope,
+          typename activation_operand_type, typename weights_operand_type,
+          typename element_type, typename coord_type, typename... coop_args>
+using __cooperative_tensor_destination_t =
+    __cooperative_tensor_t<descriptor,
+                           convolution2d_cooperative_operand::destination,
+                           scope, activation_operand_type, weights_operand_type,
+                           element_type, coord_type, coop_args...>;
+
+template <convolution2d_descriptor descriptor, typename scope,
+          typename activation_operand_type, typename weights_operand_type,
+          typename element_type, typename coord_type, typename... coop_args>
+__cooperative_tensor_destination_t<descriptor, scope, activation_operand_type,
+                                   weights_operand_type, element_type,
+                                   coord_type, coop_args...>
+__get_destination_cooperative_tensor()
+{
+  return __cooperative_tensor_destination_t<
+      descriptor, scope, activation_operand_type, weights_operand_type,
+      element_type, coord_type, coop_args...>();
+}
+
+#undef EXTERNALLY_DEFINED_ATTR
+
+} // namespace __convolution2d_detail
+
+#endif
+
+#endif // __TensorOpsConvolution2D__
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h	1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h	2025-05-23 06:26:11
@@ -0,0 +1,5131 @@
+
+// -*- Metal -*-
+//===-- MetalTensorOpsMatMul2dImpl
+//------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalTensorOpsMatMul2dImpl__
+#define __MetalTensorOpsMatMul2dImpl__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+namespace __mutmul2d_detail
+{
+
+#ifndef EXTERNALLY_DEFINED_ATTR
+#define EXTERNALLY_DEFINED_ATTR \
+  __attribute__((section("air.externally_defined")))
+#endif
+
+#define TENSOROPS_EXPORT [[gnu::visibility("default")]]
+#define INLINE __attribute__((__always_inline__))
+
+using __matmul2d_descriptor = matmul2d_descriptor;
+
+using __matmul2d_cooperative_operand_index = matmul2d_cooperative_operand_index;
+
+using __reduction_operation = reduction_operation;
+
+extern "C" EXTERNALLY_DEFINED_ATTR size_t
+__tensorops_impl_matmul2d_op_cooperative_destination_data_size(
+    const __matmul2d_descriptor descriptor, const int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR uint16_t
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_num_elements(
+    const __matmul2d_descriptor descriptor, const int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR thread void *
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_elements(
+    __tensor_ops_detail::__thread_void_t, uint16_t,
+    __tensor_ops_detail::__tensor_ops_datatype);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+    const __matmul2d_descriptor descriptor,
+    __tensor_ops_detail::__thread_void_t, uint16_t,
+    __tensor_ops_detail::__tensor_ops_datatype, thread void *,
+    __tensor_ops_detail::__tensor_ops_datatype, const int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_init(
+    __tensor_ops_detail::__thread_void_t, __matmul2d_descriptor,
+    __tensor_ops_detail::__tensor_ops_datatype, const int);
+extern "C" EXTERNALLY_DEFINED_ATTR bool
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_is_valid_element(
+    const __matmul2d_descriptor descriptor,
+    __tensor_ops_detail::__thread_void_t, uint16_t,
+    __tensor_ops_detail::__tensor_ops_datatype, const int);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_i32(
+    thread __matmul2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f16(
+    thread __matmul2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_f16(
+    thread __matmul2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_i32(
+    thread __matmul2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_i32(
+    thread __matmul2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f32(
+    thread __matmul2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_f32(
+    thread __matmul2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f16(
+    thread __matmul2d_descriptor &desc, const thread void *src,
+    thread void *dst, half identity, __reduction_operation op);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f32(
+    thread __matmul2d_descriptor &desc, const thread void *src,
+    thread void *dst, float identity, __reduction_operation op);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_i32(
+    thread __matmul2d_descriptor &desc, const thread void *src,
+    thread void *dst, int identity, __reduction_operation op);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f16(
+    thread __matmul2d_descriptor &desc, const thread void *src,
+    thread void *dst, half identity, __reduction_operation op);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f32(
+    thread __matmul2d_descriptor &desc, const thread void *src,
+    thread void *dst, float identity, __reduction_operation op);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_i32(
+    thread __matmul2d_descriptor &desc, const thread void *src,
+    thread void *dst, int identity, __reduction_operation op);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_tg_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_tg_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_tg_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_tg_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_i8_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_i8_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_i8_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_i8_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_th_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_th_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_th_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_th_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+
+template <__matmul2d_descriptor descriptor,
+          __matmul2d_cooperative_operand_index operand_index, typename scope,
+          typename element_type, typename coord_type, typename... args>
+struct __operand_layout
+{
+
+  static_assert(operand_index ==
+                    matmul2d_cooperative_operand_index::destination,
+                "only destination can be cooperative tensor");
+  static_assert(__tensor_ops_detail::__is_same_v<element_type, float> ||
+                    __tensor_ops_detail::__is_same_v<element_type, half> ||
+#if __HAVE_BFLOAT__
+                    __tensor_ops_detail::__is_same_v<element_type, bfloat> ||
+#endif
+                    __tensor_ops_detail::__is_same_v<element_type, int32_t>,
+                "cooperative tensor data type can only be one of "
+                "float/half/bfloat/int32_t");
+
+  static constant constexpr __tensor_ops_detail::__rank_t rank = 2;
+  using element_t = element_type;
+  using coord_t = coord_type;
+  using extent_t = metal::dextents<coord_t, rank>;
+  using thread_storage_t = thread void *;
+  using const_thread_storage_t = const thread void *;
+  using index_t = uint16_t;
+  using operand_layout_t =
+      __operand_layout<descriptor, operand_index, scope, element_t, coord_t>;
+  using cooperative_tensor_t =
+      metal::cooperative_tensor<element_t, extent_t, operand_layout_t>;
+  using scope_t = scope;
+
+  static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v<scope>,
+                "scope should be of type __tensorops_scope");
+
+  static constexpr constant bool is_matmul2d_cooperative_destination_layout =
+      true;
+
+  static constexpr constant __matmul2d_descriptor matmul2d_desc = descriptor;
+
+  // Returns the alignment of the storage allocated in each thread
+  // for this cooperative_tensor.
+  static constexpr size_t thread_storage_align()
+  {
+    return alignof(element_t);
+  };
+
+  // Copy-constructs from the cooperative_tensor `other`.
+  static void copy_construct(thread void *this_, thread void *other)
+  {
+    thread element_t *this_e = (thread element_t *)(this_);
+    thread element_t *other_e = (thread element_t *)(other);
+    for (size_t i = 0, e = size(this_); i != e; ++i)
+    {
+      other_e[i] = this_e[i];
+    }
+  };
+
+  // Move-constructs from the cooperative_tensor `other`.
+  static void move_construct(thread void *this_, thread void *other)
+  {
+    thread element_t *this_e = (thread element_t *)(this_);
+    thread element_t *other_e = this_e;
+  };
+
+  // Copy-assigns from the cooperative_tensor `other`.
+  static void copy_assign(thread void *this_, thread void *other)
+  {
+    thread element_t *this_e = (thread element_t *)(this_);
+    thread element_t *other_e = (thread element_t *)(other);
+    for (size_t i = 0, e = size(this_); i != e; ++i)
+    {
+      other_e[i] = this_e[i];
+    }
+  };
+
+  // Move-assigns from the cooperative_tensor `other`.
+  static void move_assign(thread void *this_, thread void *other)
+  {
+    thread element_t *this_e = (thread element_t *)(this_);
+    thread element_t *other_e = this_e;
+  };
+
+  // Destroys the per-thread object.
+  static void destroy(thread void *) {};
+
+  static size_t thread_storage_size()
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    return __tensorops_impl_matmul2d_op_cooperative_destination_data_size(
+        descriptor, threads);
+  }
+
+  template <class ElemType, class Extents, class Descriptor, class... Tags>
+  static void load(thread_storage_t storage,
+                   const thread metal::tensor<ElemType, Extents, Descriptor,
+                                              Tags...> &sourceT)
+  {
+    using elem_t = __tensor_ops_detail::__remove_addrspace_t<ElemType>;
+
+    static_assert(__tensor_ops_detail::__is_same_v<elem_t, element_t>,
+                  "Source tensor datatype does not match cooperative tensor");
+    static_assert(Extents::rank() == 1 || Extents::rank() == 2,
+                  "Source tensor must be rank 1 or 2");
+
+    int sourceRank = Extents::rank();
+
+    metal::execution_threads t = scope();
+    int threads = t.size();
+
+    __matmul2d_descriptor desc = descriptor;
+
+    using tensorType = metal::tensor<ElemType, Extents, Descriptor, Tags...>;
+
+    using sourcePtrType = typename tensorType::data_handle_type;
+
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType =
+        __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+            tensorType>();
+
+    const thread void *source = (const thread void *)(&sourceT);
+
+    if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_f16(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_f16(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_i32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_i32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_f32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_f32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
+                    "Unsupported type");
+  };
+
+  template <class ElemType, class Extents, class Descriptor, class... Tags>
+  static void store(const_thread_storage_t storage,
+                    const thread metal::tensor<ElemType, Extents, Descriptor,
+                                               Tags...> &destinationT)
+  {
+    using elem_t = __tensor_ops_detail::__remove_addrspace_t<ElemType>;
+
+    static_assert(__tensor_ops_detail::__is_same_v<elem_t, element_t>,
+                  "Tensor datatype does not match cooperative tensor");
+    static_assert(Extents::rank() == 1 || Extents::rank() == rank,
+                  "Tensor must be rank 1 or 2");
+
+    __matmul2d_descriptor desc = descriptor;
+
+    metal::execution_threads t = scope();
+    int threads = t.size();
+
+    using tensorType = metal::tensor<ElemType, Extents, Descriptor, Tags...>;
+
+    using destinationPtrType = typename tensorType::data_handle_type;
+
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType =
+            __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+                tensorType>();
+
+    const thread void *destination = (const thread void *)(&destinationT);
+
+    if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f16(
+            desc, storage, destination, destinationDescType, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_f16(
+            desc, storage, destination, destinationDescType, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+                      "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_i32(
+            desc, storage, destination, destinationDescType, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_i32(
+            desc, storage, destination, destinationDescType, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+                      "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f32(
+            desc, storage, destination, destinationDescType, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_f32(
+            desc, storage, destination, destinationDescType, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+                      "Unsupported address space");
+    }
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
+                    "Unsupported type");
+  };
+
+  static uint16_t size(const_thread_storage_t storage)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    return __tensorops_impl_matmul2d_op_cooperative_destination_tensor_num_elements(
+        descriptor, threads);
+  }
+
+  static thread element_t *get_pointer_to(const_thread_storage_t storage,
+                                          index_t idx)
+  {
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_int32;
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_t>,
+                    "unsupported data type");
+
+    return (thread element_t *)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_elements(
+            (thread_storage_t)storage, idx, dataType);
+  };
+
+  static bool mask(const_thread_storage_t storage, index_t idx)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_int32;
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_t>,
+                    "unsupported data type");
+
+    return __tensorops_impl_matmul2d_op_cooperative_destination_tensor_is_valid_element(
+        descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+        dataType, threads);
+  }
+
+  template <typename index_t, __tensor_ops_detail::__rank_t rank = 2>
+  static metal::array<index_t, rank>
+  multidimensional_indices(const_thread_storage_t storage, index_t idx)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_int32;
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_t>,
+                    "unsupported data type");
+
+    if constexpr (__tensor_ops_detail::__is_same_v<coord_t, ushort>)
+    {
+      ushort coords[2];
+      __tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint16,
+          threads);
+      return {coords[0], coords[1]};
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, short>)
+    {
+      short coords[2];
+      __tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int16,
+          threads);
+      return {coords[0], coords[1]};
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, uint>)
+    {
+      uint coords[2];
+      __tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint32,
+          threads);
+      ;
+      return {coords[0], coords[1]};
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, int>)
+    {
+      int coords[2];
+      __tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int32,
+          threads);
+      return {coords[0], coords[1]};
+    }
+  }
+
+  static void construct(thread_storage_t storage)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_int32;
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_t>,
+                    "unsupported data type");
+
+    __tensorops_impl_matmul2d_op_cooperative_destination_tensor_init(
+        (__tensor_ops_detail::__thread_void_t)storage, descriptor, dataType,
+        threads);
+  }
+};
+
+template <__matmul2d_descriptor descriptor,
+          __matmul2d_cooperative_operand_index operand_index, typename scope,
+          typename element_type, typename coord_type, typename... args>
+using __cooperative_tensor_t =
+    typename __operand_layout<descriptor, operand_index, scope, element_type,
+                              coord_type, args...>::cooperative_tensor_t;
+
+template <__matmul2d_descriptor descriptor, typename scope,
+          typename element_type, typename coord_type, typename... args>
+using __cooperative_tensor_destination_t =
+    __cooperative_tensor_t<descriptor,
+                           matmul2d_cooperative_operand_index::destination,
+                           scope, element_type, coord_type, args...>;
+
+template <__matmul2d_descriptor descriptor, typename scope,
+          typename element_type, typename coord_type, typename left_operand,
+          typename right_operand, typename... args>
+__cooperative_tensor_destination_t<descriptor, scope, element_type, coord_type,
+                                   args...>
+__get_destination_cooperative_tensor()
+{
+  static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v<scope>,
+                "scope should be of type __tensorops_scope");
+  return __cooperative_tensor_destination_t<descriptor, scope, element_type,
+                                            coord_type, args...>();
+}
+
+template <__matmul2d_descriptor descriptor, typename scope,
+          typename left_operand, typename right_operand,
+          typename destination_operand, typename... args>
+void __run(thread left_operand &leftIn, thread right_operand &rightIn,
+           thread destination_operand &destinationT)
+{
+  using leftTensorType = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<decltype(leftIn)>>;
+  using rightTensorType = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<decltype(rightIn)>>;
+  using destinationTensorType = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<decltype(destinationT)>>;
+
+  metal::execution_threads t = scope();
+  int threads = t.size();
+
+  static_assert(__tensor_ops_detail::__is_tensor_type_v<leftTensorType>,
+                "Left operand must be a tensor");
+  static_assert(__tensor_ops_detail::__is_tensor_type_v<rightTensorType>,
+                "Right operand must be a tensor");
+  static_assert(
+      __tensor_ops_detail::__is_tensor_type_v<destinationTensorType> ||
+          __tensor_ops_detail::__is_cooperative_tensor_type_v<
+              destinationTensorType>,
+      "Destination operand must be a tensor or cooperative tensor");
+
+  static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v<scope>,
+                "scope should be of type __tensorops_scope");
+
+  static_assert(__tensor_ops_detail::__get_rank<leftTensorType>() == 2,
+                "Operand must have rank 2");
+  static_assert(__tensor_ops_detail::__get_rank<rightTensorType>() == 2,
+                "Operand must have rank 2");
+  static_assert(__tensor_ops_detail::__get_rank<destinationTensorType>() == 2,
+                "Operand must have rank 2");
+
+  static_assert(
+      __tensor_ops_detail::__is_same_v<typename leftTensorType::index_type,
+                                       int>,
+      "Index type must be int");
+  static_assert(
+      __tensor_ops_detail::__is_same_v<typename rightTensorType::index_type,
+                                       int>,
+      "Index type must be int");
+  static_assert(__tensor_ops_detail::__is_same_v<
+                    typename destinationTensorType::index_type, int>,
+                "Index type must be int");
+
+  using leftPtrType = typename leftTensorType::data_handle_type;
+  using rightPtrType = typename rightTensorType::data_handle_type;
+  using destinationPtrType = typename destinationTensorType::data_handle_type;
+
+  using leftValueType = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<
+          typename leftTensorType::element_type>>;
+  using rightValueType = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<
+          typename rightTensorType::element_type>>;
+  using destinationValueType = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<
+          typename destinationTensorType::element_type>>;
+
+  const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType =
+      __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+          leftTensorType>();
+  const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType =
+      __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+          rightTensorType>();
+
+  thread void *left = (thread void *)(&leftIn);
+  thread void *right = (thread void *)(&rightIn);
+
+  __matmul2d_descriptor desc = descriptor;
+
+  // single thread
+  if constexpr (__tensor_ops_detail::__is_same_v<scope, metal::execution_thread>)
+  {
+    if constexpr (!__tensor_ops_detail::__is_cooperative_tensor_type_v<
+                      destinationTensorType>)
+    {
+      __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+          destinationDescType =
+              __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+                  destinationTensorType>();
+
+      thread void *destination = (thread void *)(&destinationT);
+
+      if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                    __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                    __tensor_ops_detail::__is_same_v<destinationValueType,
+                                                     half>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, half>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, half>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, int32_t>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_dv_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_dv_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_dv_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_dv_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_th_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_th_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_th_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_th_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destinationValueType>,
+            "Unsupported type");
+    }
+    else
+      static_assert(
+          __tensor_ops_detail::__assert_false_v<destinationTensorType>,
+          "destination cannot be cooperative tensor with cooperative group of "
+          "size 1");
+  }
+  else
+  {
+    // multiple threads
+    if constexpr (!__tensor_ops_detail::__is_cooperative_tensor_type_v<
+                      destinationTensorType>)
+    {
+      __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+          destinationDescType =
+              __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+                  destinationTensorType>();
+
+      thread void *destination = (thread void *)(&destinationT);
+
+      if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                    __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                    __tensor_ops_detail::__is_same_v<destinationValueType,
+                                                     half>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, half>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, half>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_dv_f32(
+    
Clone this wiki locally