MetalPerformancePrimitives macOS xcode26.0 b1

#MetalPerformancePrimitives.framework

diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h	1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsConvolution2d.h	2025-05-28 02:28:43
@@ -0,0 +1,177 @@
+
+// -*- Metal -*-
+//===-- MetalTensorOpsConvolution2d
+//------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalTensorOpsConvolution2d__
+#define __MetalTensorOpsConvolution2d__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+#include "__impl/MPPTensorOpsBase.h"
+#include "__impl/MPPTensorOpsUtility.h"
+
+#pragma METAL internals : enable
+
+namespace mpp
+{
+namespace tensor_ops
+{
+
+// This API performs 2d-convolution that occur in convolutional neural network.
+// 2d stands here for two spatial dimensions width x height even though tensors
+// consumed by this op are 4D. Source/inputs to the op run method are
+//      Activation tensor with named NHWC layout
+//           N = batch (slowest moving dimension)
+//           H = height
+//           W = width
+//           C = input channels (fastest moving dimension)
+//      Weights tensor with named HWCO layout
+//           H = kernel height
+//           W = kernel width
+//           C = input channels
+//           O = output channels
+// Destination tesnor is NHWO layout
+//           N = batch (slowest moving dimension)
+//           H = height
+//           W = width
+//           O = output channels (fastest moving dimension)
+//
+// Destination can also be cooperative tensor. See TensorOpsMatMul2d.h for
+// details on how to use cooperative tensor for example, for bias add and
+// applying activation before writing out the result. Currently only scope
+// supported by convolution2d op is full threadgroup. See TensorOpsMatMul2d.h
+// for details on scopes supported by tensor ops.
+
+enum class convolution2d_activation_layout
+{
+  nhwc,
+};
+
+enum class convolution2d_weights_layout
+{
+  hwio,
+};
+
+struct convolution2d_descriptor
+{
+  enum class mode
+  {
+    multiply,
+    multiply_accumulate,
+  };
+
+  // for nhwc, .x = output channel, .y = destination width, .z = destination
+  // height, .w = batch size
+  int4 destination_dimensions;
+  int4 source_dimensions;
+  int2 kernel_dimensions;
+  convolution2d_activation_layout activation_layout;
+  convolution2d_weights_layout weights_layout;
+  int2 strides;
+  int2 dilations;
+  int groups;
+  bool relaxed_precision;
+  mode conv2d_mode;
+
+  constexpr convolution2d_descriptor(
+      int4 _destination_dimensions, int4 _source_dimensions,
+      int2 _kernel_dimensions,
+      convolution2d_activation_layout _activation_layout =
+          convolution2d_activation_layout::nhwc,
+      convolution2d_weights_layout _weights_layout =
+          convolution2d_weights_layout::hwio,
+      int2 _strides = int2(1, 1), int2 _dilations = int2(1, 1), int _groups = 1,
+      bool _relaxed_precision = false,
+      mode _convolution2d_mode = mode::multiply) thread
+      : destination_dimensions(_destination_dimensions),
+        source_dimensions(_source_dimensions),
+        kernel_dimensions(_kernel_dimensions),
+        activation_layout(_activation_layout),
+        weights_layout(_weights_layout),
+        strides(_strides),
+        dilations(_dilations),
+        groups(_groups),
+        relaxed_precision(_relaxed_precision),
+        conv2d_mode(_convolution2d_mode)
+  {
+  }
+};
+
+enum class convolution2d_cooperative_operand
+{
+  destination,
+};
+
+#include "__impl/MPPTensorOpsConvolution2dImpl.h"
+
+template <convolution2d_descriptor Descriptor, typename Scope,
+          typename... ConvArgs>
+class convolution2d : __tensor_ops_detail::op
+{
+  static_assert(Descriptor.activation_layout ==
+                    convolution2d_activation_layout::nhwc,
+                "only nhwc activation layout supported currently");
+  static_assert(Descriptor.weights_layout == convolution2d_weights_layout::hwio,
+                "only hwio weights layout supported currently");
+  static_assert(Descriptor.groups == 1,
+                "only group size 1 supported currently");
+
+private:
+  thread int2 __offset;
+
+public:
+  convolution2d() thread : __offset(0)
+  {
+  }
+
+  void set_offsets(int2 o) thread
+  {
+    __offset = o;
+  }
+
+  template <typename ActivationTensorType, typename WeightsTensorType,
+            typename DestinationTensorType, typename... RunArgs>
+  INLINE void run(thread ActivationTensorType &activation,
+                  thread WeightsTensorType &weights,
+                  thread DestinationTensorType &destination) const thread
+  {
+    convolution2d_descriptor d = Descriptor;
+    __convolution2d_detail::__run<Scope, ActivationTensorType,
+                                  WeightsTensorType, DestinationTensorType,
+                                  RunArgs...>(activation, weights, destination,
+                                              d, __offset);
+  }
+
+  template <typename ActivationOperandType, typename WeightsOperandType,
+            typename ElementType, typename CoordType = int,
+            typename... CoopArgs>
+  using cooperative_tensor_destination_t =
+      __convolution2d_detail::__cooperative_tensor_destination_t<
+          Descriptor, Scope, ActivationOperandType, WeightsOperandType,
+          ElementType, CoordType, CoopArgs...>;
+
+  template <typename ActivationOperandType, typename WeightsOperandType,
+            typename ElementType, typename CoordType = int,
+            typename... CoopArgs>
+  INLINE cooperative_tensor_destination_t<ActivationOperandType,
+                                          WeightsOperandType, ElementType,
+                                          CoordType, CoopArgs...>
+  get_destination_cooperative_tensor() const thread
+  {
+    return __convolution2d_detail::__get_destination_cooperative_tensor<
+        Descriptor, Scope, ActivationOperandType, WeightsOperandType,
+        ElementType, CoordType, CoopArgs...>();
+  }
+};
+
+} // namespace tensor_ops
+} // namespace mpp
+
+#pragma METAL internals : disable
+
+#endif
+
+#endif // __TensorOpsConvolution2D__
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h	1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h	2025-05-26 00:45:42
@@ -0,0 +1,495 @@
+
+// -*- Metal -*-
+//===-- MetalTensorOpsMatMul2d
+//------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+// This API performs generalized matrix multiplication operation
+//             C = A*B + C;
+// A and B can be tensor_handle, tensor_offset, and tensor_inline.
+// C can be tensor_handle, tensor_offset, tensor_inline or cooperative_tensor.
+// Data type combinations supported by this operation are as follows
+//   A           B         C
+//  int8_t     int8_t    int32_t
+//  int8_t     int8_t    float
+//  int8_t     int8_t    half
+//  uint8_t    int8_t    int32_t
+//  uint8_t    int8_t    float
+//  uint8_t    int8_t    half
+//  int8_t     uint8_t   int32_t
+//  int8_t     uint8_t   float
+//  int8_t     uint8_t   half
+//  uint8_t    uint8_t   int32_t
+//  uint8_t    uint8_t   float
+//  uint8_t    uint8_t   half
+//   half       half     float
+//   half       half     half
+//
+// Basic usage is in the following example which takes M x K matrix A of type
+// half, K x N matrix B of type half, both in device memory and produces M x N
+// matrix C of type float in device memory. It tiles this matrix multiplication
+// in thread groups, where each thread group computes a 64 x 32 tile of output
+// but multiplying 64 x K tile of A with K x 32 tile of B. This compute kernel
+// will be launched with dispatch grid of
+//
+//        MTLSize threadgroups = MTLSizeMake((M + 63)/64, (N + 31)/32, 1);
+//
+// It uses 4 SIMD-Groups per threadgroup
+// The way to dispatch this compute kernel is
+//
+//    id<MTLComputePipelineState> state = [device newComputePipelineState:...];
+//    NSUInteger simdgroupWidth = [state threadExecutionWidth];
+//    ...
+//    [encoder dispatchThreadgroups:threadgroups
+//    threadPerThreadgroups:MTLSizeMake(simdgroupWidth*4, 1, 1)];
+//
+// kernel void simpleMatMul(tensor<device half,  dextents<int32_t, 2>,
+// tensor_handle> A,
+//                          tensor<device half,  dextents<int32_t, 2>,
+//                          tensor_handle> B, tensor<device float,
+//                          dextents<int32_t, 2>, tensor_handle> C, constant
+//                          uint& M, constant uint& N, constant uint& K, uint2
+//                          tgid [[threadgroup_position_in_grid]])
+// {
+//     // descriptor to create matmul operation that does 64x32 times 32x32
+//     producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+//     //m outer dim of local tile
+//                                                           32, //n outer dim
+//                                                           of local tile
+//                                                            0, //k inner
+//                                                            dimension. 0 means
+//                                                               //operation
+//                                                               will read K
+//                                                               from
+//                                                               //input tensor
+//                                                               //K =
+//                                                               A.extents().extent(0)
+//                                                               or
+//                                                               B.extents().extent(1)
+//                                                               for NN
+//                                                               //K =
+//                                                               A.extents().extent(0)
+//                                                               or
+//                                                               B.extents().extent(0)
+//                                                               for NT
+//                                                               //and so on..
+//                                                        false, //transpse_left
+//                                                        = false for NN and NT
+//                                                        and true for TN and TT
+//                                                        false,
+//                                                        //transpse_right =
+//                                                        false for NN and TN
+//                                                        and true for NT and TT
+//                                                        false,
+//                                                        //relaxed_precision =
+//                                                        false, set it to true
+//                                                        to allow
+//                                                        implementation
+//                                                               //sacrifice
+//                                                               accurancy for
+//                                                               performance.
+//                                                          );
+//
+//    // create matmul op from above descriptor with 4 SIMD-Groups. All 4
+//    SIMD-Groups in this threadgroup will execute this
+//    // matmul cooperatively. More on this scope below.
+//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//
+//    // Following three lines of code create appropriate slice for this thread
+//    group to work on.
+//    // E.g. A.offset below creates a tensor<device half, dextents<int32_t, 2>,
+//    tensor_offset>
+//    // which has same extents as original tensor A but origin shifted to
+//    (0,tgid.y*64) i.e.
+//    // mA[x,y] == A[x,tgid.y*64+y]
+//
+//    auto mA = A.offset(0, tgid.y*64);
+//    auto mB = B.offset(tgid.x*32, 0);
+//    auto mC = C.offset(tgid.x*32, tgid.y*64);
+//
+//     // execute the operation. Assumes C is is initialized to zero.
+//     op.run(mA, mB, mC);
+// }
+//
+// Above matrix multiplication implementation will do edge checking for all
+// thread groups against extents of original tensor although for large enough
+// matrices most of thread groups will be working on "inside" tiles, requring no
+// bounds check. In high performance code we can avoid edge checking for inside
+// thread groups and get better performance
+//
+// kernel void matMul(tensor<device half,  dextents<int32_t, 2>, tensor_handle>
+// A,
+//                    tensor<device half,  dextents<int32_t, 2>, tensor_handle>
+//                    B, tensor<device float, dextents<int32_t, 2>,
+//                    tensor_handle> C, constant uint& M, constant uint& N,
+//                    constant uint& K, uint2 tgid
+//                    [[threadgroup_position_in_grid]])
+// {
+//     // descriptor to create matmul operation that does 64x32 times 32x32
+//     producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+//                                                           32,
+//                                                            0,
+//                                                        false,
+//                                                        false,
+//                                                        false);
+//
+//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//
+//    // Inside thredgroup in both outer dimensions M and N.
+//    if ( tgid.x*64 + 63 < M && tgid.y*32 + 31 < N)
+//    {
+//      auto tA = A.static_slice<dynamic_extent, 64>(0,tgid.y*64);
+//      auto tB = B.static_slice<32, dynamic_extent>(tgid.x*32, 0);
+//      auto tC = C.static_slice<32, 64>(tgid.x*32, tgid.y*64);
+//
+//      op.run(tA, tB, tC);
+//    }
+//    else
+//    {
+//      auto tA = A.offset(0, tgid.y*64);
+//      auto tB = B.offset(tgid.x*32, 0);
+//      auto tC = C.offset(tgid.x*32, tgid.y*64);
+//
+//      op.run(tA, tB, tC);
+//    }
+// }
+//
+// User can also take ownership of looping over in reduction or k-dimension by
+// choosing appropriate chunk size in k (called k-tile or tilek) For following
+// example, we choose 16. kernel void matMulKLoop(tensor<device half,
+// dextents<int32_t, 2>, tensor_handle> A,
+//                         tensor<device half,  dextents<int32_t, 2>,
+//                         tensor_handle> B, tensor<device float,
+//                         dextents<int32_t, 2>, tensor_handle> C, constant
+//                         uint& M, constant uint& N, constant uint& K, uint2
+//                         tgid [[threadgroup_position_in_grid]])
+// {
+//     // descriptor to create matmul operation that does 64x32 times 32x32
+//     producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+//                                                           32,
+//                                                           16, // tilek = 16,
+//                                                           we loop over K in
+//                                                           chucks of 16
+//                                                               // rather than
+//                                                               letting matmul
+//                                                               op run method
+//                                                               looping over K
+//                                                               // internally
+//                                                               choosing tileK
+//                                                        false,
+//                                                        false,
+//                                                        false);
+//
+//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//
+//     constexpr int tilek = 16;
+//
+//    // Inside thredgroup in both outer dimensions M and N.
+//    if ( tgid.x*64 + 63 < M && tgid.y*32 + 31 < N)
+//    {
+//      auto tC = C.static_slice<32, 64>(tgid.x*32, tgid.y*64);
+//      int k = 0;
+//      for (; k + tilek - 1 < K; k += tilek)
+//      {
+//        auto tA = A.static_slice<tilek, 64>(k,tgid.y*64);
+//        auto tB = B.static_slice<32, tilek>(tgid.x*32, k);
+//
+//        op.run(tA, tB, tC);
+//      }
+//
+//      auto tA = A.static_slice<dynamic_extent, 64>(k, tgid.y*64);
+//      auto tB = B.static_slice<32, dynamic_extent>(tgid.x*32, k);
+//      op.run(tA, tB, tC);
+//    }
+//    else
+//    {
+//      auto tA = A.offset(0, tgid.y*64);
+//      auto tB = B.offset(tgid.x*32, 0);
+//      auto tC = C.offset(tgid.x*32, tgid.y*64);
+//
+//      op.run(tA, tB, tC);
+//    }
+// }
+//
+// Often times, we need to do some post processing on computed results before
+// storing to device or threadgroup memory. For example, in machine learning we
+// need to apply activation function on compute value. One can do GEMM as above
+// which writes the result to device memory, read the value back, call post
+// processing function and write again. This results in wasted bandwidth,
+// performance and power. User can apply post processing in-register where GEMM
+// output is computed using cooperative_tensor. Unlike tensor_handle,
+// tensor_offset and tensor_inline which are non-owning meaning these are
+// wrappers around resource in device, threadgroup or thread addressspce,
+// cooperative_tensor owns thread private data and divides the data for entire
+// tensor among threads (participating the scope of operation) in implementation
+// defined manner. This thread private memory is allocated at construction of
+// cooperative_tensor and deallocated when this cooperative_tensor goes out of
+// scope. The layout of cooperative_tensor depends on operation, data type,
+// number of threads in opscope with which op was created. Note that
+// cooperative_tensor created from an op is only valid for threads that are part
+// of opscope on which op was created. Though the layout of cooperative_tensor
+// is implemtation defined, we provide accessor functions as shown in the
+// example below
+//
+// kernel void simpleMatMulCooperative(tensor<device half,  dextents<int32_t,
+// 2>, tensor_handle> A,
+//                          tensor<device half,  dextents<int32_t, 2>,
+//                          tensor_handle> B, tensor<device float,
+//                          dextents<int32_t, 2>, tensor_handle> C,
+//                          tensor<device half, dextents<int32_t, 2>,
+//                          tensor_handle> bias, constant uint& M, constant
+//                          uint& N, constant uint& K, uint2 tgid
+//                          [[threadgroup_position_in_grid]])
+// {
+//     constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+//                                                           32,
+//                                                            0,
+//                                                        false,
+//                                                        false,
+//                                                        false);
+//
+//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//
+//    auto mA = A.offset(0, tgid.y*64);
+//    auto mB = B.offset(tgid.x*32, 0);
+//    auto mC = C.offset(tgid.x*32, tgid.y*64);
+//
+//    // This creates cooperative destination tensor of float element type.
+//    // Since matmul op above descriptor is created with 4 SIMD-Groups,
+//    coopeartive tensor will divide data among the threads on these
+//    // 4 SIMD-Groups. The layout of data among lanes is implementation defined
+//    and not all threads and even all elements within a thread need
+//    // be valid. We provide valid element check shown below which developer
+//    should use to guard their access to elements of cooperative_tensor
+//
+//    auto cT = matmulOp.get_destination_cooperative_tensor<decltype(mA),
+//    decltype(mB), float>();
+//
+//    // Loop over all the elements of cooperative_tensor thread elements owned
+//    by "this" thread and initialize to zero.
+//    // Its imperative for performance to include "unroll pragma" so compiler
+//    fully unrolls the loop.
+//
+//    #pragma unroll full
+//    for (uint16_t i = 0, i < cT.capacity(); ++i) {
+//
+//      if(cT.mask(i))
+//        cT[i] = 0;
+//    }
+//
+//    // execute the operation. All threads computes the matmul cooperatively
+//    and results are written to cooperative_tensor. op.run(mA, mB, cT);
+//
+//   // create cooperative bias tensor with same layout as destination
+//   cooperative_tensor of matmul auto biasT =
+//   matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB),
+//   float>();
+//
+//   // load data from bias tensor_handle into biasT cooperative_tensor using
+//   layout and distribution of element among threads of scope
+//   // on which matmul was created.
+//   biasT.load(bias);
+//
+//    #pragma unroll full
+//    for (uint16_t i = 0, i < cT.capacity(); ++i) {
+//
+//      if(cT.mask(i)) {
+//        //add bias
+//        cT[i] += biasT[i];
+//
+//        // get the 2-dimensional local coordinate of this thread's i-th
+//        element in destination local coordinate system (in this example
+//        // 32 x 64 tile).
+//        auto ids = cT.multidimensional_indices(i);
+//        cT[i] = foo(cT[i], idx); // do some operation based on coordinate
+//        values
+//      }
+//    }
+//
+//   // store to tensor handle
+//   cT.store(mC);
+// }
+//
+// Note on scope of operation
+// ==========================
+// A tensor operation may be executed on a single thread entirely or
+// cooperatively among a set of SIMD groups. We call these set of threads
+// "execution scop" of the tensor operation. A tensor ops must be created with
+// execution scope provided as template argument. All the threads in this
+// execution scope must enter the run method i.e. call to run methods must be
+// "execution scope" uniform. Use the following types to configure the execution
+// modes of each operation: metal::execution_thread: the operation will be run on a
+// single thread.
+//                 Fragment shaders only support this execution scope.
+// metal::execution_simdgroup - the operation will be run cooperatively by all threads in
+// this SIMD group.
+//                     May be used for finer control over tiling by slicing
+//                     tensors with SIMD IDs.
+// opscope_SIMD-Groups<N> - the operation will be executed cooperatively by N
+// SIMD groups.
+//                          Must be used when all threads in a threadgroup are
+//                          cooperatively performing the operation.
+// It is undefined behavior if the number of SIMD groups dispatched does not
+// match the number of SIMD groups that the operation was configured with.
+//
+// Even though each thread in execution scope can potentially independently
+// enter and exit run method, developer cannot assume that threads in execution
+// scope are working completely independently i.e. tensor operation run
+// implementation may need for (for correctness or performance) synchronize
+// among the threads in execution scope it was created with.
+//
+//
+//===----------------------------------------------------------------------===//
+#ifndef __MetalTensorOpsMatMul2d__
+#define __MetalTensorOpsMatMul2d__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+#include "__impl/MPPTensorOpsBase.h"
+#include "__impl/MPPTensorOpsTypes.h"
+#include <metal_numeric>
+
+#pragma METAL internals : enable
+
+namespace mpp
+{
+namespace tensor_ops
+{
+
+enum class matmul2d_cooperative_operand_index
+{
+  destination,
+};
+
+enum class reduction_operation
+{
+  sum,
+  max,
+  min,
+};
+
+struct matmul2d_descriptor
+{
+  enum class mode
+  {
+    multiply,
+    multiply_accumulate,
+  };
+
+  int m, n, k;
+  bool transpose_left, transpose_right;
+  bool relaxed_precision;
+  mode matmul_mode;
+
+public:
+  constexpr matmul2d_descriptor(int __m, int __n, int __k = dynamic_length_v<int>,
+                                bool __transpose_left = false,
+                                bool __transpose_right = false,
+                                bool __relaxed_precision = false,
+                                mode __matmul_mode = mode::multiply) thread
+      : m(__m),
+        n(__n),
+        k(__k),
+        transpose_left(__transpose_left),
+        transpose_right(__transpose_right),
+        relaxed_precision(__relaxed_precision),
+        matmul_mode(__matmul_mode)
+  {
+  }
+};
+
+template <typename ElementType>
+struct reduction_operation_identity
+{
+  static const constant ElementType sum_identity = (ElementType)0;
+  static const constant ElementType max_identity =
+      metal::numeric_limits<ElementType>::lowest;
+  static const constant ElementType min_identity =
+      metal::numeric_limits<ElementType>::max;
+};
+
+#include "__impl/MPPTensorOpsMatMul2dImpl.h"
+
+template <matmul2d_descriptor Descriptor, typename Scope, class... Args>
+class matmul2d : __tensor_ops_detail::op
+{
+  static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v<Scope>,
+                "Scope template argument should be of op_scope type");
+
+public:
+  matmul2d() thread = default;
+
+  template <
+      typename LeftOperandType, typename RightOperandType,
+      typename DestinationOperandType,
+      typename V = __tensor_ops_detail::__enable_if_t<
+          (__tensor_ops_detail::__is_tensor_type_v<LeftOperandType> &&
+           __tensor_ops_detail::__is_tensor_type_v<RightOperandType> &&
+           (__tensor_ops_detail::__is_tensor_type_v<DestinationOperandType> ||
+            __tensor_ops_detail::__is_cooperative_tensor_type_v<
+                DestinationOperandType>))>,
+      typename... RunArgs>
+  INLINE void run(thread LeftOperandType &left, thread RightOperandType &right,
+                  thread DestinationOperandType &destination) thread const
+  {
+
+    __mutmul2d_detail::__run<Descriptor, Scope, LeftOperandType,
+                             RightOperandType, DestinationOperandType,
+                             RunArgs...>(left, right, destination);
+  }
+
+  template <typename ElementType, typename CoordType, typename... CoopArgs>
+  using cooperative_tensor_destination_t =
+      __mutmul2d_detail::__cooperative_tensor_destination_t<
+          Descriptor, Scope, ElementType, CoordType, CoopArgs...>;
+
+  template <typename LeftOperandType, typename RightOperandType,
+            typename ElementType, typename CoordType = int,
+            typename U = __tensor_ops_detail::__enable_if_t<
+                __tensor_ops_detail::__is_tensor_type_v<LeftOperandType> &&
+                __tensor_ops_detail::__is_tensor_type_v<RightOperandType> &&
+                __tensor_ops_detail::__is_thread_addrspace_v<ElementType> &&
+                __tensor_ops_detail::__is_integral_v<CoordType>>,
+            typename... CoopArgs>
+  INLINE cooperative_tensor_destination_t<ElementType, CoordType, CoopArgs...>
+  get_destination_cooperative_tensor() thread const
+  {
+
+    return __mutmul2d_detail::__get_destination_cooperative_tensor<
+        Descriptor, Scope, ElementType, CoordType, LeftOperandType,
+        RightOperandType, CoopArgs...>();
+  }
+};
+
+template <class ElementType, class Extents, class Layout>
+inline void reduce_rows(
+    thread metal::cooperative_tensor<ElementType, Extents, Layout> &source,
+    thread metal::cooperative_tensor<ElementType, Extents, Layout> &destination,
+    reduction_operation op = reduction_operation::sum,
+    ElementType identity =
+        reduction_operation_identity<ElementType>::sum_identity)
+{
+  __mutmul2d_detail::__reduce_rows<ElementType, Extents, Layout>(
+      source, destination, identity, op);
+}
+
+template <class ElementType, class Extents, class Layout>
+inline void reduce_columns(
+    thread metal::cooperative_tensor<ElementType, Extents, Layout> &source,
+    thread metal::cooperative_tensor<ElementType, Extents, Layout> &destination,
+    reduction_operation op = reduction_operation::sum,
+    ElementType identity =
+        reduction_operation_identity<ElementType>::sum_identity)
+{
+  __mutmul2d_detail::__reduce_columns<ElementType, Extents, Layout>(
+      source, destination, identity, op);
+}
+
+} // namespace tensor_ops
+} // namespace mpp
+
+#pragma METAL internals : disable
+
+#endif
+
+#endif // __TensorOpsMatMul2D__
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MetalPerformancePrimitives.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MetalPerformancePrimitives.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MetalPerformancePrimitives.h	1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MetalPerformancePrimitives.h	2025-05-26 00:45:42
@@ -0,0 +1,12 @@
+// -*- Metal -*-
+//===-- MetalPerformancePrimitives ------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalPerformancePrimitives__
+#define __MetalPerformancePrimitives__
+
+#include <MetalPerformancePrimitives/MPPTensorOpsConvolution2d.h>
+#include <MetalPerformancePrimitives/MPPTensorOpsMatMul2d.h>
+
+#endif
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsBase.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsBase.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsBase.h	1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsBase.h	2025-05-26 00:45:42
@@ -0,0 +1,28 @@
+
+// -*- Metal -*-
+//===-- MetalTensorOpsBase ------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalTensorOpsBase__
+#define __MetalTensorOpsBase__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+namespace mpp
+{
+namespace tensor_ops
+{
+
+namespace __tensor_ops_detail
+{
+class op
+{
+};
+} // namespace __tensor_ops_detail
+
+} // namespace tensor_ops
+} // namespace mpp
+
+#endif
+#endif
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h	1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsConvolution2dImpl.h	2025-05-23 06:26:11
@@ -0,0 +1,4845 @@
+
+
+// -*- Metal -*-
+//===-- MetalTensorOpsConvolution2dImpl
+//------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalTensorOpsConvolution2dImpl__
+#define __MetalTensorOpsConvolution2dImpl__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+namespace __convolution2d_detail
+{
+
+#ifndef EXTERNALLY_DEFINED_ATTR
+#define EXTERNALLY_DEFINED_ATTR \
+  __attribute__((section("air.externally_defined")))
+#endif
+
+#define TENSOROPS_EXPORT [[gnu::visibility("default")]]
+#define INLINE __attribute__((__always_inline__))
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f32_dv_f32_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f32_dv_f32_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f32_tg_f32_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_f32_tg_f32_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f32_dv_f32_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f32_dv_f32_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f32_tg_f32_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_f32_tg_f32_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_dv_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_tg_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_dv_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_tg_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_dv_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_tg_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_dv_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_tg_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type,
+    int threads, thread int2 &offset);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f16_dv_f16_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f16_tg_f16_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f16_dv_f16_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f16_tg_f16_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f16_dv_f16_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f16_tg_f16_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f16_dv_f16_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f16_tg_f16_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f32_dv_f32_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_f32_tg_f32_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f32_dv_f32_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_f32_tg_f32_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_f16(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_i32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_bf_dv_bf_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_bf_tg_bf_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_bf_dv_bf_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_bf_tg_bf_f32(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_bf_dv_bf_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_dv_bf_tg_bf_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_bf_dv_bf_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_run_cooperative_tg_bf_tg_bf_bf(
+    thread convolution2d_descriptor &desc, thread void *activation,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        activation_desc_type,
+    thread void *weights,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type weights_desc_type,
+    thread void *destination, int threads, thread int2 &offset);
+
+template <typename scope, typename activation_type, typename weights_type,
+          typename destination_type, typename... run_args>
+void __run(thread activation_type &activation_tensor,
+           thread weights_type &weights_tensor,
+           thread destination_type &destination_tensor,
+           thread convolution2d_descriptor &__descriptor, int2 __offset)
+{
+  using activation_tensor_type = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<decltype(activation_tensor)>>;
+  using weights_tensor_type = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<decltype(weights_tensor)>>;
+  using destination_tensor_type = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<decltype(destination_tensor)>>;
+
+  metal::execution_threads t = scope();
+  int threads = t.size();
+
+  static_assert(__tensor_ops_detail::__is_tensor_type_v<activation_tensor_type>,
+                "Activation must be a tensor");
+  static_assert(__tensor_ops_detail::__is_tensor_type_v<weights_tensor_type>,
+                "Weights must be a tensor");
+  static_assert(
+      __tensor_ops_detail::__is_tensor_type_v<destination_tensor_type> ||
+          __tensor_ops_detail::__is_cooperative_tensor_type_v<
+              destination_tensor_type>,
+      "Destination must be a tensor or cooperative tensor");
+
+  static_assert(__tensor_ops_detail::__get_rank<activation_tensor_type>() == 4,
+                "Activation must be rank 4");
+  static_assert(__tensor_ops_detail::__get_rank<weights_tensor_type>() == 4,
+                "Weights must be rank 4");
+  static_assert(__tensor_ops_detail::__get_rank<destination_tensor_type>() == 4,
+                "Destination must be rank 4");
+
+  static_assert(__tensor_ops_detail::__is_same_v<
+                    typename activation_tensor_type::index_type, int>,
+                "Index type must be int");
+  static_assert(
+      __tensor_ops_detail::__is_same_v<typename weights_tensor_type::index_type,
+                                       int>,
+      "Index type must be int");
+  static_assert(__tensor_ops_detail::__is_same_v<
+                    typename destination_tensor_type::index_type, int>,
+                "Index type must be int");
+
+  using activation_ptr_type = typename activation_tensor_type::data_handle_type;
+  using weights_ptr_type = typename weights_tensor_type::data_handle_type;
+  using destination_ptr_type =
+      typename destination_tensor_type::data_handle_type;
+
+  using activation_value_type = typename activation_tensor_type::value_type;
+  using weights_value_type = typename weights_tensor_type::value_type;
+  using destination_value_type = typename destination_tensor_type::value_type;
+
+  auto activation = (thread void *)(&activation_tensor);
+  auto weights = (thread void *)(&weights_tensor);
+  auto offset = __offset;
+
+  const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+      activation_desc_type =
+          __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+              activation_tensor_type>();
+  const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+      weights_desc_type =
+          __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+              weights_tensor_type>();
+
+  convolution2d_descriptor desc = __descriptor;
+
+  if constexpr (__tensor_ops_detail::__is_cooperative_tensor_type_v<
+                    destination_tensor_type>)
+  {
+    thread void *destination =
+        &destination_tensor[__tensor_ops_detail::__tensor_ops_reserved_index];
+
+    if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                   int8_t> &&
+                  __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                   int8_t> &&
+                  __tensor_ops_detail::__is_same_v<destination_value_type,
+                                                   half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_i8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_dv_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_dv_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_i8_tg_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_i8_tg_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_dv_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_dv_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_ui8_tg_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_ui8_tg_ui8_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_f16_dv_f16_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_f16_dv_f16_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_f16_tg_f16_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_f16_tg_f16_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_f16_dv_f16_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_f16_dv_f16_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_f16_tg_f16_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_f16_tg_f16_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<destination_value_type,
+                                                        bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_bf_dv_bf_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_bf_dv_bf_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_bf_tg_bf_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_bf_tg_bf_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_bf_dv_bf_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_bf_dv_bf_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_bf_tg_bf_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_bf_tg_bf_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+#endif
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        float> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        float> &&
+                       __tensor_ops_detail::__is_same_v<destination_value_type,
+                                                        float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        activation_ptr_type> &&
+                    __tensor_ops_detail::__is_device_addrspace_v<
+                        weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_f32_dv_f32_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_f32_dv_f32_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_dv_f32_tg_f32_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_cooperative_tg_f32_tg_f32_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else
+      static_assert(
+          __tensor_ops_detail::__assert_false_v<destination_value_type>,
+          "Unsupported type");
+  }
+  else
+  {
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type =
+            __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+                destination_tensor_type>();
+
+    thread void *destination = (thread void *)(&destination_tensor);
+
+    if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                   int8_t> &&
+                  __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                   int8_t> &&
+                  __tensor_ops_detail::__is_same_v<destination_value_type,
+                                                   half>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_i8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        int8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_dv_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_dv_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_i8_tg_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_i8_tg_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        uint8_t> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, int32_t>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_dv_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_dv_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_dv_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_ui8_tg_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_ui8_tg_ui8_tg_i32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, half>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_dv_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_tg_f16(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        half> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_dv_f16_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_dv_f16_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f16_tg_f16_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f16_tg_f16_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<destination_value_type,
+                                                        bfloat>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_dv_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_dv_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_dv_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_dv_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_tg_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_tg_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_tg_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_tg_bf(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        bfloat> &&
+                       __tensor_ops_detail::__is_same_v<
+                           destination_value_type, float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_dv_bf_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_dv_bf_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_bf_tg_bf_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_bf_tg_bf_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+#endif
+    else if constexpr (__tensor_ops_detail::__is_same_v<activation_value_type,
+                                                        float> &&
+                       __tensor_ops_detail::__is_same_v<weights_value_type,
+                                                        float> &&
+                       __tensor_ops_detail::__is_same_v<destination_value_type,
+                                                        float>)
+    {
+      if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f32_dv_f32_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f32_dv_f32_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (
+          __tensor_ops_detail::__is_device_addrspace_v<activation_ptr_type> &&
+          __tensor_ops_detail::__is_threadgroup_addrspace_v<weights_ptr_type> &&
+          __tensor_ops_detail::__is_device_addrspace_v<destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f32_tg_f32_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f32_tg_f32_dv_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f32_dv_f32_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_device_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f32_dv_f32_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_dv_f32_tg_f32_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             activation_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             weights_ptr_type> &&
+                         __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_run_tg_f32_tg_f32_tg_f32(
+            desc, activation, activation_desc_type, weights, weights_desc_type,
+            destination, destination_desc_type, threads, offset);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else
+      static_assert(
+          __tensor_ops_detail::__assert_false_v<destination_value_type>,
+          "Unsupported type");
+  }
+}
+
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR size_t
+__tensorops_impl_conv2d_cooperative_destination_data_size(
+    const thread convolution2d_descriptor &descriptor,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR uint16_t
+__tensorops_impl_conv2d_cooperative_destination_tensor_num_elements(
+    const thread convolution2d_descriptor &descriptor,
+    __tensor_ops_detail::__const_thread_void_t, int threads);
+extern "C" TENSOROPS_EXPORT
+    EXTERNALLY_DEFINED_ATTR __tensor_ops_detail::__thread_void_t
+    __tensorops_impl_conv2d_cooperative_destination_tensor_elements(
+        __tensor_ops_detail::__thread_void_t, uint16_t,
+        __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+    const thread convolution2d_descriptor &descriptor,
+    __tensor_ops_detail::__thread_void_t, uint16_t,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_init(
+    __tensor_ops_detail::__thread_void_t,
+    const thread convolution2d_descriptor &,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_copy(
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    const thread convolution2d_descriptor &,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_move(
+    __tensor_ops_detail::__thread_void_t, __tensor_ops_detail::__thread_void_t,
+    const thread convolution2d_descriptor &,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_conv2d_cooperative_destination_tensor_destory(
+    __tensor_ops_detail::__thread_void_t,
+    const thread convolution2d_descriptor &, int threads);
+extern "C" TENSOROPS_EXPORT EXTERNALLY_DEFINED_ATTR bool
+__tensorops_impl_conv2d_cooperative_destination_tensor_is_valid_element(
+    const thread convolution2d_descriptor &,
+    __tensor_ops_detail::__thread_void_t, uint16_t,
+    __tensor_ops_detail::__tensor_ops_datatype, int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f32(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f32(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_i32(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_i32(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f16(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f16(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_bf(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_bf(
+    thread convolution2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f32(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f32(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_i32(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_i32(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f16(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f16(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_bf(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_bf(
+    thread convolution2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+
+template <convolution2d_descriptor descriptor,
+          convolution2d_cooperative_operand operand, typename scope,
+          typename activation_operand_type, typename weights_operand_type,
+          typename element_type, typename coord_type, typename... coop_args>
+struct __operand_layout
+{
+  static_assert(operand == convolution2d_cooperative_operand::destination,
+                "only destination can be cooperative tensor");
+  static_assert(__tensor_ops_detail::__is_same_v<element_type, float> ||
+                    __tensor_ops_detail::__is_same_v<element_type, half> ||
+#if __HAVE_BFLOAT__
+                    __tensor_ops_detail::__is_same_v<element_type, bfloat> ||
+#endif
+                    __tensor_ops_detail::__is_same_v<element_type, int32_t>,
+                "cooperative tensor data type can only be one of "
+                "float/half/bfloat/int32_t");
+
+  static constant constexpr __tensor_ops_detail::__rank_t rank = 4;
+  using element_t = element_type;
+  using coord_t = coord_type;
+  using extent_t = metal::dextents<coord_t, rank>;
+  using thread_storage_t = thread void *;
+  using const_thread_storage_t = const thread void *;
+  using index_t = uint16_t;
+  using operand_layout_t =
+      __operand_layout<descriptor, operand, scope, activation_operand_type,
+                       weights_operand_type, element_type, coord_type,
+                       coop_args...>;
+  using cooperative_tensor_t =
+      metal::cooperative_tensor<element_type, extent_t, operand_layout_t>;
+
+  using a_type = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<activation_operand_type>>;
+  using w_type = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<weights_operand_type>>;
+
+  using a_elem_type = typename a_type::element_type;
+  using w_elem_type = typename w_type::element_type;
+
+  static size_t thread_storage_size()
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+    __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+    __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+    return __tensorops_impl_conv2d_cooperative_destination_data_size(
+        descriptor, d_data_type, a_data_type, w_data_type, threads);
+  }
+
+  static constexpr size_t thread_storage_align()
+  {
+    return alignof(element_t);
+  };
+
+  static uint16_t size(const_thread_storage_t storage)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    return __tensorops_impl_conv2d_cooperative_destination_tensor_num_elements(
+        descriptor, storage, threads);
+  }
+
+  static void construct(thread void *this_)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+    __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+    __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+    __tensorops_impl_conv2d_cooperative_destination_tensor_init(
+        this_, descriptor, d_data_type, a_data_type, w_data_type, threads);
+  }
+
+  static void copy_construct(thread void *this_, thread void *other)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+    __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+    __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+    __tensorops_impl_conv2d_cooperative_destination_tensor_copy(
+        this_, other, descriptor, d_data_type, a_data_type, w_data_type,
+        threads);
+  };
+
+  static void move_construct(thread void *this_, thread void *other)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+    __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+    __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+    __tensorops_impl_conv2d_cooperative_destination_tensor_move(
+        this_, other, descriptor, d_data_type, a_data_type, w_data_type,
+        threads);
+  };
+
+  static void copy_assign(thread void *this_, thread void *other)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+    __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+    __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+    __tensorops_impl_conv2d_cooperative_destination_tensor_copy(
+        this_, other, descriptor, d_data_type, a_data_type, w_data_type,
+        threads);
+  };
+
+  static void move_assign(thread void *this_, thread void *other)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype d_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<element_t>::value;
+    __tensor_ops_detail::__tensor_ops_datatype a_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<a_elem_type>::value;
+    __tensor_ops_detail::__tensor_ops_datatype w_data_type =
+        __tensor_ops_detail::__type_to_tensor_ops_datatype<w_elem_type>::value;
+
+    __tensorops_impl_conv2d_cooperative_destination_tensor_move(
+        this_, other, descriptor, d_data_type, a_data_type, w_data_type,
+        threads);
+  };
+
+  // Destroys the per-thread object.
+  static void destroy(thread void *this_)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensorops_impl_conv2d_cooperative_destination_tensor_destory(
+        this_, descriptor, threads);
+  }
+
+  template <class ElemType, class Extents, class Descriptor, class... Tags>
+  static void load(thread_storage_t storage,
+                   const thread metal::tensor<ElemType, Extents, Descriptor,
+                                              Tags...> &sourceT)
+  {
+    using elem_t = __tensor_ops_detail::__remove_addrspace_t<ElemType>;
+
+    metal::execution_threads t = scope();
+    int threads = t.size();
+
+    static_assert(__tensor_ops_detail::__is_same_v<elem_t, element_type>,
+                  "Source tensor datatype does not match cooperative tensor");
+    static_assert(Extents::rank() == 1 || Extents::rank() == 4,
+                  "Source tensor must be rank 1 or 4");
+
+    int sourceRank = Extents::rank();
+
+    convolution2d_descriptor desc = descriptor;
+
+    using tensorType = metal::tensor<ElemType, Extents, Descriptor, Tags...>;
+
+    using sourcePtrType = typename tensorType::data_handle_type;
+
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType =
+        __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+            tensorType>();
+
+    const thread void *source = (const thread void *)(&sourceT);
+
+    if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f16(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f16(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_i32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_i32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_f32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_f32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_dv_bf(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_load_tg_bf(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+#endif
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
+                    "Unsupported type");
+  };
+
+  template <class ElemType, class Extents, class Descriptor, class... Tags>
+  static void store(const_thread_storage_t storage,
+                    const thread metal::tensor<ElemType, Extents, Descriptor,
+                                               Tags...> &destinationT)
+  {
+    using elem_t = __tensor_ops_detail::__remove_addrspace_t<ElemType>;
+
+    metal::execution_threads t = scope();
+    int threads = t.size();
+
+    static_assert(__tensor_ops_detail::__is_same_v<elem_t, element_type>,
+                  "Tensor datatype does not match cooperative tensor");
+    static_assert(Extents::rank() == 1 || Extents::rank() == rank,
+                  "Tensor must be rank 1 or 4");
+
+    convolution2d_descriptor desc = descriptor;
+
+    using tensorType = metal::tensor<ElemType, Extents, Descriptor, Tags...>;
+
+    using destination_ptr_type = typename tensorType::data_handle_type;
+
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destination_desc_type =
+            __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+                tensorType>();
+
+    const thread void *destination = (const thread void *)(&destinationT);
+
+    if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f16(
+            desc, storage, destination, destination_desc_type, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f16(
+            desc, storage, destination, destination_desc_type, threads);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_i32(
+            desc, storage, destination, destination_desc_type, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_i32(
+            desc, storage, destination, destination_desc_type, threads);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_f32(
+            desc, storage, destination, destination_desc_type, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_f32(
+            desc, storage, destination, destination_desc_type, threads);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_dv_bf(
+            desc, storage, destination, destination_desc_type, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destination_ptr_type>)
+        __tensorops_impl_convolution2d_op_cooperative_destination_tensor_store_tg_bf(
+            desc, storage, destination, destination_desc_type, threads);
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destination_ptr_type>,
+            "Unsupported address space");
+    }
+#endif
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
+                    "Unsupported type");
+  };
+
+  static thread element_t *get_pointer_to(const_thread_storage_t storage,
+                                          index_t idx)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_type>,
+                    "unsupported data type");
+
+    return (thread element_t *)
+        __tensorops_impl_conv2d_cooperative_destination_tensor_elements(
+            (thread_storage_t)storage, idx, dataType, threads);
+  };
+
+  static bool mask(const_thread_storage_t storage, index_t idx)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_type>,
+                    "unsupported data type");
+
+    return __tensorops_impl_conv2d_cooperative_destination_tensor_is_valid_element(
+        descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+        dataType, threads);
+  }
+
+  template <typename index_t, __tensor_ops_detail::__rank_t rank>
+  static metal::array<index_t, rank>
+  multidimensional_indices(const_thread_storage_t storage, index_t idx)
+  {
+
+    metal::execution_threads t = scope();
+    int threads = t.size();
+
+    static_assert(rank == 4, "multidimensional_indices returns 4D indices");
+
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_type>,
+                    "unsupported data type");
+
+    if constexpr (__tensor_ops_detail::__is_same_v<coord_t, ushort>)
+    {
+      ushort coords[4];
+      __tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint16,
+          threads);
+      return {coords[0], coords[1], coords[2], coords[3]};
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, short>)
+    {
+      short coords[4];
+      __tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int16,
+          threads);
+      return {coords[0], coords[1], coords[2], coords[3]};
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, uint>)
+    {
+      uint coords[4];
+      __tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint32,
+          threads);
+      return {coords[0], coords[1], coords[2], coords[3]};
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, int>)
+    {
+      int coords[4];
+      __tensorops_impl_conv2d_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int32,
+          threads);
+      return {coords[0], coords[1], coords[2], coords[3]};
+    }
+  }
+};
+
+template <convolution2d_descriptor descriptor,
+          convolution2d_cooperative_operand operand, typename scope,
+          typename activation_operand_type, typename weights_operand_type,
+          typename element_type, typename coord_type, typename... coop_args>
+using __cooperative_tensor_t = typename __operand_layout<
+    descriptor, operand, scope, activation_operand_type, weights_operand_type,
+    element_type, coord_type, coop_args...>::cooperative_tensor_t;
+
+template <convolution2d_descriptor descriptor, typename scope,
+          typename activation_operand_type, typename weights_operand_type,
+          typename element_type, typename coord_type, typename... coop_args>
+using __cooperative_tensor_destination_t =
+    __cooperative_tensor_t<descriptor,
+                           convolution2d_cooperative_operand::destination,
+                           scope, activation_operand_type, weights_operand_type,
+                           element_type, coord_type, coop_args...>;
+
+template <convolution2d_descriptor descriptor, typename scope,
+          typename activation_operand_type, typename weights_operand_type,
+          typename element_type, typename coord_type, typename... coop_args>
+__cooperative_tensor_destination_t<descriptor, scope, activation_operand_type,
+                                   weights_operand_type, element_type,
+                                   coord_type, coop_args...>
+__get_destination_cooperative_tensor()
+{
+  return __cooperative_tensor_destination_t<
+      descriptor, scope, activation_operand_type, weights_operand_type,
+      element_type, coord_type, coop_args...>();
+}
+
+#undef EXTERNALLY_DEFINED_ATTR
+
+} // namespace __convolution2d_detail
+
+#endif
+
+#endif // __TensorOpsConvolution2D__
diff -ruN /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h
--- /Applications/Xcode_16.4.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h	1969-12-31 19:00:00
+++ /Applications/Xcode_26.0.0-beta.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h	2025-05-23 06:26:11
@@ -0,0 +1,5131 @@
+
+// -*- Metal -*-
+//===-- MetalTensorOpsMatMul2dImpl
+//------------------------------------------------------===//
+// Copyright (c) 2025 Apple Inc. All rights reserved
+//===----------------------------------------------------------------------===//
+
+#ifndef __MetalTensorOpsMatMul2dImpl__
+#define __MetalTensorOpsMatMul2dImpl__
+
+#if defined(__METAL_VERSION__) && defined(__HAVE_TENSOR__)
+
+namespace __mutmul2d_detail
+{
+
+#ifndef EXTERNALLY_DEFINED_ATTR
+#define EXTERNALLY_DEFINED_ATTR \
+  __attribute__((section("air.externally_defined")))
+#endif
+
+#define TENSOROPS_EXPORT [[gnu::visibility("default")]]
+#define INLINE __attribute__((__always_inline__))
+
+using __matmul2d_descriptor = matmul2d_descriptor;
+
+using __matmul2d_cooperative_operand_index = matmul2d_cooperative_operand_index;
+
+using __reduction_operation = reduction_operation;
+
+extern "C" EXTERNALLY_DEFINED_ATTR size_t
+__tensorops_impl_matmul2d_op_cooperative_destination_data_size(
+    const __matmul2d_descriptor descriptor, const int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR uint16_t
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_num_elements(
+    const __matmul2d_descriptor descriptor, const int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR thread void *
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_elements(
+    __tensor_ops_detail::__thread_void_t, uint16_t,
+    __tensor_ops_detail::__tensor_ops_datatype);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+    const __matmul2d_descriptor descriptor,
+    __tensor_ops_detail::__thread_void_t, uint16_t,
+    __tensor_ops_detail::__tensor_ops_datatype, thread void *,
+    __tensor_ops_detail::__tensor_ops_datatype, const int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_init(
+    __tensor_ops_detail::__thread_void_t, __matmul2d_descriptor,
+    __tensor_ops_detail::__tensor_ops_datatype, const int);
+extern "C" EXTERNALLY_DEFINED_ATTR bool
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_is_valid_element(
+    const __matmul2d_descriptor descriptor,
+    __tensor_ops_detail::__thread_void_t, uint16_t,
+    __tensor_ops_detail::__tensor_ops_datatype, const int);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_i32(
+    thread __matmul2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *storage,
+    const thread void *source,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType,
+    int sourceRank, int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f16(
+    thread __matmul2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_f16(
+    thread __matmul2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_i32(
+    thread __matmul2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_i32(
+    thread __matmul2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f32(
+    thread __matmul2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_f32(
+    thread __matmul2d_descriptor &desc, const thread void *storage,
+    const thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destDescType,
+    int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f16(
+    thread __matmul2d_descriptor &desc, const thread void *src,
+    thread void *dst, half identity, __reduction_operation op);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f32(
+    thread __matmul2d_descriptor &desc, const thread void *src,
+    thread void *dst, float identity, __reduction_operation op);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_i32(
+    thread __matmul2d_descriptor &desc, const thread void *src,
+    thread void *dst, int identity, __reduction_operation op);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f16(
+    thread __matmul2d_descriptor &desc, const thread void *src,
+    thread void *dst, half identity, __reduction_operation op);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f32(
+    thread __matmul2d_descriptor &desc, const thread void *src,
+    thread void *dst, float identity, __reduction_operation op);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_i32(
+    thread __matmul2d_descriptor &desc, const thread void *src,
+    thread void *dst, int identity, __reduction_operation op);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_dv_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_f32_tg_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_dv_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_f32_tg_i8_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_f32_tg_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_dv_i8_tg_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_dv_i8_tg_i8_tg_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_dv_i8_tg_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_tg_i8_tg_i8_tg_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType,
+    int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_i8_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f16_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_f32_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_i8_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_i8_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_i8_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_i8_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination, int threads);
+
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f16(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_dv_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_th_f32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_dv_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_th_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_th_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_th_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_th_i32(
+    thread __matmul2d_descriptor &desc, thread void *left,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType,
+    thread void *right,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType,
+    thread void *destination,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType);
+
+template <__matmul2d_descriptor descriptor,
+          __matmul2d_cooperative_operand_index operand_index, typename scope,
+          typename element_type, typename coord_type, typename... args>
+struct __operand_layout
+{
+
+  static_assert(operand_index ==
+                    matmul2d_cooperative_operand_index::destination,
+                "only destination can be cooperative tensor");
+  static_assert(__tensor_ops_detail::__is_same_v<element_type, float> ||
+                    __tensor_ops_detail::__is_same_v<element_type, half> ||
+#if __HAVE_BFLOAT__
+                    __tensor_ops_detail::__is_same_v<element_type, bfloat> ||
+#endif
+                    __tensor_ops_detail::__is_same_v<element_type, int32_t>,
+                "cooperative tensor data type can only be one of "
+                "float/half/bfloat/int32_t");
+
+  static constant constexpr __tensor_ops_detail::__rank_t rank = 2;
+  using element_t = element_type;
+  using coord_t = coord_type;
+  using extent_t = metal::dextents<coord_t, rank>;
+  using thread_storage_t = thread void *;
+  using const_thread_storage_t = const thread void *;
+  using index_t = uint16_t;
+  using operand_layout_t =
+      __operand_layout<descriptor, operand_index, scope, element_t, coord_t>;
+  using cooperative_tensor_t =
+      metal::cooperative_tensor<element_t, extent_t, operand_layout_t>;
+  using scope_t = scope;
+
+  static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v<scope>,
+                "scope should be of type __tensorops_scope");
+
+  static constexpr constant bool is_matmul2d_cooperative_destination_layout =
+      true;
+
+  static constexpr constant __matmul2d_descriptor matmul2d_desc = descriptor;
+
+  // Returns the alignment of the storage allocated in each thread
+  // for this cooperative_tensor.
+  static constexpr size_t thread_storage_align()
+  {
+    return alignof(element_t);
+  };
+
+  // Copy-constructs from the cooperative_tensor `other`.
+  static void copy_construct(thread void *this_, thread void *other)
+  {
+    thread element_t *this_e = (thread element_t *)(this_);
+    thread element_t *other_e = (thread element_t *)(other);
+    for (size_t i = 0, e = size(this_); i != e; ++i)
+    {
+      other_e[i] = this_e[i];
+    }
+  };
+
+  // Move-constructs from the cooperative_tensor `other`.
+  static void move_construct(thread void *this_, thread void *other)
+  {
+    thread element_t *this_e = (thread element_t *)(this_);
+    thread element_t *other_e = this_e;
+  };
+
+  // Copy-assigns from the cooperative_tensor `other`.
+  static void copy_assign(thread void *this_, thread void *other)
+  {
+    thread element_t *this_e = (thread element_t *)(this_);
+    thread element_t *other_e = (thread element_t *)(other);
+    for (size_t i = 0, e = size(this_); i != e; ++i)
+    {
+      other_e[i] = this_e[i];
+    }
+  };
+
+  // Move-assigns from the cooperative_tensor `other`.
+  static void move_assign(thread void *this_, thread void *other)
+  {
+    thread element_t *this_e = (thread element_t *)(this_);
+    thread element_t *other_e = this_e;
+  };
+
+  // Destroys the per-thread object.
+  static void destroy(thread void *) {};
+
+  static size_t thread_storage_size()
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    return __tensorops_impl_matmul2d_op_cooperative_destination_data_size(
+        descriptor, threads);
+  }
+
+  template <class ElemType, class Extents, class Descriptor, class... Tags>
+  static void load(thread_storage_t storage,
+                   const thread metal::tensor<ElemType, Extents, Descriptor,
+                                              Tags...> &sourceT)
+  {
+    using elem_t = __tensor_ops_detail::__remove_addrspace_t<ElemType>;
+
+    static_assert(__tensor_ops_detail::__is_same_v<elem_t, element_t>,
+                  "Source tensor datatype does not match cooperative tensor");
+    static_assert(Extents::rank() == 1 || Extents::rank() == 2,
+                  "Source tensor must be rank 1 or 2");
+
+    int sourceRank = Extents::rank();
+
+    metal::execution_threads t = scope();
+    int threads = t.size();
+
+    __matmul2d_descriptor desc = descriptor;
+
+    using tensorType = metal::tensor<ElemType, Extents, Descriptor, Tags...>;
+
+    using sourcePtrType = typename tensorType::data_handle_type;
+
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type sourceDescType =
+        __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+            tensorType>();
+
+    const thread void *source = (const thread void *)(&sourceT);
+
+    if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_f16(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_f16(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_i32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_i32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_f32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_f32(
+            desc, storage, source, sourceDescType, sourceRank, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
+                    "Unsupported type");
+  };
+
+  template <class ElemType, class Extents, class Descriptor, class... Tags>
+  static void store(const_thread_storage_t storage,
+                    const thread metal::tensor<ElemType, Extents, Descriptor,
+                                               Tags...> &destinationT)
+  {
+    using elem_t = __tensor_ops_detail::__remove_addrspace_t<ElemType>;
+
+    static_assert(__tensor_ops_detail::__is_same_v<elem_t, element_t>,
+                  "Tensor datatype does not match cooperative tensor");
+    static_assert(Extents::rank() == 1 || Extents::rank() == rank,
+                  "Tensor must be rank 1 or 2");
+
+    __matmul2d_descriptor desc = descriptor;
+
+    metal::execution_threads t = scope();
+    int threads = t.size();
+
+    using tensorType = metal::tensor<ElemType, Extents, Descriptor, Tags...>;
+
+    using destinationPtrType = typename tensorType::data_handle_type;
+
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+        destinationDescType =
+            __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+                tensorType>();
+
+    const thread void *destination = (const thread void *)(&destinationT);
+
+    if constexpr (__tensor_ops_detail::__is_same_v<elem_t, half>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f16(
+            desc, storage, destination, destinationDescType, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_f16(
+            desc, storage, destination, destinationDescType, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+                      "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, int32_t>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_i32(
+            desc, storage, destination, destinationDescType, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_i32(
+            desc, storage, destination, destinationDescType, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+                      "Unsupported address space");
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, float>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f32(
+            desc, storage, destination, destinationDescType, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_f32(
+            desc, storage, destination, destinationDescType, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+                      "Unsupported address space");
+    }
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
+                    "Unsupported type");
+  };
+
+  static uint16_t size(const_thread_storage_t storage)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    return __tensorops_impl_matmul2d_op_cooperative_destination_tensor_num_elements(
+        descriptor, threads);
+  }
+
+  static thread element_t *get_pointer_to(const_thread_storage_t storage,
+                                          index_t idx)
+  {
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_int32;
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_t>,
+                    "unsupported data type");
+
+    return (thread element_t *)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_elements(
+            (thread_storage_t)storage, idx, dataType);
+  };
+
+  static bool mask(const_thread_storage_t storage, index_t idx)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_int32;
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_t>,
+                    "unsupported data type");
+
+    return __tensorops_impl_matmul2d_op_cooperative_destination_tensor_is_valid_element(
+        descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+        dataType, threads);
+  }
+
+  template <typename index_t, __tensor_ops_detail::__rank_t rank = 2>
+  static metal::array<index_t, rank>
+  multidimensional_indices(const_thread_storage_t storage, index_t idx)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_int32;
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_t>,
+                    "unsupported data type");
+
+    if constexpr (__tensor_ops_detail::__is_same_v<coord_t, ushort>)
+    {
+      ushort coords[2];
+      __tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint16,
+          threads);
+      return {coords[0], coords[1]};
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, short>)
+    {
+      short coords[2];
+      __tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int16,
+          threads);
+      return {coords[0], coords[1]};
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, uint>)
+    {
+      uint coords[2];
+      __tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_uint32,
+          threads);
+      ;
+      return {coords[0], coords[1]};
+    }
+    else if constexpr (__tensor_ops_detail::__is_same_v<coord_t, int>)
+    {
+      int coords[2];
+      __tensorops_impl_matmul2d_op_cooperative_destination_tensor_get_coordinate(
+          descriptor, (__tensor_ops_detail::__thread_void_t)storage, idx,
+          dataType, coords, __tensor_ops_detail::__tensor_ops_datatype_int32,
+          threads);
+      return {coords[0], coords[1]};
+    }
+  }
+
+  static void construct(thread_storage_t storage)
+  {
+    metal::execution_threads t = scope();
+    int threads = t.size();
+    __tensor_ops_detail::__tensor_ops_datatype dataType;
+    if constexpr (__tensor_ops_detail::__is_same_v<element_t, float>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float32;
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, half>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_float16;
+#if __HAVE_BFLOAT__
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, bfloat>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_bfloat16;
+#endif
+    else if constexpr (__tensor_ops_detail::__is_same_v<element_t, int32_t>)
+      dataType = __tensor_ops_detail::__tensor_ops_datatype_int32;
+    else
+      static_assert(__tensor_ops_detail::__assert_false_v<element_t>,
+                    "unsupported data type");
+
+    __tensorops_impl_matmul2d_op_cooperative_destination_tensor_init(
+        (__tensor_ops_detail::__thread_void_t)storage, descriptor, dataType,
+        threads);
+  }
+};
+
+template <__matmul2d_descriptor descriptor,
+          __matmul2d_cooperative_operand_index operand_index, typename scope,
+          typename element_type, typename coord_type, typename... args>
+using __cooperative_tensor_t =
+    typename __operand_layout<descriptor, operand_index, scope, element_type,
+                              coord_type, args...>::cooperative_tensor_t;
+
+template <__matmul2d_descriptor descriptor, typename scope,
+          typename element_type, typename coord_type, typename... args>
+using __cooperative_tensor_destination_t =
+    __cooperative_tensor_t<descriptor,
+                           matmul2d_cooperative_operand_index::destination,
+                           scope, element_type, coord_type, args...>;
+
+template <__matmul2d_descriptor descriptor, typename scope,
+          typename element_type, typename coord_type, typename left_operand,
+          typename right_operand, typename... args>
+__cooperative_tensor_destination_t<descriptor, scope, element_type, coord_type,
+                                   args...>
+__get_destination_cooperative_tensor()
+{
+  static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v<scope>,
+                "scope should be of type __tensorops_scope");
+  return __cooperative_tensor_destination_t<descriptor, scope, element_type,
+                                            coord_type, args...>();
+}
+
+template <__matmul2d_descriptor descriptor, typename scope,
+          typename left_operand, typename right_operand,
+          typename destination_operand, typename... args>
+void __run(thread left_operand &leftIn, thread right_operand &rightIn,
+           thread destination_operand &destinationT)
+{
+  using leftTensorType = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<decltype(leftIn)>>;
+  using rightTensorType = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<decltype(rightIn)>>;
+  using destinationTensorType = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<decltype(destinationT)>>;
+
+  metal::execution_threads t = scope();
+  int threads = t.size();
+
+  static_assert(__tensor_ops_detail::__is_tensor_type_v<leftTensorType>,
+                "Left operand must be a tensor");
+  static_assert(__tensor_ops_detail::__is_tensor_type_v<rightTensorType>,
+                "Right operand must be a tensor");
+  static_assert(
+      __tensor_ops_detail::__is_tensor_type_v<destinationTensorType> ||
+          __tensor_ops_detail::__is_cooperative_tensor_type_v<
+              destinationTensorType>,
+      "Destination operand must be a tensor or cooperative tensor");
+
+  static_assert(__tensor_ops_detail::__is_tensorops_execution_scope_v<scope>,
+                "scope should be of type __tensorops_scope");
+
+  static_assert(__tensor_ops_detail::__get_rank<leftTensorType>() == 2,
+                "Operand must have rank 2");
+  static_assert(__tensor_ops_detail::__get_rank<rightTensorType>() == 2,
+                "Operand must have rank 2");
+  static_assert(__tensor_ops_detail::__get_rank<destinationTensorType>() == 2,
+                "Operand must have rank 2");
+
+  static_assert(
+      __tensor_ops_detail::__is_same_v<typename leftTensorType::index_type,
+                                       int>,
+      "Index type must be int");
+  static_assert(
+      __tensor_ops_detail::__is_same_v<typename rightTensorType::index_type,
+                                       int>,
+      "Index type must be int");
+  static_assert(__tensor_ops_detail::__is_same_v<
+                    typename destinationTensorType::index_type, int>,
+                "Index type must be int");
+
+  using leftPtrType = typename leftTensorType::data_handle_type;
+  using rightPtrType = typename rightTensorType::data_handle_type;
+  using destinationPtrType = typename destinationTensorType::data_handle_type;
+
+  using leftValueType = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<
+          typename leftTensorType::element_type>>;
+  using rightValueType = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<
+          typename rightTensorType::element_type>>;
+  using destinationValueType = __tensor_ops_detail::__remove_addrspace_t<
+      __tensor_ops_detail::__remove_reference_t<
+          typename destinationTensorType::element_type>>;
+
+  const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType =
+      __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+          leftTensorType>();
+  const __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType =
+      __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+          rightTensorType>();
+
+  thread void *left = (thread void *)(&leftIn);
+  thread void *right = (thread void *)(&rightIn);
+
+  __matmul2d_descriptor desc = descriptor;
+
+  // single thread
+  if constexpr (__tensor_ops_detail::__is_same_v<scope, metal::execution_thread>)
+  {
+    if constexpr (!__tensor_ops_detail::__is_cooperative_tensor_type_v<
+                      destinationTensorType>)
+    {
+      __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+          destinationDescType =
+              __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+                  destinationTensorType>();
+
+      thread void *destination = (thread void *)(&destinationT);
+
+      if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                    __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                    __tensor_ops_detail::__is_same_v<destinationValueType,
+                                                     half>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, half>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, half>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_i8_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f16_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_f32_th_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, int32_t>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_dv_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_dv_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_dv_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_dv_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_i8_th_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_i8_th_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_i8_th_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else if constexpr (
+            __tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_i8_th_i32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else
+        static_assert(
+            __tensor_ops_detail::__assert_false_v<destinationValueType>,
+            "Unsupported type");
+    }
+    else
+      static_assert(
+          __tensor_ops_detail::__assert_false_v<destinationTensorType>,
+          "destination cannot be cooperative tensor with cooperative group of "
+          "size 1");
+  }
+  else
+  {
+    // multiple threads
+    if constexpr (!__tensor_ops_detail::__is_cooperative_tensor_type_v<
+                      destinationTensorType>)
+    {
+      __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
+          destinationDescType =
+              __tensor_ops_detail::__tensor_type_to_tensor_descriptor_type<
+                  destinationTensorType>();
+
+      thread void *destination = (thread void *)(&destinationT);
+
+      if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                    __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                    __tensor_ops_detail::__is_same_v<destinationValueType,
+                                                     half>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, half>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_i8_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_i8_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_i8_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_i8_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, half>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_dv_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_dv_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_dv_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_tg_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_tg_f16_tg_f16(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_dv_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_tg_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_f16_tg_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_f16_tg_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<
+                               destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_f16_tg_f32(
+              desc, left, leftDescType, right, rightDescType, destination,
+              destinationDescType, threads);
+        else
+          static_assert(
+              __tensor_ops_detail::__assert_false_v<destinationPtrType>,
+              "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType,
+                                                          half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType,
+                                                          float> &&
+                         __tensor_ops_detail::__is_same_v<
+                             destinationValueType, float>)
+      {
+        if constexpr (
+            __tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+            __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_f32_dv_f32(
+

MetalPerformancePrimitives macOS xcode26.0 b1

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!