NVIDIA · cliffburdick · Feb 22, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/include/matx/transforms/matmul/matmul_cusparse.h b/include/matx/transforms/matmul/matmul_cusparse.h
@@ -7,8 +7,8 @@
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
 //
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
@@ -20,14 +20,15 @@
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
 /////////////////////////////////////////////////////////////////////////////////
 
 #pragma once
@@ -97,10 +98,9 @@ class MatMulCUSPARSEHandle_t {
                   std::is_same_v<TC, cuda::std::complex<double>>) {
       salpha_ = {alpha, 0};
       sbeta_ = {beta, 0};
-    }
-    else if constexpr (std::is_same_v<TC, float> ||
-                       std::is_same_v<TC, double>) {
-      salpha_ = alpha;;
+    } else if constexpr (std::is_same_v<TC, float> ||
+                         std::is_same_v<TC, double>) {
+      salpha_ = alpha;
       sbeta_ = beta;
     } else {
       MATX_THROW(matxNotSupported, "SpMM currently only supports uniform FP");
@@ -150,9 +150,9 @@ class MatMulCUSPARSEHandle_t {
     // Allocate a workspace for SpMM.
     const cusparseSpMMAlg_t algo = CUSPARSE_SPMM_ALG_DEFAULT;
     const cudaDataType comptp = dtc; // TODO: support separate comp type?!
-    ret = cusparseSpMM_bufferSize(handle_, params_.opA, params_.opB,
-                                  &salpha_, matA_, matB_, &sbeta_,
-                                  matC_, comptp, algo, &workspaceSize_);
+    ret = cusparseSpMM_bufferSize(handle_, params_.opA, params_.opB, &salpha_,
+                                  matA_, matB_, &sbeta_, matC_, comptp, algo,
+                                  &workspaceSize_);
     MATX_ASSERT(ret == CUSPARSE_STATUS_SUCCESS, matxMatMulError);
     if (workspaceSize_) {
       matxAlloc((void **)&workspace_, workspaceSize_, MATX_DEVICE_MEMORY);
@@ -203,8 +203,8 @@ class MatMulCUSPARSEHandle_t {
     const cusparseSpMMAlg_t algo = CUSPARSE_SPMM_ALG_DEFAULT;
     const cudaDataType comptp = MatXTypeToCudaType<TC>(); // TODO: see above
     [[maybe_unused]] cusparseStatus_t ret =
-        cusparseSpMM(handle_, params_.opA, params_.opB, &salpha_, matA_,
-                     matB_, &sbeta_, matC_, comptp, algo, workspace_);
+        cusparseSpMM(handle_, params_.opA, params_.opB, &salpha_, matA_, matB_,
+                     &sbeta_, matC_, comptp, algo, workspace_);
     MATX_ASSERT(ret == CUSPARSE_STATUS_SUCCESS, matxMatMulError);
   }
 
@@ -255,49 +255,69 @@ using gemm_cusparse_cache_t =
     std::unordered_map<MatMulCUSPARSEParams_t, std::any,
                        MatMulCUSPARSEParamsKeyHash, MatMulCUSPARSEParamsKeyEq>;
 
+template <typename Op>
+__MATX_INLINE__ auto getCuSparseSupportedTensor(const Op &in,
+                                                cudaStream_t stream) {
+  const auto func = [&]() {
+    if constexpr (is_tensor_view_v<Op>)
+      return in.Stride(Op::Rank() - 1) == 1;
+    return true;
+  };
+  return GetSupportedTensor(in, func, MATX_ASYNC_DEVICE_MEMORY, stream);
+}
+
 } // end namespace detail
 
 template <typename TensorTypeC, typename TensorTypeA, typename TensorTypeB>
-void sparse_matmul_impl(TensorTypeC &c, const TensorTypeA &a, const TensorTypeB &b,
-                        const cudaExecutor &exec, float alpha = 1.0,
-                        float beta = 0.0) {
+void sparse_matmul_impl(TensorTypeC &C, const TensorTypeA &a,
+                        const TensorTypeB &B, const cudaExecutor &exec,
+                        float alpha = 1.0, float beta = 0.0) {
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
   const auto stream = exec.getStream();
 
-  using TA = typename TensorTypeA::value_type;
-  using TB = typename TensorTypeB::value_type;
-  using TC = typename TensorTypeC::value_type;
+  // Transform into supported form.
+  auto b = getCuSparseSupportedTensor(B, stream);
+  auto c = getCuSparseSupportedTensor(C, stream);
+  if (!is_matx_transform_op<TensorTypeB>() && !b.isSameView(B)) {
+    (b = B).run(stream);
+  }
+
+  using atype = TensorTypeA;
+  using btype = decltype(b);
+  using ctype = decltype(c);
+
+  using TA = typename atype::value_type;
+  using TB = typename btype::value_type;
+  using TC = typename ctype::value_type;
 
-  static constexpr int RANKA = TensorTypeA::Rank();
-  static constexpr int RANKB = TensorTypeB::Rank();
-  static constexpr int RANKC = TensorTypeC::Rank();
+  static constexpr int RANKA = atype::Rank();
+  static constexpr int RANKB = btype::Rank();
+  static constexpr int RANKC = ctype::Rank();
 
   // Restrictions.
   static_assert(RANKA == 2 && RANKB == 2 && RANKC == 2,
                 "tensors must have rank-2");
-  static_assert(std::is_same_v<TC, TA> &&
-		std::is_same_v<TC, TB>,
+  static_assert(std::is_same_v<TC, TA> && std::is_same_v<TC, TB>,
                 "tensors must have the same data type");
   // TODO: allow MIXED-PRECISION computation!
-  static_assert(std::is_same_v<TC, float> ||
-                std::is_same_v<TC, double> ||
-                std::is_same_v<TC, cuda::std::complex<float>> ||
-                std::is_same_v<TC, cuda::std::complex<double>>,
+  static_assert(std::is_same_v<TC, float> || std::is_same_v<TC, double> ||
+                    std::is_same_v<TC, cuda::std::complex<float>> ||
+                    std::is_same_v<TC, cuda::std::complex<double>>,
                 "unsupported data type");
-  MATX_ASSERT(
-       a.Size(RANKA - 1) == b.Size(RANKB - 2) &&
-       c.Size(RANKC - 1) == b.Size(RANKB - 1) &&
-       c.Size(RANKC - 2) == a.Size(RANKA - 2), matxInvalidSize);
-  MATX_ASSERT(b.Stride(RANKB - 1) == 1 &&
-              c.Stride(RANKC - 1) == 1, matxInvalidParameter);
+  MATX_ASSERT(a.Size(RANKA - 1) == b.Size(RANKB - 2) &&
+                  c.Size(RANKC - 1) == b.Size(RANKB - 1) &&
+                  c.Size(RANKC - 2) == a.Size(RANKA - 2),
+              matxInvalidSize);
+  MATX_ASSERT(b.Stride(RANKB - 1) == 1 && c.Stride(RANKC - 1) == 1,
+              matxInvalidParameter);
 
   // Get parameters required by these tensors (for caching).
   auto params =
-      detail::MatMulCUSPARSEHandle_t<TensorTypeC, TensorTypeA, TensorTypeB>::GetGemmParams(
+      detail::MatMulCUSPARSEHandle_t<ctype, atype, btype>::GetGemmParams(
           c, a, b, stream, alpha, beta);
 
   // Lookup and cache.
-  using cache_val_type = detail::MatMulCUSPARSEHandle_t<TensorTypeC, TensorTypeA, TensorTypeB>;
+  using cache_val_type = detail::MatMulCUSPARSEHandle_t<ctype, atype, btype>;
   detail::GetCache().LookupAndExec<detail::gemm_cusparse_cache_t>(
       detail::GetCacheIdFromType<detail::gemm_cusparse_cache_t>(), params,
       [&]() {
@@ -306,6 +326,11 @@ void sparse_matmul_impl(TensorTypeC &c, const TensorTypeA &a, const TensorTypeB
       [&](std::shared_ptr<cache_val_type> cache_type) {
         cache_type->Exec(c, a, b);
       });
+
+  // Copy transformed output back.
+  if (!c.isSameView(C)) {
+    (C = c).run(stream);
+  }
 }
 
 } // end namespace matx
diff --git a/test/00_sparse/Matmul.cu b/test/00_sparse/Matmul.cu
@@ -20,14 +20,15 @@
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
 /////////////////////////////////////////////////////////////////////////////////
 
 #include "assert.h"
@@ -135,7 +136,24 @@ TYPED_TEST(MatmulSparseTestsAll, MatmulCOO) {
       else {
         ASSERT_NEAR(O(i, j), E(i, j), this->thresh);
       }
+    }
+  }
+
+  // Allow transforming output.
+  auto TO = make_tensor<TestType>({n, m});
+  (transpose(TO) = matmul(S, B)).run(exec);
 
+  // Verify result.
+  exec.sync();
+  for (index_t i = 0; i < m; i++) {
+    for (index_t j = 0; j < n; j++) {
+      if constexpr (is_complex_v<TestType>) {
+        ASSERT_NEAR(TO(j, i).real(), E(i, j).real(), this->thresh);
+        ASSERT_NEAR(TO(j, i).imag(), E(i,j ).imag(), this->thresh);
+      }
+      else {
+        ASSERT_NEAR(TO(j, i), E(i, j), this->thresh);
+      }
     }
   }
 
@@ -177,7 +195,6 @@ TYPED_TEST(MatmulSparseTestsAll, MatmulCSR) {
       else {
         ASSERT_NEAR(O(i, j), E(i, j), this->thresh);
       }
-
     }
   }
 
@@ -219,7 +236,26 @@ TYPED_TEST(MatmulSparseTestsAll, MatmulCSC) {
       else {
         ASSERT_NEAR(O(i, j), E(i, j), this->thresh);
       }
+    }
+  }
+
+  // Allow dense computations (pre-/post-matmul).
+  TestType C3 = static_cast<TestType>(3);
+  TestType C5 = static_cast<TestType>(5);
+  (B = (B - C3)).run(exec);
+  (O = matmul(S, B + C3) + C5).run(exec);
 
+  // Verify result.
+  exec.sync();
+  for (index_t i = 0; i < m; i++) {
+    for (index_t j = 0; j < n; j++) {
+      if constexpr (is_complex_v<TestType>) {
+        ASSERT_NEAR((O(i, j) - C5).real(), E(i, j).real(), this->thresh);
+        ASSERT_NEAR((O(i, j) - C5).imag(), E(i,j ).imag(), this->thresh);
+      }
+      else {
+        ASSERT_NEAR(O(i, j) - C5, E(i, j), this->thresh);
+      }
     }
   }