NVIDIA
diff --git a/‎CMakeLists.txt‎
Lines changed: 11 additions & 5 deletions b/‎CMakeLists.txt‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎examples/black_scholes.cu‎
Lines changed: 1 addition & 1 deletion b/‎examples/black_scholes.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/convolution.cu‎
Lines changed: 0 additions & 1 deletion b/‎examples/convolution.cu‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/mvdr_beamformer.h‎
Lines changed: 3 additions & 3 deletions b/‎examples/mvdr_beamformer.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/recursive_filter.cu‎
Lines changed: 0 additions & 3 deletions b/‎examples/recursive_filter.cu‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎include/matx/core/half.h‎
Lines changed: 2 additions & 2 deletions b/‎include/matx/core/half.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/matx/core/half_complex.h‎
Lines changed: 3 additions & 3 deletions b/‎include/matx/core/half_complex.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎include/matx/core/operator_utils.h‎
Lines changed: 0 additions & 2 deletions b/‎include/matx/core/operator_utils.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎include/matx/core/print.h‎
Lines changed: 1 addition & 1 deletion b/‎include/matx/core/print.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/matx/core/storage.h‎
Lines changed: 2 additions & 2 deletions b/‎include/matx/core/storage.h‎
Lines changed: 2 additions & 2 deletions
@@ -134,8 +134,10 @@ rapids_cpm_cccl(
 
 target_link_libraries(matx INTERFACE CCCL::CCCL)
 
-# Set flags for compiling tests faster
-set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
+# Set flags for compiling tests faster (only for nvcc)
+if (NOT CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
+    set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
+endif()
 
 # Hack because CMake doesn't have short circult evaluation
 if (NOT CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
@@ -165,9 +167,13 @@ if (NOT ${IS_NVCPP} GREATER -1)
     endif()
 endif()
 
-
-
-set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Werror all-warnings>)
+if (CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
+message((STATUS "Using Clang compiler"))
+    # Workaround for clang bug: https://github.com/llvm/llvm-project/issues/58491
+    set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Wno-unused-command-line-argument>)
+else()
+    set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Werror all-warnings>)
+endif()
 set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CXX>:-Werror>)
 
 # CUTLASS slows down compile times when used, so leave it as optional for now
 
@@ -61,7 +61,7 @@ private:
 
 public:
   BlackScholes(O out, I1 K, I1 V, I1 S, I1 r, I1 T)
-      : out_(out), K_(K), V_(V), S_(S), r_(r), T_(T)  {}
+      : out_(out), V_(V), S_(S), K_(K), r_(r), T_(T)  {}
 
   __device__ inline void operator()(index_t idx)
   {
 
@@ -39,7 +39,6 @@ using namespace matx;
 int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
-  typedef cuda::std::complex<float> complex;
 
   uint32_t iterations = 10;
   constexpr index_t numSamples = 1638400;
 
@@ -164,9 +164,9 @@ class MVDRBeamformer {
   auto GetCovMatInvView() { return invCovMatView; }
 
 private:
-  index_t num_beams_;
-  index_t num_el_;
-  index_t data_len_;
+  [[maybe_unused]] index_t num_beams_;
+  [[maybe_unused]] index_t num_el_;
+  [[maybe_unused]] index_t data_len_;
   index_t snap_len_;
   cuda::std::complex<float> load_coeff_ = {0.1f, 0.f};
 
 
@@ -40,8 +40,6 @@ using namespace matx;
 int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
-  using complex = cuda::std::complex<float>;
-
   cudaDeviceProp prop;
   cudaGetDeviceProperties(&prop, 0);
 
@@ -70,7 +68,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
-  using OutType = float;
   using InType = float;
   using FilterType = float;
 
 
@@ -417,7 +417,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool operator==(const T &lhs,
                                                     const matxHalf<T> &rhs)
 {
   matxHalf<T> tmp{lhs};
-  return lhs == tmp;
+  return rhs == tmp;
 }
 
 /**
@@ -464,7 +464,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool operator!=(const T &lhs,
                                                     const matxHalf<T> &rhs)
 {
   matxHalf<T> tmp{lhs};
-  return !(lhs == tmp);
+  return !(rhs == tmp);
 }
 
 /**
 
@@ -515,7 +515,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool
 operator==(const T &lhs, const matxHalfComplex<T> &rhs)
 {
   matxHalfComplex<T> tmp{lhs};
-  return lhs == tmp;
+  return rhs == tmp;
 }
 
 /**
@@ -562,7 +562,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool
 operator!=(const T &lhs, const matxHalfComplex<T> &rhs)
 {
   matxHalfComplex<T> tmp{lhs};
-  return !(lhs == tmp);
+  return !(rhs == tmp);
 }
 
 
@@ -853,7 +853,7 @@ pow(const T &x, const matxHalfComplex<T> &y)
 {
   cuda::std::complex<float> tmp{static_cast<float>(y.real()),
                                 static_cast<float>(y.imag())};
-  tmp = cuda::std::pow(y, pow);
+  tmp = cuda::std::pow(x, pow);
   return {static_cast<T>(tmp.real()), static_cast<T>(tmp.imag())};
 }
 
 
@@ -132,8 +132,6 @@ namespace matx {
 
     template <typename Op, typename ValidFunc>
     __MATX_INLINE__ auto GetSupportedTensor(const Op &in, const ValidFunc &fn, matxMemorySpace_t space, cudaStream_t stream = 0) {
-      constexpr int RANK = Op::Rank();
-
       if constexpr (is_matx_transform_op<Op>()) {
         // We can assume that if a transform is passed to the input then PreRun has already completed
         // on the transform and we can use the internal pointer
 
@@ -653,7 +653,7 @@ namespace matx {
    */
   template <typename Op, typename... Args,
             std::enable_if_t<(Op::Rank() > 0 && sizeof...(Args) == 0), bool> = true>
-  void fprint(FILE* fp, const Op &op, Args... dims) {
+  void fprint(FILE* fp, const Op &op, [[maybe_unused]] Args... dims) {
     cuda::std::array<int, Op::Rank()> arr = {0};
     auto tp = cuda::std::tuple_cat(arr);
     cuda::std::apply([&](auto &&...args) { fprint(fp, op, args...); }, tp);
 
@@ -406,7 +406,7 @@ namespace matx
      */
     void SetData(T *const data) noexcept
     {
-      data_.reset(data_, [](auto){});
+      data_.reset(data, [](auto){});
     }
 
     /**
@@ -423,7 +423,7 @@ namespace matx
      * 
      * @param size Size in bytes to allocate
      */
-    __MATX_INLINE__ T* allocate(size_t size)
+    __MATX_INLINE__ T* allocate([[maybe_unused]] size_t size)
     {
       MATX_THROW(matxInvalidParameter, "Cannot call allocate on a smart pointer storage type");
     }
Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ private:`
`61`	`61`
`62`	`62`	`public:`
`63`	`63`	`BlackScholes(O out, I1 K, I1 V, I1 S, I1 r, I1 T)`
`64`		`- : out_(out), K_(K), V_(V), S_(S), r_(r), T_(T) {}`
	`64`	`+ : out_(out), V_(V), S_(S), K_(K), r_(r), T_(T) {}`
`65`	`65`
`66`	`66`	`__device__ inline void operator()(index_t idx)`
`67`	`67`	`{`
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,6 @@ using namespace matx;`
`39`	`39`	`int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)`
`40`	`40`	`{`
`41`	`41`	`MATX_ENTER_HANDLER();`
`42`		`- typedef cuda::std::complex<float> complex;`
`43`	`42`
`44`	`43`	`uint32_t iterations = 10;`
`45`	`44`	`constexpr index_t numSamples = 1638400;`
Original file line number	Diff line number	Diff line change
`@@ -417,7 +417,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool operator==(const T &lhs,`
`417`	`417`	`const matxHalf<T> &rhs)`
`418`	`418`	`{`
`419`	`419`	`matxHalf<T> tmp{lhs};`
`420`		`- return lhs == tmp;`
	`420`	`+ return rhs == tmp;`
`421`	`421`	`}`
`422`	`422`
`423`	`423`	`/**`
`@@ -464,7 +464,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool operator!=(const T &lhs,`
`464`	`464`	`const matxHalf<T> &rhs)`
`465`	`465`	`{`
`466`	`466`	`matxHalf<T> tmp{lhs};`
`467`		`- return !(lhs == tmp);`
	`467`	`+ return !(rhs == tmp);`
`468`	`468`	`}`
`469`	`469`
`470`	`470`	`/**`
Original file line number	Diff line number	Diff line change
`@@ -515,7 +515,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool`
`515`	`515`	`operator==(const T &lhs, const matxHalfComplex<T> &rhs)`
`516`	`516`	`{`
`517`	`517`	`matxHalfComplex<T> tmp{lhs};`
`518`		`- return lhs == tmp;`
	`518`	`+ return rhs == tmp;`
`519`	`519`	`}`
`520`	`520`
`521`	`521`	`/**`
`@@ -562,7 +562,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool`
`562`	`562`	`operator!=(const T &lhs, const matxHalfComplex<T> &rhs)`
`563`	`563`	`{`
`564`	`564`	`matxHalfComplex<T> tmp{lhs};`
`565`		`- return !(lhs == tmp);`
	`565`	`+ return !(rhs == tmp);`
`566`	`566`	`}`
`567`	`567`
`568`	`568`
`@@ -853,7 +853,7 @@ pow(const T &x, const matxHalfComplex<T> &y)`
`853`	`853`	`{`
`854`	`854`	`cuda::std::complex<float> tmp{static_cast<float>(y.real()),`
`855`	`855`	`static_cast<float>(y.imag())};`
`856`		`- tmp = cuda::std::pow(y, pow);`
	`856`	`+ tmp = cuda::std::pow(x, pow);`
`857`	`857`	`return {static_cast<T>(tmp.real()), static_cast<T>(tmp.imag())};`
`858`	`858`	`}`
`859`	`859`
Original file line number	Diff line number	Diff line change
`@@ -406,7 +406,7 @@ namespace matx`
`406`	`406`	`*/`
`407`	`407`	`void SetData(T *const data) noexcept`
`408`	`408`	`{`
`409`		`- data_.reset(data_, [](auto){});`
	`409`	`+ data_.reset(data, [](auto){});`
`410`	`410`	`}`
`411`	`411`
`412`	`412`	`/**`
`@@ -423,7 +423,7 @@ namespace matx`
`423`	`423`	`*`
`424`	`424`	`* @param size Size in bytes to allocate`
`425`	`425`	`*/`
`426`		`- __MATX_INLINE__ T* allocate(size_t size)`
	`426`	`+ __MATX_INLINE__ T* allocate([[maybe_unused]] size_t size)`
`427`	`427`	`{`
`428`	`428`	`MATX_THROW(matxInvalidParameter, "Cannot call allocate on a smart pointer storage type");`
`429`	`429`	`}`