NVIDIA · cliffburdick · Jan 16, 2025 · Jan 15, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -132,8 +132,10 @@ rapids_cpm_cccl(
 
 target_link_libraries(matx INTERFACE CCCL::CCCL)
 
-# Set flags for compiling tests faster
-set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
+# Set flags for compiling tests faster (only for nvcc)
+if (NOT CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
+    set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
+endif()
 
 # Hack because CMake doesn't have short circult evaluation
 if (NOT CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
@@ -163,9 +165,13 @@ if (NOT ${IS_NVCPP} GREATER -1)
     endif()
 endif()
 
-
-
-set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Werror all-warnings>)
+if (CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
+message((STATUS "Using Clang compiler"))
+    # Workaround for clang bug: https://github.com/llvm/llvm-project/issues/58491
+    set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Wno-unused-command-line-argument>)
+else()
+    set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Werror all-warnings>)
+endif()
 set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CXX>:-Werror>)
 
 # CUTLASS slows down compile times when used, so leave it as optional for now

diff --git a/examples/black_scholes.cu b/examples/black_scholes.cu
@@ -61,7 +61,7 @@ private:
 
 public:
   BlackScholes(O out, I1 K, I1 V, I1 S, I1 r, I1 T)
-      : out_(out), K_(K), V_(V), S_(S), r_(r), T_(T)  {}
+      : out_(out), V_(V), S_(S), K_(K), r_(r), T_(T)  {}
 
   __device__ inline void operator()(index_t idx)
   {

diff --git a/examples/convolution.cu b/examples/convolution.cu
@@ -39,7 +39,6 @@ using namespace matx;
 int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
-  typedef cuda::std::complex<float> complex;
 
   uint32_t iterations = 10;
   constexpr index_t numSamples = 1638400;

diff --git a/examples/mvdr_beamformer.h b/examples/mvdr_beamformer.h
@@ -164,9 +164,9 @@ class MVDRBeamformer {
   auto GetCovMatInvView() { return invCovMatView; }
 
 private:
-  index_t num_beams_;
-  index_t num_el_;
-  index_t data_len_;
+  [[maybe_unused]] index_t num_beams_;
+  [[maybe_unused]] index_t num_el_;
+  [[maybe_unused]] index_t data_len_;
   index_t snap_len_;
   cuda::std::complex<float> load_coeff_ = {0.1f, 0.f};
 

diff --git a/examples/recursive_filter.cu b/examples/recursive_filter.cu
@@ -40,8 +40,6 @@ using namespace matx;
 int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
-  using complex = cuda::std::complex<float>;
-
   cudaDeviceProp prop;
   cudaGetDeviceProperties(&prop, 0);
 
@@ -70,7 +68,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
 
-  using OutType = float;
   using InType = float;
   using FilterType = float;
 

diff --git a/include/matx/core/half.h b/include/matx/core/half.h
@@ -417,7 +417,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool operator==(const T &lhs,
                                                     const matxHalf<T> &rhs)
 {
   matxHalf<T> tmp{lhs};
-  return lhs == tmp;
+  return rhs == tmp;
 }
 
 /**
@@ -464,7 +464,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool operator!=(const T &lhs,
                                                     const matxHalf<T> &rhs)
 {
   matxHalf<T> tmp{lhs};
-  return !(lhs == tmp);
+  return !(rhs == tmp);
 }
 
 /**

diff --git a/include/matx/core/half_complex.h b/include/matx/core/half_complex.h
@@ -515,7 +515,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool
 operator==(const T &lhs, const matxHalfComplex<T> &rhs)
 {
   matxHalfComplex<T> tmp{lhs};
-  return lhs == tmp;
+  return rhs == tmp;
 }
 
 /**
@@ -562,7 +562,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool
 operator!=(const T &lhs, const matxHalfComplex<T> &rhs)
 {
   matxHalfComplex<T> tmp{lhs};
-  return !(lhs == tmp);
+  return !(rhs == tmp);
 }
 
 
@@ -853,7 +853,7 @@ pow(const T &x, const matxHalfComplex<T> &y)
 {
   cuda::std::complex<float> tmp{static_cast<float>(y.real()),
                                 static_cast<float>(y.imag())};
-  tmp = cuda::std::pow(y, pow);
+  tmp = cuda::std::pow(x, pow);
   return {static_cast<T>(tmp.real()), static_cast<T>(tmp.imag())};
 }
 

diff --git a/include/matx/core/operator_utils.h b/include/matx/core/operator_utils.h
@@ -132,8 +132,6 @@ namespace matx {
 
     template <typename Op, typename ValidFunc>
     __MATX_INLINE__ auto GetSupportedTensor(const Op &in, const ValidFunc &fn, matxMemorySpace_t space, cudaStream_t stream = 0) {
-      constexpr int RANK = Op::Rank();
-
       if constexpr (is_matx_transform_op<Op>()) {
         // We can assume that if a transform is passed to the input then PreRun has already completed
         // on the transform and we can use the internal pointer

diff --git a/include/matx/core/print.h b/include/matx/core/print.h
@@ -653,7 +653,7 @@ namespace matx {
    */
   template <typename Op, typename... Args,
             std::enable_if_t<(Op::Rank() > 0 && sizeof...(Args) == 0), bool> = true>
-  void fprint(FILE* fp, const Op &op, Args... dims) {
+  void fprint(FILE* fp, const Op &op, [[maybe_unused]] Args... dims) {
     cuda::std::array<int, Op::Rank()> arr = {0};
     auto tp = cuda::std::tuple_cat(arr);
     cuda::std::apply([&](auto &&...args) { fprint(fp, op, args...); }, tp);

diff --git a/include/matx/core/storage.h b/include/matx/core/storage.h
@@ -406,7 +406,7 @@ namespace matx
      */
     void SetData(T *const data) noexcept
     {
-      data_.reset(data_, [](auto){});
+      data_.reset(data, [](auto){});
     }
 
     /**
@@ -423,7 +423,7 @@ namespace matx
      * 
      * @param size Size in bytes to allocate
      */
-    __MATX_INLINE__ T* allocate(size_t size)
+    __MATX_INLINE__ T* allocate([[maybe_unused]] size_t size)
     {
       MATX_THROW(matxInvalidParameter, "Cannot call allocate on a smart pointer storage type");
     }

diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h
@@ -967,7 +967,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
   Reset(T *const data, T *const ldata) noexcept
   {
     storage_.SetData(data);
-    this->SetData(data);
+    this->SetData(ldata);
   }
 
 
@@ -1074,7 +1074,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
   __MATX_INLINE__ __MATX_HOST__ bool IsManagedPointer() {
     bool managed;
-    const CUresult retval = cuPointerGetAttribute(&managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)this->Data());
+    [[maybe_unused]] const CUresult retval = cuPointerGetAttribute(&managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)this->Data());
     MATX_ASSERT(retval == CUDA_SUCCESS, matxNotSupported);
     return managed;
   }
@@ -1453,7 +1453,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
     // Determine where this memory resides
     auto kind     = GetPointerKind(this->Data());
-    auto mem_res  = cuPointerGetAttributes(sizeof(attr)/sizeof(attr[0]), attr, data, reinterpret_cast<CUdeviceptr>(this->Data()));
+    [[maybe_unused]] auto mem_res  = cuPointerGetAttributes(sizeof(attr)/sizeof(attr[0]), attr, data, reinterpret_cast<CUdeviceptr>(this->Data()));
     MATX_ASSERT_STR_EXP(mem_res, CUDA_SUCCESS, matxCudaError, "Error returned from cuPointerGetAttributes");
     if (kind == MATX_INVALID_MEMORY) {
       if (mem_type == CU_MEMORYTYPE_DEVICE) {

diff --git a/include/matx/core/tensor_utils.h b/include/matx/core/tensor_utils.h
@@ -155,7 +155,7 @@ namespace matx
     __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto BlockToIdx(const Op &op, index_t abs, int nb_dims) {
       using l_stride_type = index_t;
       using l_shape_type = index_t;
-      constexpr int RANK = op.Rank();
+      constexpr int RANK = Op::Rank();
       cuda::std::array<l_shape_type, RANK> indices{0};
 
       for (int idx = 0; idx < RANK - nb_dims; idx++) {

diff --git a/include/matx/core/utils.h b/include/matx/core/utils.h
@@ -134,9 +134,9 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ auto madd( const T1 &x, const T2 &
     //__half2 Y = make_half2(y.real(), y.imag());
     //__half2 Z = make_half2(z.real(), z.imag());
 
-    const __half2 &X = *reinterpret_cast<const __half2*>(&x);
-    const __half2 &Y = *reinterpret_cast<const __half2*>(&y);
-    const __half2 &Z = *reinterpret_cast<const __half2*>(&z);
+    [[maybe_unused]] const __half2 &X = *reinterpret_cast<const __half2*>(&x);
+    [[maybe_unused]] const __half2 &Y = *reinterpret_cast<const __half2*>(&y);
+    [[maybe_unused]] const __half2 &Z = *reinterpret_cast<const __half2*>(&z);
 
 #if 1
 #ifdef __CUDA_ARCH__

diff --git a/include/matx/executors/host.h b/include/matx/executors/host.h
@@ -58,15 +58,17 @@ enum class ThreadsMode {
 
 struct HostExecParams {
   HostExecParams(int threads = 1) : threads_(threads) {}
-  HostExecParams(cpu_set_t cpu_set) : cpu_set_(cpu_set), threads_(1) {
+  HostExecParams(cpu_set_t cpu_set) : threads_(1), cpu_set_(cpu_set) {
     MATX_ASSERT_STR(false, matxNotSupported, "CPU affinity not supported yet");
   }
 
   int GetNumThreads() const { return threads_; }
 
   private:
     int threads_;
+MATX_IGNORE_WARNING_PUSH_CLANG("-Wunused-private-field")    
     cpu_set_t cpu_set_ {0};
+MATX_IGNORE_WARNING_POP_CLANG
 };
 
 /**

diff --git a/include/matx/generators/chirp.h b/include/matx/generators/chirp.h
@@ -67,8 +67,8 @@ namespace matx
         inline __MATX_HOST__ __MATX_DEVICE__ Chirp(SpaceOp sop, FreqType f0, space_type t1, FreqType f1, ChirpMethod method) : 
           sop_(sop),
           f0_(f0),
+          f1_(f1),          
           t1_(t1),
-          f1_(f1),
           method_(method)
         {}
 
@@ -109,8 +109,8 @@ namespace matx
 	inline __MATX_HOST__ __MATX_DEVICE__ ComplexChirp(SpaceOp sop, FreqType f0, space_type t1, FreqType f1, ChirpMethod method) : 
           sop_(sop),
           f0_(f0),
-          t1_(t1),
           f1_(f1),
+          t1_(t1),          
           method_(method)
         {}
 

diff --git a/include/matx/kernels/channelize_poly.cuh b/include/matx/kernels/channelize_poly.cuh
@@ -290,22 +290,22 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
         if (outdims[OutElemRank] <= last_elem) {
             const filter_t *h = h_start;
             output_t accum { 0 };
-            const int first_end = cuda::std::min(cached_input_ind_tail + filter_phase_len - 1, smem_input_height - 1);
+            const uint32_t first_end = cuda::std::min(cached_input_ind_tail + filter_phase_len - 1, smem_input_height - 1);
             // The footprint of samples involved in the convolution may wrap from the end
             // to the beginning of smem_input. The prologue below handles the samples from
             // the current tail to the end of smem_input and the epilogue starts back at the
             // beginning of smem_input.
-            const int prologue_count = (first_end - cached_input_ind_tail + 1);
-            const int epilogue_count = (prologue_count < filter_phase_len) ? filter_phase_len - prologue_count : 0;
+            const uint32_t prologue_count = (first_end - cached_input_ind_tail + 1);
+            const uint32_t epilogue_count = (prologue_count < filter_phase_len) ? filter_phase_len - prologue_count : 0;
             const input_t *sample = smem_input + cached_input_ind_tail * num_channels + (num_channels - 1 - chan);
             // Apply the filter h in reverse order below to flip the filter for convolution
-            for (int k = 0; k < prologue_count; k++) {
+            for (uint32_t k = 0; k < prologue_count; k++) {
                 accum += (*h) * (*sample);
                 sample += num_channels;
                 h -= num_channels;
             }
             sample = smem_input + (num_channels - 1 - chan);
-            for (int k = 0; k < epilogue_count; k++) {
+            for (uint32_t k = 0; k < epilogue_count; k++) {
                 accum += (*h) * (*sample);
                 sample += num_channels;
                 h -= num_channels;