Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing warnings issues by clang-19, both host and device #825

Merged
merged 1 commit into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,10 @@ rapids_cpm_cccl(

target_link_libraries(matx INTERFACE CCCL::CCCL)

# Set flags for compiling tests faster
set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
# Set flags for compiling tests faster (only for nvcc)
if (NOT CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
endif()

# Hack because CMake doesn't have short circult evaluation
if (NOT CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
Expand Down Expand Up @@ -163,9 +165,13 @@ if (NOT ${IS_NVCPP} GREATER -1)
endif()
endif()



set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Werror all-warnings>)
if (CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
message((STATUS "Using Clang compiler"))
# Workaround for clang bug: https://github.com/llvm/llvm-project/issues/58491
set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Wno-unused-command-line-argument>)
else()
set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Werror all-warnings>)
endif()
set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CXX>:-Werror>)

# CUTLASS slows down compile times when used, so leave it as optional for now
Expand Down
2 changes: 1 addition & 1 deletion examples/black_scholes.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ private:

public:
BlackScholes(O out, I1 K, I1 V, I1 S, I1 r, I1 T)
: out_(out), K_(K), V_(V), S_(S), r_(r), T_(T) {}
: out_(out), V_(V), S_(S), K_(K), r_(r), T_(T) {}

__device__ inline void operator()(index_t idx)
{
Expand Down
1 change: 0 additions & 1 deletion examples/convolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ using namespace matx;
int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
{
MATX_ENTER_HANDLER();
typedef cuda::std::complex<float> complex;

uint32_t iterations = 10;
constexpr index_t numSamples = 1638400;
Expand Down
6 changes: 3 additions & 3 deletions examples/mvdr_beamformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,9 @@ class MVDRBeamformer {
auto GetCovMatInvView() { return invCovMatView; }

private:
index_t num_beams_;
index_t num_el_;
index_t data_len_;
[[maybe_unused]] index_t num_beams_;
[[maybe_unused]] index_t num_el_;
[[maybe_unused]] index_t data_len_;
index_t snap_len_;
cuda::std::complex<float> load_coeff_ = {0.1f, 0.f};

Expand Down
3 changes: 0 additions & 3 deletions examples/recursive_filter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ using namespace matx;
int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
{
MATX_ENTER_HANDLER();
using complex = cuda::std::complex<float>;

cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);

Expand Down Expand Up @@ -70,7 +68,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
cudaEventCreate(&start);
cudaEventCreate(&stop);

using OutType = float;
using InType = float;
using FilterType = float;

Expand Down
4 changes: 2 additions & 2 deletions include/matx/core/half.h
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool operator==(const T &lhs,
const matxHalf<T> &rhs)
{
matxHalf<T> tmp{lhs};
return lhs == tmp;
return rhs == tmp;
}

/**
Expand Down Expand Up @@ -464,7 +464,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool operator!=(const T &lhs,
const matxHalf<T> &rhs)
{
matxHalf<T> tmp{lhs};
return !(lhs == tmp);
return !(rhs == tmp);
}

/**
Expand Down
6 changes: 3 additions & 3 deletions include/matx/core/half_complex.h
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool
operator==(const T &lhs, const matxHalfComplex<T> &rhs)
{
matxHalfComplex<T> tmp{lhs};
return lhs == tmp;
return rhs == tmp;
}

/**
Expand Down Expand Up @@ -562,7 +562,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool
operator!=(const T &lhs, const matxHalfComplex<T> &rhs)
{
matxHalfComplex<T> tmp{lhs};
return !(lhs == tmp);
return !(rhs == tmp);
}


Expand Down Expand Up @@ -853,7 +853,7 @@ pow(const T &x, const matxHalfComplex<T> &y)
{
cuda::std::complex<float> tmp{static_cast<float>(y.real()),
static_cast<float>(y.imag())};
tmp = cuda::std::pow(y, pow);
tmp = cuda::std::pow(x, pow);
return {static_cast<T>(tmp.real()), static_cast<T>(tmp.imag())};
}

Expand Down
2 changes: 0 additions & 2 deletions include/matx/core/operator_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,6 @@ namespace matx {

template <typename Op, typename ValidFunc>
__MATX_INLINE__ auto GetSupportedTensor(const Op &in, const ValidFunc &fn, matxMemorySpace_t space, cudaStream_t stream = 0) {
constexpr int RANK = Op::Rank();

if constexpr (is_matx_transform_op<Op>()) {
// We can assume that if a transform is passed to the input then PreRun has already completed
// on the transform and we can use the internal pointer
Expand Down
2 changes: 1 addition & 1 deletion include/matx/core/print.h
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,7 @@ namespace matx {
*/
template <typename Op, typename... Args,
std::enable_if_t<(Op::Rank() > 0 && sizeof...(Args) == 0), bool> = true>
void fprint(FILE* fp, const Op &op, Args... dims) {
void fprint(FILE* fp, const Op &op, [[maybe_unused]] Args... dims) {
cuda::std::array<int, Op::Rank()> arr = {0};
auto tp = cuda::std::tuple_cat(arr);
cuda::std::apply([&](auto &&...args) { fprint(fp, op, args...); }, tp);
Expand Down
4 changes: 2 additions & 2 deletions include/matx/core/storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ namespace matx
*/
void SetData(T *const data) noexcept
{
data_.reset(data_, [](auto){});
data_.reset(data, [](auto){});
}

/**
Expand All @@ -423,7 +423,7 @@ namespace matx
*
* @param size Size in bytes to allocate
*/
__MATX_INLINE__ T* allocate(size_t size)
__MATX_INLINE__ T* allocate([[maybe_unused]] size_t size)
{
MATX_THROW(matxInvalidParameter, "Cannot call allocate on a smart pointer storage type");
}
Expand Down
6 changes: 3 additions & 3 deletions include/matx/core/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -967,7 +967,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
Reset(T *const data, T *const ldata) noexcept
{
storage_.SetData(data);
this->SetData(data);
this->SetData(ldata);
}


Expand Down Expand Up @@ -1074,7 +1074,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {

__MATX_INLINE__ __MATX_HOST__ bool IsManagedPointer() {
bool managed;
const CUresult retval = cuPointerGetAttribute(&managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)this->Data());
[[maybe_unused]] const CUresult retval = cuPointerGetAttribute(&managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)this->Data());
MATX_ASSERT(retval == CUDA_SUCCESS, matxNotSupported);
return managed;
}
Expand Down Expand Up @@ -1453,7 +1453,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {

// Determine where this memory resides
auto kind = GetPointerKind(this->Data());
auto mem_res = cuPointerGetAttributes(sizeof(attr)/sizeof(attr[0]), attr, data, reinterpret_cast<CUdeviceptr>(this->Data()));
[[maybe_unused]] auto mem_res = cuPointerGetAttributes(sizeof(attr)/sizeof(attr[0]), attr, data, reinterpret_cast<CUdeviceptr>(this->Data()));
MATX_ASSERT_STR_EXP(mem_res, CUDA_SUCCESS, matxCudaError, "Error returned from cuPointerGetAttributes");
if (kind == MATX_INVALID_MEMORY) {
if (mem_type == CU_MEMORYTYPE_DEVICE) {
Expand Down
2 changes: 1 addition & 1 deletion include/matx/core/tensor_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ namespace matx
__MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto BlockToIdx(const Op &op, index_t abs, int nb_dims) {
using l_stride_type = index_t;
using l_shape_type = index_t;
constexpr int RANK = op.Rank();
constexpr int RANK = Op::Rank();
cuda::std::array<l_shape_type, RANK> indices{0};

for (int idx = 0; idx < RANK - nb_dims; idx++) {
Expand Down
6 changes: 3 additions & 3 deletions include/matx/core/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ auto madd( const T1 &x, const T2 &
//__half2 Y = make_half2(y.real(), y.imag());
//__half2 Z = make_half2(z.real(), z.imag());

const __half2 &X = *reinterpret_cast<const __half2*>(&x);
const __half2 &Y = *reinterpret_cast<const __half2*>(&y);
const __half2 &Z = *reinterpret_cast<const __half2*>(&z);
[[maybe_unused]] const __half2 &X = *reinterpret_cast<const __half2*>(&x);
[[maybe_unused]] const __half2 &Y = *reinterpret_cast<const __half2*>(&y);
[[maybe_unused]] const __half2 &Z = *reinterpret_cast<const __half2*>(&z);

#if 1
#ifdef __CUDA_ARCH__
Expand Down
4 changes: 3 additions & 1 deletion include/matx/executors/host.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,17 @@ enum class ThreadsMode {

struct HostExecParams {
HostExecParams(int threads = 1) : threads_(threads) {}
HostExecParams(cpu_set_t cpu_set) : cpu_set_(cpu_set), threads_(1) {
HostExecParams(cpu_set_t cpu_set) : threads_(1), cpu_set_(cpu_set) {
MATX_ASSERT_STR(false, matxNotSupported, "CPU affinity not supported yet");
}

int GetNumThreads() const { return threads_; }

private:
int threads_;
MATX_IGNORE_WARNING_PUSH_CLANG("-Wunused-private-field")
cpu_set_t cpu_set_ {0};
MATX_IGNORE_WARNING_POP_CLANG
};

/**
Expand Down
4 changes: 2 additions & 2 deletions include/matx/generators/chirp.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ namespace matx
inline __MATX_HOST__ __MATX_DEVICE__ Chirp(SpaceOp sop, FreqType f0, space_type t1, FreqType f1, ChirpMethod method) :
sop_(sop),
f0_(f0),
f1_(f1),
t1_(t1),
f1_(f1),
method_(method)
{}

Expand Down Expand Up @@ -109,8 +109,8 @@ namespace matx
inline __MATX_HOST__ __MATX_DEVICE__ ComplexChirp(SpaceOp sop, FreqType f0, space_type t1, FreqType f1, ChirpMethod method) :
sop_(sop),
f0_(f0),
t1_(t1),
f1_(f1),
t1_(t1),
method_(method)
{}

Expand Down
10 changes: 5 additions & 5 deletions include/matx/kernels/channelize_poly.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -290,22 +290,22 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
if (outdims[OutElemRank] <= last_elem) {
const filter_t *h = h_start;
output_t accum { 0 };
const int first_end = cuda::std::min(cached_input_ind_tail + filter_phase_len - 1, smem_input_height - 1);
const uint32_t first_end = cuda::std::min(cached_input_ind_tail + filter_phase_len - 1, smem_input_height - 1);
// The footprint of samples involved in the convolution may wrap from the end
// to the beginning of smem_input. The prologue below handles the samples from
// the current tail to the end of smem_input and the epilogue starts back at the
// beginning of smem_input.
const int prologue_count = (first_end - cached_input_ind_tail + 1);
const int epilogue_count = (prologue_count < filter_phase_len) ? filter_phase_len - prologue_count : 0;
const uint32_t prologue_count = (first_end - cached_input_ind_tail + 1);
const uint32_t epilogue_count = (prologue_count < filter_phase_len) ? filter_phase_len - prologue_count : 0;
const input_t *sample = smem_input + cached_input_ind_tail * num_channels + (num_channels - 1 - chan);
// Apply the filter h in reverse order below to flip the filter for convolution
for (int k = 0; k < prologue_count; k++) {
for (uint32_t k = 0; k < prologue_count; k++) {
accum += (*h) * (*sample);
sample += num_channels;
h -= num_channels;
}
sample = smem_input + (num_channels - 1 - chan);
for (int k = 0; k < epilogue_count; k++) {
for (uint32_t k = 0; k < epilogue_count; k++) {
accum += (*h) * (*sample);
sample += num_channels;
h -= num_channels;
Expand Down
Loading