NVIDIA · alliepiper · Oct 29, 2025 · Oct 15, 2025 · Oct 25, 2025 · Oct 26, 2025
@@ -1,7 +1,8 @@
 cccl_get_c2h()
 
 function(cccl_c_parallel_add_test target_name_var source)
-  string(REGEX REPLACE "test_([^.]*)" "cccl.c.parallel.test.\\1" target_name "${source}")
+  get_filename_component(target_name "${source}" NAME_WE)
+  string(REGEX REPLACE "test_([^.]*)" "cccl.c.parallel.test.\\1" target_name "${target_name}")
   set(target_name_var ${target_name} PARENT_SCOPE)
 
   add_executable(${target_name} "${source}")

@@ -109,7 +109,7 @@ void AlgorithmExecute(std::optional<BuildCache>& cache, const std::optional<KeyT
   constexpr int device_id = 0;
   const auto& build_info  = BuildInformation<device_id>::init();
 
-  BuildResultT build;
+  BuildResultT build{};
 
   bool found               = false;
   const bool cache_and_key = bool(cache) && bool(lookup_key);

@@ -8,7 +8,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <array>
 #include <cstdint>
+#include <vector>
 
 #include <cuda_runtime.h>
 
@@ -206,7 +208,7 @@ C2H_TEST("DeviceHistogram::HistogramEven API usage", "[histogram][device]")
   using counter_t = int;
 
   int num_samples = 10;
-  std::vector<float> d_samples{2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5};
+  std::vector<float> d_samples{2.2f, 6.1f, 7.1f, 2.9f, 3.5f, 0.3f, 2.9f, 2.1f, 6.1f, 999.5f};
 
   int num_rows = 1;
 

@@ -386,7 +386,7 @@ struct DeviceMergeSort_SortPairs_Iterators_Fixture_Tag;
 C2H_TEST("DeviceMergeSort::SortPairs works with input iterators", "[merge_sort]")
 {
   using key_t         = int;
-  using item_t        = int;
+  using int_item_t    = int;
   const int num_items = GENERATE_COPY(take(2, random(1, 1000000)), values({500, 1000000, 2000000}));
 
   operation_t op = make_operation("op", get_merge_sort_op(get_type_info<key_t>().type));
@@ -396,28 +396,28 @@ C2H_TEST("DeviceMergeSort::SortPairs works with input iterators", "[merge_sort]"
     make_random_access_iterator<key_t>(iterator_kind::INPUT, "int", "item");
 
   std::vector<key_t> input_keys = make_shuffled_sequence<key_t>(num_items);
-  std::vector<item_t> input_items(num_items);
+  std::vector<int_item_t> input_items(num_items);
   std::transform(input_keys.begin(), input_keys.end(), input_items.begin(), [](key_t key) {
-    return static_cast<item_t>(key);
+    return static_cast<int_item_t>(key);
   });
 
-  std::vector<key_t> expected_keys   = input_keys;
-  std::vector<item_t> expected_items = input_items;
+  std::vector<key_t> expected_keys       = input_keys;
+  std::vector<int_item_t> expected_items = input_items;
 
   pointer_t<key_t> input_keys_ptr(input_keys);
   input_keys_it.state.data = input_keys_ptr.ptr;
   pointer_t<key_t> input_items_ptr(input_items);
   input_items_it.state.data = input_items_ptr.ptr;
 
   auto& build_cache    = get_cache<DeviceMergeSort_SortPairs_Iterators_Fixture_Tag>();
-  const auto& test_key = make_key<key_t, item_t>();
+  const auto& test_key = make_key<key_t, int_item_t>();
 
   merge_sort(input_keys_it, input_items_it, input_keys_ptr, input_items_ptr, num_items, op, build_cache, test_key);
 
   std::sort(expected_keys.begin(), expected_keys.end());
   std::sort(expected_items.begin(), expected_items.end());
   REQUIRE(expected_keys == std::vector<key_t>(input_keys_ptr));
-  REQUIRE(expected_items == std::vector<item_t>(input_items_ptr));
+  REQUIRE(expected_items == std::vector<int_item_t>(input_items_ptr));
 }
 
 // These tests with output iterators are currently failing https://github.com/NVIDIA/cccl/issues/3722

@@ -314,7 +314,7 @@ C2H_TEST("DeviceRadixSort::SortPairs works", "[radix_sort]", test_params_tuple)
   if (is_descending)
   {
     std::sort(expected_keys.begin(), expected_keys.end(), std::greater<KeyT>());
-    std::sort(expected_items.begin(), expected_items.end(), std::greater<KeyT>());
+    std::sort(expected_items.begin(), expected_items.end(), std::greater<ItemT>());
   }
   else
   {

@@ -269,7 +269,7 @@ C2H_TEST("Reduce works with input iterators", "[reduce]")
   reduce(input_it, output_it, num_items, op, init, build_cache, test_key);
 
   const int output   = output_it[0];
-  const int expected = init.value + num_items * (num_items - 1) / 2;
+  const int expected = init.value + static_cast<int>(num_items * (num_items - 1) / 2);
   REQUIRE(output == expected);
 }
 

@@ -526,7 +526,7 @@ C2H_TEST("SegmentedReduce works with input iterators", "[segmented_reduce]")
     auto inp_ = generate<int>(n_elems);
     for (auto&& el : inp_)
     {
-      host_input.push_back(el);
+      host_input.push_back(static_cast<ValueT>(el));
     }
   }
   std::vector<ValueT> host_output(n_cols, 0);
@@ -604,27 +604,33 @@ C2H_TEST("SegmentedReduce works with input iterators", "[segmented_reduce]")
   REQUIRE(host_actual == host_output);
 }
 
-struct SegmentedReduce_SumOverRows_FloatingPointTypes_Fixture_Tag;
-C2H_TEST_LIST("segmented_reduce can work with floating point types",
-              "[segmented_reduce]",
+using fp_test_types = c2h::type_list<
 #if _CCCL_HAS_NVFP16()
-              __half,
+  __half,
 #endif
-              float,
-              double)
+  float,
+  double>;
+struct SegmentedReduce_SumOverRows_FloatingPointTypes_Fixture_Tag;
+C2H_TEST("segmented_reduce can work with floating point types", "[segmented_reduce]", fp_test_types)
 {
+  using T = c2h::get<0, TestType>;
+
   constexpr std::size_t n_rows = 13;
   constexpr std::size_t n_cols = 12;
 
   constexpr std::size_t n_elems  = n_rows * n_cols;
   constexpr std::size_t row_size = n_cols;
 
   const std::vector<int> int_input = generate<int>(n_elems);
-  const std::vector<TestType> input(int_input.begin(), int_input.end());
-  std::vector<TestType> output(n_rows, 0);
+  // Suppress harmless conversion warnings on MSVC
+  _CCCL_DIAG_PUSH
+  _CCCL_DIAG_SUPPRESS_MSVC(4244)
+  const std::vector<T> input(int_input.begin(), int_input.end());
+  _CCCL_DIAG_POP
+  std::vector<T> output(n_rows, 0);
 
-  pointer_t<TestType> input_ptr(input); // copy from host to device
-  pointer_t<TestType> output_ptr(output); // copy from host to device
+  pointer_t<T> input_ptr(input); // copy from host to device
+  pointer_t<T> output_ptr(output); // copy from host to device
 
   using SizeT                                     = unsigned long long;
   static constexpr std::string_view index_ty_name = "unsigned long long";
@@ -659,11 +665,11 @@ C2H_TEST_LIST("segmented_reduce can work with floating point types",
   end_offset_it.state.linear_id = 1;
   end_offset_it.state.row_size  = row_size;
 
-  operation_t op = make_operation("op", get_reduce_op(get_type_info<TestType>().type));
-  value_t<TestType> init{0};
+  operation_t op = make_operation("op", get_reduce_op(get_type_info<T>().type));
+  value_t<T> init{0};
 
   auto& build_cache    = get_cache<SegmentedReduce_SumOverRows_FloatingPointTypes_Fixture_Tag>();
-  const auto& test_key = make_key<TestType>();
+  const auto& test_key = make_key<T>();
 
   segmented_reduce(input_ptr, output_ptr, n_rows, start_offset_it, end_offset_it, op, init, build_cache, test_key);
 
@@ -675,7 +681,7 @@ C2H_TEST_LIST("segmented_reduce can work with floating point types",
     std::size_t row_offset = i * row_size;
     host_output_it[i]      = std::reduce(host_input_it + row_offset, host_input_it + (row_offset + n_cols));
   }
-  REQUIRE(output == std::vector<TestType>(output_ptr));
+  REQUIRE(output == std::vector<T>(output_ptr));
 }
 
 template <typename ValueT>

@@ -91,13 +91,18 @@ template <typename T>
 struct three_way_partition_result_t
 {
   three_way_partition_result_t() = delete;
-  explicit three_way_partition_result_t(int num_items)
+  explicit three_way_partition_result_t(std::size_t num_items)
       : first_part(num_items)
       , second_part(num_items)
       , unselected(num_items)
   {}
   explicit three_way_partition_result_t(
-    std::vector<T> first, std::vector<T> second, std::vector<T> unselected, int n_first, int n_second, int n_unselected)
+    std::vector<T> first,
+    std::vector<T> second,
+    std::vector<T> unselected,
+    std::size_t n_first,
+    std::size_t n_second,
+    std::size_t n_unselected)
       : first_part(std::move(first))
       , second_part(std::move(second))
       , unselected(std::move(unselected))
@@ -110,9 +115,9 @@ struct three_way_partition_result_t
   std::vector<T> second_part;
   std::vector<T> unselected;
 
-  int num_items_in_first_part{};
-  int num_items_in_second_part{};
-  int num_unselected_items{};
+  std::size_t num_items_in_first_part{};
+  std::size_t num_items_in_second_part{};
+  std::size_t num_unselected_items{};
 
   bool operator==(const three_way_partition_result_t<T>& other) const
   {
@@ -500,7 +505,7 @@ C2H_TEST("ThreeWayPartition works with iterators", "[three_way_partition]")
   REQUIRE(first_part_output == std_result.first_part);
   REQUIRE(second_part_output == std_result.second_part);
   REQUIRE(unselected_output == std_result.unselected);
-  REQUIRE(num_selected[0] == std_result.num_items_in_first_part);
-  REQUIRE(num_selected[1] == std_result.num_items_in_second_part);
-  REQUIRE(static_cast<int>(num_items) - num_selected[0] - num_selected[1] == std_result.num_unselected_items);
+  REQUIRE(static_cast<std::size_t>(num_selected[0]) == std_result.num_items_in_first_part);
+  REQUIRE(static_cast<std::size_t>(num_selected[1]) == std_result.num_items_in_second_part);
+  REQUIRE(num_items - static_cast<std::size_t>(num_selected[0] + num_selected[1]) == std_result.num_unselected_items);
 }
@@ -109,7 +109,7 @@ C2H_TEST("Transform generates UBLKCP on SM90", "[transform][ublkcp]")
     return;
   }
 
-  cccl_device_transform_build_result_t build;
+  cccl_device_transform_build_result_t build{};
   operation_t op = make_operation("op", get_unary_op(get_type_info<int>().type));
   REQUIRE(
     CUDA_SUCCESS
@@ -254,10 +254,12 @@ C2H_TEST("Transform works with integral types with well-known operations", "[tra
   unary_transform(input_ptr, output_ptr, num_items, op, build_cache, test_key);
 
   std::vector<T> expected(num_items, 0);
+  _CCCL_DIAG_PUSH
+  _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus on unsigned type
   std::transform(input.begin(), input.end(), expected.begin(), [](const T& x) {
     return -x;
   });
-
+  _CCCL_DIAG_POP
   if (num_items > 0)
   {
     REQUIRE(expected == std::vector<T>(output_ptr));
@@ -536,7 +538,11 @@ C2H_TEST("Transform works with floating point types", "[transform]", floating_po
   const std::size_t num_items      = GENERATE(0, 42, take(4, random(1 << 12, 1 << 16)));
   operation_t op                   = make_operation("op", get_unary_op(get_type_info<T>().type));
   const std::vector<int> int_input = generate<int>(num_items);
+  // Suppress harmless conversion warnings on MSVC
+  _CCCL_DIAG_PUSH
+  _CCCL_DIAG_SUPPRESS_MSVC(4244)
   const std::vector<T> input(int_input.begin(), int_input.end());
+  _CCCL_DIAG_POP
   const std::vector<T> output(num_items, 0);
   pointer_t<T> input_ptr(input);
   pointer_t<T> output_ptr(output);

@@ -218,7 +218,11 @@ C2H_TEST("DeviceSelect::UniqueByKey works with floating point types", "[unique_b
 
   operation_t op                   = make_operation("op", get_unique_by_key_op(get_type_info<key_t>().type));
   const std::vector<int> int_input = generate<int>(num_items);
+  // Suppress harmless conversion warnings on MSVC
+  _CCCL_DIAG_PUSH
+  _CCCL_DIAG_SUPPRESS_MSVC(4244)
   const std::vector<key_t> input_keys(int_input.begin(), int_input.end());
+  _CCCL_DIAG_POP
   std::vector<item_t> input_values = generate<item_t>(num_items);
 
   pointer_t<key_t> input_keys_it(input_keys);

@@ -19,6 +19,7 @@
 #include <format>
 #include <fstream>
 #include <memory>
+#include <numeric>
 #include <random>
 #include <string>
 #include <tuple>
@@ -49,9 +50,9 @@ inline std::string inspect_sass(const void* cubin, size_t cubin_size)
   temp_in_file.close();
 
   std::string command = "nvdisasm -gi ";
-  command += temp_in_filename;
+  command += temp_in_filename.string();
   command += " > ";
-  command += temp_out_filename;
+  command += temp_out_filename.string();
 
   int exec_code = std::system(command.c_str());
 
@@ -114,12 +115,19 @@ inline std::string compile(const std::string& source)
 template <class T>
 std::vector<T> generate(std::size_t num_items)
 {
+  // Add support for 8-bit ints, otherwise MSVC fails with:
+  // error C2338: static_assert failed:
+  //   'invalid template argument for uniform_int_distribution:
+  //     N4950 [rand.req.genl]/1.5 requires one of
+  //       short, int, long, long long,
+  //       unsigned short, unsigned int, unsigned long, or unsigned long long'
+  using dist_type = std::conditional_t<sizeof(T) == 1, short, T>;
   std::random_device rnd_device;
   std::mt19937 mersenne_engine{rnd_device()}; // Generates random integers
-  std::uniform_int_distribution<T> dist{T{1}, T{42}};
+  std::uniform_int_distribution<dist_type> dist{dist_type{1}, dist_type{42}};
   std::vector<T> vec(num_items);
   std::generate(vec.begin(), vec.end(), [&]() {
-    return dist(mersenne_engine);
+    return static_cast<T>(dist(mersenne_engine));
   });
   return vec;
 }

@@ -28,7 +28,9 @@ target_include_directories(cccl.c2h PUBLIC "${C2H_SOURCE_DIR}/include")
 target_link_libraries(cccl.c2h PUBLIC
   CCCL::CCCL
   Catch2::Catch2
+  cccl.compiler_interface_cpp17
 )
+cccl_configure_target(cccl.c2h DIALECT 17)
 
 if (C2H_ENABLE_CURAND)
   target_link_libraries(cccl.c2h PRIVATE CUDA::curand)

@@ -83,12 +83,11 @@ VEC_SPECIALIZATION(int4);
 
 VEC_SPECIALIZATION(long2);
 VEC_SPECIALIZATION(long3);
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-VEC_SPECIALIZATION(long4);
-_CCCL_SUPPRESS_DEPRECATED_POP
 #  if _CCCL_CTK_AT_LEAST(13, 0)
 VEC_SPECIALIZATION(long4_16a);
 VEC_SPECIALIZATION(long4_32a);
+#  else
+VEC_SPECIALIZATION(long4);
 #  endif // _CCCL_CTK_AT_LEAST(13, 0)
 
 // VEC_SPECIALIZATION(ulong2);
@@ -97,22 +96,20 @@ VEC_SPECIALIZATION(long4_32a);
 
 VEC_SPECIALIZATION(longlong2);
 VEC_SPECIALIZATION(longlong3);
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-VEC_SPECIALIZATION(longlong4);
-_CCCL_SUPPRESS_DEPRECATED_POP
 #  if _CCCL_CTK_AT_LEAST(13, 0)
 VEC_SPECIALIZATION(longlong4_16a);
 VEC_SPECIALIZATION(longlong4_32a);
+#  else
+VEC_SPECIALIZATION(longlong4);
 #  endif // _CCCL_CTK_AT_LEAST(13, 0)
 
 VEC_SPECIALIZATION(ulonglong2);
 // VEC_SPECIALIZATION(ulonglong3);
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-VEC_SPECIALIZATION(ulonglong4);
-_CCCL_SUPPRESS_DEPRECATED_POP
 #  if _CCCL_CTK_AT_LEAST(13, 0)
 VEC_SPECIALIZATION(ulonglong4_16a);
 VEC_SPECIALIZATION(ulonglong4_32a);
+#  else
+VEC_SPECIALIZATION(ulonglong4);
 #  endif // _CCCL_CTK_AT_LEAST(13, 0)
 
 VEC_SPECIALIZATION(float2);
@@ -121,12 +118,11 @@ VEC_SPECIALIZATION(float4);
 
 VEC_SPECIALIZATION(double2);
 VEC_SPECIALIZATION(double3);
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-VEC_SPECIALIZATION(double4);
-_CCCL_SUPPRESS_DEPRECATED_POP
 #  if _CCCL_CTK_AT_LEAST(13, 0)
 VEC_SPECIALIZATION(double4_16a);
 VEC_SPECIALIZATION(double4_32a);
+#  else
+VEC_SPECIALIZATION(double4);
 #  endif // _CCCL_CTK_AT_LEAST(13, 0)
 
 #  if TEST_HALF_T()
@@ -167,8 +163,3 @@ VEC_GEN_MOD_SPECIALIZATION(ulonglong4, unsigned long long);
 VEC_GEN_MOD_SPECIALIZATION(ushort4, unsigned short);
 #endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
 } // namespace c2h::detail
-
-// Suppress deprecation warnings for use of vector types in the `*cudafe1.stub.c` file
-#if _CCCL_CTK_AT_LEAST(13, 0) && _CCCL_COMPILER(CLANG)
-_CCCL_SUPPRESS_DEPRECATED_PUSH
-#endif // _CCCL_CTK_AT_LEAST(13, 0) && _CCCL_COMPILER(CLANG)