2525#include < cudf/detail/gather.hpp>
2626#include < cudf/detail/offsets_iterator_factory.cuh>
2727#include < cudf/detail/utilities/integer_utils.hpp>
28+ #include < cudf/detail/utilities/vector_factories.hpp>
2829#include < cudf/detail/valid_if.cuh>
2930#include < cudf/filling.hpp>
31+ #include < cudf/lists/combine.hpp>
3032#include < cudf/null_mask.hpp>
3133#include < cudf/scalar/scalar_factories.hpp>
3234#include < cudf/strings/combine.hpp>
35+ #include < cudf/strings/convert/convert_integers.hpp>
3336#include < cudf/strings/detail/strings_children.cuh>
3437#include < cudf/table/table.hpp>
3538#include < cudf/types.hpp>
3639#include < cudf/utilities/default_stream.hpp>
3740#include < cudf/utilities/error.hpp>
3841#include < cudf/utilities/memory_resource.hpp>
42+ #include < cudf/utilities/traits.hpp>
3943
4044#include < rmm/device_buffer.hpp>
4145#include < rmm/device_uvector.hpp>
5660#include < thrust/random/uniform_int_distribution.h>
5761#include < thrust/random/uniform_real_distribution.h>
5862#include < thrust/scan.h>
63+ #include < thrust/sequence.h>
5964#include < thrust/tabulate.h>
6065#include < thrust/transform.h>
6166#include < thrust/tuple.h>
@@ -450,79 +455,6 @@ rmm::device_uvector<cudf::size_type> sample_indices_with_run_length(cudf::size_t
450455 }
451456}
452457
453- /* *
454- * @brief Creates a column with random content of type @ref T.
455- *
456- * @param profile Parameters for the random generator
457- * @param engine Pseudo-random engine
458- * @param num_rows Size of the output column
459- *
460- * @tparam T Data type of the output column
461- * @return Column filled with random data
462- */
463- template <typename T>
464- std::unique_ptr<cudf::column> create_random_column (data_profile const & profile,
465- thrust::minstd_rand& engine,
466- cudf::size_type num_rows)
467- {
468- // Bernoulli distribution
469- auto valid_dist = random_value_fn<bool >(
470- distribution_params<bool >{1 . - profile.get_null_probability ().value_or (0 )});
471- auto value_dist = random_value_fn<T>{profile.get_distribution_params <T>()};
472-
473- using DeviceType = cudf::device_storage_type_t <T>;
474- cudf::data_type const dtype = [&]() {
475- if constexpr (cudf::is_fixed_point<T>())
476- return cudf::data_type{cudf::type_to_id<T>(), value_dist.get_scale (engine)};
477- else
478- return cudf::data_type{cudf::type_to_id<T>()};
479- }();
480-
481- // Distribution for picking elements from the array of samples
482- auto const avg_run_len = profile.get_avg_run_length ();
483- rmm::device_uvector<DeviceType> data (0 , cudf::get_default_stream ());
484- rmm::device_uvector<bool > null_mask (0 , cudf::get_default_stream ());
485-
486- if (profile.get_cardinality () == 0 and avg_run_len == 1 ) {
487- data = value_dist (engine, num_rows);
488- null_mask = valid_dist (engine, num_rows);
489- } else {
490- auto const cardinality = [profile_cardinality = profile.get_cardinality (), num_rows] {
491- return (profile_cardinality == 0 or profile_cardinality > num_rows) ? num_rows
492- : profile_cardinality;
493- }();
494- rmm::device_uvector<bool > samples_null_mask = valid_dist (engine, cardinality);
495- rmm::device_uvector<DeviceType> samples = value_dist (engine, cardinality);
496-
497- // generate n samples and gather.
498- auto const sample_indices =
499- sample_indices_with_run_length (avg_run_len, cardinality, num_rows, engine);
500- data = rmm::device_uvector<DeviceType>(num_rows, cudf::get_default_stream ());
501- null_mask = rmm::device_uvector<bool >(num_rows, cudf::get_default_stream ());
502- thrust::gather (
503- thrust::device, sample_indices.begin (), sample_indices.end (), samples.begin (), data.begin ());
504- thrust::gather (thrust::device,
505- sample_indices.begin (),
506- sample_indices.end (),
507- samples_null_mask.begin (),
508- null_mask.begin ());
509- }
510-
511- auto [result_bitmask, null_count] =
512- cudf::detail::valid_if (null_mask.begin (),
513- null_mask.end (),
514- cuda::std::identity{},
515- cudf::get_default_stream (),
516- cudf::get_current_device_resource_ref ());
517-
518- return std::make_unique<cudf::column>(
519- dtype,
520- num_rows,
521- data.release (),
522- profile.get_null_probability ().has_value () ? std::move (result_bitmask) : rmm::device_buffer{},
523- profile.get_null_probability ().has_value () ? null_count : 0 );
524- }
525-
526458struct valid_or_zero {
527459 template <typename T>
528460 __device__ T operator ()(thrust::tuple<T, bool > len_valid) const
@@ -607,6 +539,131 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
607539 num_rows, std::move (offsets), chars.release (), null_count, std::move (result_bitmask));
608540}
609541
542+ /* *
543+ * @brief Functor to dispatch create_random_column calls.
544+ */
545+ struct create_rand_col_fn {
546+ public:
547+ template <typename T>
548+ std::unique_ptr<cudf::column> operator ()(data_profile const & profile,
549+ thrust::minstd_rand& engine,
550+ cudf::size_type num_rows)
551+ {
552+ if (profile.get_cardinality () == 0 || profile.get_cardinality () >= num_rows) {
553+ return create_distinct_rows_column<T>(profile, engine, num_rows);
554+ }
555+ return create_random_column<T>(profile, engine, num_rows);
556+ }
557+ };
558+
559+ /* *
560+ * @brief Creates a column with random content of type @ref T.
561+ *
562+ * @param profile Parameters for the random generator
563+ * @param engine Pseudo-random engine
564+ * @param num_rows Size of the output column
565+ *
566+ * @tparam T Data type of the output column
567+ * @return Column filled with random data
568+ */
569+ template <typename T>
570+ std::unique_ptr<cudf::column> create_random_column (data_profile const & profile,
571+ thrust::minstd_rand& engine,
572+ cudf::size_type num_rows)
573+ {
574+ // Bernoulli distribution
575+ auto valid_dist = random_value_fn<bool >(
576+ distribution_params<bool >{1 . - profile.get_null_probability ().value_or (0 )});
577+ auto value_dist = random_value_fn<T>{profile.get_distribution_params <T>()};
578+
579+ using DeviceType = cudf::device_storage_type_t <T>;
580+ cudf::data_type const dtype = [&]() {
581+ if constexpr (cudf::is_fixed_point<T>())
582+ return cudf::data_type{cudf::type_to_id<T>(), value_dist.get_scale (engine)};
583+ else
584+ return cudf::data_type{cudf::type_to_id<T>()};
585+ }();
586+
587+ // Distribution for picking elements from the array of samples
588+ auto const avg_run_len = profile.get_avg_run_length ();
589+ rmm::device_uvector<DeviceType> data (0 , cudf::get_default_stream ());
590+ rmm::device_uvector<bool > null_mask (0 , cudf::get_default_stream ());
591+
592+ if (profile.get_cardinality () == 0 and avg_run_len == 1 ) {
593+ data = value_dist (engine, num_rows);
594+ null_mask = valid_dist (engine, num_rows);
595+ } else {
596+ auto const cardinality = [profile_cardinality = profile.get_cardinality (), num_rows] {
597+ return (profile_cardinality == 0 or profile_cardinality > num_rows) ? num_rows
598+ : profile_cardinality;
599+ }();
600+ rmm::device_uvector<bool > samples_null_mask = valid_dist (engine, cardinality);
601+ rmm::device_uvector<DeviceType> samples = value_dist (engine, cardinality);
602+
603+ // generate n samples and gather.
604+ auto const sample_indices =
605+ sample_indices_with_run_length (avg_run_len, cardinality, num_rows, engine);
606+ data = rmm::device_uvector<DeviceType>(num_rows, cudf::get_default_stream ());
607+ null_mask = rmm::device_uvector<bool >(num_rows, cudf::get_default_stream ());
608+ thrust::gather (
609+ thrust::device, sample_indices.begin (), sample_indices.end (), samples.begin (), data.begin ());
610+ thrust::gather (thrust::device,
611+ sample_indices.begin (),
612+ sample_indices.end (),
613+ samples_null_mask.begin (),
614+ null_mask.begin ());
615+ }
616+
617+ auto [result_bitmask, null_count] =
618+ cudf::detail::valid_if (null_mask.begin (),
619+ null_mask.end (),
620+ cuda::std::identity{},
621+ cudf::get_default_stream (),
622+ cudf::get_current_device_resource_ref ());
623+
624+ return std::make_unique<cudf::column>(
625+ dtype,
626+ num_rows,
627+ data.release (),
628+ profile.get_null_probability ().has_value () ? std::move (result_bitmask) : rmm::device_buffer{},
629+ profile.get_null_probability ().has_value () ? null_count : 0 );
630+ }
631+
632+ template <typename T>
633+ std::unique_ptr<cudf::column> create_distinct_rows_column (data_profile const & profile,
634+ thrust::minstd_rand& engine,
635+ cudf::size_type num_rows)
636+ {
637+ using DeviceType = cudf::device_storage_type_t <T>;
638+
639+ // Bernoulli distribution
640+ auto valid_dist = random_value_fn<bool >(
641+ distribution_params<bool >{1 . - profile.get_null_probability ().value_or (0 )});
642+
643+ cudf::data_type const dtype = [&]() {
644+ if constexpr (cudf::is_fixed_point<T>())
645+ return cudf::data_type{cudf::type_to_id<T>(), 0 };
646+ else
647+ return cudf::data_type{cudf::type_to_id<T>()};
648+ }();
649+
650+ auto init = cudf::make_default_constructed_scalar (dtype);
651+ auto col = cudf::sequence (num_rows, *init);
652+
653+ rmm::device_uvector<bool > null_mask (0 , cudf::get_default_stream ());
654+ null_mask = valid_dist (engine, num_rows);
655+ auto [result_bitmask, null_count] =
656+ cudf::detail::valid_if (null_mask.begin (),
657+ null_mask.end (),
658+ cuda::std::identity{},
659+ cudf::get_default_stream (),
660+ cudf::get_current_device_resource_ref ());
661+
662+ col->set_null_mask (std::move (result_bitmask), null_count);
663+
664+ return std::move (cudf::sample (cudf::table_view ({col->view ()}), num_rows)->release ()[0 ]);
665+ }
666+
610667/* *
611668 * @brief Creates a string column with random content.
612669 *
@@ -637,6 +694,18 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
637694 return std::move (str_table->release ()[0 ]);
638695}
639696
697+ template <>
698+ std::unique_ptr<cudf::column> create_distinct_rows_column<cudf::string_view>(
699+ data_profile const & profile, thrust::minstd_rand& engine, cudf::size_type num_rows)
700+ {
701+ auto col = create_random_column<cudf::string_view>(profile, engine, num_rows);
702+ auto int_col = cudf::sequence (
703+ num_rows, *cudf::make_default_constructed_scalar (cudf::data_type{cudf::type_id::INT32}));
704+ auto int2strcol = cudf::strings::from_integers (int_col->view ());
705+ auto concat_col = cudf::strings::concatenate (cudf::table_view ({col->view (), int2strcol->view ()}));
706+ return std::move (cudf::sample (cudf::table_view ({concat_col->view ()}), num_rows)->release ()[0 ]);
707+ }
708+
640709template <>
641710std::unique_ptr<cudf::column> create_random_column<cudf::dictionary32>(data_profile const & profile,
642711 thrust::minstd_rand& engine,
@@ -645,19 +714,12 @@ std::unique_ptr<cudf::column> create_random_column<cudf::dictionary32>(data_prof
645714 CUDF_FAIL (" not implemented yet" );
646715}
647716
648- /* *
649- * @brief Functor to dispatch create_random_column calls.
650- */
651- struct create_rand_col_fn {
652- public:
653- template <typename T>
654- std::unique_ptr<cudf::column> operator ()(data_profile const & profile,
655- thrust::minstd_rand& engine,
656- cudf::size_type num_rows)
657- {
658- return create_random_column<T>(profile, engine, num_rows);
659- }
660- };
717+ template <>
718+ std::unique_ptr<cudf::column> create_distinct_rows_column<cudf::dictionary32>(
719+ data_profile const & profile, thrust::minstd_rand& engine, cudf::size_type num_rows)
720+ {
721+ CUDF_FAIL (" not implemented yet" );
722+ }
661723
662724template <>
663725std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profile const & profile,
@@ -724,6 +786,29 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
724786 CUDF_FAIL (" Reached unreachable code in struct column creation" );
725787}
726788
789+ template <>
790+ std::unique_ptr<cudf::column> create_distinct_rows_column<cudf::struct_view>(
791+ data_profile const & profile, thrust::minstd_rand& engine, cudf::size_type num_rows)
792+ {
793+ auto const dist_params = profile.get_distribution_params <cudf::struct_view>();
794+ auto col = create_random_column<cudf::struct_view>(profile, engine, num_rows);
795+ std::vector<std::unique_ptr<cudf::column>> children;
796+ children.push_back (cudf::sequence (
797+ num_rows, *cudf::make_default_constructed_scalar (cudf::data_type{cudf::type_id::INT32})));
798+ for (int lvl = dist_params.max_depth ; lvl > 1 ; --lvl) {
799+ std::vector<std::unique_ptr<cudf::column>> parents;
800+ parents.push_back (
801+ cudf::create_structs_hierarchy (num_rows, std::move (children), 0 , rmm::device_buffer{}));
802+ std::swap (parents, children);
803+ }
804+ auto const null_count = col->null_count ();
805+ auto col_contents = col->release ();
806+ col_contents.children .push_back (std::move (children[0 ]));
807+ auto structs_col = cudf::create_structs_hierarchy (
808+ num_rows, std::move (col_contents.children ), null_count, std::move (*col_contents.null_mask ));
809+ return std::move (cudf::sample (cudf::table_view ({structs_col->view ()}), num_rows)->release ()[0 ]);
810+ }
811+
727812template <typename T>
728813struct clamp_down {
729814 T max;
@@ -800,6 +885,26 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
800885 return list_column; // return the top-level column
801886}
802887
888+ template <>
889+ std::unique_ptr<cudf::column> create_distinct_rows_column<cudf::list_view>(
890+ data_profile const & profile, thrust::minstd_rand& engine, cudf::size_type num_rows)
891+ {
892+ auto const dist_params = profile.get_distribution_params <cudf::list_view>();
893+ auto col = create_random_column<cudf::list_view>(profile, engine, num_rows);
894+ auto child_column = cudf::sequence (
895+ num_rows, *cudf::make_default_constructed_scalar (cudf::data_type{cudf::type_id::INT32}));
896+ for (int lvl = dist_params.max_depth ; lvl > 0 ; --lvl) {
897+ auto offsets_column = cudf::sequence (
898+ num_rows + 1 , *cudf::make_default_constructed_scalar (cudf::data_type{cudf::type_id::INT32}));
899+ auto list_column = cudf::make_lists_column (
900+ num_rows, std::move (offsets_column), std::move (child_column), 0 , rmm::device_buffer{});
901+ std::swap (child_column, list_column);
902+ }
903+ auto lists_col =
904+ cudf::lists::concatenate_rows (cudf::table_view ({col->view (), child_column->view ()}));
905+ return std::move (cudf::sample (cudf::table_view ({lists_col->view ()}), num_rows)->release ()[0 ]);
906+ }
907+
803908using columns_vector = std::vector<std::unique_ptr<cudf::column>>;
804909
805910/* *
0 commit comments