Skip to content

Commit a7650d3

Browse files
authored
Add data types axis to joins benchmarks (#19281)
Addresses #19280 This PR adds INT, FLOAT, STRING, LIST and STRUCT data types, as well as the number of keys to be joined, as parameter axes to the join benchmarks. To generate build and probe tables with the specified number of columns and column types - 1. Create a distinct rows table of size `numrows + 1` by passing cardinality as zero to the random table generator's profile. 2. Using the existing random number utilities, create a gather map of indices in `[0 ... numrows - 1]` based on the multiplicity selected. This map is used to generate the build table. 3. For the probe table, generate another gather map based on selectivity fraction `s` passed. This results in `s` fraction of the probe table gather map having entries in `[0 ... numrows - 1]` and the remaining being equal to `numrows`. Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: #19281
1 parent 42469ad commit a7650d3

File tree

10 files changed

+642
-358
lines changed

10 files changed

+642
-358
lines changed

cpp/benchmarks/common/generate_input.cu

Lines changed: 191 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,21 @@
2525
#include <cudf/detail/gather.hpp>
2626
#include <cudf/detail/offsets_iterator_factory.cuh>
2727
#include <cudf/detail/utilities/integer_utils.hpp>
28+
#include <cudf/detail/utilities/vector_factories.hpp>
2829
#include <cudf/detail/valid_if.cuh>
2930
#include <cudf/filling.hpp>
31+
#include <cudf/lists/combine.hpp>
3032
#include <cudf/null_mask.hpp>
3133
#include <cudf/scalar/scalar_factories.hpp>
3234
#include <cudf/strings/combine.hpp>
35+
#include <cudf/strings/convert/convert_integers.hpp>
3336
#include <cudf/strings/detail/strings_children.cuh>
3437
#include <cudf/table/table.hpp>
3538
#include <cudf/types.hpp>
3639
#include <cudf/utilities/default_stream.hpp>
3740
#include <cudf/utilities/error.hpp>
3841
#include <cudf/utilities/memory_resource.hpp>
42+
#include <cudf/utilities/traits.hpp>
3943

4044
#include <rmm/device_buffer.hpp>
4145
#include <rmm/device_uvector.hpp>
@@ -56,6 +60,7 @@
5660
#include <thrust/random/uniform_int_distribution.h>
5761
#include <thrust/random/uniform_real_distribution.h>
5862
#include <thrust/scan.h>
63+
#include <thrust/sequence.h>
5964
#include <thrust/tabulate.h>
6065
#include <thrust/transform.h>
6166
#include <thrust/tuple.h>
@@ -450,79 +455,6 @@ rmm::device_uvector<cudf::size_type> sample_indices_with_run_length(cudf::size_t
450455
}
451456
}
452457

453-
/**
454-
* @brief Creates a column with random content of type @ref T.
455-
*
456-
* @param profile Parameters for the random generator
457-
* @param engine Pseudo-random engine
458-
* @param num_rows Size of the output column
459-
*
460-
* @tparam T Data type of the output column
461-
* @return Column filled with random data
462-
*/
463-
template <typename T>
464-
std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
465-
thrust::minstd_rand& engine,
466-
cudf::size_type num_rows)
467-
{
468-
// Bernoulli distribution
469-
auto valid_dist = random_value_fn<bool>(
470-
distribution_params<bool>{1. - profile.get_null_probability().value_or(0)});
471-
auto value_dist = random_value_fn<T>{profile.get_distribution_params<T>()};
472-
473-
using DeviceType = cudf::device_storage_type_t<T>;
474-
cudf::data_type const dtype = [&]() {
475-
if constexpr (cudf::is_fixed_point<T>())
476-
return cudf::data_type{cudf::type_to_id<T>(), value_dist.get_scale(engine)};
477-
else
478-
return cudf::data_type{cudf::type_to_id<T>()};
479-
}();
480-
481-
// Distribution for picking elements from the array of samples
482-
auto const avg_run_len = profile.get_avg_run_length();
483-
rmm::device_uvector<DeviceType> data(0, cudf::get_default_stream());
484-
rmm::device_uvector<bool> null_mask(0, cudf::get_default_stream());
485-
486-
if (profile.get_cardinality() == 0 and avg_run_len == 1) {
487-
data = value_dist(engine, num_rows);
488-
null_mask = valid_dist(engine, num_rows);
489-
} else {
490-
auto const cardinality = [profile_cardinality = profile.get_cardinality(), num_rows] {
491-
return (profile_cardinality == 0 or profile_cardinality > num_rows) ? num_rows
492-
: profile_cardinality;
493-
}();
494-
rmm::device_uvector<bool> samples_null_mask = valid_dist(engine, cardinality);
495-
rmm::device_uvector<DeviceType> samples = value_dist(engine, cardinality);
496-
497-
// generate n samples and gather.
498-
auto const sample_indices =
499-
sample_indices_with_run_length(avg_run_len, cardinality, num_rows, engine);
500-
data = rmm::device_uvector<DeviceType>(num_rows, cudf::get_default_stream());
501-
null_mask = rmm::device_uvector<bool>(num_rows, cudf::get_default_stream());
502-
thrust::gather(
503-
thrust::device, sample_indices.begin(), sample_indices.end(), samples.begin(), data.begin());
504-
thrust::gather(thrust::device,
505-
sample_indices.begin(),
506-
sample_indices.end(),
507-
samples_null_mask.begin(),
508-
null_mask.begin());
509-
}
510-
511-
auto [result_bitmask, null_count] =
512-
cudf::detail::valid_if(null_mask.begin(),
513-
null_mask.end(),
514-
cuda::std::identity{},
515-
cudf::get_default_stream(),
516-
cudf::get_current_device_resource_ref());
517-
518-
return std::make_unique<cudf::column>(
519-
dtype,
520-
num_rows,
521-
data.release(),
522-
profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{},
523-
profile.get_null_probability().has_value() ? null_count : 0);
524-
}
525-
526458
struct valid_or_zero {
527459
template <typename T>
528460
__device__ T operator()(thrust::tuple<T, bool> len_valid) const
@@ -607,6 +539,131 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
607539
num_rows, std::move(offsets), chars.release(), null_count, std::move(result_bitmask));
608540
}
609541

542+
/**
543+
* @brief Functor to dispatch create_random_column calls.
544+
*/
545+
struct create_rand_col_fn {
546+
public:
547+
template <typename T>
548+
std::unique_ptr<cudf::column> operator()(data_profile const& profile,
549+
thrust::minstd_rand& engine,
550+
cudf::size_type num_rows)
551+
{
552+
if (profile.get_cardinality() == 0 || profile.get_cardinality() >= num_rows) {
553+
return create_distinct_rows_column<T>(profile, engine, num_rows);
554+
}
555+
return create_random_column<T>(profile, engine, num_rows);
556+
}
557+
};
558+
559+
/**
560+
* @brief Creates a column with random content of type @ref T.
561+
*
562+
* @param profile Parameters for the random generator
563+
* @param engine Pseudo-random engine
564+
* @param num_rows Size of the output column
565+
*
566+
* @tparam T Data type of the output column
567+
* @return Column filled with random data
568+
*/
569+
template <typename T>
570+
std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
571+
thrust::minstd_rand& engine,
572+
cudf::size_type num_rows)
573+
{
574+
// Bernoulli distribution
575+
auto valid_dist = random_value_fn<bool>(
576+
distribution_params<bool>{1. - profile.get_null_probability().value_or(0)});
577+
auto value_dist = random_value_fn<T>{profile.get_distribution_params<T>()};
578+
579+
using DeviceType = cudf::device_storage_type_t<T>;
580+
cudf::data_type const dtype = [&]() {
581+
if constexpr (cudf::is_fixed_point<T>())
582+
return cudf::data_type{cudf::type_to_id<T>(), value_dist.get_scale(engine)};
583+
else
584+
return cudf::data_type{cudf::type_to_id<T>()};
585+
}();
586+
587+
// Distribution for picking elements from the array of samples
588+
auto const avg_run_len = profile.get_avg_run_length();
589+
rmm::device_uvector<DeviceType> data(0, cudf::get_default_stream());
590+
rmm::device_uvector<bool> null_mask(0, cudf::get_default_stream());
591+
592+
if (profile.get_cardinality() == 0 and avg_run_len == 1) {
593+
data = value_dist(engine, num_rows);
594+
null_mask = valid_dist(engine, num_rows);
595+
} else {
596+
auto const cardinality = [profile_cardinality = profile.get_cardinality(), num_rows] {
597+
return (profile_cardinality == 0 or profile_cardinality > num_rows) ? num_rows
598+
: profile_cardinality;
599+
}();
600+
rmm::device_uvector<bool> samples_null_mask = valid_dist(engine, cardinality);
601+
rmm::device_uvector<DeviceType> samples = value_dist(engine, cardinality);
602+
603+
// generate n samples and gather.
604+
auto const sample_indices =
605+
sample_indices_with_run_length(avg_run_len, cardinality, num_rows, engine);
606+
data = rmm::device_uvector<DeviceType>(num_rows, cudf::get_default_stream());
607+
null_mask = rmm::device_uvector<bool>(num_rows, cudf::get_default_stream());
608+
thrust::gather(
609+
thrust::device, sample_indices.begin(), sample_indices.end(), samples.begin(), data.begin());
610+
thrust::gather(thrust::device,
611+
sample_indices.begin(),
612+
sample_indices.end(),
613+
samples_null_mask.begin(),
614+
null_mask.begin());
615+
}
616+
617+
auto [result_bitmask, null_count] =
618+
cudf::detail::valid_if(null_mask.begin(),
619+
null_mask.end(),
620+
cuda::std::identity{},
621+
cudf::get_default_stream(),
622+
cudf::get_current_device_resource_ref());
623+
624+
return std::make_unique<cudf::column>(
625+
dtype,
626+
num_rows,
627+
data.release(),
628+
profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{},
629+
profile.get_null_probability().has_value() ? null_count : 0);
630+
}
631+
632+
template <typename T>
633+
std::unique_ptr<cudf::column> create_distinct_rows_column(data_profile const& profile,
634+
thrust::minstd_rand& engine,
635+
cudf::size_type num_rows)
636+
{
637+
using DeviceType = cudf::device_storage_type_t<T>;
638+
639+
// Bernoulli distribution
640+
auto valid_dist = random_value_fn<bool>(
641+
distribution_params<bool>{1. - profile.get_null_probability().value_or(0)});
642+
643+
cudf::data_type const dtype = [&]() {
644+
if constexpr (cudf::is_fixed_point<T>())
645+
return cudf::data_type{cudf::type_to_id<T>(), 0};
646+
else
647+
return cudf::data_type{cudf::type_to_id<T>()};
648+
}();
649+
650+
auto init = cudf::make_default_constructed_scalar(dtype);
651+
auto col = cudf::sequence(num_rows, *init);
652+
653+
rmm::device_uvector<bool> null_mask(0, cudf::get_default_stream());
654+
null_mask = valid_dist(engine, num_rows);
655+
auto [result_bitmask, null_count] =
656+
cudf::detail::valid_if(null_mask.begin(),
657+
null_mask.end(),
658+
cuda::std::identity{},
659+
cudf::get_default_stream(),
660+
cudf::get_current_device_resource_ref());
661+
662+
col->set_null_mask(std::move(result_bitmask), null_count);
663+
664+
return std::move(cudf::sample(cudf::table_view({col->view()}), num_rows)->release()[0]);
665+
}
666+
610667
/**
611668
* @brief Creates a string column with random content.
612669
*
@@ -637,6 +694,18 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
637694
return std::move(str_table->release()[0]);
638695
}
639696

697+
template <>
698+
std::unique_ptr<cudf::column> create_distinct_rows_column<cudf::string_view>(
699+
data_profile const& profile, thrust::minstd_rand& engine, cudf::size_type num_rows)
700+
{
701+
auto col = create_random_column<cudf::string_view>(profile, engine, num_rows);
702+
auto int_col = cudf::sequence(
703+
num_rows, *cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_id::INT32}));
704+
auto int2strcol = cudf::strings::from_integers(int_col->view());
705+
auto concat_col = cudf::strings::concatenate(cudf::table_view({col->view(), int2strcol->view()}));
706+
return std::move(cudf::sample(cudf::table_view({concat_col->view()}), num_rows)->release()[0]);
707+
}
708+
640709
template <>
641710
std::unique_ptr<cudf::column> create_random_column<cudf::dictionary32>(data_profile const& profile,
642711
thrust::minstd_rand& engine,
@@ -645,19 +714,12 @@ std::unique_ptr<cudf::column> create_random_column<cudf::dictionary32>(data_prof
645714
CUDF_FAIL("not implemented yet");
646715
}
647716

648-
/**
649-
* @brief Functor to dispatch create_random_column calls.
650-
*/
651-
struct create_rand_col_fn {
652-
public:
653-
template <typename T>
654-
std::unique_ptr<cudf::column> operator()(data_profile const& profile,
655-
thrust::minstd_rand& engine,
656-
cudf::size_type num_rows)
657-
{
658-
return create_random_column<T>(profile, engine, num_rows);
659-
}
660-
};
717+
template <>
718+
std::unique_ptr<cudf::column> create_distinct_rows_column<cudf::dictionary32>(
719+
data_profile const& profile, thrust::minstd_rand& engine, cudf::size_type num_rows)
720+
{
721+
CUDF_FAIL("not implemented yet");
722+
}
661723

662724
template <>
663725
std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profile const& profile,
@@ -724,6 +786,29 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
724786
CUDF_FAIL("Reached unreachable code in struct column creation");
725787
}
726788

789+
template <>
790+
std::unique_ptr<cudf::column> create_distinct_rows_column<cudf::struct_view>(
791+
data_profile const& profile, thrust::minstd_rand& engine, cudf::size_type num_rows)
792+
{
793+
auto const dist_params = profile.get_distribution_params<cudf::struct_view>();
794+
auto col = create_random_column<cudf::struct_view>(profile, engine, num_rows);
795+
std::vector<std::unique_ptr<cudf::column>> children;
796+
children.push_back(cudf::sequence(
797+
num_rows, *cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_id::INT32})));
798+
for (int lvl = dist_params.max_depth; lvl > 1; --lvl) {
799+
std::vector<std::unique_ptr<cudf::column>> parents;
800+
parents.push_back(
801+
cudf::create_structs_hierarchy(num_rows, std::move(children), 0, rmm::device_buffer{}));
802+
std::swap(parents, children);
803+
}
804+
auto const null_count = col->null_count();
805+
auto col_contents = col->release();
806+
col_contents.children.push_back(std::move(children[0]));
807+
auto structs_col = cudf::create_structs_hierarchy(
808+
num_rows, std::move(col_contents.children), null_count, std::move(*col_contents.null_mask));
809+
return std::move(cudf::sample(cudf::table_view({structs_col->view()}), num_rows)->release()[0]);
810+
}
811+
727812
template <typename T>
728813
struct clamp_down {
729814
T max;
@@ -800,6 +885,26 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
800885
return list_column; // return the top-level column
801886
}
802887

888+
template <>
889+
std::unique_ptr<cudf::column> create_distinct_rows_column<cudf::list_view>(
890+
data_profile const& profile, thrust::minstd_rand& engine, cudf::size_type num_rows)
891+
{
892+
auto const dist_params = profile.get_distribution_params<cudf::list_view>();
893+
auto col = create_random_column<cudf::list_view>(profile, engine, num_rows);
894+
auto child_column = cudf::sequence(
895+
num_rows, *cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_id::INT32}));
896+
for (int lvl = dist_params.max_depth; lvl > 0; --lvl) {
897+
auto offsets_column = cudf::sequence(
898+
num_rows + 1, *cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_id::INT32}));
899+
auto list_column = cudf::make_lists_column(
900+
num_rows, std::move(offsets_column), std::move(child_column), 0, rmm::device_buffer{});
901+
std::swap(child_column, list_column);
902+
}
903+
auto lists_col =
904+
cudf::lists::concatenate_rows(cudf::table_view({col->view(), child_column->view()}));
905+
return std::move(cudf::sample(cudf::table_view({lists_col->view()}), num_rows)->release()[0]);
906+
}
907+
803908
using columns_vector = std::vector<std::unique_ptr<cudf::column>>;
804909

805910
/**

0 commit comments

Comments
 (0)