Skip to content

Commit b3ba226

Browse files
authored
GH-39603: [R] Error: Cannot convert Dictionary Array of type dictionary<values=large_string, indices=uint32, ordered=0> to R (#49710)
### Rationale for this change Dictionary Arrays with specific type indices caused error ### What changes are included in this PR? Allows those type indices and add check to ensure no overflow ### Are these changes tested? Yes ### Are there any user-facing changes? No ### AI Usage I used AI to create the changes but manually reviewed myself afterwards * GitHub Issue: #39603 Authored-by: Nic Crane <thisisnic@gmail.com> Signed-off-by: Nic Crane <thisisnic@gmail.com>
1 parent f8184d5 commit b3ba226

4 files changed

Lines changed: 36 additions & 12 deletions

File tree

cpp/src/arrow/util/converter.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,9 @@ struct MakeConverterImpl {
238238
DICTIONARY_CASE(FloatType);
239239
DICTIONARY_CASE(DoubleType);
240240
DICTIONARY_CASE(BinaryType);
241+
DICTIONARY_CASE(LargeBinaryType);
241242
DICTIONARY_CASE(StringType);
243+
DICTIONARY_CASE(LargeStringType);
242244
DICTIONARY_CASE(FixedSizeBinaryType);
243245
#undef DICTIONARY_CASE
244246
default:

r/src/array_to_vector.cpp

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -595,7 +595,9 @@ class Converter_Dictionary : public Converter {
595595
case Type::UINT16:
596596
case Type::INT16:
597597
case Type::INT32:
598-
// TODO: also add int64, uint32, uint64 downcasts, if possible
598+
case Type::UINT32:
599+
case Type::INT64:
600+
case Type::UINT64:
599601
break;
600602
default:
601603
cpp11::stop("Cannot convert Dictionary Array of type `%s` to R",
@@ -612,6 +614,16 @@ class Converter_Dictionary : public Converter {
612614
dictionary_ = CreateEmptyArray(dict_type.value_type());
613615
}
614616
}
617+
618+
// R factors store their codes in 32-bit integers, so dictionary arrays with
619+
// more levels than that cannot be represented safely.
620+
if (dictionary_->length() > std::numeric_limits<int>::max()) {
621+
const auto& dict_type = checked_cast<const DictionaryType&>(*chunked_array->type());
622+
cpp11::stop(
623+
"Cannot convert Dictionary Array of type `%s` to R: dictionary has "
624+
"more levels than an R factor can represent",
625+
dict_type.ToString().c_str());
626+
}
615627
}
616628

617629
SEXP Allocate(R_xlen_t n) const {
@@ -653,6 +665,15 @@ class Converter_Dictionary : public Converter {
653665
case Type::INT32:
654666
return Ingest_some_nulls_Impl<arrow::Int32Type>(data, array, start, n,
655667
chunk_index);
668+
case Type::UINT32:
669+
return Ingest_some_nulls_Impl<arrow::UInt32Type>(data, array, start, n,
670+
chunk_index);
671+
case Type::INT64:
672+
return Ingest_some_nulls_Impl<arrow::Int64Type>(data, array, start, n,
673+
chunk_index);
674+
case Type::UINT64:
675+
return Ingest_some_nulls_Impl<arrow::UInt64Type>(data, array, start, n,
676+
chunk_index);
656677
default:
657678
break;
658679
}
@@ -704,7 +725,8 @@ class Converter_Dictionary : public Converter {
704725
// TODO (npr): this coercion should be optional, "dictionariesAsFactors" ;)
705726
// Alternative: preserve the logical type of the dictionary values
706727
// (e.g. if dict is timestamp, return a POSIXt R vector, not factor)
707-
if (dictionary_->type_id() != Type::STRING) {
728+
if (dictionary_->type_id() != Type::STRING &&
729+
dictionary_->type_id() != Type::LARGE_STRING) {
708730
cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels");
709731
}
710732

r/src/r_to_arrow.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1029,8 +1029,8 @@ class RDictionaryConverter<ValueType, enable_if_has_string_view<ValueType>>
10291029

10301030
// first we need to handle the levels
10311031
SEXP levels = Rf_getAttrib(x, R_LevelsSymbol);
1032-
auto memo_chunked_chunked_array =
1033-
arrow::r::vec_to_arrow_ChunkedArray(levels, utf8(), false);
1032+
auto memo_chunked_chunked_array = arrow::r::vec_to_arrow_ChunkedArray(
1033+
levels, this->dict_type_->value_type(), false);
10341034
for (const auto& chunk : memo_chunked_chunked_array->chunks()) {
10351035
RETURN_NOT_OK(this->value_builder_->InsertMemoValues(*chunk));
10361036
}

r/tests/testthat/test-Table.R

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -357,18 +357,18 @@ test_that("Table handles null type (ARROW-7064)", {
357357

358358
test_that("Can create table with specific dictionary types", {
359359
fact <- example_data[, "fct"]
360-
int_types <- c(int8(), int16(), int32(), int64())
361-
# TODO: test uint types when format allows
362-
# uint_types <- c(uint8(), uint16(), uint32(), uint64()) # nolint
363-
for (i in int_types) {
360+
index_types <- c(int8(), int16(), int32(), int64(), uint8(), uint16(), uint32(), uint64())
361+
for (i in index_types) {
364362
sch <- schema(fct = dictionary(i, utf8()))
365363
tab <- Table$create(fact, schema = sch)
366364
expect_equal(sch, tab$schema)
367-
if (i != int64()) {
368-
# TODO: same downcast to int32 as we do for int64() type elsewhere
369-
expect_equal_data_frame(tab, fact)
370-
}
365+
expect_equal_data_frame(tab, fact)
371366
}
367+
368+
# large_utf8 values exercise the non-ALTREP conversion path
369+
tab_large <- Table$create(fact, schema = schema(fct = dictionary(uint32(), large_utf8())))
370+
expect_equal(tab_large$schema, schema(fct = dictionary(uint32(), large_utf8())))
371+
expect_equal_data_frame(tab_large, fact)
372372
})
373373

374374
test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", {

0 commit comments

Comments
 (0)