-
Notifications
You must be signed in to change notification settings - Fork 4.1k
GH-45523: [R] Implement Utf8View type bindings #49712
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
94275d7
dfdad81
52fc2b2
3af2dd0
7f6b26a
887beda
3c6820e
fc6dec6
21e7014
1842e7e
2b56912
74c32cc
c3b63c7
ee58f8e
809f578
d095d1b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -826,7 +826,8 @@ class PyDictionaryConverter<U, enable_if_has_string_view<U>> | |
| } else { | ||
| ARROW_RETURN_NOT_OK( | ||
| PyValue::Convert(this->value_type_, this->options_, value, view_)); | ||
| return this->value_builder_->Append(view_.bytes, static_cast<int32_t>(view_.size)); | ||
| return this->value_builder_->Append( | ||
| std::string_view(view_.bytes, static_cast<size_t>(view_.size))); | ||
|
Comment on lines
+829
to
+830
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The old code passed |
||
| } | ||
|
pitrou marked this conversation as resolved.
|
||
| } | ||
|
|
||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -203,6 +203,13 @@ Utf8 <- R6Class( | |
| code = function(namespace = FALSE) call2("utf8", .ns = if (namespace) "arrow") | ||
| ) | ||
| ) | ||
| StringView <- R6Class( | ||
| "StringView", | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason for calling it |
||
| inherit = DataType, | ||
| public = list( | ||
| code = function(namespace = FALSE) call2("string_view", .ns = if (namespace) "arrow") | ||
| ) | ||
| ) | ||
| LargeUtf8 <- R6Class( | ||
| "LargeUtf8", | ||
| inherit = DataType, | ||
|
|
@@ -505,6 +512,10 @@ bool <- boolean | |
| #' @export | ||
| utf8 <- function() Utf8__initialize() | ||
|
|
||
| #' @rdname data-type | ||
| #' @export | ||
| string_view <- function() StringView__initialize() | ||
|
|
||
| #' @rdname data-type | ||
| #' @export | ||
| large_utf8 <- function() LargeUtf8__initialize() | ||
|
|
@@ -806,6 +817,8 @@ canonical_type_str <- function(type_str) { | |
| boolean = "bool", | ||
| bool = "bool", | ||
| utf8 = "string", | ||
| utf8_view = "string_view", | ||
| string_view = "string_view", | ||
| large_utf8 = "large_string", | ||
| large_string = "large_string", | ||
| binary = "binary", | ||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -290,26 +290,32 @@ struct Converter_String : public Converter { | |
|
|
||
| Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array, | ||
| R_xlen_t start, R_xlen_t n, size_t chunk_index) const { | ||
| auto p_offset = array->data()->GetValues<int32_t>(1); | ||
| if (!p_offset) { | ||
| return Status::Invalid("Invalid offset buffer"); | ||
| } | ||
| auto p_strings = array->data()->GetValues<char>(2, *p_offset); | ||
| if (!p_strings) { | ||
| // There is an offset buffer, but the data buffer is null | ||
| // There is at least one value in the array and not all the values are null | ||
| // That means all values are either empty strings or nulls so there is nothing to do | ||
|
|
||
| if (array->null_count()) { | ||
| arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), | ||
| array->offset(), n); | ||
| for (int i = 0; i < n; i++, null_reader.Next()) { | ||
| if (null_reader.IsNotSet()) { | ||
| SET_STRING_ELT(data, start + i, NA_STRING); | ||
| // StringViewArray uses a different memory layout (views + data buffers) rather | ||
| // than offsets, so skip the offset-based fast path and fall through to the | ||
| // GetView()-based element loop below. | ||
| if (array->type_id() != Type::STRING_VIEW) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can use |
||
| auto p_offset = array->data()->GetValues<int32_t>(1); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this doesn't work for LargeBinary and LargeString, which have 64-bit offsets, right? |
||
| if (!p_offset) { | ||
| return Status::Invalid("Invalid offset buffer"); | ||
| } | ||
| auto p_strings = array->data()->GetValues<char>(2, *p_offset); | ||
| if (!p_strings) { | ||
| // There is an offset buffer, but the data buffer is null | ||
| // There is at least one value in the array and not all the values are null | ||
| // That means all values are either empty strings or nulls so there is nothing to | ||
| // do | ||
|
|
||
| if (array->null_count()) { | ||
| arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), | ||
| array->offset(), n); | ||
| for (int i = 0; i < n; i++, null_reader.Next()) { | ||
| if (null_reader.IsNotSet()) { | ||
| SET_STRING_ELT(data, start + i, NA_STRING); | ||
| } | ||
| } | ||
| } | ||
| return Status::OK(); | ||
| } | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| StringArrayType* string_array = static_cast<StringArrayType*>(array.get()); | ||
|
|
@@ -726,7 +732,8 @@ class Converter_Dictionary : public Converter { | |
| // Alternative: preserve the logical type of the dictionary values | ||
| // (e.g. if dict is timestamp, return a POSIXt R vector, not factor) | ||
| if (dictionary_->type_id() != Type::STRING && | ||
| dictionary_->type_id() != Type::LARGE_STRING) { | ||
| dictionary_->type_id() != Type::LARGE_STRING && | ||
| dictionary_->type_id() != Type::STRING_VIEW) { | ||
| cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels"); | ||
| } | ||
|
|
||
|
|
@@ -1263,6 +1270,10 @@ std::shared_ptr<Converter> Converter::Make( | |
| return std::make_shared<arrow::r::Converter_String<arrow::LargeStringArray>>( | ||
| chunked_array); | ||
|
|
||
| case Type::STRING_VIEW: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not handle BINARY_VIEW above as well? |
||
| return std::make_shared<arrow::r::Converter_String<arrow::StringViewArray>>( | ||
| chunked_array); | ||
|
|
||
| case Type::DICTIONARY: | ||
| return std::make_shared<arrow::r::Converter_Dictionary>(chunked_array); | ||
|
|
||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -57,6 +57,8 @@ const char* r6_class_name<arrow::DataType>::get( | |
|
|
||
| case Type::STRING: | ||
| return "Utf8"; | ||
| case Type::STRING_VIEW: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not |
||
| return "StringView"; | ||
| case Type::LARGE_STRING: | ||
| return "LargeUtf8"; | ||
|
|
||
|
|
@@ -165,6 +167,9 @@ std::shared_ptr<arrow::DataType> Boolean__initialize() { return arrow::boolean() | |
| // [[arrow::export]] | ||
| std::shared_ptr<arrow::DataType> Utf8__initialize() { return arrow::utf8(); } | ||
|
|
||
| // [[arrow::export]] | ||
| std::shared_ptr<arrow::DataType> StringView__initialize() { return arrow::utf8_view(); } | ||
|
|
||
| // [[arrow::export]] | ||
| std::shared_ptr<arrow::DataType> LargeUtf8__initialize() { return arrow::large_utf8(); } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -910,6 +910,43 @@ class RPrimitiveConverter<T, enable_if_string_like<T>> | |
| } | ||
| }; | ||
|
|
||
| template <typename T> | ||
| class RPrimitiveConverter<T, enable_if_string_view<T>> | ||
| : public PrimitiveConverter<T, RConverter> { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems there's a lot in common between this and the regular |
||
| public: | ||
| Status Extend(SEXP x, int64_t size, int64_t offset = 0) override { | ||
| RVectorType rtype = GetVectorType(x); | ||
| if (rtype != STRING) { | ||
| return Status::Invalid("Expecting a character vector"); | ||
| } | ||
| return UnsafeAppendUtf8Strings(arrow::r::utf8_strings(x), size, offset); | ||
| } | ||
|
|
||
| void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override { | ||
| auto task = [this, values, size]() { return this->Extend(values, size); }; | ||
| tasks.Append(false, std::move(task)); | ||
| } | ||
|
|
||
| private: | ||
| Status UnsafeAppendUtf8Strings(const cpp11::strings& s, int64_t size, int64_t offset) { | ||
| RETURN_NOT_OK(this->primitive_builder_->Reserve(size - offset)); | ||
|
|
||
| // Use safe Append (not UnsafeAppend) because StringView's heap builder | ||
| // has a 2GB-per-block limit that prevents bulk data pre-reservation. | ||
| const SEXP* p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s)) + offset; | ||
| for (R_xlen_t i = offset; i < size; i++, ++p_strings) { | ||
| SEXP si = *p_strings; | ||
| if (si == NA_STRING) { | ||
| this->primitive_builder_->UnsafeAppendNull(); | ||
| } else { | ||
| RETURN_NOT_OK(this->primitive_builder_->Append(CHAR(si), LENGTH(si))); | ||
| } | ||
| } | ||
|
|
||
| return Status::OK(); | ||
| } | ||
| }; | ||
|
thisisnic marked this conversation as resolved.
|
||
|
|
||
| template <typename T> | ||
| class RPrimitiveConverter<T, enable_if_t<is_duration_type<T>::value>> | ||
| : public PrimitiveConverter<T, RConverter> { | ||
|
|
@@ -1062,7 +1099,13 @@ struct RConverterTrait< | |
| }; | ||
|
|
||
| template <typename T> | ||
| struct RConverterTrait<T, enable_if_binary_view_like<T>> { | ||
| struct RConverterTrait<T, enable_if_string_view<T>> { | ||
| using type = RPrimitiveConverter<T>; | ||
| }; | ||
|
|
||
| template <typename T> | ||
| struct RConverterTrait<T, enable_if_t<is_binary_view_like_type<T>::value && | ||
| !is_string_view_type<T>::value>> { | ||
| // not implemented | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not implement it too? It should be reasonably easy given you already the implementations for Binary and StringView. |
||
| }; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -203,6 +203,8 @@ test_that("Array supports character vectors (ARROW-3339)", { | |||||||
| # with NA | ||||||||
| expect_array_roundtrip(c("itsy", NA, "spider"), utf8()) | ||||||||
| expect_array_roundtrip(c("itsy", NA, "spider"), large_utf8(), as = large_utf8()) | ||||||||
|
|
||||||||
| expect_array_roundtrip(c("itsy", NA, "", "spider"), string_view(), as = string_view()) | ||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. String view arrays have a different representation for "short" and "long" strings (i.e. > 12 characters), I suggest exercising that:
Suggested change
|
||||||||
| }) | ||||||||
|
|
||||||||
| test_that("Character vectors > 2GB become large_utf8", { | ||||||||
|
|
||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -371,6 +371,28 @@ test_that("Can create table with specific dictionary types", { | |
| expect_equal_data_frame(tab_large, fact) | ||
| }) | ||
|
|
||
| test_that("Table converts dictionary arrays with wider index types back to R", { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test does not seem related, is it just adding a test that was overlooked before? |
||
| fact <- example_data[, "fct"] | ||
| for (i in list(uint32(), int64(), uint64())) { | ||
| sch <- schema(fct = dictionary(i, utf8())) | ||
| tab <- Table$create(fact, schema = sch) | ||
| expect_equal(tab$schema, sch) | ||
| expect_equal_data_frame(tab, fact) | ||
| } | ||
| }) | ||
|
|
||
| test_that("Table converts dictionary arrays with string_view values", { | ||
| expected <- data.frame(foo = factor(c("x", NA, "x"))) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would also suggest testing with longer strings here. |
||
| tab <- Table$create(expected, schema = schema(foo = dictionary(uint32(), string_view()))) | ||
| expect_equal_data_frame(tab, expected) | ||
| }) | ||
|
|
||
| test_that("Table round-trips string_view columns", { | ||
| expected <- data.frame(x = c("hello", NA, "")) | ||
| tab <- Table$create(expected, schema = schema(x = string_view())) | ||
| expect_equal_data_frame(tab, expected) | ||
| }) | ||
|
|
||
| test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { | ||
| b1 <- record_batch(f = factor(c("a"), levels = c("a", "b"))) | ||
| b2 <- record_batch(f = factor(c("c"), levels = c("c", "d"))) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not also
BinaryViewType?