-
Notifications
You must be signed in to change notification settings - Fork 4.1k
GH-45523: [R] Implement Utf8View type bindings #49712
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 11 commits
94275d7
dfdad81
52fc2b2
3af2dd0
7f6b26a
887beda
3c6820e
fc6dec6
21e7014
1842e7e
2b56912
74c32cc
c3b63c7
ee58f8e
809f578
d095d1b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -826,7 +826,8 @@ class PyDictionaryConverter<U, enable_if_has_string_view<U>> | |
| } else { | ||
| ARROW_RETURN_NOT_OK( | ||
| PyValue::Convert(this->value_type_, this->options_, value, view_)); | ||
| return this->value_builder_->Append(view_.bytes, static_cast<int32_t>(view_.size)); | ||
| return this->value_builder_->Append( | ||
| std::string_view(view_.bytes, static_cast<size_t>(view_.size))); | ||
|
Comment on lines
+829
to
+830
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The old code passed |
||
| } | ||
|
pitrou marked this conversation as resolved.
|
||
| } | ||
|
|
||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -203,6 +203,13 @@ Utf8 <- R6Class( | |
| code = function(namespace = FALSE) call2("utf8", .ns = if (namespace) "arrow") | ||
| ) | ||
| ) | ||
| StringView <- R6Class( | ||
| "StringView", | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason for calling it |
||
| inherit = DataType, | ||
| public = list( | ||
| code = function(namespace = FALSE) call2("string_view", .ns = if (namespace) "arrow") | ||
| ) | ||
| ) | ||
| LargeUtf8 <- R6Class( | ||
| "LargeUtf8", | ||
| inherit = DataType, | ||
|
|
@@ -505,6 +512,10 @@ bool <- boolean | |
| #' @export | ||
| utf8 <- function() Utf8__initialize() | ||
|
|
||
| #' @rdname data-type | ||
| #' @export | ||
| string_view <- function() StringView__initialize() | ||
|
|
||
| #' @rdname data-type | ||
| #' @export | ||
| large_utf8 <- function() LargeUtf8__initialize() | ||
|
|
@@ -806,6 +817,8 @@ canonical_type_str <- function(type_str) { | |
| boolean = "bool", | ||
| bool = "bool", | ||
| utf8 = "string", | ||
| utf8_view = "string_view", | ||
| string_view = "string_view", | ||
| large_utf8 = "large_string", | ||
| large_string = "large_string", | ||
| binary = "binary", | ||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -290,26 +290,31 @@ struct Converter_String : public Converter { | |
|
|
||
| Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array, | ||
| R_xlen_t start, R_xlen_t n, size_t chunk_index) const { | ||
| auto p_offset = array->data()->GetValues<int32_t>(1); | ||
| if (!p_offset) { | ||
| return Status::Invalid("Invalid offset buffer"); | ||
| } | ||
| auto p_strings = array->data()->GetValues<char>(2, *p_offset); | ||
| if (!p_strings) { | ||
| // There is an offset buffer, but the data buffer is null | ||
| // There is at least one value in the array and not all the values are null | ||
| // That means all values are either empty strings or nulls so there is nothing to do | ||
|
|
||
| if (array->null_count()) { | ||
| arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), | ||
| array->offset(), n); | ||
| for (int i = 0; i < n; i++, null_reader.Next()) { | ||
| if (null_reader.IsNotSet()) { | ||
| SET_STRING_ELT(data, start + i, NA_STRING); | ||
| // StringViewArray uses a different memory layout (views + data buffers) rather | ||
| // than offsets, so skip the offset-based fast path and fall through to GetString(). | ||
| if constexpr (!std::is_same_v<StringArrayType, arrow::StringViewArray>) { | ||
|
thisisnic marked this conversation as resolved.
Outdated
|
||
| auto p_offset = array->data()->GetValues<int32_t>(1); | ||
|
thisisnic marked this conversation as resolved.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this doesn't work for LargeBinary and LargeString, which have 64-bit offsets, right? |
||
| if (!p_offset) { | ||
| return Status::Invalid("Invalid offset buffer"); | ||
| } | ||
| auto p_strings = array->data()->GetValues<char>(2, *p_offset); | ||
| if (!p_strings) { | ||
| // There is an offset buffer, but the data buffer is null | ||
| // There is at least one value in the array and not all the values are null | ||
| // That means all values are either empty strings or nulls so there is nothing to | ||
| // do | ||
|
|
||
| if (array->null_count()) { | ||
| arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), | ||
| array->offset(), n); | ||
| for (int i = 0; i < n; i++, null_reader.Next()) { | ||
| if (null_reader.IsNotSet()) { | ||
| SET_STRING_ELT(data, start + i, NA_STRING); | ||
| } | ||
| } | ||
| } | ||
| return Status::OK(); | ||
| } | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| StringArrayType* string_array = static_cast<StringArrayType*>(array.get()); | ||
|
|
@@ -726,7 +731,9 @@ class Converter_Dictionary : public Converter { | |
| // Alternative: preserve the logical type of the dictionary values | ||
| // (e.g. if dict is timestamp, return a POSIXt R vector, not factor) | ||
| if (dictionary_->type_id() != Type::STRING && | ||
| dictionary_->type_id() != Type::LARGE_STRING) { | ||
| dictionary_->type_id() != Type::LARGE_STRING && | ||
| dictionary_->type_id() != Type::STRING_VIEW | ||
| ) { | ||
| cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels"); | ||
| } | ||
|
|
||
|
|
@@ -1263,6 +1270,10 @@ std::shared_ptr<Converter> Converter::Make( | |
| return std::make_shared<arrow::r::Converter_String<arrow::LargeStringArray>>( | ||
| chunked_array); | ||
|
|
||
| case Type::STRING_VIEW: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not handle BINARY_VIEW above as well? |
||
| return std::make_shared<arrow::r::Converter_String<arrow::StringViewArray>>( | ||
| chunked_array); | ||
|
|
||
| case Type::DICTIONARY: | ||
| return std::make_shared<arrow::r::Converter_Dictionary>(chunked_array); | ||
|
|
||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -57,6 +57,8 @@ const char* r6_class_name<arrow::DataType>::get( | |
|
|
||
| case Type::STRING: | ||
| return "Utf8"; | ||
| case Type::STRING_VIEW: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not |
||
| return "StringView"; | ||
| case Type::LARGE_STRING: | ||
| return "LargeUtf8"; | ||
|
|
||
|
|
@@ -165,6 +167,9 @@ std::shared_ptr<arrow::DataType> Boolean__initialize() { return arrow::boolean() | |
| // [[arrow::export]] | ||
| std::shared_ptr<arrow::DataType> Utf8__initialize() { return arrow::utf8(); } | ||
|
|
||
| // [[arrow::export]] | ||
| std::shared_ptr<arrow::DataType> StringView__initialize() { return arrow::utf8_view(); } | ||
|
|
||
| // [[arrow::export]] | ||
| std::shared_ptr<arrow::DataType> LargeUtf8__initialize() { return arrow::large_utf8(); } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -910,6 +910,49 @@ class RPrimitiveConverter<T, enable_if_string_like<T>> | |
| } | ||
| }; | ||
|
|
||
| template <typename T> | ||
| class RPrimitiveConverter<T, enable_if_string_view<T>> | ||
| : public PrimitiveConverter<T, RConverter> { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems there's a lot in common between this and the regular |
||
| public: | ||
| Status Extend(SEXP x, int64_t size, int64_t offset = 0) override { | ||
| RVectorType rtype = GetVectorType(x); | ||
| if (rtype != STRING) { | ||
| return Status::Invalid("Expecting a character vector"); | ||
| } | ||
| return UnsafeAppendUtf8Strings(arrow::r::utf8_strings(x), size, offset); | ||
| } | ||
|
|
||
| void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override { | ||
| auto task = [this, values, size]() { return this->Extend(values, size); }; | ||
| tasks.Append(false, std::move(task)); | ||
| } | ||
|
|
||
| private: | ||
| Status UnsafeAppendUtf8Strings(const cpp11::strings& s, int64_t size, int64_t offset) { | ||
| RETURN_NOT_OK(this->primitive_builder_->Reserve(size - offset)); | ||
| const SEXP* p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s)) + offset; | ||
|
|
||
| int64_t total_length = 0; | ||
| for (R_xlen_t i = offset; i < size; i++, ++p_strings) { | ||
| SEXP si = *p_strings; | ||
| total_length += si == NA_STRING ? 0 : LENGTH(si); | ||
| } | ||
| RETURN_NOT_OK(this->primitive_builder_->ReserveData(total_length)); | ||
|
|
||
| p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s)) + offset; | ||
| for (R_xlen_t i = offset; i < size; i++, ++p_strings) { | ||
| SEXP si = *p_strings; | ||
| if (si == NA_STRING) { | ||
| this->primitive_builder_->UnsafeAppendNull(); | ||
| } else { | ||
|
thisisnic marked this conversation as resolved.
Outdated
thisisnic marked this conversation as resolved.
Outdated
|
||
| this->primitive_builder_->UnsafeAppend(CHAR(si), LENGTH(si)); | ||
| } | ||
| } | ||
|
|
||
| return Status::OK(); | ||
| } | ||
| }; | ||
|
thisisnic marked this conversation as resolved.
|
||
|
|
||
| template <typename T> | ||
| class RPrimitiveConverter<T, enable_if_t<is_duration_type<T>::value>> | ||
| : public PrimitiveConverter<T, RConverter> { | ||
|
|
@@ -1062,7 +1105,13 @@ struct RConverterTrait< | |
| }; | ||
|
|
||
| template <typename T> | ||
| struct RConverterTrait<T, enable_if_binary_view_like<T>> { | ||
| struct RConverterTrait<T, enable_if_string_view<T>> { | ||
| using type = RPrimitiveConverter<T>; | ||
| }; | ||
|
|
||
| template <typename T> | ||
| struct RConverterTrait<T, enable_if_t<is_binary_view_like_type<T>::value && | ||
| !is_string_view_type<T>::value>> { | ||
| // not implemented | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not implement it too? It should be reasonably easy given you already the implementations for Binary and StringView. |
||
| }; | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not also
BinaryViewType?