-
Notifications
You must be signed in to change notification settings - Fork 4.2k
GH-45523: [R] Implement Utf8View type bindings #49712
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
877f9b6
cc42de4
55a356c
bd8ebf9
32b8256
aa14e21
e266993
a691979
9569a03
419450f
b153aaa
0568f15
fae4f21
1ff1e94
464976e
bd09948
f8e0f1f
415f1c5
8e29ee5
9792316
dabaca9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -826,7 +826,8 @@ class PyDictionaryConverter<U, enable_if_has_string_view<U>> | |
| } else { | ||
| ARROW_RETURN_NOT_OK( | ||
| PyValue::Convert(this->value_type_, this->options_, value, view_)); | ||
| return this->value_builder_->Append(view_.bytes, static_cast<int32_t>(view_.size)); | ||
| return this->value_builder_->Append( | ||
| std::string_view(view_.bytes, static_cast<size_t>(view_.size))); | ||
|
Comment on lines
+829
to
+830
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The old code passed |
||
| } | ||
|
pitrou marked this conversation as resolved.
|
||
| } | ||
|
|
||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -290,26 +290,32 @@ struct Converter_String : public Converter { | |
|
|
||
| Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array, | ||
| R_xlen_t start, R_xlen_t n, size_t chunk_index) const { | ||
| auto p_offset = array->data()->GetValues<int32_t>(1); | ||
| if (!p_offset) { | ||
| return Status::Invalid("Invalid offset buffer"); | ||
| } | ||
| auto p_strings = array->data()->GetValues<char>(2, *p_offset); | ||
| if (!p_strings) { | ||
| // There is an offset buffer, but the data buffer is null | ||
| // There is at least one value in the array and not all the values are null | ||
| // That means all values are either empty strings or nulls so there is nothing to do | ||
|
|
||
| if (array->null_count()) { | ||
| arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), | ||
| array->offset(), n); | ||
| for (int i = 0; i < n; i++, null_reader.Next()) { | ||
| if (null_reader.IsNotSet()) { | ||
| SET_STRING_ELT(data, start + i, NA_STRING); | ||
| // BinaryView/StringView arrays use a different memory layout (views + data buffers) | ||
| // rather than offsets, so skip the offset-based fast path and fall through to the | ||
| // GetView()-based element loop below. | ||
| if (!is_binary_view_like(array->type_id())) { | ||
| auto p_offset = array->data()->GetValues<int32_t>(1); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this doesn't work for LargeBinary and LargeString, which have 64-bit offsets, right? |
||
| if (!p_offset) { | ||
| return Status::Invalid("Invalid offset buffer"); | ||
| } | ||
| auto p_strings = array->data()->GetValues<char>(2, *p_offset); | ||
| if (!p_strings) { | ||
| // There is an offset buffer, but the data buffer is null | ||
| // There is at least one value in the array and not all the values are null | ||
| // That means all values are either empty strings or nulls so there is nothing to | ||
| // do | ||
|
|
||
| if (array->null_count()) { | ||
| arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), | ||
| array->offset(), n); | ||
| for (int i = 0; i < n; i++, null_reader.Next()) { | ||
| if (null_reader.IsNotSet()) { | ||
| SET_STRING_ELT(data, start + i, NA_STRING); | ||
| } | ||
| } | ||
| } | ||
| return Status::OK(); | ||
| } | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| StringArrayType* string_array = static_cast<StringArrayType*>(array.get()); | ||
|
|
@@ -497,6 +503,44 @@ class Converter_Binary : public Converter { | |
| virtual bool Parallel() const { return false; } | ||
| }; | ||
|
|
||
| class Converter_BinaryView : public Converter { | ||
| public: | ||
| explicit Converter_BinaryView(const std::shared_ptr<ChunkedArray>& chunked_array) | ||
| : Converter(chunked_array) {} | ||
|
|
||
| SEXP Allocate(R_xlen_t n) const { | ||
| SEXP res = PROTECT(Rf_allocVector(VECSXP, n)); | ||
| Rf_classgets(res, data::classes_arrow_binary); | ||
| UNPROTECT(1); | ||
| return res; | ||
| } | ||
|
|
||
| Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array, | ||
| R_xlen_t start, R_xlen_t n, size_t chunk_index) const { | ||
| const BinaryViewArray* binary_array = | ||
| checked_cast<const BinaryViewArray*>(array.get()); | ||
|
|
||
| auto ingest_one = [&](R_xlen_t i) { | ||
| auto value = binary_array->GetView(i); | ||
| SEXP raw = PROTECT(Rf_allocVector(RAWSXP, value.size())); | ||
| std::copy(value.data(), value.data() + value.size(), RAW(raw)); | ||
|
|
||
| SET_VECTOR_ELT(data, i + start, raw); | ||
| UNPROTECT(1); | ||
|
|
||
| return Status::OK(); | ||
| }; | ||
|
|
||
| return IngestSome(array, n, ingest_one); | ||
| } | ||
|
|
||
| virtual bool Parallel() const { return false; } | ||
| }; | ||
|
|
||
| class Converter_FixedSizeBinary : public Converter { | ||
| public: | ||
| explicit Converter_FixedSizeBinary(const std::shared_ptr<ChunkedArray>& chunked_array, | ||
|
|
@@ -726,7 +770,8 @@ class Converter_Dictionary : public Converter { | |
| // Alternative: preserve the logical type of the dictionary values | ||
| // (e.g. if dict is timestamp, return a POSIXt R vector, not factor) | ||
| if (dictionary_->type_id() != Type::STRING && | ||
| dictionary_->type_id() != Type::LARGE_STRING) { | ||
| dictionary_->type_id() != Type::LARGE_STRING && | ||
| dictionary_->type_id() != Type::STRING_VIEW) { | ||
| cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels"); | ||
| } | ||
|
|
||
|
|
@@ -1263,6 +1308,13 @@ std::shared_ptr<Converter> Converter::Make( | |
| return std::make_shared<arrow::r::Converter_String<arrow::LargeStringArray>>( | ||
| chunked_array); | ||
|
|
||
| case Type::STRING_VIEW: | ||
|
thisisnic marked this conversation as resolved.
|
||
| return std::make_shared<arrow::r::Converter_String<arrow::StringViewArray>>( | ||
| chunked_array); | ||
|
|
||
| case Type::BINARY_VIEW: | ||
| return std::make_shared<arrow::r::Converter_BinaryView>(chunked_array); | ||
|
|
||
| case Type::DICTIONARY: | ||
| return std::make_shared<arrow::r::Converter_Dictionary>(chunked_array); | ||
|
|
||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Uh oh!
There was an error while loading. Please reload this page.