Skip to content
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/util/converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ struct MakeConverterImpl {
DICTIONARY_CASE(LargeBinaryType);
DICTIONARY_CASE(StringType);
DICTIONARY_CASE(LargeStringType);
DICTIONARY_CASE(StringViewType);

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not also BinaryViewType?

DICTIONARY_CASE(FixedSizeBinaryType);
#undef DICTIONARY_CASE
default:
Expand Down
3 changes: 2 additions & 1 deletion python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,8 @@ class PyDictionaryConverter<U, enable_if_has_string_view<U>>
} else {
ARROW_RETURN_NOT_OK(
PyValue::Convert(this->value_type_, this->options_, value, view_));
return this->value_builder_->Append(view_.bytes, static_cast<int32_t>(view_.size));
return this->value_builder_->Append(
std::string_view(view_.bytes, static_cast<size_t>(view_.size)));
Comment on lines +829 to +830

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The old code passed (const char*, int32_t) which matches StringBuilder::Append but not StringViewBuilder::Append (which takes int64_t). Switching to std::string_view works for both builder types.

}
Comment thread
pitrou marked this conversation as resolved.
}

Expand Down
1 change: 1 addition & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ export(set_io_thread_count)
export(show_exec_plan)
export(starts_with)
export(string)
export(string_view)
export(struct)
export(time32)
export(time64)
Expand Down
21 changes: 12 additions & 9 deletions r/R/arrowExports.R

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions r/R/type.R
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,13 @@ Utf8 <- R6Class(
code = function(namespace = FALSE) call2("utf8", .ns = if (namespace) "arrow")
)
)
StringView <- R6Class(
"StringView",

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason for calling it StringView instead of Utf8View (like the existing Utf8 and LargeUtf8)?

inherit = DataType,
public = list(
code = function(namespace = FALSE) call2("string_view", .ns = if (namespace) "arrow")
)
)
LargeUtf8 <- R6Class(
"LargeUtf8",
inherit = DataType,
Expand Down Expand Up @@ -505,6 +512,10 @@ bool <- boolean
#' @export
utf8 <- function() Utf8__initialize()

#' @rdname data-type
#' @export
string_view <- function() StringView__initialize()

#' @rdname data-type
#' @export
large_utf8 <- function() LargeUtf8__initialize()
Expand Down Expand Up @@ -806,6 +817,8 @@ canonical_type_str <- function(type_str) {
boolean = "bool",
bool = "bool",
utf8 = "string",
utf8_view = "string_view",
string_view = "string_view",
large_utf8 = "large_string",
large_string = "large_string",
binary = "binary",
Expand Down
3 changes: 3 additions & 0 deletions r/man/data-type.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 29 additions & 18 deletions r/src/array_to_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,26 +290,31 @@ struct Converter_String : public Converter {

Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
auto p_offset = array->data()->GetValues<int32_t>(1);
if (!p_offset) {
return Status::Invalid("Invalid offset buffer");
}
auto p_strings = array->data()->GetValues<char>(2, *p_offset);
if (!p_strings) {
// There is an offset buffer, but the data buffer is null
// There is at least one value in the array and not all the values are null
// That means all values are either empty strings or nulls so there is nothing to do

if (array->null_count()) {
arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
array->offset(), n);
for (int i = 0; i < n; i++, null_reader.Next()) {
if (null_reader.IsNotSet()) {
SET_STRING_ELT(data, start + i, NA_STRING);
// StringViewArray uses a different memory layout (views + data buffers) rather
// than offsets, so skip the offset-based fast path and fall through to GetString().
if constexpr (!std::is_same_v<StringArrayType, arrow::StringViewArray>) {
Comment thread
thisisnic marked this conversation as resolved.
Outdated
auto p_offset = array->data()->GetValues<int32_t>(1);
Comment thread
thisisnic marked this conversation as resolved.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this doesn't work for LargeBinary and LargeString, which have 64-bit offsets, right?

if (!p_offset) {
return Status::Invalid("Invalid offset buffer");
}
auto p_strings = array->data()->GetValues<char>(2, *p_offset);
if (!p_strings) {
// There is an offset buffer, but the data buffer is null
// There is at least one value in the array and not all the values are null
// That means all values are either empty strings or nulls so there is nothing to
// do

if (array->null_count()) {
arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
array->offset(), n);
for (int i = 0; i < n; i++, null_reader.Next()) {
if (null_reader.IsNotSet()) {
SET_STRING_ELT(data, start + i, NA_STRING);
}
}
}
return Status::OK();
}
return Status::OK();
}

StringArrayType* string_array = static_cast<StringArrayType*>(array.get());
Expand Down Expand Up @@ -726,7 +731,9 @@ class Converter_Dictionary : public Converter {
// Alternative: preserve the logical type of the dictionary values
// (e.g. if dict is timestamp, return a POSIXt R vector, not factor)
if (dictionary_->type_id() != Type::STRING &&
dictionary_->type_id() != Type::LARGE_STRING) {
dictionary_->type_id() != Type::LARGE_STRING &&
dictionary_->type_id() != Type::STRING_VIEW
) {
cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels");
}

Expand Down Expand Up @@ -1263,6 +1270,10 @@ std::shared_ptr<Converter> Converter::Make(
return std::make_shared<arrow::r::Converter_String<arrow::LargeStringArray>>(
chunked_array);

case Type::STRING_VIEW:

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not handle BINARY_VIEW above as well?

return std::make_shared<arrow::r::Converter_String<arrow::StringViewArray>>(
chunked_array);

case Type::DICTIONARY:
return std::make_shared<arrow::r::Converter_Dictionary>(chunked_array);

Expand Down
45 changes: 26 additions & 19 deletions r/src/arrowExports.cpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions r/src/datatype.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ const char* r6_class_name<arrow::DataType>::get(

case Type::STRING:
return "Utf8";
case Type::STRING_VIEW:

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not BINARY_VIEW below as well?

return "StringView";
case Type::LARGE_STRING:
return "LargeUtf8";

Expand Down Expand Up @@ -165,6 +167,9 @@ std::shared_ptr<arrow::DataType> Boolean__initialize() { return arrow::boolean()
// [[arrow::export]]
std::shared_ptr<arrow::DataType> Utf8__initialize() { return arrow::utf8(); }

// [[arrow::export]]
std::shared_ptr<arrow::DataType> StringView__initialize() { return arrow::utf8_view(); }

// [[arrow::export]]
std::shared_ptr<arrow::DataType> LargeUtf8__initialize() { return arrow::large_utf8(); }

Expand Down
51 changes: 50 additions & 1 deletion r/src/r_to_arrow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -910,6 +910,49 @@ class RPrimitiveConverter<T, enable_if_string_like<T>>
}
};

template <typename T>
class RPrimitiveConverter<T, enable_if_string_view<T>>
: public PrimitiveConverter<T, RConverter> {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems there's a lot in common between this and the regular String converter (only UnsafeAppendUtf8Strings differs AFAICT), perhaps it's worth factoring things out in a common base class?

public:
Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
RVectorType rtype = GetVectorType(x);
if (rtype != STRING) {
return Status::Invalid("Expecting a character vector");
}
return UnsafeAppendUtf8Strings(arrow::r::utf8_strings(x), size, offset);
}

void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
auto task = [this, values, size]() { return this->Extend(values, size); };
tasks.Append(false, std::move(task));
}

private:
Status UnsafeAppendUtf8Strings(const cpp11::strings& s, int64_t size, int64_t offset) {
RETURN_NOT_OK(this->primitive_builder_->Reserve(size - offset));
const SEXP* p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s)) + offset;

int64_t total_length = 0;
for (R_xlen_t i = offset; i < size; i++, ++p_strings) {
SEXP si = *p_strings;
total_length += si == NA_STRING ? 0 : LENGTH(si);
}
RETURN_NOT_OK(this->primitive_builder_->ReserveData(total_length));

p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s)) + offset;
for (R_xlen_t i = offset; i < size; i++, ++p_strings) {
SEXP si = *p_strings;
if (si == NA_STRING) {
this->primitive_builder_->UnsafeAppendNull();
} else {
Comment thread
thisisnic marked this conversation as resolved.
Outdated
Comment thread
thisisnic marked this conversation as resolved.
Outdated
this->primitive_builder_->UnsafeAppend(CHAR(si), LENGTH(si));
}
}

return Status::OK();
}
};
Comment thread
thisisnic marked this conversation as resolved.

template <typename T>
class RPrimitiveConverter<T, enable_if_t<is_duration_type<T>::value>>
: public PrimitiveConverter<T, RConverter> {
Expand Down Expand Up @@ -1062,7 +1105,13 @@ struct RConverterTrait<
};

template <typename T>
struct RConverterTrait<T, enable_if_binary_view_like<T>> {
struct RConverterTrait<T, enable_if_string_view<T>> {
using type = RPrimitiveConverter<T>;
};

template <typename T>
struct RConverterTrait<T, enable_if_t<is_binary_view_like_type<T>::value &&
!is_string_view_type<T>::value>> {
// not implemented

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not implement it too? It should be reasonably easy given you already the implementations for Binary and StringView.

};

Expand Down
8 changes: 8 additions & 0 deletions r/tests/testthat/test-Array.R
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,14 @@ test_that("Array supports character vectors (ARROW-3339)", {
# with NA
expect_array_roundtrip(c("itsy", NA, "spider"), utf8())
expect_array_roundtrip(c("itsy", NA, "spider"), large_utf8(), as = large_utf8())

# string_view
expect_array_roundtrip(c("itsy", "bitsy", "spider"), string_view(), as = string_view())
expect_array_roundtrip(c("itsy", NA, "spider"), string_view(), as = string_view())

# string_view with empty strings
expect_array_roundtrip(c("", "bitsy", ""), string_view(), as = string_view())
expect_array_roundtrip(c("", NA, ""), string_view(), as = string_view())
})

test_that("Character vectors > 2GB become large_utf8", {
Expand Down
Loading
Loading