Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/src/arrow/util/converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,8 @@ struct MakeConverterImpl {
DICTIONARY_CASE(LargeBinaryType);
DICTIONARY_CASE(StringType);
DICTIONARY_CASE(LargeStringType);
DICTIONARY_CASE(StringViewType);
Comment thread
thisisnic marked this conversation as resolved.
DICTIONARY_CASE(BinaryViewType);
DICTIONARY_CASE(FixedSizeBinaryType);
#undef DICTIONARY_CASE
default:
Expand Down
3 changes: 2 additions & 1 deletion python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,8 @@ class PyDictionaryConverter<U, enable_if_has_string_view<U>>
} else {
ARROW_RETURN_NOT_OK(
PyValue::Convert(this->value_type_, this->options_, value, view_));
return this->value_builder_->Append(view_.bytes, static_cast<int32_t>(view_.size));
return this->value_builder_->Append(
std::string_view(view_.bytes, static_cast<size_t>(view_.size)));
Comment on lines +829 to +830

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The old code passed (const char*, int32_t) which matches StringBuilder::Append but not StringViewBuilder::Append (which takes int64_t). Switching to std::string_view works for both builder types.

}
Comment thread
pitrou marked this conversation as resolved.
}

Expand Down
3 changes: 3 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ export(as_record_batch)
export(as_record_batch_reader)
export(as_schema)
export(binary)
export(binary_view)
export(bool)
export(boolean)
export(buffer)
Expand Down Expand Up @@ -397,6 +398,7 @@ export(set_io_thread_count)
export(show_exec_plan)
export(starts_with)
export(string)
export(string_view)
export(struct)
export(time32)
export(time64)
Expand All @@ -411,6 +413,7 @@ export(uint8)
export(unify_schemas)
export(unregister_extension_type)
export(utf8)
export(utf8_view)
export(value_counts)
export(vctrs_extension_array)
export(vctrs_extension_type)
Expand Down
8 changes: 8 additions & 0 deletions r/R/arrowExports.R

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions r/R/type.R
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,13 @@ Utf8 <- R6Class(
code = function(namespace = FALSE) call2("utf8", .ns = if (namespace) "arrow")
)
)
Utf8View <- R6Class(
"Utf8View",
inherit = DataType,
public = list(
code = function(namespace = FALSE) call2("utf8_view", .ns = if (namespace) "arrow")
)
)
LargeUtf8 <- R6Class(
"LargeUtf8",
inherit = DataType,
Expand All @@ -224,6 +231,13 @@ LargeBinary <- R6Class(
code = function(namespace = FALSE) call2("large_binary", .ns = if (namespace) "arrow")
)
)
BinaryView <- R6Class(
"BinaryView",
inherit = DataType,
public = list(
code = function(namespace = FALSE) call2("binary_view", .ns = if (namespace) "arrow")
)
)
FixedSizeBinary <- R6Class(
"FixedSizeBinary",
inherit = FixedWidthType,
Expand Down Expand Up @@ -505,6 +519,14 @@ bool <- boolean
#' @export
utf8 <- function() Utf8__initialize()

#' @rdname data-type
#' @export
utf8_view <- function() Utf8View__initialize()

#' @rdname data-type
#' @export
string_view <- utf8_view

#' @rdname data-type
#' @export
large_utf8 <- function() LargeUtf8__initialize()
Expand All @@ -517,6 +539,10 @@ binary <- function() Binary__initialize()
#' @export
large_binary <- function() LargeBinary__initialize()

#' @rdname data-type
#' @export
binary_view <- function() BinaryView__initialize()

#' @rdname data-type
#' @export
fixed_size_binary <- function(byte_width) FixedSizeBinary__initialize(byte_width)
Expand Down Expand Up @@ -806,9 +832,12 @@ canonical_type_str <- function(type_str) {
boolean = "bool",
bool = "bool",
utf8 = "string",
utf8_view = "string_view",
string_view = "string_view",
large_utf8 = "large_string",
large_string = "large_string",
binary = "binary",
binary_view = "binary_view",
large_binary = "large_binary",
fixed_size_binary = "fixed_size_binary",
string = "string",
Expand Down
9 changes: 9 additions & 0 deletions r/man/data-type.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

88 changes: 70 additions & 18 deletions r/src/array_to_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,26 +290,32 @@ struct Converter_String : public Converter {

Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
auto p_offset = array->data()->GetValues<int32_t>(1);
if (!p_offset) {
return Status::Invalid("Invalid offset buffer");
}
auto p_strings = array->data()->GetValues<char>(2, *p_offset);
if (!p_strings) {
// There is an offset buffer, but the data buffer is null
// There is at least one value in the array and not all the values are null
// That means all values are either empty strings or nulls so there is nothing to do

if (array->null_count()) {
arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
array->offset(), n);
for (int i = 0; i < n; i++, null_reader.Next()) {
if (null_reader.IsNotSet()) {
SET_STRING_ELT(data, start + i, NA_STRING);
// BinaryView/StringView arrays use a different memory layout (views + data buffers)
// rather than offsets, so skip the offset-based fast path and fall through to the
// GetView()-based element loop below.
if (!is_binary_view_like(array->type_id())) {
auto p_offset = array->data()->GetValues<int32_t>(1);

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this doesn't work for LargeBinary and LargeString, which have 64-bit offsets, right?

if (!p_offset) {
return Status::Invalid("Invalid offset buffer");
}
auto p_strings = array->data()->GetValues<char>(2, *p_offset);
if (!p_strings) {
// There is an offset buffer, but the data buffer is null
// There is at least one value in the array and not all the values are null
// That means all values are either empty strings or nulls so there is nothing to
// do

if (array->null_count()) {
arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
array->offset(), n);
for (int i = 0; i < n; i++, null_reader.Next()) {
if (null_reader.IsNotSet()) {
SET_STRING_ELT(data, start + i, NA_STRING);
}
}
}
return Status::OK();
}
return Status::OK();
}

StringArrayType* string_array = static_cast<StringArrayType*>(array.get());
Expand Down Expand Up @@ -497,6 +503,44 @@ class Converter_Binary : public Converter {
virtual bool Parallel() const { return false; }
};

class Converter_BinaryView : public Converter {
public:
explicit Converter_BinaryView(const std::shared_ptr<ChunkedArray>& chunked_array)
: Converter(chunked_array) {}

SEXP Allocate(R_xlen_t n) const {
SEXP res = PROTECT(Rf_allocVector(VECSXP, n));
Rf_classgets(res, data::classes_arrow_binary);
UNPROTECT(1);
return res;
}

Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
return Status::OK();
}

Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
const BinaryViewArray* binary_array =
checked_cast<const BinaryViewArray*>(array.get());

auto ingest_one = [&](R_xlen_t i) {
auto value = binary_array->GetView(i);
SEXP raw = PROTECT(Rf_allocVector(RAWSXP, value.size()));
std::copy(value.data(), value.data() + value.size(), RAW(raw));

SET_VECTOR_ELT(data, i + start, raw);
UNPROTECT(1);

return Status::OK();
};

return IngestSome(array, n, ingest_one);
}

virtual bool Parallel() const { return false; }
};

class Converter_FixedSizeBinary : public Converter {
public:
explicit Converter_FixedSizeBinary(const std::shared_ptr<ChunkedArray>& chunked_array,
Expand Down Expand Up @@ -726,7 +770,8 @@ class Converter_Dictionary : public Converter {
// Alternative: preserve the logical type of the dictionary values
// (e.g. if dict is timestamp, return a POSIXt R vector, not factor)
if (dictionary_->type_id() != Type::STRING &&
dictionary_->type_id() != Type::LARGE_STRING) {
dictionary_->type_id() != Type::LARGE_STRING &&
dictionary_->type_id() != Type::STRING_VIEW) {
cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels");
}

Expand Down Expand Up @@ -1263,6 +1308,13 @@ std::shared_ptr<Converter> Converter::Make(
return std::make_shared<arrow::r::Converter_String<arrow::LargeStringArray>>(
chunked_array);

case Type::STRING_VIEW:
Comment thread
thisisnic marked this conversation as resolved.
return std::make_shared<arrow::r::Converter_String<arrow::StringViewArray>>(
chunked_array);

case Type::BINARY_VIEW:
return std::make_shared<arrow::r::Converter_BinaryView>(chunked_array);

case Type::DICTIONARY:
return std::make_shared<arrow::r::Converter_Dictionary>(chunked_array);

Expand Down
60 changes: 38 additions & 22 deletions r/src/arrowExports.cpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading