apache · thisisnic · Apr 11, 2026 · Apr 11, 2026 · May 5, 2026 · May 5, 2026
@@ -241,6 +241,7 @@ struct MakeConverterImpl {
       DICTIONARY_CASE(LargeBinaryType);
       DICTIONARY_CASE(StringType);
       DICTIONARY_CASE(LargeStringType);
+      DICTIONARY_CASE(StringViewType);
       DICTIONARY_CASE(FixedSizeBinaryType);
 #undef DICTIONARY_CASE
       default:

@@ -826,7 +826,8 @@ class PyDictionaryConverter<U, enable_if_has_string_view<U>>
     } else {
       ARROW_RETURN_NOT_OK(
           PyValue::Convert(this->value_type_, this->options_, value, view_));
-      return this->value_builder_->Append(view_.bytes, static_cast<int32_t>(view_.size));
+      return this->value_builder_->Append(
+          std::string_view(view_.bytes, static_cast<size_t>(view_.size)));
     }
   }
 

@@ -397,6 +397,7 @@ export(set_io_thread_count)
 export(show_exec_plan)
 export(starts_with)
 export(string)
+export(string_view)
 export(struct)
 export(time32)
 export(time64)

@@ -203,6 +203,13 @@ Utf8 <- R6Class(
     code = function(namespace = FALSE) call2("utf8", .ns = if (namespace) "arrow")
   )
 )
+StringView <- R6Class(
+  "StringView",
+  inherit = DataType,
+  public = list(
+    code = function(namespace = FALSE) call2("string_view", .ns = if (namespace) "arrow")
+  )
+)
 LargeUtf8 <- R6Class(
   "LargeUtf8",
   inherit = DataType,
@@ -505,6 +512,10 @@ bool <- boolean
 #' @export
 utf8 <- function() Utf8__initialize()
 
+#' @rdname data-type
+#' @export
+string_view <- function() StringView__initialize()
+
 #' @rdname data-type
 #' @export
 large_utf8 <- function() LargeUtf8__initialize()
@@ -806,6 +817,8 @@ canonical_type_str <- function(type_str) {
     boolean = "bool",
     bool = "bool",
     utf8 = "string",
+    utf8_view = "string_view",
+    string_view = "string_view",
     large_utf8 = "large_string",
     large_string = "large_string",
     binary = "binary",

@@ -290,26 +290,31 @@ struct Converter_String : public Converter {
 
   Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
                            R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
-    auto p_offset = array->data()->GetValues<int32_t>(1);
-    if (!p_offset) {
-      return Status::Invalid("Invalid offset buffer");
-    }
-    auto p_strings = array->data()->GetValues<char>(2, *p_offset);
-    if (!p_strings) {
-      // There is an offset buffer, but the data buffer is null
-      // There is at least one value in the array and not all the values are null
-      // That means all values are either empty strings or nulls so there is nothing to do
-
-      if (array->null_count()) {
-        arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
-                                                  array->offset(), n);
-        for (int i = 0; i < n; i++, null_reader.Next()) {
-          if (null_reader.IsNotSet()) {
-            SET_STRING_ELT(data, start + i, NA_STRING);
+    // StringViewArray uses a different memory layout (views + data buffers) rather
+    // than offsets, so skip the offset-based fast path and fall through to GetString().
+    if constexpr (!std::is_same_v<StringArrayType, arrow::StringViewArray>) {
+      auto p_offset = array->data()->GetValues<int32_t>(1);
+      if (!p_offset) {
+        return Status::Invalid("Invalid offset buffer");
+      }
+      auto p_strings = array->data()->GetValues<char>(2, *p_offset);
+      if (!p_strings) {
+        // There is an offset buffer, but the data buffer is null
+        // There is at least one value in the array and not all the values are null
+        // That means all values are either empty strings or nulls so there is nothing to
+        // do
+
+        if (array->null_count()) {
+          arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
+                                                    array->offset(), n);
+          for (int i = 0; i < n; i++, null_reader.Next()) {
+            if (null_reader.IsNotSet()) {
+              SET_STRING_ELT(data, start + i, NA_STRING);
+            }
           }
         }
+        return Status::OK();
       }
-      return Status::OK();
     }
 
     StringArrayType* string_array = static_cast<StringArrayType*>(array.get());
@@ -726,7 +731,9 @@ class Converter_Dictionary : public Converter {
     // Alternative: preserve the logical type of the dictionary values
     // (e.g. if dict is timestamp, return a POSIXt R vector, not factor)
     if (dictionary_->type_id() != Type::STRING &&
-        dictionary_->type_id() != Type::LARGE_STRING) {
+        dictionary_->type_id() != Type::LARGE_STRING &&
+        dictionary_->type_id() != Type::STRING_VIEW
+      ) {
       cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels");
     }
 
@@ -1263,6 +1270,10 @@ std::shared_ptr<Converter> Converter::Make(
       return std::make_shared<arrow::r::Converter_String<arrow::LargeStringArray>>(
           chunked_array);
 
+    case Type::STRING_VIEW:
+      return std::make_shared<arrow::r::Converter_String<arrow::StringViewArray>>(
+          chunked_array);
+
     case Type::DICTIONARY:
       return std::make_shared<arrow::r::Converter_Dictionary>(chunked_array);
 

@@ -57,6 +57,8 @@ const char* r6_class_name<arrow::DataType>::get(
 
     case Type::STRING:
       return "Utf8";
+    case Type::STRING_VIEW:
+      return "StringView";
     case Type::LARGE_STRING:
       return "LargeUtf8";
 
@@ -165,6 +167,9 @@ std::shared_ptr<arrow::DataType> Boolean__initialize() { return arrow::boolean()
 // [[arrow::export]]
 std::shared_ptr<arrow::DataType> Utf8__initialize() { return arrow::utf8(); }
 
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> StringView__initialize() { return arrow::utf8_view(); }
+
 // [[arrow::export]]
 std::shared_ptr<arrow::DataType> LargeUtf8__initialize() { return arrow::large_utf8(); }
 

@@ -910,6 +910,49 @@ class RPrimitiveConverter<T, enable_if_string_like<T>>
   }
 };
 
+template <typename T>
+class RPrimitiveConverter<T, enable_if_string_view<T>>
+    : public PrimitiveConverter<T, RConverter> {
+ public:
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+    RVectorType rtype = GetVectorType(x);
+    if (rtype != STRING) {
+      return Status::Invalid("Expecting a character vector");
+    }
+    return UnsafeAppendUtf8Strings(arrow::r::utf8_strings(x), size, offset);
+  }
+
+  void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+    auto task = [this, values, size]() { return this->Extend(values, size); };
+    tasks.Append(false, std::move(task));
+  }
+
+ private:
+  Status UnsafeAppendUtf8Strings(const cpp11::strings& s, int64_t size, int64_t offset) {
+    RETURN_NOT_OK(this->primitive_builder_->Reserve(size - offset));
+    const SEXP* p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s)) + offset;
+
+    int64_t total_length = 0;
+    for (R_xlen_t i = offset; i < size; i++, ++p_strings) {
+      SEXP si = *p_strings;
+      total_length += si == NA_STRING ? 0 : LENGTH(si);
+    }
+    RETURN_NOT_OK(this->primitive_builder_->ReserveData(total_length));
+
+    p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s)) + offset;
+    for (R_xlen_t i = offset; i < size; i++, ++p_strings) {
+      SEXP si = *p_strings;
+      if (si == NA_STRING) {
+        this->primitive_builder_->UnsafeAppendNull();
+      } else {
+        this->primitive_builder_->UnsafeAppend(CHAR(si), LENGTH(si));
+      }
+    }
+
+    return Status::OK();
+  }
+};
+
 template <typename T>
 class RPrimitiveConverter<T, enable_if_t<is_duration_type<T>::value>>
     : public PrimitiveConverter<T, RConverter> {
@@ -1062,7 +1105,13 @@ struct RConverterTrait<
 };
 
 template <typename T>
-struct RConverterTrait<T, enable_if_binary_view_like<T>> {
+struct RConverterTrait<T, enable_if_string_view<T>> {
+  using type = RPrimitiveConverter<T>;
+};
+
+template <typename T>
+struct RConverterTrait<T, enable_if_t<is_binary_view_like_type<T>::value &&
+                                      !is_string_view_type<T>::value>> {
   // not implemented
 };
 

@@ -203,6 +203,14 @@ test_that("Array supports character vectors (ARROW-3339)", {
   # with NA
   expect_array_roundtrip(c("itsy", NA, "spider"), utf8())
   expect_array_roundtrip(c("itsy", NA, "spider"), large_utf8(), as = large_utf8())
+
+  # string_view
+  expect_array_roundtrip(c("itsy", "bitsy", "spider"), string_view(), as = string_view())
+  expect_array_roundtrip(c("itsy", NA, "spider"), string_view(), as = string_view())
+
+  # string_view with empty strings
+  expect_array_roundtrip(c("", "bitsy", ""), string_view(), as = string_view())
+  expect_array_roundtrip(c("", NA, ""), string_view(), as = string_view())
 })
 
 test_that("Character vectors > 2GB become large_utf8", {