apache · thisisnic · Apr 11, 2026 · Apr 11, 2026 · May 5, 2026 · May 5, 2026
@@ -241,6 +241,7 @@ struct MakeConverterImpl {
       DICTIONARY_CASE(LargeBinaryType);
       DICTIONARY_CASE(StringType);
       DICTIONARY_CASE(LargeStringType);
+      DICTIONARY_CASE(StringViewType);
       DICTIONARY_CASE(FixedSizeBinaryType);
 #undef DICTIONARY_CASE
       default:

@@ -826,7 +826,8 @@ class PyDictionaryConverter<U, enable_if_has_string_view<U>>
     } else {
       ARROW_RETURN_NOT_OK(
           PyValue::Convert(this->value_type_, this->options_, value, view_));
-      return this->value_builder_->Append(view_.bytes, static_cast<int32_t>(view_.size));
+      return this->value_builder_->Append(
+          std::string_view(view_.bytes, static_cast<size_t>(view_.size)));
     }
   }
 

@@ -397,6 +397,7 @@ export(set_io_thread_count)
 export(show_exec_plan)
 export(starts_with)
 export(string)
+export(string_view)
 export(struct)
 export(time32)
 export(time64)

@@ -203,6 +203,13 @@ Utf8 <- R6Class(
     code = function(namespace = FALSE) call2("utf8", .ns = if (namespace) "arrow")
   )
 )
+StringView <- R6Class(
+  "StringView",
+  inherit = DataType,
+  public = list(
+    code = function(namespace = FALSE) call2("string_view", .ns = if (namespace) "arrow")
+  )
+)
 LargeUtf8 <- R6Class(
   "LargeUtf8",
   inherit = DataType,
@@ -505,6 +512,10 @@ bool <- boolean
 #' @export
 utf8 <- function() Utf8__initialize()
 
+#' @rdname data-type
+#' @export
+string_view <- function() StringView__initialize()
+
 #' @rdname data-type
 #' @export
 large_utf8 <- function() LargeUtf8__initialize()
@@ -806,6 +817,8 @@ canonical_type_str <- function(type_str) {
     boolean = "bool",
     bool = "bool",
     utf8 = "string",
+    utf8_view = "string_view",
+    string_view = "string_view",
     large_utf8 = "large_string",
     large_string = "large_string",
     binary = "binary",

@@ -290,26 +290,32 @@ struct Converter_String : public Converter {
 
   Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
                            R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
-    auto p_offset = array->data()->GetValues<int32_t>(1);
-    if (!p_offset) {
-      return Status::Invalid("Invalid offset buffer");
-    }
-    auto p_strings = array->data()->GetValues<char>(2, *p_offset);
-    if (!p_strings) {
-      // There is an offset buffer, but the data buffer is null
-      // There is at least one value in the array and not all the values are null
-      // That means all values are either empty strings or nulls so there is nothing to do
-
-      if (array->null_count()) {
-        arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
-                                                  array->offset(), n);
-        for (int i = 0; i < n; i++, null_reader.Next()) {
-          if (null_reader.IsNotSet()) {
-            SET_STRING_ELT(data, start + i, NA_STRING);
+    // StringViewArray uses a different memory layout (views + data buffers) rather
+    // than offsets, so skip the offset-based fast path and fall through to the
+    // GetView()-based element loop below.
+    if (array->type_id() != Type::STRING_VIEW) {
+      auto p_offset = array->data()->GetValues<int32_t>(1);
+      if (!p_offset) {
+        return Status::Invalid("Invalid offset buffer");
+      }
+      auto p_strings = array->data()->GetValues<char>(2, *p_offset);
+      if (!p_strings) {
+        // There is an offset buffer, but the data buffer is null
+        // There is at least one value in the array and not all the values are null
+        // That means all values are either empty strings or nulls so there is nothing to
+        // do
+
+        if (array->null_count()) {
+          arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
+                                                    array->offset(), n);
+          for (int i = 0; i < n; i++, null_reader.Next()) {
+            if (null_reader.IsNotSet()) {
+              SET_STRING_ELT(data, start + i, NA_STRING);
+            }
           }
         }
+        return Status::OK();
       }
-      return Status::OK();
     }
 
     StringArrayType* string_array = static_cast<StringArrayType*>(array.get());
@@ -726,7 +732,8 @@ class Converter_Dictionary : public Converter {
     // Alternative: preserve the logical type of the dictionary values
     // (e.g. if dict is timestamp, return a POSIXt R vector, not factor)
     if (dictionary_->type_id() != Type::STRING &&
-        dictionary_->type_id() != Type::LARGE_STRING) {
+        dictionary_->type_id() != Type::LARGE_STRING &&
+        dictionary_->type_id() != Type::STRING_VIEW) {
       cpp11::safe[Rf_warning]("Coercing dictionary values to R character factor levels");
     }
 
@@ -1263,6 +1270,10 @@ std::shared_ptr<Converter> Converter::Make(
       return std::make_shared<arrow::r::Converter_String<arrow::LargeStringArray>>(
           chunked_array);
 
+    case Type::STRING_VIEW:
+      return std::make_shared<arrow::r::Converter_String<arrow::StringViewArray>>(
+          chunked_array);
+
     case Type::DICTIONARY:
       return std::make_shared<arrow::r::Converter_Dictionary>(chunked_array);
 

@@ -57,6 +57,8 @@ const char* r6_class_name<arrow::DataType>::get(
 
     case Type::STRING:
       return "Utf8";
+    case Type::STRING_VIEW:
+      return "StringView";
     case Type::LARGE_STRING:
       return "LargeUtf8";
 
@@ -165,6 +167,9 @@ std::shared_ptr<arrow::DataType> Boolean__initialize() { return arrow::boolean()
 // [[arrow::export]]
 std::shared_ptr<arrow::DataType> Utf8__initialize() { return arrow::utf8(); }
 
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> StringView__initialize() { return arrow::utf8_view(); }
+
 // [[arrow::export]]
 std::shared_ptr<arrow::DataType> LargeUtf8__initialize() { return arrow::large_utf8(); }
 

@@ -910,6 +910,43 @@ class RPrimitiveConverter<T, enable_if_string_like<T>>
   }
 };
 
+template <typename T>
+class RPrimitiveConverter<T, enable_if_string_view<T>>
+    : public PrimitiveConverter<T, RConverter> {
+ public:
+  Status Extend(SEXP x, int64_t size, int64_t offset = 0) override {
+    RVectorType rtype = GetVectorType(x);
+    if (rtype != STRING) {
+      return Status::Invalid("Expecting a character vector");
+    }
+    return UnsafeAppendUtf8Strings(arrow::r::utf8_strings(x), size, offset);
+  }
+
+  void DelayedExtend(SEXP values, int64_t size, RTasks& tasks) override {
+    auto task = [this, values, size]() { return this->Extend(values, size); };
+    tasks.Append(false, std::move(task));
+  }
+
+ private:
+  Status UnsafeAppendUtf8Strings(const cpp11::strings& s, int64_t size, int64_t offset) {
+    RETURN_NOT_OK(this->primitive_builder_->Reserve(size - offset));
+
+    // Use safe Append (not UnsafeAppend) because StringView's heap builder
+    // has a 2GB-per-block limit that prevents bulk data pre-reservation.
+    const SEXP* p_strings = reinterpret_cast<const SEXP*>(DATAPTR_RO(s)) + offset;
+    for (R_xlen_t i = offset; i < size; i++, ++p_strings) {
+      SEXP si = *p_strings;
+      if (si == NA_STRING) {
+        this->primitive_builder_->UnsafeAppendNull();
+      } else {
+        RETURN_NOT_OK(this->primitive_builder_->Append(CHAR(si), LENGTH(si)));
+      }
+    }
+
+    return Status::OK();
+  }
+};
+
 template <typename T>
 class RPrimitiveConverter<T, enable_if_t<is_duration_type<T>::value>>
     : public PrimitiveConverter<T, RConverter> {
@@ -1062,7 +1099,13 @@ struct RConverterTrait<
 };
 
 template <typename T>
-struct RConverterTrait<T, enable_if_binary_view_like<T>> {
+struct RConverterTrait<T, enable_if_string_view<T>> {
+  using type = RPrimitiveConverter<T>;
+};
+
+template <typename T>
+struct RConverterTrait<T, enable_if_t<is_binary_view_like_type<T>::value &&
+                                      !is_string_view_type<T>::value>> {
   // not implemented
 };
 

@@ -203,6 +203,8 @@ test_that("Array supports character vectors (ARROW-3339)", {
   # with NA
   expect_array_roundtrip(c("itsy", NA, "spider"), utf8())
   expect_array_roundtrip(c("itsy", NA, "spider"), large_utf8(), as = large_utf8())
+
+  expect_array_roundtrip(c("itsy", NA, "", "spider"), string_view(), as = string_view())
-  expect_array_roundtrip(c("itsy", NA, "", "spider"), string_view(), as = string_view())
+  expect_array_roundtrip(c("itsy", NA, "", "spider"), string_view(), as = string_view())
+  expect_array_roundtrip(c("itsy", NA, "", "a long non-inlined string", "another long string"), string_view(), as = string_view())
-  expect_array_roundtrip(c("itsy", NA, "", "spider"), string_view(), as = string_view())
+  expect_array_roundtrip(c("itsy", NA, "", "spider"), string_view(), as = string_view())
+  expect_array_roundtrip(c("itsy", NA, "", "a long non-inlined string", "another long string"), string_view(), as = string_view())
 })
 
 test_that("Character vectors > 2GB become large_utf8", {

@@ -371,6 +371,28 @@ test_that("Can create table with specific dictionary types", {
   expect_equal_data_frame(tab_large, fact)
 })
 
+test_that("Table converts dictionary arrays with wider index types back to R", {
+  fact <- example_data[, "fct"]
+  for (i in list(uint32(), int64(), uint64())) {
+    sch <- schema(fct = dictionary(i, utf8()))
+    tab <- Table$create(fact, schema = sch)
+    expect_equal(tab$schema, sch)
+    expect_equal_data_frame(tab, fact)
+  }
+})
+
+test_that("Table converts dictionary arrays with string_view values", {
+  expected <- data.frame(foo = factor(c("x", NA, "x")))
+  tab <- Table$create(expected, schema = schema(foo = dictionary(uint32(), string_view())))
+  expect_equal_data_frame(tab, expected)
+})
+
+test_that("Table round-trips string_view columns", {
+  expected <- data.frame(x = c("hello", NA, ""))
+  tab <- Table$create(expected, schema = schema(x = string_view()))
+  expect_equal_data_frame(tab, expected)
+})
+
 test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", {
   b1 <- record_batch(f = factor(c("a"), levels = c("a", "b")))
   b2 <- record_batch(f = factor(c("c"), levels = c("c", "d")))