Skip to content

Commit 653065b

Browse files
committed
Encapsulate View manipulation
1 parent 520ad68 commit 653065b

File tree

5 files changed

+718
-87
lines changed

5 files changed

+718
-87
lines changed

arrow-array/src/array/byte_view_array.rs

Lines changed: 103 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -22,52 +22,36 @@ use crate::types::bytes::ByteArrayNativeType;
2222
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
2323
use crate::{Array, ArrayAccessor, ArrayRef};
2424
use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
25-
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
25+
use arrow_data::{ArrayData, ArrayDataBuilder, OffsetView, View};
2626
use arrow_schema::{ArrowError, DataType};
2727
use std::any::Any;
2828
use std::fmt::Debug;
2929
use std::marker::PhantomData;
3030
use std::sync::Arc;
3131

32-
/// [Variable-size Binary View Layout]: An array of variable length bytes view arrays.
33-
///
34-
/// Different than [`crate::GenericByteArray`] as it stores both an offset and length
35-
/// meaning that take / filter operations can be implemented without copying the underlying data.
36-
///
37-
/// See [`StringViewArray`] for storing utf8 encoded string data and
38-
/// [`BinaryViewArray`] for storing bytes.
39-
///
40-
/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
32+
/// [Variable-size Binary View Layout]: An array of variable length byte strings.
4133
///
4234
/// A `GenericByteViewArray` stores variable length byte strings. An array of
43-
/// `N` elements is stored as `N` fixed length "views" and a variable number
35+
/// `N` elements is stored as `N` fixed length [`View`]s and some number
4436
/// of variable length "buffers".
4537
///
46-
/// Each view is a `u128` value layout is different depending on the
47-
/// length of the string stored at that location:
38+
/// There are no constraints on offsets other than they must point into a valid
39+
/// buffer. The offsets can be out of order, non-continuous and overlapping.
4840
///
49-
/// ```text
50-
/// ┌──────┬────────────────────────┐
51-
/// │length│ string value │
52-
/// Strings (len <= 12) │ │ (padded with 0) │
53-
/// └──────┴────────────────────────┘
54-
/// 0 31 127
55-
///
56-
/// ┌───────┬───────┬───────┬───────┐
57-
/// │length │prefix │ buf │offset │
58-
/// Strings (len > 12) │ │ │ index │ │
59-
/// └───────┴───────┴───────┴───────┘
60-
/// 0 31 63 95 127
61-
/// ```
41+
/// Because `GenericByteViewArray` stores both an offset and length for each
42+
/// byte string, certain operations such as `take` and `filter` can be
43+
/// implemented without copying the underlying data, unlike
44+
/// [`GenericByteArray`], which requires the variable length data to be
45+
/// contiguous.
6246
///
63-
/// * Strings with length <= 12 are stored directly in the view.
47+
/// # See Also:
48+
/// * [`StringViewArray`] for storing UTF-8 string data
49+
/// * [`BinaryViewArray`] for storing bytes
50+
/// * [`View`] for the format of the views and interpreting the `u128` views
6451
///
65-
/// * Strings with length > 12: The first four bytes are stored inline in the
66-
/// view and the entire string is stored in one of the buffers.
52+
/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
6753
///
68-
/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
69-
/// than they must point into a valid buffer. However, they can be out of order,
70-
/// non continuous and overlapping.
54+
/// # Example
7155
///
7256
/// For example, in the following diagram, the strings "FishWasInTownToday" and
7357
/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
@@ -93,6 +77,7 @@ use std::sync::Arc;
9377
/// └───┘
9478
/// ```
9579
/// [`GenericByteArray`]: crate::array::GenericByteArray
80+
/// [`View`]: arrow_data::View
9681
pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
9782
data_type: DataType,
9883
views: ScalarBuffer<u128>,
@@ -114,16 +99,26 @@ impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
11499
}
115100

116101
impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
117-
/// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
102+
/// Create a new [`GenericByteViewArray`] from the provided parts, panicking
103+
/// on failure.
118104
///
119-
/// # Panics
105+
/// See [Self::try_new] for parameters
120106
///
107+
/// # Panics
121108
/// Panics if [`GenericByteViewArray::try_new`] returns an error
109+
///
110+
/// [`View`]: arrow_data::View
122111
pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self {
123112
Self::try_new(views, buffers, nulls).unwrap()
124113
}
125114

126-
/// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
115+
/// Create a new [`GenericByteViewArray`] from the provided parts, returning
116+
/// an error on failure
117+
///
118+
/// # Parameters
119+
/// * `views`: a [`ScalarBuffer`] of u128 views (see [`View`] for format)
120+
/// * `buffers`: a vector of [`Buffer`]s storing the string data
121+
/// * `nulls`: an optional [`NullBuffer`] for null values
127122
///
128123
/// # Errors
129124
///
@@ -156,7 +151,10 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
156151
})
157152
}
158153

159-
/// Create a new [`GenericByteViewArray`] from the provided parts, without validation
154+
/// Create a new [`GenericByteViewArray`] from the provided parts, without
155+
/// validation
156+
///
157+
/// See [Self::try_new] for parameters
160158
///
161159
/// # Safety
162160
///
@@ -232,21 +230,76 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
232230
unsafe { self.value_unchecked(i) }
233231
}
234232

233+
/// return the view at index `i`
234+
pub fn view(&self, _i: usize) -> &View {
235+
// TODO
236+
// Need to do this so a reference to the view can be returned as bytes_unchecked
237+
todo!();
238+
}
239+
235240
/// Returns the element at index `i`
241+
///
236242
/// # Safety
237243
/// Caller is responsible for ensuring that the index is within the bounds of the array
238244
pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
239245
let v = self.views.get_unchecked(idx);
240-
let len = *v as u32;
241-
let b = if len <= 12 {
242-
let ptr = self.views.as_ptr() as *const u8;
243-
std::slice::from_raw_parts(ptr.add(idx * 16 + 4), len as usize)
244-
} else {
245-
let view = ByteView::from(*v);
246-
let data = self.buffers.get_unchecked(view.buffer_index as usize);
247-
let offset = view.offset as usize;
248-
data.get_unchecked(offset..offset + len as usize)
249-
};
246+
match View::from(v) {
247+
View::Inline(inline_view) => {
248+
let bytes = inline_view.get_bytes_unchecked(v);
249+
T::Native::from_bytes_unchecked(bytes)
250+
}
251+
View::Offset(offset_view) => self.value_from_offset_view_unchecked(offset_view),
252+
}
253+
}
254+
255+
/// Return the value of element from this [`OffsetView`]
256+
///
257+
/// # Errors
258+
/// * the buffer index is out of bounds
259+
///* offset / length is out of bounds of the buffer
260+
/// * The data is not valid for `T::Native` (e.g. not Utf8)
261+
pub fn value_from_offset_view<'a>(
262+
&'a self,
263+
offset_view: OffsetView<'_>,
264+
) -> Result<&'a T::Native, ArrowError> {
265+
let data = self
266+
.buffers
267+
.get(offset_view.buffer_index() as usize)
268+
.ok_or_else(|| {
269+
ArrowError::InvalidArgumentError(format!(
270+
"Invalid ByteView. Requested buffer {} but only has {} buffers",
271+
offset_view.buffer_index(),
272+
self.buffers.len()
273+
))
274+
})?;
275+
276+
let b = data.get(offset_view.range()).ok_or_else(|| {
277+
ArrowError::InvalidArgumentError(format!(
278+
"Invalid ByteView. Requested range {:?} but buffer {} only has {} bytes",
279+
offset_view.range(),
280+
offset_view.buffer_index(),
281+
data.len()
282+
))
283+
})?;
284+
285+
T::Native::try_from_bytes(b)
286+
}
287+
288+
/// Return the value from the [`OffsetView`]
289+
///
290+
/// # Safety
291+
/// The caller is responsible for ensuring:
292+
/// * the buffer index is within of bounds
293+
/// * offset / length is within of bounds of the buffer
294+
/// * The data is valid for `T::Native` (e.g Utf8 for Strings)
295+
pub unsafe fn value_from_offset_view_unchecked<'a>(
296+
&'a self,
297+
offset_view: OffsetView<'_>,
298+
) -> &'a T::Native {
299+
let data = self
300+
.buffers
301+
.get_unchecked(offset_view.buffer_index() as usize);
302+
let b = data.get_unchecked(offset_view.range());
250303
T::Native::from_bytes_unchecked(b)
251304
}
252305

@@ -620,8 +673,8 @@ mod tests {
620673
view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
621674
view_buffer[4..].copy_from_slice(&data);
622675

623-
let view = ByteView::from(u128::from_le_bytes(view_buffer));
624-
let views = ScalarBuffer::from(vec![view.into()]);
676+
let view = u128::from_le_bytes(view_buffer);
677+
let views = ScalarBuffer::from(vec![view]);
625678
let buffers = vec![];
626679
StringViewArray::new(views, buffers, None);
627680
}
@@ -639,8 +692,8 @@ mod tests {
639692
view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
640693
view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
641694
view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
642-
let view = ByteView::from(u128::from_le_bytes(view_buffer));
643-
let views = ScalarBuffer::from(vec![view.into()]);
695+
let view = u128::from_le_bytes(view_buffer);
696+
let views = ScalarBuffer::from(vec![view]);
644697
let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
645698

646699
StringViewArray::new(views, buffers, None);

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use crate::builder::ArrayBuilder;
1919
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
2020
use crate::{ArrayRef, GenericByteViewArray};
2121
use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer};
22-
use arrow_data::ByteView;
22+
use arrow_data::{OffsetViewBuilder, OwnedView};
2323

2424
use std::any::Any;
2525
use std::marker::PhantomData;
@@ -72,35 +72,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
7272
#[inline]
7373
pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
7474
let v: &[u8] = value.as_ref().as_ref();
75-
let length: u32 = v.len().try_into().unwrap();
76-
if length <= 12 {
77-
let mut view_buffer = [0; 16];
78-
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
79-
view_buffer[4..4 + v.len()].copy_from_slice(v);
80-
self.views_builder.append(u128::from_le_bytes(view_buffer));
81-
self.null_buffer_builder.append_non_null();
82-
return;
83-
}
8475

85-
let required_cap = self.in_progress.len() + v.len();
86-
if self.in_progress.capacity() < required_cap {
87-
let in_progress = Vec::with_capacity(v.len().max(self.block_size as usize));
88-
let flushed = std::mem::replace(&mut self.in_progress, in_progress);
89-
if !flushed.is_empty() {
90-
assert!(self.completed.len() < u32::MAX as usize);
91-
self.completed.push(flushed.into());
76+
let view: u128 = match OwnedView::from(v) {
77+
OwnedView::Inline(view) => view,
78+
OwnedView::Offset(view) => {
79+
let required_cap = self.in_progress.len() + v.len();
80+
if self.in_progress.capacity() < required_cap {
81+
let in_progress = Vec::with_capacity(v.len().max(self.block_size as usize));
82+
let flushed = std::mem::replace(&mut self.in_progress, in_progress);
83+
if !flushed.is_empty() {
84+
assert!(self.completed.len() < u32::MAX as usize);
85+
self.completed.push(flushed.into());
86+
}
87+
};
88+
let builder = OffsetViewBuilder::from(view)
89+
.with_offset(self.in_progress.len() as u32)
90+
.with_buffer_index(self.completed.len() as u32);
91+
// copy the actual data into the in_progress buffer
92+
self.in_progress.extend_from_slice(v);
93+
builder.into()
9294
}
9395
};
94-
let offset = self.in_progress.len() as u32;
95-
self.in_progress.extend_from_slice(v);
96-
97-
let view = ByteView {
98-
length,
99-
prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
100-
buffer_index: self.completed.len() as u32,
101-
offset,
102-
};
103-
self.views_builder.append(view.into());
96+
self.views_builder.append(view);
10497
self.null_buffer_builder.append_non_null();
10598
}
10699

arrow-array/src/types.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1425,24 +1425,45 @@ pub(crate) mod bytes {
14251425
impl<O: OffsetSizeTrait> ByteArrayTypeSealed for GenericBinaryType<O> {}
14261426

14271427
pub trait ByteArrayNativeType: std::fmt::Debug + Send + Sync {
1428+
/// Covert bytes to this native type
1429+
///
14281430
/// # Safety
14291431
///
14301432
/// `b` must be a valid byte sequence for `Self`
14311433
unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self;
1434+
1435+
/// Covert bytes to this native type
1436+
///
1437+
/// # Errors
1438+
///
1439+
/// `b` is not a valid byte sequence for `Self` (e.g. not UTF8)
1440+
fn try_from_bytes(b: &[u8]) -> Result<&Self, ArrowError>;
14321441
}
14331442

14341443
impl ByteArrayNativeType for [u8] {
14351444
#[inline]
14361445
unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self {
14371446
b
14381447
}
1448+
1449+
#[inline]
1450+
fn try_from_bytes(b: &[u8]) -> Result<&Self, ArrowError> {
1451+
Ok(b)
1452+
}
14391453
}
14401454

14411455
impl ByteArrayNativeType for str {
14421456
#[inline]
14431457
unsafe fn from_bytes_unchecked(b: &[u8]) -> &Self {
14441458
std::str::from_utf8_unchecked(b)
14451459
}
1460+
1461+
#[inline]
1462+
fn try_from_bytes(b: &[u8]) -> Result<&Self, ArrowError> {
1463+
std::str::from_utf8(b).map_err(|e| {
1464+
ArrowError::InvalidArgumentError(format!("Encountered non UTF-8 data: {e}"))
1465+
})
1466+
}
14461467
}
14471468
}
14481469

0 commit comments

Comments
 (0)