@@ -22,52 +22,36 @@ use crate::types::bytes::ByteArrayNativeType;
22
22
use crate :: types:: { BinaryViewType , ByteViewType , StringViewType } ;
23
23
use crate :: { Array , ArrayAccessor , ArrayRef } ;
24
24
use arrow_buffer:: { Buffer , NullBuffer , ScalarBuffer } ;
25
- use arrow_data:: { ArrayData , ArrayDataBuilder , ByteView } ;
25
+ use arrow_data:: { ArrayData , ArrayDataBuilder , OffsetView , View } ;
26
26
use arrow_schema:: { ArrowError , DataType } ;
27
27
use std:: any:: Any ;
28
28
use std:: fmt:: Debug ;
29
29
use std:: marker:: PhantomData ;
30
30
use std:: sync:: Arc ;
31
31
32
- /// [Variable-size Binary View Layout]: An array of variable length bytes view arrays.
33
- ///
34
- /// Different than [`crate::GenericByteArray`] as it stores both an offset and length
35
- /// meaning that take / filter operations can be implemented without copying the underlying data.
36
- ///
37
- /// See [`StringViewArray`] for storing utf8 encoded string data and
38
- /// [`BinaryViewArray`] for storing bytes.
39
- ///
40
- /// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
32
+ /// [Variable-size Binary View Layout]: An array of variable length byte strings.
41
33
///
42
34
/// A `GenericByteViewArray` stores variable length byte strings. An array of
43
- /// `N` elements is stored as `N` fixed length "views" and a variable number
35
+ /// `N` elements is stored as `N` fixed length [`View`]s and some number
44
36
/// of variable length "buffers".
45
37
///
46
- /// Each view is a `u128` value layout is different depending on the
47
- /// length of the string stored at that location:
38
+ /// There are no constraints on offsets other than they must point into a valid
39
+ /// buffer. The offsets can be out of order, non-continuous and overlapping.
48
40
///
49
- /// ```text
50
- /// ┌──────┬────────────────────────┐
51
- /// │length│ string value │
52
- /// Strings (len <= 12) │ │ (padded with 0) │
53
- /// └──────┴────────────────────────┘
54
- /// 0 31 127
55
- ///
56
- /// ┌───────┬───────┬───────┬───────┐
57
- /// │length │prefix │ buf │offset │
58
- /// Strings (len > 12) │ │ │ index │ │
59
- /// └───────┴───────┴───────┴───────┘
60
- /// 0 31 63 95 127
61
- /// ```
41
+ /// Because `GenericByteViewArray` stores both an offset and length for each
42
+ /// byte string, certain operations such as `take` and `filter` can be
43
+ /// implemented without copying the underlying data, unlike
44
+ /// [`GenericByteArray`], which requires the variable length data to be
45
+ /// contiguous.
62
46
///
63
- /// * Strings with length <= 12 are stored directly in the view.
47
+ /// # See Also:
48
+ /// * [`StringViewArray`] for storing UTF-8 string data
49
+ /// * [`BinaryViewArray`] for storing bytes
50
+ /// * [`View`] for the format of the views and interpreting the `u128` views
64
51
///
65
- /// * Strings with length > 12: The first four bytes are stored inline in the
66
- /// view and the entire string is stored in one of the buffers.
52
+ /// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
67
53
///
68
- /// Unlike [`GenericByteArray`], there are no constraints on the offsets other
69
- /// than they must point into a valid buffer. However, they can be out of order,
70
- /// non continuous and overlapping.
54
+ /// # Example
71
55
///
72
56
/// For example, in the following diagram, the strings "FishWasInTownToday" and
73
57
/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
@@ -93,6 +77,7 @@ use std::sync::Arc;
93
77
/// └───┘
94
78
/// ```
95
79
/// [`GenericByteArray`]: crate::array::GenericByteArray
80
+ /// [`View`]: arrow_data::View
96
81
pub struct GenericByteViewArray < T : ByteViewType + ?Sized > {
97
82
data_type : DataType ,
98
83
views : ScalarBuffer < u128 > ,
@@ -114,16 +99,26 @@ impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
114
99
}
115
100
116
101
impl < T : ByteViewType + ?Sized > GenericByteViewArray < T > {
117
- /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
102
+ /// Create a new [`GenericByteViewArray`] from the provided parts, panicking
103
+ /// on failure.
118
104
///
119
- /// # Panics
105
+ /// See [Self::try_new] for parameters
120
106
///
107
+ /// # Panics
121
108
/// Panics if [`GenericByteViewArray::try_new`] returns an error
109
+ ///
110
+ /// [`View`]: arrow_data::View
122
111
pub fn new ( views : ScalarBuffer < u128 > , buffers : Vec < Buffer > , nulls : Option < NullBuffer > ) -> Self {
123
112
Self :: try_new ( views, buffers, nulls) . unwrap ( )
124
113
}
125
114
126
- /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
115
+ /// Create a new [`GenericByteViewArray`] from the provided parts, returning
116
+ /// an error on failure
117
+ ///
118
+ /// # Parameters
119
+ /// * `views`: a [`ScalarBuffer`] of u128 views (see [`View`] for format)
120
+ /// * `buffers`: a vector of [`Buffer`]s storing the string data
121
+ /// * `nulls`: an optional [`NullBuffer`] for null values
127
122
///
128
123
/// # Errors
129
124
///
@@ -156,7 +151,10 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
156
151
} )
157
152
}
158
153
159
- /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
154
+ /// Create a new [`GenericByteViewArray`] from the provided parts, without
155
+ /// validation
156
+ ///
157
+ /// See [Self::try_new] for parameters
160
158
///
161
159
/// # Safety
162
160
///
@@ -232,21 +230,76 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
232
230
unsafe { self . value_unchecked ( i) }
233
231
}
234
232
233
+ /// return the view at index `i`
234
+ pub fn view ( & self , _i : usize ) -> & View {
235
+ // TODO
236
+ // Need to do this so a reference to the view can be returned as bytes_unchecked
237
+ todo ! ( ) ;
238
+ }
239
+
235
240
/// Returns the element at index `i`
241
+ ///
236
242
/// # Safety
237
243
/// Caller is responsible for ensuring that the index is within the bounds of the array
238
244
pub unsafe fn value_unchecked ( & self , idx : usize ) -> & T :: Native {
239
245
let v = self . views . get_unchecked ( idx) ;
240
- let len = * v as u32 ;
241
- let b = if len <= 12 {
242
- let ptr = self . views . as_ptr ( ) as * const u8 ;
243
- std:: slice:: from_raw_parts ( ptr. add ( idx * 16 + 4 ) , len as usize )
244
- } else {
245
- let view = ByteView :: from ( * v) ;
246
- let data = self . buffers . get_unchecked ( view. buffer_index as usize ) ;
247
- let offset = view. offset as usize ;
248
- data. get_unchecked ( offset..offset + len as usize )
249
- } ;
246
+ match View :: from ( v) {
247
+ View :: Inline ( inline_view) => {
248
+ let bytes = inline_view. get_bytes_unchecked ( v) ;
249
+ T :: Native :: from_bytes_unchecked ( bytes)
250
+ }
251
+ View :: Offset ( offset_view) => self . value_from_offset_view_unchecked ( offset_view) ,
252
+ }
253
+ }
254
+
255
+ /// Return the value of element from this [`OffsetView`]
256
+ ///
257
+ /// # Errors
258
+ /// * the buffer index is out of bounds
259
+ ///* offset / length is out of bounds of the buffer
260
+ /// * The data is not valid for `T::Native` (e.g. not Utf8)
261
+ pub fn value_from_offset_view < ' a > (
262
+ & ' a self ,
263
+ offset_view : OffsetView < ' _ > ,
264
+ ) -> Result < & ' a T :: Native , ArrowError > {
265
+ let data = self
266
+ . buffers
267
+ . get ( offset_view. buffer_index ( ) as usize )
268
+ . ok_or_else ( || {
269
+ ArrowError :: InvalidArgumentError ( format ! (
270
+ "Invalid ByteView. Requested buffer {} but only has {} buffers" ,
271
+ offset_view. buffer_index( ) ,
272
+ self . buffers. len( )
273
+ ) )
274
+ } ) ?;
275
+
276
+ let b = data. get ( offset_view. range ( ) ) . ok_or_else ( || {
277
+ ArrowError :: InvalidArgumentError ( format ! (
278
+ "Invalid ByteView. Requested range {:?} but buffer {} only has {} bytes" ,
279
+ offset_view. range( ) ,
280
+ offset_view. buffer_index( ) ,
281
+ data. len( )
282
+ ) )
283
+ } ) ?;
284
+
285
+ T :: Native :: try_from_bytes ( b)
286
+ }
287
+
288
+ /// Return the value from the [`OffsetView`]
289
+ ///
290
+ /// # Safety
291
+ /// The caller is responsible for ensuring:
292
+ /// * the buffer index is within of bounds
293
+ /// * offset / length is within of bounds of the buffer
294
+ /// * The data is valid for `T::Native` (e.g Utf8 for Strings)
295
+ pub unsafe fn value_from_offset_view_unchecked < ' a > (
296
+ & ' a self ,
297
+ offset_view : OffsetView < ' _ > ,
298
+ ) -> & ' a T :: Native {
299
+ let data = self
300
+ . buffers
301
+ . get_unchecked ( offset_view. buffer_index ( ) as usize ) ;
302
+ let b = data. get_unchecked ( offset_view. range ( ) ) ;
250
303
T :: Native :: from_bytes_unchecked ( b)
251
304
}
252
305
@@ -620,8 +673,8 @@ mod tests {
620
673
view_buffer[ 0 ..4 ] . copy_from_slice ( & 1u32 . to_le_bytes ( ) ) ;
621
674
view_buffer[ 4 ..] . copy_from_slice ( & data) ;
622
675
623
- let view = ByteView :: from ( u128:: from_le_bytes ( view_buffer) ) ;
624
- let views = ScalarBuffer :: from ( vec ! [ view. into ( ) ] ) ;
676
+ let view = u128:: from_le_bytes ( view_buffer) ;
677
+ let views = ScalarBuffer :: from ( vec ! [ view] ) ;
625
678
let buffers = vec ! [ ] ;
626
679
StringViewArray :: new ( views, buffers, None ) ;
627
680
}
@@ -639,8 +692,8 @@ mod tests {
639
692
view_buffer[ 4 ..8 ] . copy_from_slice ( & input_str_1. as_bytes ( ) [ 0 ..4 ] ) ;
640
693
view_buffer[ 8 ..12 ] . copy_from_slice ( & 0u32 . to_le_bytes ( ) ) ;
641
694
view_buffer[ 12 ..] . copy_from_slice ( & 0u32 . to_le_bytes ( ) ) ;
642
- let view = ByteView :: from ( u128:: from_le_bytes ( view_buffer) ) ;
643
- let views = ScalarBuffer :: from ( vec ! [ view. into ( ) ] ) ;
695
+ let view = u128:: from_le_bytes ( view_buffer) ;
696
+ let views = ScalarBuffer :: from ( vec ! [ view] ) ;
644
697
let buffers = vec ! [ Buffer :: from_slice_ref( input_str_2. as_bytes( ) ) ] ;
645
698
646
699
StringViewArray :: new ( views, buffers, None ) ;
0 commit comments