diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index bed0bdf889b2..b53bbe7abc8c 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -55,7 +55,9 @@ impl OffsetSizeTrait for i64 { } /// An array of [variable length lists], similar to JSON arrays -/// (e.g. `["A", "B", "C"]`). +/// (e.g. `["A", "B", "C"]`). This struct specifically represents +/// the [list layout]. Refer to [`GenericListViewArray`] for the +/// [list-view layout]. /// /// Lists are represented using `offsets` into a `values` child /// array. Offsets are stored in two adjacent entries of an @@ -123,7 +125,10 @@ impl OffsetSizeTrait for i64 { /// ``` /// /// [`StringArray`]: crate::array::StringArray +/// [`GenericListViewArray`]: crate::array::GenericListViewArray /// [variable length lists]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout +/// [list layout]: https://arrow.apache.org/docs/format/Columnar.html#list-layout +/// [list-view layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout pub struct GenericListArray { data_type: DataType, nulls: Option, diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs index 7e52a6f3e457..195ac7e11611 100644 --- a/arrow-array/src/array/list_view_array.rs +++ b/arrow-array/src/array/list_view_array.rs @@ -32,16 +32,81 @@ pub type ListViewArray = GenericListViewArray; /// A [`GenericListViewArray`] of variable size lists, storing offsets as `i64`. pub type LargeListViewArray = GenericListViewArray; +/// An array of [variable length lists], specifically in the [list-view layout]. /// -/// Different from [`crate::GenericListArray`] as it stores both an offset and length -/// meaning that take / filter operations can be implemented without copying the underlying data. +/// Differs from [`GenericListArray`] (which represents the [list layout]) in that +/// the sizes of the child arrays are explicitly encoded in a separate buffer, instead +/// of being derived from the difference between subsequent offsets in the offset buffer. /// -/// [Variable-size List Layout: ListView Layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout +/// This allows the offsets (and subsequently child data) to be out of order. It also +/// allows take / filter operations to be implemented without copying the underlying data. +/// +/// # Representation +/// +/// Given the same example array from [`GenericListArray`], it would be represented +/// as such via a list-view layout array: +/// +/// ```text +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ +/// ┌ ─ ─ ─ ─ ─ ─ ┐ │ +/// ┌─────────────┐ ┌───────┐ │ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ +/// │ [A,B,C] │ │ (0,3) │ │ 1 │ │ 0 │ │ 3 │ │ │ 1 │ │ A │ │ 0 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ [] │ │ (3,0) │ │ 1 │ │ 3 │ │ 0 │ │ │ 1 │ │ B │ │ 1 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ NULL │ │ (?,?) │ │ 0 │ │ ? │ │ ? │ │ │ 1 │ │ C │ │ 2 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ [D] │ │ (4,1) │ │ 1 │ │ 4 │ │ 1 │ │ │ ? │ │ ? │ │ 3 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ [NULL, F] │ │ (5,2) │ │ 1 │ │ 5 │ │ 2 │ │ │ 1 │ │ D │ │ 4 │ +/// └─────────────┘ └───────┘ │ └───┘ └───┘ └───┘ ├───┤ ├───┤ +/// │ │ 0 │ │ ? │ │ 5 │ +/// Logical Logical │ Validity Offsets Sizes ├───┤ ├───┤ +/// Values Offset (nulls) │ │ 1 │ │ F │ │ 6 │ +/// & Size │ └───┘ └───┘ +/// │ Values │ │ +/// (offsets[i], │ ListViewArray (Array) +/// sizes[i]) └ ─ ─ ─ ─ ─ ─ ┘ │ +/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ +/// ``` +/// +/// Another way of representing the same array but taking advantage of the offsets being out of order: +/// +/// ```text +/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ +/// ┌ ─ ─ ─ ─ ─ ─ ┐ │ +/// ┌─────────────┐ ┌───────┐ │ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ +/// │ [A,B,C] │ │ (2,3) │ │ 1 │ │ 2 │ │ 3 │ │ │ 0 │ │ ? │ │ 0 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ [] │ │ (0,0) │ │ 1 │ │ 0 │ │ 0 │ │ │ 1 │ │ F │ │ 1 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ NULL │ │ (?,?) │ │ 0 │ │ ? │ │ ? │ │ │ 1 │ │ A │ │ 2 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ [D] │ │ (5,1) │ │ 1 │ │ 5 │ │ 1 │ │ │ 1 │ │ B │ │ 3 │ +/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤ ├───┤ +/// │ [NULL, F] │ │ (0,2) │ │ 1 │ │ 0 │ │ 2 │ │ │ 1 │ │ C │ │ 4 │ +/// └─────────────┘ └───────┘ │ └───┘ └───┘ └───┘ ├───┤ ├───┤ +/// │ │ 1 │ │ D │ │ 5 │ +/// Logical Logical │ Validity Offsets Sizes └───┘ └───┘ +/// Values Offset (nulls) │ Values │ │ +/// & Size │ (Array) +/// └ ─ ─ ─ ─ ─ ─ ┘ │ +/// (offsets[i], │ ListViewArray +/// sizes[i]) │ +/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ +/// ``` +/// +/// [`GenericListArray`]: crate::array::GenericListArray +/// [variable length lists]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout +/// [list layout]: https://arrow.apache.org/docs/format/Columnar.html#list-layout +/// [list-view layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout #[derive(Clone)] pub struct GenericListViewArray { data_type: DataType, nulls: Option, values: ArrayRef, + // Unlike GenericListArray, we do not use OffsetBuffer here as offsets are not + // guaranteed to be monotonically increasing. value_offsets: ScalarBuffer, value_sizes: ScalarBuffer, } diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 8e77f0180246..5c9073c4eeb6 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -245,7 +245,7 @@ pub enum DataType { /// /// # Recommendation /// - /// Users should prefer [`DataType::Date32`] to cleanly represent the number + /// Users should prefer [`Date32`] to cleanly represent the number /// of days, or one of the Timestamp variants to include time as part of the /// representation, depending on their use case. /// @@ -253,6 +253,7 @@ pub enum DataType { /// /// For more details, see [#5288](https://github.com/apache/arrow-rs/issues/5288). /// + /// [`Date32`]: Self::Date32 /// [Schema.fbs]: https://github.com/apache/arrow/blob/main/format/Schema.fbs Date64, /// A signed 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. @@ -282,10 +283,12 @@ pub enum DataType { LargeBinary, /// Opaque binary data of variable length. /// - /// Logically the same as [`Self::Binary`], but the internal representation uses a view + /// Logically the same as [`Binary`], but the internal representation uses a view /// struct that contains the string length and either the string's entire data /// inline (for small strings) or an inlined prefix, an index of another buffer, /// and an offset pointing to a slice in that buffer (for non-small strings). + /// + /// [`Binary`]: Self::Binary BinaryView, /// A variable-length string in Unicode with UTF-8 encoding. /// @@ -299,10 +302,12 @@ pub enum DataType { LargeUtf8, /// A variable-length string in Unicode with UTF-8 encoding /// - /// Logically the same as [`Self::Utf8`], but the internal representation uses a view + /// Logically the same as [`Utf8`], but the internal representation uses a view /// struct that contains the string length and either the string's entire data /// inline (for small strings) or an inlined prefix, an index of another buffer, /// and an offset pointing to a slice in that buffer (for non-small strings). + /// + /// [`Utf8`]: Self::Utf8 Utf8View, /// A list of some logical data type with variable length. /// @@ -311,11 +316,12 @@ pub enum DataType { /// (NOT YET FULLY SUPPORTED) A list of some logical data type with variable length. /// + /// Logically the same as [`List`], but the internal representation differs in how child + /// data is referenced, allowing flexibility in how data is layed out. + /// /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s. /// - /// The ListView layout is defined by three buffers: - /// a validity bitmap, an offsets buffer, and an additional sizes buffer. - /// Sizes and offsets are both 32 bits for this type + /// [`List`]: Self::List ListView(FieldRef), /// A list of some logical data type with fixed length. FixedSizeList(FieldRef, i32), @@ -326,11 +332,12 @@ pub enum DataType { /// (NOT YET FULLY SUPPORTED) A list of some logical data type with variable length and 64-bit offsets. /// + /// Logically the same as [`LargeList`], but the internal representation differs in how child + /// data is referenced, allowing flexibility in how data is layed out. + /// /// Note this data type is not yet fully supported. Using it with arrow APIs may result in `panic`s. /// - /// The LargeListView layout is defined by three buffers: - /// a validity bitmap, an offsets buffer, and an additional sizes buffer. - /// Sizes and offsets are both 64 bits for this type + /// [`LargeList`]: Self::LargeList LargeListView(FieldRef), /// A nested datatype that contains a number of sub-fields. Struct(Fields),