diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 8bbe175dafb8..2f670a64e108 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -286,7 +286,10 @@ impl ArrowReaderBuilder { pub struct ArrowReaderOptions { /// Should the reader strip any user defined metadata from the Arrow schema skip_arrow_metadata: bool, - /// If provided used as the schema for the file, otherwise the schema is read from the file + /// If provided used as the schema hint when determining the Arrow schema, + /// otherwise the schema hint is read from the [ARROW_SCHEMA_META_KEY] + /// + /// [ARROW_SCHEMA_META_KEY]: crate::arrow::ARROW_SCHEMA_META_KEY supplied_schema: Option, /// If true, attempt to read `OffsetIndex` and `ColumnIndex` pub(crate) page_index: bool, @@ -314,17 +317,27 @@ impl ArrowReaderOptions { } } - /// Provide a schema to use when reading the parquet file. If provided it - /// takes precedence over the schema inferred from the file or the schema defined - /// in the file's metadata. If the schema is not compatible with the file's - /// schema an error will be returned when constructing the builder. + /// Provide a schema hint to use when reading the Parquet file. + /// + /// If provided, this schema takes precedence over any arrow schema embedded + /// in the metadata (see the [`arrow`] documentation for more details). + /// + /// If the provided schema is not compatible with the data stored in the + /// parquet file schema, an error will be returned when constructing the + /// builder. /// - /// This option is only required if you want to cast columns to a different type. - /// For example, if you wanted to cast from an Int64 in the Parquet file to a Timestamp - /// in the Arrow schema. + /// This option is only required if you want to explicitly control the + /// conversion of Parquet types to Arrow types, such as casting a column to + /// a different type. For example, if you wanted to read an Int64 in + /// a Parquet file to a [`TimestampMicrosecondArray`] in the Arrow schema. + /// + /// [`arrow`]: crate::arrow + /// [`TimestampMicrosecondArray`]: arrow_array::TimestampMicrosecondArray + /// + /// # Notes /// - /// The supplied schema must have the same number of columns as the parquet schema and - /// the column names need to be the same. + /// The provided schema must have the same number of columns as the parquet schema and + /// the column names must be the same. /// /// # Example /// ``` diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index b89c6ddcf8da..76f8ef1bf068 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -15,13 +15,48 @@ // specific language governing permissions and limitations // under the License. -//! API for reading/writing -//! Arrow [RecordBatch](arrow_array::RecordBatch)es and -//! [Array](arrow_array::Array)s to/from Parquet Files. +//! API for reading/writing Arrow [`RecordBatch`]es and [`Array`]s to/from +//! Parquet Files. //! -//! See the [crate-level documentation](crate) for more details. +//! See the [crate-level documentation](crate) for more details on other APIs //! -//! # Example of writing Arrow record batch to Parquet file +//! # Schema Conversion +//! +//! These APIs ensure that data in Arrow [`RecordBatch`]es written to Parquet are +//! read back as [`RecordBatch`]es with the exact same types and values. +//! +//! Parquet and Arrow have different type systems, and there is not +//! always a one to one mapping between the systems. For example, data +//! stored as a Parquet [`BYTE_ARRAY`] can be read as either an Arrow +//! [`BinaryViewArray`] or [`BinaryArray`]. +//! +//! To recover the original Arrow types, the writers in this module add a "hint" to +//! the metadata in the [`ARROW_SCHEMA_META_KEY`] key which records the original Arrow +//! schema. The metadata hint follows the same convention as arrow-cpp based +//! implementations such as `pyarrow`. The reader looks for the schema hint in the +//! metadata to determine Arrow types, and if it is not present, infers the Arrow schema +//! from the Parquet schema. +//! +//! In situations where the embedded Arrow schema is not compatible with the Parquet +//! schema, the Parquet schema takes precedence and no error is raised. +//! See [#1663](https://github.com/apache/arrow-rs/issues/1663) +//! +//! You can also control the type conversion process in more detail using: +//! +//! * [`ArrowSchemaConverter`] control the conversion of Arrow types to Parquet +//! types. +//! +//! * [`ArrowReaderOptions::with_schema`] to explicitly specify your own Arrow schema hint +//! to use when reading Parquet, overriding any metadata that may be present. +//! +//! [`RecordBatch`]: arrow_array::RecordBatch +//! [`Array`]: arrow_array::Array +//! [`BYTE_ARRAY`]: crate::basic::Type::BYTE_ARRAY +//! [`BinaryViewArray`]: arrow_array::BinaryViewArray +//! [`BinaryArray`]: arrow_array::BinaryArray +//! [`ArrowReaderOptions::with_schema`]: arrow_reader::ArrowReaderOptions::with_schema +//! +//! # Example: Writing Arrow `RecordBatch` to Parquet file //! //!```rust //! # use arrow_array::{Int32Array, ArrayRef}; @@ -53,7 +88,7 @@ //! writer.close().unwrap(); //! ``` //! -//! # Example of reading parquet file into arrow record batch +//! # Example: Reading Parquet file into Arrow `RecordBatch` //! //! ```rust //! # use std::fs::File; @@ -93,11 +128,10 @@ //! println!("Read {} records.", record_batch.num_rows()); //! ``` //! -//! # Example of reading non-uniformly encrypted parquet file into arrow record batch +//! # Example: Reading non-uniformly encrypted parquet file into arrow record batch //! //! Note: This requires the experimental `encryption` feature to be enabled at compile time. //! -//! #![cfg_attr(feature = "encryption", doc = "```rust")] #![cfg_attr(not(feature = "encryption"), doc = "```ignore")] //! # use arrow_array::{Int32Array, ArrayRef}; @@ -168,7 +202,6 @@ pub use self::async_reader::ParquetRecordBatchStreamBuilder; pub use self::async_writer::AsyncArrowWriter; use crate::schema::types::{SchemaDescriptor, Type}; use arrow_schema::{FieldRef, Schema}; - // continue to export deprecated methods until they are removed #[allow(deprecated)] pub use self::schema::arrow_to_parquet_schema; @@ -178,7 +211,10 @@ pub use self::schema::{ parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, ArrowSchemaConverter, FieldLevels, }; -/// Schema metadata key used to store serialized Arrow IPC schema +/// Schema metadata key used to store serialized Arrow schema +/// +/// The Arrow schema is encoded using the Arrow IPC format, and then base64 +/// encoded. This is the same format used by arrow-cpp systems, such as pyarrow. pub const ARROW_SCHEMA_META_KEY: &str = "ARROW:schema"; /// The value of this metadata key, if present on [`Field::metadata`], will be used diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 89c42f5eaf92..ad9e1f781f70 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -105,6 +105,12 @@ pub struct FieldLevels { /// /// Columns not included within [`ProjectionMask`] will be ignored. /// +/// The optional `hint` parameter is the desired Arrow schema. See the +/// [`arrow`] module documentation for more information. +/// +/// [`arrow`]: crate::arrow +/// +/// # Notes: /// Where a field type in `hint` is compatible with the corresponding parquet type in `schema`, it /// will be used, otherwise the default arrow type for the given parquet column type will be used. /// @@ -192,8 +198,12 @@ pub fn encode_arrow_schema(schema: &Schema) -> String { BASE64_STANDARD.encode(&len_prefix_schema) } -/// Mutates writer metadata by storing the encoded Arrow schema. +/// Mutates writer metadata by storing the encoded Arrow schema hint in +/// [`ARROW_SCHEMA_META_KEY`]. +/// /// If there is an existing Arrow schema metadata, it is replaced. +/// +/// [`ARROW_SCHEMA_META_KEY`]: crate::arrow::ARROW_SCHEMA_META_KEY pub fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterProperties) { let encoded = encode_arrow_schema(schema); @@ -224,7 +234,12 @@ pub fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterP /// Converter for Arrow schema to Parquet schema /// -/// Example: +/// See the documentation on the [`arrow`] module for background +/// information on how Arrow schema is represented in Parquet. +/// +/// [`arrow`]: crate::arrow +/// +/// # Example: /// ``` /// # use std::sync::Arc; /// # use arrow_schema::{Field, Schema, DataType};