From 11c99a3232761e6d12162f6c09822de821b61c96 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 06:51:05 -0400 Subject: [PATCH 01/15] Document Arrow <--> Parquet schema conversion better --- parquet/src/arrow/arrow_reader/mod.rs | 19 +++++++----- parquet/src/arrow/mod.rs | 44 +++++++++++++++++++++------ parquet/src/arrow/schema/mod.rs | 3 ++ 3 files changed, 50 insertions(+), 16 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 8bbe175dafb8..1fad2bc40fe8 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -314,14 +314,19 @@ impl ArrowReaderOptions { } } - /// Provide a schema to use when reading the parquet file. If provided it - /// takes precedence over the schema inferred from the file or the schema defined - /// in the file's metadata. If the schema is not compatible with the file's - /// schema an error will be returned when constructing the builder. + /// Provide a schema to use when reading the Parquet file. /// - /// This option is only required if you want to cast columns to a different type. - /// For example, if you wanted to cast from an Int64 in the Parquet file to a Timestamp - /// in the Arrow schema. + /// If provided, this schema takes precedence over the schema inferred from + /// the file or the schema defined in the file's metadata (see [`arrow`] + /// documentation for more details). If the provided schema is not compatible + /// with the file's schema, an error will be returned when constructing the builder. + /// + /// This option is only required if you want to explicitly control the + /// conversion of Parquet types to Arrow types, such as casting a column to + /// a different type. For example, if you wanted to read an Int64 in + /// a Parquet file to a [`TimestampMicrosecondArray`] in the Arrow schema. + /// + /// # Notes /// /// The supplied schema must have the same number of columns as the parquet schema and /// the column names need to be the same. diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index b89c6ddcf8da..182e77038ea8 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -15,13 +15,41 @@ // specific language governing permissions and limitations // under the License. -//! API for reading/writing -//! Arrow [RecordBatch](arrow_array::RecordBatch)es and -//! [Array](arrow_array::Array)s to/from Parquet Files. +//! API for reading/writing Arrow [`RecordBatch`]es and [`Array`]s to/from +//! Parquet Files. //! -//! See the [crate-level documentation](crate) for more details. +//! See the [crate-level documentation](crate) for more details on other APIs //! -//! # Example of writing Arrow record batch to Parquet file +//! # Schema Conversion +//! +//! These APIs ensure that data in Arrow [`RecordBatch`]es written to Parquet are +//! read back as [`RecordBatch`]es with the exact same types and values. +//! +//! Parquet and Arrow have different type systems, and there is not +//! always a one to one mapping between the systems. For example, data +//! stored as a Parquet [`BYTE_ARRAY`] can be read as either an Arrow +//! [`BinaryViewArray`] or [`BinaryArray`]. +//! +//! To recover the original Arrow types, the writers in this module add +//! metadata in the [`ARROW_SCHEMA_META_KEY`] key to record the original Arrow +//! schema. The readers look for this metadata to determine Arrow types, and if +//! it is not present, use reasonable defaults. You can also control the type +//! conversion process in more detail using: +//! +//! * [`ArrowSchemaConverter`] control the conversion of Arrow types to Parquet +//! types. +//! +//! * [`ArrowReaderOptions::with_schema`] to explicitly specify what Arrow types +//! to use when reading Parquet, overriding any metadata that may be present. +//! +//! [`RecordBatch`]: arrow_array::RecordBatch +//! [`Array`]: arrow_array::Array +//! [`BYTE_ARRAY`]: crate::basic::Type::BYTE_ARRAY +//! [`BinaryViewArray`]: arrow_array::BinaryViewArray +//! [`BinaryArray`]: arrow_array::BinaryArray +//! [`ArrowReaderOptions::with_schema`]: arrow_reader::ArrowReaderOptions::with_schema +//! +//! # Example: Writing Arrow `RecordBatch` to Parquet file //! //!```rust //! # use arrow_array::{Int32Array, ArrayRef}; @@ -53,7 +81,7 @@ //! writer.close().unwrap(); //! ``` //! -//! # Example of reading parquet file into arrow record batch +//! # Example: Reading Parquet file into Arrow `RecordBatch` //! //! ```rust //! # use std::fs::File; @@ -93,11 +121,10 @@ //! println!("Read {} records.", record_batch.num_rows()); //! ``` //! -//! # Example of reading non-uniformly encrypted parquet file into arrow record batch +//! # Example: Reading non-uniformly encrypted parquet file into arrow record batch //! //! Note: This requires the experimental `encryption` feature to be enabled at compile time. //! -//! #![cfg_attr(feature = "encryption", doc = "```rust")] #![cfg_attr(not(feature = "encryption"), doc = "```ignore")] //! # use arrow_array::{Int32Array, ArrayRef}; @@ -168,7 +195,6 @@ pub use self::async_reader::ParquetRecordBatchStreamBuilder; pub use self::async_writer::AsyncArrowWriter; use crate::schema::types::{SchemaDescriptor, Type}; use arrow_schema::{FieldRef, Schema}; - // continue to export deprecated methods until they are removed #[allow(deprecated)] pub use self::schema::arrow_to_parquet_schema; diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 89c42f5eaf92..fa6d23314c44 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -223,6 +223,9 @@ pub fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterP } /// Converter for Arrow schema to Parquet schema +/// +/// See the documentation on the [`arrow`] module for background +/// information on how Arrow schema is represented in Parquet. /// /// Example: /// ``` From 2949e780c52b287a6d930c039f7828fef33b11eb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 11:58:27 -0400 Subject: [PATCH 02/15] Add a note about arrow metadata convention --- parquet/src/arrow/mod.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 182e77038ea8..fccf66901d13 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -32,9 +32,10 @@ //! //! To recover the original Arrow types, the writers in this module add //! metadata in the [`ARROW_SCHEMA_META_KEY`] key to record the original Arrow -//! schema. The readers look for this metadata to determine Arrow types, and if -//! it is not present, use reasonable defaults. You can also control the type -//! conversion process in more detail using: +//! schema. The metadata follows the same convention as arrow-cpp based +//! implementations such as `pyarrow`. The reader looks for this metadata to +//! determine Arrow types, and if it is not present, use reasonable defaults. +//! You can also control the type conversion process in more detail using: //! //! * [`ArrowSchemaConverter`] control the conversion of Arrow types to Parquet //! types. @@ -204,7 +205,10 @@ pub use self::schema::{ parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, ArrowSchemaConverter, FieldLevels, }; -/// Schema metadata key used to store serialized Arrow IPC schema +/// Schema metadata key used to store serialized Arrow schema +/// +/// The Arrow schema is encoded using the Arrow IPC format, and then base64 +/// encoded. This is the same format used by arrow-cpp systems, such as pyarrow. pub const ARROW_SCHEMA_META_KEY: &str = "ARROW:schema"; /// The value of this metadata key, if present on [`Field::metadata`], will be used From ca7b0c31b3227a2afc9658e3555c520579dedc03 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 11:58:46 -0400 Subject: [PATCH 03/15] lint --- parquet/src/arrow/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index fccf66901d13..dc70b60e06ce 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -32,9 +32,9 @@ //! //! To recover the original Arrow types, the writers in this module add //! metadata in the [`ARROW_SCHEMA_META_KEY`] key to record the original Arrow -//! schema. The metadata follows the same convention as arrow-cpp based -//! implementations such as `pyarrow`. The reader looks for this metadata to -//! determine Arrow types, and if it is not present, use reasonable defaults. +//! schema. The metadata follows the same convention as arrow-cpp based +//! implementations such as `pyarrow`. The reader looks for this metadata to +//! determine Arrow types, and if it is not present, use reasonable defaults. //! You can also control the type conversion process in more detail using: //! //! * [`ArrowSchemaConverter`] control the conversion of Arrow types to Parquet @@ -206,7 +206,7 @@ pub use self::schema::{ }; /// Schema metadata key used to store serialized Arrow schema -/// +/// /// The Arrow schema is encoded using the Arrow IPC format, and then base64 /// encoded. This is the same format used by arrow-cpp systems, such as pyarrow. pub const ARROW_SCHEMA_META_KEY: &str = "ARROW:schema"; From 893718d042f840b7af88df02fa53b0b0332f0538 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 12:00:46 -0400 Subject: [PATCH 04/15] Fix links --- parquet/src/arrow/arrow_reader/mod.rs | 5 ++++- parquet/src/arrow/schema/mod.rs | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 1fad2bc40fe8..62052bc9918d 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -317,7 +317,7 @@ impl ArrowReaderOptions { /// Provide a schema to use when reading the Parquet file. /// /// If provided, this schema takes precedence over the schema inferred from - /// the file or the schema defined in the file's metadata (see [`arrow`] + /// the file or the schema defined in the file's metadata (see the [`arrow`] /// documentation for more details). If the provided schema is not compatible /// with the file's schema, an error will be returned when constructing the builder. /// @@ -326,6 +326,9 @@ impl ArrowReaderOptions { /// a different type. For example, if you wanted to read an Int64 in /// a Parquet file to a [`TimestampMicrosecondArray`] in the Arrow schema. /// + /// [`arrow`]: crate::arrow + /// [`TimestampMicrosecondArray`]: arrow_array::TimestampMicrosecondArray + /// /// # Notes /// /// The supplied schema must have the same number of columns as the parquet schema and diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index fa6d23314c44..9fd34c4911cc 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -227,7 +227,9 @@ pub fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterP /// See the documentation on the [`arrow`] module for background /// information on how Arrow schema is represented in Parquet. /// -/// Example: +/// [`arrow`]: crate::arrow +/// +/// # Example: /// ``` /// # use std::sync::Arc; /// # use arrow_schema::{Field, Schema, DataType}; From 3a4e03da3a6e6375855fd1862672f5c6f9ede495 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 12:23:34 -0400 Subject: [PATCH 05/15] clarify what happens with provided schema --- parquet/src/arrow/arrow_reader/mod.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 62052bc9918d..855d98ea7443 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -316,10 +316,11 @@ impl ArrowReaderOptions { /// Provide a schema to use when reading the Parquet file. /// - /// If provided, this schema takes precedence over the schema inferred from - /// the file or the schema defined in the file's metadata (see the [`arrow`] - /// documentation for more details). If the provided schema is not compatible - /// with the file's schema, an error will be returned when constructing the builder. + /// If provided, this schema takes precedence over the schema defined in the + /// arrow file's metadata (see the [`arrow`] documentation for more details). + /// If the provided schema is not compatible with the data stored in the + /// parquet file schema, an error will be returned when constructing the + /// builder. /// /// This option is only required if you want to explicitly control the /// conversion of Parquet types to Arrow types, such as casting a column to From 8b0920c85fd738165d0b8c7544a2ffdf9b83d84a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 12:28:33 -0400 Subject: [PATCH 06/15] More docs --- parquet/src/arrow/arrow_reader/mod.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 855d98ea7443..965600a553b8 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -286,7 +286,10 @@ impl ArrowReaderBuilder { pub struct ArrowReaderOptions { /// Should the reader strip any user defined metadata from the Arrow schema skip_arrow_metadata: bool, - /// If provided used as the schema for the file, otherwise the schema is read from the file + /// If provided used as the schema hint when determining the Arrow schema, + /// otherwise the schema hint is read from the [ARROW_SCHEMA_META_KEY] + /// + /// [ARROW_SCHEMA_META_KEY]: crate::arrow::ARROW_SCHEMA_META_KEY supplied_schema: Option, /// If true, attempt to read `OffsetIndex` and `ColumnIndex` pub(crate) page_index: bool, @@ -316,10 +319,10 @@ impl ArrowReaderOptions { /// Provide a schema to use when reading the Parquet file. /// - /// If provided, this schema takes precedence over the schema defined in the - /// arrow file's metadata (see the [`arrow`] documentation for more details). - /// If the provided schema is not compatible with the data stored in the - /// parquet file schema, an error will be returned when constructing the + /// If provided, this schema takes precedence over the schema defined in the + /// arrow file's metadata (see the [`arrow`] documentation for more details). + /// If the provided schema is not compatible with the data stored in the + /// parquet file schema, an error will be returned when constructing the /// builder. /// /// This option is only required if you want to explicitly control the From e417988252d9167efa32113e7bdcb07bd42f246c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 12:38:40 -0400 Subject: [PATCH 07/15] fmt --- parquet/src/arrow/schema/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 9fd34c4911cc..f3ecac844a3e 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -223,8 +223,8 @@ pub fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterP } /// Converter for Arrow schema to Parquet schema -/// -/// See the documentation on the [`arrow`] module for background +/// +/// See the documentation on the [`arrow`] module for background /// information on how Arrow schema is represented in Parquet. /// /// [`arrow`]: crate::arrow From 812160005efe3afc63531b8ea051e1fa44a91f67 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 12:56:08 -0400 Subject: [PATCH 08/15] more claritification --- parquet/src/arrow/arrow_reader/mod.rs | 8 ++++---- parquet/src/arrow/mod.rs | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 965600a553b8..3c1a71f84525 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -319,8 +319,8 @@ impl ArrowReaderOptions { /// Provide a schema to use when reading the Parquet file. /// - /// If provided, this schema takes precedence over the schema defined in the - /// arrow file's metadata (see the [`arrow`] documentation for more details). + /// If provided, this schema takes precedence over any schema defined in the + /// file's schema hint in the metadata (see the [`arrow`] documentation for more details). /// If the provided schema is not compatible with the data stored in the /// parquet file schema, an error will be returned when constructing the /// builder. @@ -335,8 +335,8 @@ impl ArrowReaderOptions { /// /// # Notes /// - /// The supplied schema must have the same number of columns as the parquet schema and - /// the column names need to be the same. + /// The provided schema must have the same number of columns as the parquet schema and + /// the column names must be the same. /// /// # Example /// ``` diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index dc70b60e06ce..6a5ed8da0a17 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -30,17 +30,17 @@ //! stored as a Parquet [`BYTE_ARRAY`] can be read as either an Arrow //! [`BinaryViewArray`] or [`BinaryArray`]. //! -//! To recover the original Arrow types, the writers in this module add -//! metadata in the [`ARROW_SCHEMA_META_KEY`] key to record the original Arrow -//! schema. The metadata follows the same convention as arrow-cpp based -//! implementations such as `pyarrow`. The reader looks for this metadata to -//! determine Arrow types, and if it is not present, use reasonable defaults. +//! To recover the original Arrow types, the writers in this module add a "hint" to +//! the metadata in the [`ARROW_SCHEMA_META_KEY`] key which records the original Arrow +//! schema. The metadata hint follows the same convention as arrow-cpp based +//! implementations such as `pyarrow`. The reader looks for the schema hint in the +//! metadata to determine Arrow types, and if it is not present, use reasonable defaults. //! You can also control the type conversion process in more detail using: //! //! * [`ArrowSchemaConverter`] control the conversion of Arrow types to Parquet //! types. //! -//! * [`ArrowReaderOptions::with_schema`] to explicitly specify what Arrow types +//! * [`ArrowReaderOptions::with_schema`] to explicitly specify your own Arrow schema hint //! to use when reading Parquet, overriding any metadata that may be present. //! //! [`RecordBatch`]: arrow_array::RecordBatch From 2defff51190cb01de50be316c54ee70e147f6308 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 13:01:31 -0400 Subject: [PATCH 09/15] More clarifications --- parquet/src/arrow/schema/mod.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index f3ecac844a3e..2ee9ff9c63fb 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -104,7 +104,13 @@ pub struct FieldLevels { /// Convert a parquet [`SchemaDescriptor`] to [`FieldLevels`] /// /// Columns not included within [`ProjectionMask`] will be ignored. -/// +/// +/// The optional `hint` parameter is the desired Arrow schema. See the +/// [`arrow`] module documentation for more information. +/// +/// [`arrow`]: crate::arrow +/// +/// # Notes: /// Where a field type in `hint` is compatible with the corresponding parquet type in `schema`, it /// will be used, otherwise the default arrow type for the given parquet column type will be used. /// @@ -192,8 +198,12 @@ pub fn encode_arrow_schema(schema: &Schema) -> String { BASE64_STANDARD.encode(&len_prefix_schema) } -/// Mutates writer metadata by storing the encoded Arrow schema. +/// Mutates writer metadata by storing the encoded Arrow schema hint in +/// [`ARROW_SCHEMA_META_KEY`]. +/// /// If there is an existing Arrow schema metadata, it is replaced. +/// +/// [`ARROW_SCHEMA_META_KEY`]: crate::arrow::ARROW_SCHEMA_META_KEY pub fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterProperties) { let encoded = encode_arrow_schema(schema); From 3426c8aeff9d160bd78b34a0a1052a97016065fe Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 13:31:56 -0400 Subject: [PATCH 10/15] fmt --- parquet/src/arrow/schema/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 2ee9ff9c63fb..ad9e1f781f70 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -104,12 +104,12 @@ pub struct FieldLevels { /// Convert a parquet [`SchemaDescriptor`] to [`FieldLevels`] /// /// Columns not included within [`ProjectionMask`] will be ignored. -/// +/// /// The optional `hint` parameter is the desired Arrow schema. See the /// [`arrow`] module documentation for more information. -/// +/// /// [`arrow`]: crate::arrow -/// +/// /// # Notes: /// Where a field type in `hint` is compatible with the corresponding parquet type in `schema`, it /// will be used, otherwise the default arrow type for the given parquet column type will be used. @@ -200,9 +200,9 @@ pub fn encode_arrow_schema(schema: &Schema) -> String { /// Mutates writer metadata by storing the encoded Arrow schema hint in /// [`ARROW_SCHEMA_META_KEY`]. -/// +/// /// If there is an existing Arrow schema metadata, it is replaced. -/// +/// /// [`ARROW_SCHEMA_META_KEY`]: crate::arrow::ARROW_SCHEMA_META_KEY pub fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut WriterProperties) { let encoded = encode_arrow_schema(schema); From eace423bb04bd65383424c40a1d4656c4faaeac5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 14:12:18 -0400 Subject: [PATCH 11/15] Update parquet/src/arrow/arrow_reader/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- parquet/src/arrow/arrow_reader/mod.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 3c1a71f84525..2f670a64e108 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -317,10 +317,11 @@ impl ArrowReaderOptions { } } - /// Provide a schema to use when reading the Parquet file. + /// Provide a schema hint to use when reading the Parquet file. + /// + /// If provided, this schema takes precedence over any arrow schema embedded + /// in the metadata (see the [`arrow`] documentation for more details). /// - /// If provided, this schema takes precedence over any schema defined in the - /// file's schema hint in the metadata (see the [`arrow`] documentation for more details). /// If the provided schema is not compatible with the data stored in the /// parquet file schema, an error will be returned when constructing the /// builder. From 3beb7a6c3b5fc35cef7a523b193ce137336b675f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 14:37:13 -0400 Subject: [PATCH 12/15] Update parquet/src/arrow/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- parquet/src/arrow/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 6a5ed8da0a17..9cb7c6148751 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -34,7 +34,12 @@ //! the metadata in the [`ARROW_SCHEMA_META_KEY`] key which records the original Arrow //! schema. The metadata hint follows the same convention as arrow-cpp based //! implementations such as `pyarrow`. The reader looks for the schema hint in the -//! metadata to determine Arrow types, and if it is not present, use reasonable defaults. +//! metadata to determine Arrow types, and if it is not present, infers the arrow schema +//! from the parquet schema. +//! +//! In situations where the embedded arrow schema is not compatible with the parquet +//! schema, the parquet schema takes precedence - see [#1663](https://github.com/apache/arrow-rs/issues/1663) +//! //! You can also control the type conversion process in more detail using: //! //! * [`ArrowSchemaConverter`] control the conversion of Arrow types to Parquet From 6f803b174e5bd470d777dd4b3393fb7339d53da2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 14:39:25 -0400 Subject: [PATCH 13/15] tweaks --- parquet/src/arrow/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 9cb7c6148751..e8e9f6f186e1 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -37,8 +37,9 @@ //! metadata to determine Arrow types, and if it is not present, infers the arrow schema //! from the parquet schema. //! -//! In situations where the embedded arrow schema is not compatible with the parquet -//! schema, the parquet schema takes precedence - see [#1663](https://github.com/apache/arrow-rs/issues/1663) +//! In situations where the embedded Arrow schema is not compatible with the parquet +//! schema, the parquet schema takes precedence and no error is raised. +//! See [#1663](https://github.com/apache/arrow-rs/issues/1663) //! //! You can also control the type conversion process in more detail using: //! From ad005543aad4b02fd3c906ea5062bd13fab130b6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 14:41:33 -0400 Subject: [PATCH 14/15] capitalization OCD --- parquet/src/arrow/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index e8e9f6f186e1..54314cef9a51 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -34,10 +34,10 @@ //! the metadata in the [`ARROW_SCHEMA_META_KEY`] key which records the original Arrow //! schema. The metadata hint follows the same convention as arrow-cpp based //! implementations such as `pyarrow`. The reader looks for the schema hint in the -//! metadata to determine Arrow types, and if it is not present, infers the arrow schema -//! from the parquet schema. +//! metadata to determine Arrow types, and if it is not present, infers the Arrow schema +//! from the Parquet schema. //! -//! In situations where the embedded Arrow schema is not compatible with the parquet +//! In situations where the embedded Arrow schema is not compatible with the Parquet //! schema, the parquet schema takes precedence and no error is raised. //! See [#1663](https://github.com/apache/arrow-rs/issues/1663) //! From 652937ff2e3141d65f09c0a62e0c582ef9770c7b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 7 May 2025 14:41:54 -0400 Subject: [PATCH 15/15] capitalization OCD --- parquet/src/arrow/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 54314cef9a51..76f8ef1bf068 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -38,7 +38,7 @@ //! from the Parquet schema. //! //! In situations where the embedded Arrow schema is not compatible with the Parquet -//! schema, the parquet schema takes precedence and no error is raised. +//! schema, the Parquet schema takes precedence and no error is raised. //! See [#1663](https://github.com/apache/arrow-rs/issues/1663) //! //! You can also control the type conversion process in more detail using: