15
15
// specific language governing permissions and limitations
16
16
// under the License.
17
17
18
- //! API for reading/writing
19
- //! Arrow [RecordBatch](arrow_array::RecordBatch)es and
20
- //! [Array](arrow_array::Array)s to/from Parquet Files.
18
+ //! API for reading/writing Arrow [`RecordBatch`]es and [`Array`]s to/from
19
+ //! Parquet Files.
21
20
//!
22
- //! See the [crate-level documentation](crate) for more details.
21
+ //! See the [crate-level documentation](crate) for more details on other APIs
23
22
//!
24
- //! # Example of writing Arrow record batch to Parquet file
23
+ //! # Schema Conversion
24
+ //!
25
+ //! These APIs ensure that data in Arrow [`RecordBatch`]es written to Parquet are
26
+ //! read back as [`RecordBatch`]es with the exact same types and values.
27
+ //!
28
+ //! Parquet and Arrow have different type systems, and there is not
29
+ //! always a one to one mapping between the systems. For example, data
30
+ //! stored as a Parquet [`BYTE_ARRAY`] can be read as either an Arrow
31
+ //! [`BinaryViewArray`] or [`BinaryArray`].
32
+ //!
33
+ //! To recover the original Arrow types, the writers in this module add a "hint" to
34
+ //! the metadata in the [`ARROW_SCHEMA_META_KEY`] key which records the original Arrow
35
+ //! schema. The metadata hint follows the same convention as arrow-cpp based
36
+ //! implementations such as `pyarrow`. The reader looks for the schema hint in the
37
+ //! metadata to determine Arrow types, and if it is not present, infers the Arrow schema
38
+ //! from the Parquet schema.
39
+ //!
40
+ //! In situations where the embedded Arrow schema is not compatible with the Parquet
41
+ //! schema, the Parquet schema takes precedence and no error is raised.
42
+ //! See [#1663](https://github.com/apache/arrow-rs/issues/1663)
43
+ //!
44
+ //! You can also control the type conversion process in more detail using:
45
+ //!
46
+ //! * [`ArrowSchemaConverter`] control the conversion of Arrow types to Parquet
47
+ //! types.
48
+ //!
49
+ //! * [`ArrowReaderOptions::with_schema`] to explicitly specify your own Arrow schema hint
50
+ //! to use when reading Parquet, overriding any metadata that may be present.
51
+ //!
52
+ //! [`RecordBatch`]: arrow_array::RecordBatch
53
+ //! [`Array`]: arrow_array::Array
54
+ //! [`BYTE_ARRAY`]: crate::basic::Type::BYTE_ARRAY
55
+ //! [`BinaryViewArray`]: arrow_array::BinaryViewArray
56
+ //! [`BinaryArray`]: arrow_array::BinaryArray
57
+ //! [`ArrowReaderOptions::with_schema`]: arrow_reader::ArrowReaderOptions::with_schema
58
+ //!
59
+ //! # Example: Writing Arrow `RecordBatch` to Parquet file
25
60
//!
26
61
//!```rust
27
62
//! # use arrow_array::{Int32Array, ArrayRef};
53
88
//! writer.close().unwrap();
54
89
//! ```
55
90
//!
56
- //! # Example of reading parquet file into arrow record batch
91
+ //! # Example: Reading Parquet file into Arrow `RecordBatch`
57
92
//!
58
93
//! ```rust
59
94
//! # use std::fs::File;
93
128
//! println!("Read {} records.", record_batch.num_rows());
94
129
//! ```
95
130
//!
96
- //! # Example of reading non-uniformly encrypted parquet file into arrow record batch
131
+ //! # Example: Reading non-uniformly encrypted parquet file into arrow record batch
97
132
//!
98
133
//! Note: This requires the experimental `encryption` feature to be enabled at compile time.
99
134
//!
100
- //!
101
135
#![ cfg_attr( feature = "encryption" , doc = "```rust" ) ]
102
136
#![ cfg_attr( not( feature = "encryption" ) , doc = "```ignore" ) ]
103
137
//! # use arrow_array::{Int32Array, ArrayRef};
@@ -168,7 +202,6 @@ pub use self::async_reader::ParquetRecordBatchStreamBuilder;
168
202
pub use self :: async_writer:: AsyncArrowWriter ;
169
203
use crate :: schema:: types:: { SchemaDescriptor , Type } ;
170
204
use arrow_schema:: { FieldRef , Schema } ;
171
-
172
205
// continue to export deprecated methods until they are removed
173
206
#[ allow( deprecated) ]
174
207
pub use self :: schema:: arrow_to_parquet_schema;
@@ -178,7 +211,10 @@ pub use self::schema::{
178
211
parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, ArrowSchemaConverter , FieldLevels ,
179
212
} ;
180
213
181
- /// Schema metadata key used to store serialized Arrow IPC schema
214
+ /// Schema metadata key used to store serialized Arrow schema
215
+ ///
216
+ /// The Arrow schema is encoded using the Arrow IPC format, and then base64
217
+ /// encoded. This is the same format used by arrow-cpp systems, such as pyarrow.
182
218
pub const ARROW_SCHEMA_META_KEY : & str = "ARROW:schema" ;
183
219
184
220
/// The value of this metadata key, if present on [`Field::metadata`], will be used
0 commit comments