15
15
// specific language governing permissions and limitations
16
16
// under the License.
17
17
18
- //! Provides API for converting parquet schema to arrow schema and vice versa.
19
- //!
20
- //! The main interfaces for converting parquet schema to arrow schema are
21
- //! `parquet_to_arrow_schema`, `parquet_to_arrow_schema_by_columns` and
22
- //! `parquet_to_arrow_field`.
23
- //!
24
- //! The interfaces for converting arrow schema to parquet schema is coming.
18
+ //! Converting Parquet schema <--> Arrow schema: [`ArrowSchemaConverter`] and [parquet_to_arrow_schema]
25
19
26
20
use base64:: prelude:: BASE64_STANDARD ;
27
21
use base64:: Engine ;
@@ -226,27 +220,134 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut
226
220
}
227
221
}
228
222
229
- /// Convert arrow schema to parquet schema
223
+ /// Converter for Arrow schema to Parquet schema
230
224
///
231
- /// The name of the root schema element defaults to `"arrow_schema"`, this can be
232
- /// overridden with [`arrow_to_parquet_schema_with_root`]
233
- pub fn arrow_to_parquet_schema ( schema : & Schema , coerce_types : bool ) -> Result < SchemaDescriptor > {
234
- arrow_to_parquet_schema_with_root ( schema, "arrow_schema" , coerce_types)
225
+ /// Example:
226
+ /// ```
227
+ /// # use std::sync::Arc;
228
+ /// # use arrow_schema::{Field, Schema, DataType};
229
+ /// # use parquet::arrow::ArrowSchemaConverter;
230
+ /// use parquet::schema::types::{SchemaDescriptor, Type};
231
+ /// use parquet::basic; // note there are two `Type`s in the following example
232
+ /// // create an Arrow Schema
233
+ /// let arrow_schema = Schema::new(vec![
234
+ /// Field::new("a", DataType::Int64, true),
235
+ /// Field::new("b", DataType::Date32, true),
236
+ /// ]);
237
+ /// // convert the Arrow schema to a Parquet schema
238
+ /// let parquet_schema = ArrowSchemaConverter::new()
239
+ /// .convert(&arrow_schema)
240
+ /// .unwrap();
241
+ ///
242
+ /// let expected_parquet_schema = SchemaDescriptor::new(
243
+ /// Arc::new(
244
+ /// Type::group_type_builder("arrow_schema")
245
+ /// .with_fields(vec![
246
+ /// Arc::new(
247
+ /// Type::primitive_type_builder("a", basic::Type::INT64)
248
+ /// .build().unwrap()
249
+ /// ),
250
+ /// Arc::new(
251
+ /// Type::primitive_type_builder("b", basic::Type::INT32)
252
+ /// .with_converted_type(basic::ConvertedType::DATE)
253
+ /// .with_logical_type(Some(basic::LogicalType::Date))
254
+ /// .build().unwrap()
255
+ /// ),
256
+ /// ])
257
+ /// .build().unwrap()
258
+ /// )
259
+ /// );
260
+ /// assert_eq!(parquet_schema, expected_parquet_schema);
261
+ /// ```
262
+ #[ derive( Debug ) ]
263
+ pub struct ArrowSchemaConverter < ' a > {
264
+ /// Name of the root schema in Parquet
265
+ schema_root : & ' a str ,
266
+ /// Should we coerce Arrow types to compatible Parquet types?
267
+ ///
268
+ /// See docs on [Self::with_coerce_types]`
269
+ coerce_types : bool ,
235
270
}
236
271
237
- /// Convert arrow schema to parquet schema specifying the name of the root schema element
238
- pub fn arrow_to_parquet_schema_with_root (
239
- schema : & Schema ,
240
- root : & str ,
241
- coerce_types : bool ,
242
- ) -> Result < SchemaDescriptor > {
243
- let fields = schema
244
- . fields ( )
245
- . iter ( )
246
- . map ( |field| arrow_to_parquet_type ( field, coerce_types) . map ( Arc :: new) )
247
- . collect :: < Result < _ > > ( ) ?;
248
- let group = Type :: group_type_builder ( root) . with_fields ( fields) . build ( ) ?;
249
- Ok ( SchemaDescriptor :: new ( Arc :: new ( group) ) )
272
+ impl Default for ArrowSchemaConverter < ' _ > {
273
+ fn default ( ) -> Self {
274
+ Self :: new ( )
275
+ }
276
+ }
277
+
278
+ impl < ' a > ArrowSchemaConverter < ' a > {
279
+ /// Create a new converter
280
+ pub fn new ( ) -> Self {
281
+ Self {
282
+ schema_root : "arrow_schema" ,
283
+ coerce_types : false ,
284
+ }
285
+ }
286
+
287
+ /// Should Arrow types be coerced into Parquet native types (default `false`).
288
+ ///
289
+ /// Setting this option to `true` will result in Parquet files that can be
290
+ /// read by more readers, but may lose precision for Arrow types such as
291
+ /// [`DataType::Date64`] which have no direct [corresponding Parquet type].
292
+ ///
293
+ /// By default, this converter does not coerce to native Parquet types. Enabling type
294
+ /// coercion allows for meaningful representations that do not require
295
+ /// downstream readers to consider the embedded Arrow schema, and can allow
296
+ /// for greater compatibility with other Parquet implementations. However,
297
+ /// type coercion also prevents data from being losslessly round-tripped.
298
+ ///
299
+ /// # Discussion
300
+ ///
301
+ /// Some Arrow types such as `Date64`, `Timestamp` and `Interval` have no
302
+ /// corresponding Parquet logical type. Thus, they can not be losslessly
303
+ /// round-tripped when stored using the appropriate Parquet logical type.
304
+ /// For example, some Date64 values may be truncated when stored with
305
+ /// parquet's native 32 bit date type.
306
+ ///
307
+ /// For [`List`] and [`Map`] types, some Parquet readers expect certain
308
+ /// schema elements to have specific names (earlier versions of the spec
309
+ /// were somewhat ambiguous on this point). Type coercion will use the names
310
+ /// prescribed by the Parquet specification, potentially losing naming
311
+ /// metadata from the Arrow schema.
312
+ ///
313
+ /// [`List`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
314
+ /// [`Map`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps
315
+ /// [corresponding Parquet type]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date
316
+ ///
317
+ pub fn with_coerce_types ( mut self , coerce_types : bool ) -> Self {
318
+ self . coerce_types = coerce_types;
319
+ self
320
+ }
321
+
322
+ /// Set the root schema element name (defaults to `"arrow_schema"`).
323
+ pub fn schema_root ( mut self , schema_root : & ' a str ) -> Self {
324
+ self . schema_root = schema_root;
325
+ self
326
+ }
327
+
328
+ /// Convert the specified Arrow [`Schema`] to the desired Parquet [`SchemaDescriptor`]
329
+ ///
330
+ /// See example in [`ArrowSchemaConverter`]
331
+ pub fn convert ( & self , schema : & Schema ) -> Result < SchemaDescriptor > {
332
+ let fields = schema
333
+ . fields ( )
334
+ . iter ( )
335
+ . map ( |field| arrow_to_parquet_type ( field, self . coerce_types ) . map ( Arc :: new) )
336
+ . collect :: < Result < _ > > ( ) ?;
337
+ let group = Type :: group_type_builder ( self . schema_root )
338
+ . with_fields ( fields)
339
+ . build ( ) ?;
340
+ Ok ( SchemaDescriptor :: new ( Arc :: new ( group) ) )
341
+ }
342
+ }
343
+
344
+ /// Convert arrow schema to parquet schema
345
+ ///
346
+ /// The name of the root schema element defaults to `"arrow_schema"`, this can be
347
+ /// overridden with [`ArrowSchemaConverter`]
348
+ #[ deprecated( since = "54.0.0" , note = "Use `ArrowToParquetSchemaConverter` instead" ) ]
349
+ pub fn arrow_to_parquet_schema ( schema : & Schema ) -> Result < SchemaDescriptor > {
350
+ ArrowSchemaConverter :: new ( ) . convert ( schema)
250
351
}
251
352
252
353
fn parse_key_value_metadata (
@@ -1488,7 +1589,10 @@ mod tests {
1488
1589
" ;
1489
1590
let parquet_group_type = parse_message_type ( message_type) . unwrap ( ) ;
1490
1591
let parquet_schema = SchemaDescriptor :: new ( Arc :: new ( parquet_group_type) ) ;
1491
- let converted_arrow_schema = arrow_to_parquet_schema ( & arrow_schema, true ) . unwrap ( ) ;
1592
+ let converted_arrow_schema = ArrowSchemaConverter :: new ( )
1593
+ . with_coerce_types ( true )
1594
+ . convert ( & arrow_schema)
1595
+ . unwrap ( ) ;
1492
1596
assert_eq ! (
1493
1597
parquet_schema. columns( ) . len( ) ,
1494
1598
converted_arrow_schema. columns( ) . len( )
@@ -1512,7 +1616,10 @@ mod tests {
1512
1616
" ;
1513
1617
let parquet_group_type = parse_message_type ( message_type) . unwrap ( ) ;
1514
1618
let parquet_schema = SchemaDescriptor :: new ( Arc :: new ( parquet_group_type) ) ;
1515
- let converted_arrow_schema = arrow_to_parquet_schema ( & arrow_schema, false ) . unwrap ( ) ;
1619
+ let converted_arrow_schema = ArrowSchemaConverter :: new ( )
1620
+ . with_coerce_types ( false )
1621
+ . convert ( & arrow_schema)
1622
+ . unwrap ( ) ;
1516
1623
assert_eq ! (
1517
1624
parquet_schema. columns( ) . len( ) ,
1518
1625
converted_arrow_schema. columns( ) . len( )
@@ -1668,7 +1775,7 @@ mod tests {
1668
1775
Field :: new( "decimal256" , DataType :: Decimal256 ( 39 , 2 ) , false ) ,
1669
1776
] ;
1670
1777
let arrow_schema = Schema :: new ( arrow_fields) ;
1671
- let converted_arrow_schema = arrow_to_parquet_schema ( & arrow_schema, false ) . unwrap ( ) ;
1778
+ let converted_arrow_schema = ArrowSchemaConverter :: new ( ) . convert ( & arrow_schema) . unwrap ( ) ;
1672
1779
1673
1780
assert_eq ! (
1674
1781
parquet_schema. columns( ) . len( ) ,
@@ -1705,9 +1812,10 @@ mod tests {
1705
1812
false ,
1706
1813
) ] ;
1707
1814
let arrow_schema = Schema :: new ( arrow_fields) ;
1708
- let converted_arrow_schema = arrow_to_parquet_schema ( & arrow_schema, true ) ;
1815
+ let converted_arrow_schema = ArrowSchemaConverter :: new ( )
1816
+ . with_coerce_types ( true )
1817
+ . convert ( & arrow_schema) ;
1709
1818
1710
- assert ! ( converted_arrow_schema. is_err( ) ) ;
1711
1819
converted_arrow_schema. unwrap ( ) ;
1712
1820
}
1713
1821
@@ -1978,7 +2086,9 @@ mod tests {
1978
2086
// don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema
1979
2087
let arrow_schema = crate :: arrow:: parquet_to_arrow_schema ( & schema_descriptor, None ) ?;
1980
2088
1981
- let parq_schema_descr = crate :: arrow:: arrow_to_parquet_schema ( & arrow_schema, true ) ?;
2089
+ let parq_schema_descr = ArrowSchemaConverter :: new ( )
2090
+ . with_coerce_types ( true )
2091
+ . convert ( & arrow_schema) ?;
1982
2092
let parq_fields = parq_schema_descr. root_schema ( ) . get_fields ( ) ;
1983
2093
assert_eq ! ( parq_fields. len( ) , 2 ) ;
1984
2094
assert_eq ! ( parq_fields[ 0 ] . get_basic_info( ) . id( ) , 1 ) ;
0 commit comments