@@ -225,29 +225,99 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut
225
225
}
226
226
}
227
227
228
+ /// Converter for arrow schema to parquet schema
229
+ ///
230
+ /// Example:
231
+ /// ```
232
+ /// # use arrow_schema::{Field, Schema, DataType};
233
+ /// use parquet::arrow::ArrowToParquetSchemaConverter;
234
+ /// let schema = Schema::new(vec![
235
+ /// Field::new("a", DataType::Int64, false),
236
+ /// Field::new("b", DataType::Date32, false),
237
+ /// ];
238
+ ///
239
+ /// let parquet_schema = ArrowToParquetSchemaConverter::new(&schema)
240
+ /// .build()
241
+ /// .unwrap();
242
+ ///
243
+ ///
244
+ /// ```
245
+ #[ derive( Debug ) ]
246
+ pub struct ArrowToParquetSchemaConverter < ' a > {
247
+ /// The schema to convert
248
+ schema : & ' a Schema ,
249
+ /// Name of the root schema in Parquet
250
+ schema_root : & ' a str ,
251
+ /// Should we Coerce arrow types to compatible Parquet types?
252
+ ///
253
+ /// See docs on [Self::with_coerce_types]`
254
+ coerce_types : bool
255
+ }
256
+
257
+ impl < ' a > ArrowToParquetSchemaConverter < ' a > {
258
+ /// Create a new converter
259
+ pub fn new ( schema : & ' a Schema ) -> Self {
260
+ Self {
261
+ schema,
262
+ schema_root : "arrow_schema" ,
263
+ coerce_types : false ,
264
+ }
265
+ }
266
+
267
+ /// Should arrow types be coerced into parquet native types (default false).
268
+ ///
269
+ /// Setting this option to `true` will result in parquet files that can be
270
+ /// read by more readers, but may lose precision for arrow types such as
271
+ /// [`DataType::Date64`] which have no direct corresponding Parquet type.
272
+ ///
273
+ /// # Discussion
274
+ ///
275
+ /// Some Arrow types such as `Date64`, `Timestamp` and `Interval` have no
276
+ /// corresponding Parquet logical type. Thus, they can not be losslessly
277
+ /// round-tripped when stored using the appropriate Parquet logical type.
278
+ ///
279
+ /// For example, some Date64 values may be truncated when stored with
280
+ /// parquet's native 32 bit date type.
281
+ ///
282
+ /// By default, the arrow writer does not coerce to native parquet types. It
283
+ /// writes data in such a way that it can be lossless round tripped.
284
+ /// However, this means downstream readers must be aware of and correctly
285
+ /// interpret the embedded Arrow schema.
286
+ pub fn with_coerce_types ( mut self , coerce_types : bool ) -> Self {
287
+ self . coerce_types = coerce_types;
288
+ self
289
+ }
290
+
291
+ /// Set the root schema element name (defaults to `"arrow_schema"`).
292
+ pub fn schema_root ( mut self , schema_root : & ' a str ) -> Self {
293
+ self . schema_root = schema_root;
294
+ self
295
+ }
296
+
297
+ /// Build the desired parquet [`SchemaDescriptor`]
298
+ pub fn build ( self ) -> Result < SchemaDescriptor > {
299
+ let Self { schema, schema_root : root_schema_name, coerce_types } = self ;
300
+ let fields = schema
301
+ . fields ( )
302
+ . iter ( )
303
+ . map ( |field| arrow_to_parquet_type ( field, coerce_types) . map ( Arc :: new) )
304
+ . collect :: < Result < _ > > ( ) ?;
305
+ let group = Type :: group_type_builder ( root_schema_name) . with_fields ( fields) . build ( ) ?;
306
+ Ok ( SchemaDescriptor :: new ( Arc :: new ( group) ) )
307
+ }
308
+ }
309
+
228
310
/// Convert arrow schema to parquet schema
229
311
///
230
312
/// The name of the root schema element defaults to `"arrow_schema"`, this can be
231
313
/// overridden with [`arrow_to_parquet_schema_with_root`]
232
- pub fn arrow_to_parquet_schema ( schema : & Schema , coerce_types : bool ) -> Result < SchemaDescriptor > {
233
- arrow_to_parquet_schema_with_root ( schema, "arrow_schema" , coerce_types)
234
- }
314
+ #[ deprecated( since = "54.0.0" , note = "Use `ArrowToParquetSchemaConverter` instead" ) ]
315
+ pub fn arrow_to_parquet_schema ( schema : & Schema ) -> Result < SchemaDescriptor > {
235
316
236
- /// Convert arrow schema to parquet schema specifying the name of the root schema element
237
- pub fn arrow_to_parquet_schema_with_root (
238
- schema : & Schema ,
239
- root : & str ,
240
- coerce_types : bool ,
241
- ) -> Result < SchemaDescriptor > {
242
- let fields = schema
243
- . fields ( )
244
- . iter ( )
245
- . map ( |field| arrow_to_parquet_type ( field, coerce_types) . map ( Arc :: new) )
246
- . collect :: < Result < _ > > ( ) ?;
247
- let group = Type :: group_type_builder ( root) . with_fields ( fields) . build ( ) ?;
248
- Ok ( SchemaDescriptor :: new ( Arc :: new ( group) ) )
317
+ ArrowToParquetSchemaConverter :: new ( schema) . build ( )
249
318
}
250
319
320
+
251
321
fn parse_key_value_metadata (
252
322
key_value_metadata : Option < & Vec < KeyValue > > ,
253
323
) -> Option < HashMap < String , String > > {
@@ -1569,7 +1639,7 @@ mod tests {
1569
1639
Field :: new( "decimal256" , DataType :: Decimal256 ( 39 , 2 ) , false ) ,
1570
1640
] ;
1571
1641
let arrow_schema = Schema :: new ( arrow_fields) ;
1572
- let converted_arrow_schema = arrow_to_parquet_schema ( & arrow_schema, false ) . unwrap ( ) ;
1642
+ let converted_arrow_schema = ArrowToParquetSchemaConverter :: new ( & arrow_schema) . build ( ) . unwrap ( ) ;
1573
1643
1574
1644
assert_eq ! (
1575
1645
parquet_schema. columns( ) . len( ) ,
@@ -1606,9 +1676,10 @@ mod tests {
1606
1676
false ,
1607
1677
) ] ;
1608
1678
let arrow_schema = Schema :: new ( arrow_fields) ;
1609
- let converted_arrow_schema = arrow_to_parquet_schema ( & arrow_schema, true ) ;
1679
+ let converted_arrow_schema = ArrowToParquetSchemaConverter :: new ( & arrow_schema)
1680
+ . with_coerce_types ( true )
1681
+ . build ( ) ;
1610
1682
1611
- assert ! ( converted_arrow_schema. is_err( ) ) ;
1612
1683
converted_arrow_schema. unwrap ( ) ;
1613
1684
}
1614
1685
@@ -1878,7 +1949,9 @@ mod tests {
1878
1949
// don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema
1879
1950
let arrow_schema = crate :: arrow:: parquet_to_arrow_schema ( & schema_descriptor, None ) ?;
1880
1951
1881
- let parq_schema_descr = crate :: arrow:: arrow_to_parquet_schema ( & arrow_schema, true ) ?;
1952
+ let parq_schema_descr = crate :: arrow:: ArrowToParquetSchemaConverter :: new ( & arrow_schema)
1953
+ . with_coerce_types ( true )
1954
+ . build ( ) ?;
1882
1955
let parq_fields = parq_schema_descr. root_schema ( ) . get_fields ( ) ;
1883
1956
assert_eq ! ( parq_fields. len( ) , 2 ) ;
1884
1957
assert_eq ! ( parq_fields[ 0 ] . get_basic_info( ) . id( ) , 1 ) ;
0 commit comments