@@ -27,6 +27,55 @@ use crate::proto;
2727
2828use arrow:: datatypes:: { DataType as ArrowDataType , Field , Schema , TimeUnit , UnionMode } ;
2929
30+ /// Configuration for timestamp precision when converting ORC timestamps to Arrow.
31+ #[ derive( Debug , Clone , Copy , PartialEq , Eq , Default ) ]
32+ pub enum TimestampPrecision {
33+ /// Convert timestamps to microseconds (lower precision).
34+ Microsecond ,
35+ /// Convert timestamps to nanoseconds (default, higher precision).
36+ #[ default]
37+ Nanosecond ,
38+ }
39+
40+ /// Builder for configuring Arrow schema conversion options.
41+ #[ derive( Debug , Clone ) ]
42+ pub struct ArrowSchemaOptions {
43+ timestamp_precision : TimestampPrecision ,
44+ }
45+
46+ impl Default for ArrowSchemaOptions {
47+ fn default ( ) -> Self {
48+ Self :: new ( )
49+ }
50+ }
51+
52+ impl ArrowSchemaOptions {
53+ /// Create a new options builder with default values.
54+ /// - Timestamp precision is [`TimestampPrecision::Nanosecond`]
55+ pub fn new ( ) -> Self {
56+ Self {
57+ timestamp_precision : TimestampPrecision :: default ( ) ,
58+ }
59+ }
60+
61+ /// Set the timestamp precision for converting ORC timestamps to Arrow.
62+ ///
63+ /// ORC timestamps have nanosecond precision, but you may want to convert
64+ /// them to microseconds for compatibility with systems that don't support
65+ /// nanosecond precision.
66+ ///
67+ /// Default: [`TimestampPrecision::Nanosecond`]
68+ pub fn with_timestamp_precision ( mut self , precision : TimestampPrecision ) -> Self {
69+ self . timestamp_precision = precision;
70+ self
71+ }
72+
73+ /// Get the timestamp precision
74+ fn timestamp_precision ( & self ) -> TimestampPrecision {
75+ self . timestamp_precision
76+ }
77+ }
78+
3079/// Represents the root data type of the ORC file. Contains multiple named child types
3180/// which map to the columns available. Allows projecting only specific columns from
3281/// the base schema.
@@ -63,11 +112,22 @@ impl RootDataType {
63112
64113 /// Convert into an Arrow schema.
65114 pub fn create_arrow_schema ( & self , user_metadata : & HashMap < String , String > ) -> Schema {
115+ self . create_arrow_schema_with_options ( user_metadata, ArrowSchemaOptions :: new ( ) )
116+ }
117+
118+ /// Convert into an Arrow schema with custom options.
119+ pub fn create_arrow_schema_with_options (
120+ & self ,
121+ user_metadata : & HashMap < String , String > ,
122+ options : ArrowSchemaOptions ,
123+ ) -> Schema {
66124 let fields = self
67125 . children
68126 . iter ( )
69127 . map ( |col| {
70- let dt = col. data_type ( ) . to_arrow_data_type ( ) ;
128+ let dt = col
129+ . data_type ( )
130+ . to_arrow_data_type_with_options ( options. clone ( ) ) ;
71131 Field :: new ( col. name ( ) , dt, true )
72132 } )
73133 . collect :: < Vec < _ > > ( ) ;
@@ -434,7 +494,19 @@ impl DataType {
434494 Ok ( dt)
435495 }
436496
497+ /// Convert this ORC data type to an Arrow data type with default options.
437498 pub fn to_arrow_data_type ( & self ) -> ArrowDataType {
499+ self . to_arrow_data_type_with_options ( ArrowSchemaOptions :: new ( ) )
500+ }
501+
502+ /// Convert this ORC data type to an Arrow data type with custom options.
503+ pub fn to_arrow_data_type_with_options ( & self , options : ArrowSchemaOptions ) -> ArrowDataType {
504+ let timestamp_precision = options. timestamp_precision ( ) ;
505+ let time_unit = match timestamp_precision {
506+ TimestampPrecision :: Microsecond => TimeUnit :: Microsecond ,
507+ TimestampPrecision :: Nanosecond => TimeUnit :: Nanosecond ,
508+ } ;
509+
438510 match self {
439511 DataType :: Boolean { .. } => ArrowDataType :: Boolean ,
440512 DataType :: Byte { .. } => ArrowDataType :: Int8 ,
@@ -450,33 +522,35 @@ impl DataType {
450522 DataType :: Decimal {
451523 precision, scale, ..
452524 } => ArrowDataType :: Decimal128 ( * precision as u8 , * scale as i8 ) , // TODO: safety of cast?
453- DataType :: Timestamp { .. } => ArrowDataType :: Timestamp ( TimeUnit :: Nanosecond , None ) ,
525+ DataType :: Timestamp { .. } => ArrowDataType :: Timestamp ( time_unit , None ) ,
454526 DataType :: TimestampWithLocalTimezone { .. } => {
455- ArrowDataType :: Timestamp ( TimeUnit :: Nanosecond , Some ( "UTC" . into ( ) ) )
527+ ArrowDataType :: Timestamp ( time_unit , Some ( "UTC" . into ( ) ) )
456528 }
457529 DataType :: Date { .. } => ArrowDataType :: Date32 ,
458530 DataType :: Struct { children, .. } => {
459531 let children = children
460532 . iter ( )
461533 . map ( |col| {
462- let dt = col. data_type ( ) . to_arrow_data_type ( ) ;
534+ let dt = col
535+ . data_type ( )
536+ . to_arrow_data_type_with_options ( options. clone ( ) ) ;
463537 Field :: new ( col. name ( ) , dt, true )
464538 } )
465539 . collect ( ) ;
466540 ArrowDataType :: Struct ( children)
467541 }
468542 DataType :: List { child, .. } => {
469- let child = child. to_arrow_data_type ( ) ;
543+ let child = child. to_arrow_data_type_with_options ( options ) ;
470544 ArrowDataType :: new_list ( child, true )
471545 }
472546 DataType :: Map { key, value, .. } => {
473547 // TODO: this needs to be kept in sync with MapArrayDecoder
474548 // move to common location?
475549 // TODO: should it be "keys" and "values" (like arrow-rs)
476550 // or "key" and "value" like PyArrow and in Schema.fbs?
477- let key = key. to_arrow_data_type ( ) ;
551+ let key = key. to_arrow_data_type_with_options ( options . clone ( ) ) ;
478552 let key = Field :: new ( "keys" , key, false ) ;
479- let value = value. to_arrow_data_type ( ) ;
553+ let value = value. to_arrow_data_type_with_options ( options ) ;
480554 let value = Field :: new ( "values" , value, true ) ;
481555
482556 let dt = ArrowDataType :: Struct ( vec ! [ key, value] . into ( ) ) ;
@@ -492,7 +566,7 @@ impl DataType {
492566 // TODO: Support up to including 256
493567 // Need to do Union within Union
494568 let index = index as u8 as i8 ;
495- let arrow_dt = variant. to_arrow_data_type ( ) ;
569+ let arrow_dt = variant. to_arrow_data_type_with_options ( options . clone ( ) ) ;
496570 // Name shouldn't matter here (only ORC struct types give names to subtypes anyway)
497571 // Using naming convention following PyArrow for easier comparison
498572 let field = Arc :: new ( Field :: new ( format ! ( "_union_{index}" ) , arrow_dt, true ) ) ;
0 commit comments