Skip to content

Commit 7ea0ce7

Browse files
authored
fix timestamp 0001-01-01 overflow (#67)
* fix: timestamp 0001-01-01 overflow * builder
1 parent ed48394 commit 7ea0ce7

File tree

6 files changed

+186
-19
lines changed

6 files changed

+186
-19
lines changed

src/array_decoder/timestamp.rs

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -251,11 +251,28 @@ impl<T: ArrowTimestampType> ArrayBatchDecoder for TimestampOffsetArrayDecoder<T>
251251
let convert_timezone = |ts| {
252252
// Convert from writer timezone to reader timezone (which we default to UTC)
253253
// TODO: more efficient way of doing this?
254-
self.writer_tz
255-
.timestamp_nanos(ts)
256-
.naive_local()
257-
.and_utc()
258-
.timestamp_nanos_opt()
254+
let microseconds_in_timeunit = match T::UNIT {
255+
TimeUnit::Second => 1_000_000,
256+
TimeUnit::Millisecond => 1_000,
257+
TimeUnit::Microsecond => 1,
258+
TimeUnit::Nanosecond => -1, // not used in this case
259+
};
260+
261+
match T::UNIT {
262+
TimeUnit::Second | TimeUnit::Millisecond | TimeUnit::Microsecond => self
263+
.writer_tz
264+
.timestamp_micros(ts * microseconds_in_timeunit)
265+
.single()
266+
.map(|dt| {
267+
dt.naive_local().and_utc().timestamp_micros() / microseconds_in_timeunit
268+
}),
269+
TimeUnit::Nanosecond => self
270+
.writer_tz
271+
.timestamp_nanos(ts)
272+
.naive_local()
273+
.and_utc()
274+
.timestamp_nanos_opt(),
275+
}
259276
};
260277
let array = array
261278
// first try to convert all non-nullable batches to non-nullable batches

src/arrow_reader.rs

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use crate::projection::ProjectionMask;
2929
use crate::reader::metadata::{read_metadata, FileMetadata};
3030
use crate::reader::ChunkReader;
3131
use crate::row_selection::RowSelection;
32-
use crate::schema::RootDataType;
32+
use crate::schema::{ArrowSchemaOptions, RootDataType, TimestampPrecision};
3333
use crate::stripe::{Stripe, StripeMetadata};
3434

3535
const DEFAULT_BATCH_SIZE: usize = 8192;
@@ -42,6 +42,7 @@ pub struct ArrowReaderBuilder<R> {
4242
pub(crate) schema_ref: Option<SchemaRef>,
4343
pub(crate) file_byte_range: Option<Range<usize>>,
4444
pub(crate) row_selection: Option<RowSelection>,
45+
pub(crate) timestamp_precision: TimestampPrecision,
4546
}
4647

4748
impl<R> ArrowReaderBuilder<R> {
@@ -54,6 +55,7 @@ impl<R> ArrowReaderBuilder<R> {
5455
schema_ref: None,
5556
file_byte_range: None,
5657
row_selection: None,
58+
timestamp_precision: TimestampPrecision::default(),
5759
}
5860
}
5961

@@ -109,6 +111,28 @@ impl<R> ArrowReaderBuilder<R> {
109111
self
110112
}
111113

114+
/// Sets the timestamp precision for reading timestamp columns.
115+
///
116+
/// By default, timestamps are read as Nanosecond precision.
117+
/// Use this method to switch to Microsecond precision if needed for compatibility.
118+
///
119+
/// # Examples
120+
///
121+
/// ```no_run
122+
/// # use std::fs::File;
123+
/// # use orc_rust::arrow_reader::ArrowReaderBuilder;
124+
/// # use orc_rust::schema::TimestampPrecision;
125+
/// let file = File::open("/path/to/file.orc").unwrap();
126+
/// let reader = ArrowReaderBuilder::try_new(file)
127+
/// .unwrap()
128+
/// .with_timestamp_precision(TimestampPrecision::Microsecond)
129+
/// .build();
130+
/// ```
131+
pub fn with_timestamp_precision(mut self, precision: TimestampPrecision) -> Self {
132+
self.timestamp_precision = precision;
133+
self
134+
}
135+
112136
/// Returns the currently computed schema
113137
///
114138
/// Unless [`with_schema`](Self::with_schema) was called, this is computed dynamically
@@ -124,9 +148,11 @@ impl<R> ArrowReaderBuilder<R> {
124148
.iter()
125149
.map(|(key, value)| (key.clone(), String::from_utf8_lossy(value).to_string()))
126150
.collect::<HashMap<_, _>>();
127-
self.schema_ref
128-
.clone()
129-
.unwrap_or_else(|| Arc::new(projected_data_type.create_arrow_schema(&metadata)))
151+
self.schema_ref.clone().unwrap_or_else(|| {
152+
let options =
153+
ArrowSchemaOptions::new().with_timestamp_precision(self.timestamp_precision);
154+
Arc::new(projected_data_type.create_arrow_schema_with_options(&metadata, options))
155+
})
130156
}
131157
}
132158

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,4 @@ pub use arrow_writer::{ArrowWriter, ArrowWriterBuilder};
7272
#[cfg(feature = "async")]
7373
pub use async_arrow_reader::ArrowStreamReader;
7474
pub use row_selection::{RowSelection, RowSelector};
75+
pub use schema::{ArrowSchemaOptions, TimestampPrecision};

src/schema.rs

Lines changed: 82 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,55 @@ use crate::proto;
2727

2828
use arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit, UnionMode};
2929

30+
/// Configuration for timestamp precision when converting ORC timestamps to Arrow.
31+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
32+
pub enum TimestampPrecision {
33+
/// Convert timestamps to microseconds (lower precision).
34+
Microsecond,
35+
/// Convert timestamps to nanoseconds (default, higher precision).
36+
#[default]
37+
Nanosecond,
38+
}
39+
40+
/// Builder for configuring Arrow schema conversion options.
41+
#[derive(Debug, Clone)]
42+
pub struct ArrowSchemaOptions {
43+
timestamp_precision: TimestampPrecision,
44+
}
45+
46+
impl Default for ArrowSchemaOptions {
47+
fn default() -> Self {
48+
Self::new()
49+
}
50+
}
51+
52+
impl ArrowSchemaOptions {
53+
/// Create a new options builder with default values.
54+
/// - Timestamp precision is [`TimestampPrecision::Nanosecond`]
55+
pub fn new() -> Self {
56+
Self {
57+
timestamp_precision: TimestampPrecision::default(),
58+
}
59+
}
60+
61+
/// Set the timestamp precision for converting ORC timestamps to Arrow.
62+
///
63+
/// ORC timestamps have nanosecond precision, but you may want to convert
64+
/// them to microseconds for compatibility with systems that don't support
65+
/// nanosecond precision.
66+
///
67+
/// Default: [`TimestampPrecision::Nanosecond`]
68+
pub fn with_timestamp_precision(mut self, precision: TimestampPrecision) -> Self {
69+
self.timestamp_precision = precision;
70+
self
71+
}
72+
73+
/// Get the timestamp precision
74+
fn timestamp_precision(&self) -> TimestampPrecision {
75+
self.timestamp_precision
76+
}
77+
}
78+
3079
/// Represents the root data type of the ORC file. Contains multiple named child types
3180
/// which map to the columns available. Allows projecting only specific columns from
3281
/// the base schema.
@@ -63,11 +112,22 @@ impl RootDataType {
63112

64113
/// Convert into an Arrow schema.
65114
pub fn create_arrow_schema(&self, user_metadata: &HashMap<String, String>) -> Schema {
115+
self.create_arrow_schema_with_options(user_metadata, ArrowSchemaOptions::new())
116+
}
117+
118+
/// Convert into an Arrow schema with custom options.
119+
pub fn create_arrow_schema_with_options(
120+
&self,
121+
user_metadata: &HashMap<String, String>,
122+
options: ArrowSchemaOptions,
123+
) -> Schema {
66124
let fields = self
67125
.children
68126
.iter()
69127
.map(|col| {
70-
let dt = col.data_type().to_arrow_data_type();
128+
let dt = col
129+
.data_type()
130+
.to_arrow_data_type_with_options(options.clone());
71131
Field::new(col.name(), dt, true)
72132
})
73133
.collect::<Vec<_>>();
@@ -434,7 +494,19 @@ impl DataType {
434494
Ok(dt)
435495
}
436496

497+
/// Convert this ORC data type to an Arrow data type with default options.
437498
pub fn to_arrow_data_type(&self) -> ArrowDataType {
499+
self.to_arrow_data_type_with_options(ArrowSchemaOptions::new())
500+
}
501+
502+
/// Convert this ORC data type to an Arrow data type with custom options.
503+
pub fn to_arrow_data_type_with_options(&self, options: ArrowSchemaOptions) -> ArrowDataType {
504+
let timestamp_precision = options.timestamp_precision();
505+
let time_unit = match timestamp_precision {
506+
TimestampPrecision::Microsecond => TimeUnit::Microsecond,
507+
TimestampPrecision::Nanosecond => TimeUnit::Nanosecond,
508+
};
509+
438510
match self {
439511
DataType::Boolean { .. } => ArrowDataType::Boolean,
440512
DataType::Byte { .. } => ArrowDataType::Int8,
@@ -450,33 +522,35 @@ impl DataType {
450522
DataType::Decimal {
451523
precision, scale, ..
452524
} => ArrowDataType::Decimal128(*precision as u8, *scale as i8), // TODO: safety of cast?
453-
DataType::Timestamp { .. } => ArrowDataType::Timestamp(TimeUnit::Nanosecond, None),
525+
DataType::Timestamp { .. } => ArrowDataType::Timestamp(time_unit, None),
454526
DataType::TimestampWithLocalTimezone { .. } => {
455-
ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into()))
527+
ArrowDataType::Timestamp(time_unit, Some("UTC".into()))
456528
}
457529
DataType::Date { .. } => ArrowDataType::Date32,
458530
DataType::Struct { children, .. } => {
459531
let children = children
460532
.iter()
461533
.map(|col| {
462-
let dt = col.data_type().to_arrow_data_type();
534+
let dt = col
535+
.data_type()
536+
.to_arrow_data_type_with_options(options.clone());
463537
Field::new(col.name(), dt, true)
464538
})
465539
.collect();
466540
ArrowDataType::Struct(children)
467541
}
468542
DataType::List { child, .. } => {
469-
let child = child.to_arrow_data_type();
543+
let child = child.to_arrow_data_type_with_options(options);
470544
ArrowDataType::new_list(child, true)
471545
}
472546
DataType::Map { key, value, .. } => {
473547
// TODO: this needs to be kept in sync with MapArrayDecoder
474548
// move to common location?
475549
// TODO: should it be "keys" and "values" (like arrow-rs)
476550
// or "key" and "value" like PyArrow and in Schema.fbs?
477-
let key = key.to_arrow_data_type();
551+
let key = key.to_arrow_data_type_with_options(options.clone());
478552
let key = Field::new("keys", key, false);
479-
let value = value.to_arrow_data_type();
553+
let value = value.to_arrow_data_type_with_options(options);
480554
let value = Field::new("values", value, true);
481555

482556
let dt = ArrowDataType::Struct(vec![key, value].into());
@@ -492,7 +566,7 @@ impl DataType {
492566
// TODO: Support up to including 256
493567
// Need to do Union within Union
494568
let index = index as u8 as i8;
495-
let arrow_dt = variant.to_arrow_data_type();
569+
let arrow_dt = variant.to_arrow_data_type_with_options(options.clone());
496570
// Name shouldn't matter here (only ORC struct types give names to subtypes anyway)
497571
// Using naming convention following PyArrow for easier comparison
498572
let field = Arc::new(Field::new(format!("_union_{index}"), arrow_dt, true));

tests/basic/main.rs

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use std::fs::File;
1919
use std::ops::Range;
2020
use std::sync::Arc;
2121

22+
use crate::misc::{LONG_BOOL_EXPECTED, LONG_STRING_DICT_EXPECTED, LONG_STRING_EXPECTED};
2223
use arrow::datatypes::{DataType, Decimal128Type, DecimalType, Field, Schema, TimeUnit};
2324
use arrow::record_batch::{RecordBatch, RecordBatchReader};
2425
use arrow::util::pretty;
@@ -28,8 +29,7 @@ use orc_rust::arrow_reader::{ArrowReader, ArrowReaderBuilder};
2829
#[cfg(feature = "async")]
2930
use orc_rust::async_arrow_reader::ArrowStreamReader;
3031
use orc_rust::projection::ProjectionMask;
31-
32-
use crate::misc::{LONG_BOOL_EXPECTED, LONG_STRING_DICT_EXPECTED, LONG_STRING_EXPECTED};
32+
use orc_rust::TimestampPrecision;
3333

3434
mod misc;
3535

@@ -701,6 +701,55 @@ pub fn decimal128_timestamps_1900_test() {
701701
assert_batches_eq(&[batch], &expected);
702702
}
703703

704+
#[test]
705+
pub fn timestamps_0001_test() {
706+
let path = integration_path("timestamps_0001.orc");
707+
let f = File::open(path).expect("no file found");
708+
let mut reader = ArrowReaderBuilder::try_new(f)
709+
.unwrap()
710+
.with_schema(Arc::new(Schema::new(vec![Field::new(
711+
"c1",
712+
DataType::Timestamp(TimeUnit::Microsecond, None),
713+
true,
714+
)])))
715+
.build();
716+
let batch = reader.next().unwrap().unwrap();
717+
718+
let expected = [
719+
"+---------------------+",
720+
"| c1 |",
721+
"+---------------------+",
722+
"| 0000-12-30T00:00:00 |",
723+
"+---------------------+",
724+
];
725+
assert_batches_eq(&[batch], &expected);
726+
}
727+
728+
#[test]
729+
pub fn timestamps_0001_projection_test() {
730+
let path = integration_path("timestamps_0001.orc");
731+
let f = File::open(path).expect("no file found");
732+
733+
let builder = ArrowReaderBuilder::try_new(f).unwrap();
734+
let projection = ProjectionMask::named_roots(builder.file_metadata().root_data_type(), &["c1"]);
735+
736+
let mut reader = builder
737+
.with_projection(projection)
738+
.with_timestamp_precision(TimestampPrecision::Microsecond)
739+
.build();
740+
741+
let batch = reader.next().unwrap().unwrap();
742+
743+
let expected = [
744+
"+---------------------+",
745+
"| c1 |",
746+
"+---------------------+",
747+
"| 0000-12-30T00:00:00 |",
748+
"+---------------------+",
749+
];
750+
assert_batches_eq(&[batch], &expected);
751+
}
752+
704753
// From https://github.com/apache/arrow-rs/blob/7705acad845e8b2a366a08640f7acb4033ed7049/arrow-flight/src/sql/metadata/mod.rs#L67-L75
705754
pub fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) {
706755
let formatted = pretty::pretty_format_batches(batches).unwrap().to_string();
338 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)