diff --git a/arrow-arith/src/numeric.rs b/arrow-arith/src/numeric.rs index b2c87bba5143..43f5875ee6c0 100644 --- a/arrow-arith/src/numeric.rs +++ b/arrow-arith/src/numeric.rs @@ -1346,8 +1346,10 @@ mod tests { IntervalMonthDayNanoType::make_value(35, -19, 41899000000000000) ]) ); - let a = IntervalMonthDayNanoArray::from(vec![i64::MAX as i128]); - let b = IntervalMonthDayNanoArray::from(vec![1]); + let max_nanos = IntervalMonthDayNanoType::make_value(0, 0, i64::MAX); + let a = IntervalMonthDayNanoArray::from(vec![max_nanos]); + let one_nanos = IntervalMonthDayNanoType::make_value(0, 0, 1); + let b = IntervalMonthDayNanoArray::from(vec![one_nanos]); let err = add(&a, &b).unwrap_err().to_string(); assert_eq!( err, diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 038b2a291f58..250fd4d1baa0 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -264,11 +264,11 @@ Each field is independent (e.g. there is no constraint that the quantity of nanoseconds represents less than a day's worth of time). ```text -┌──────────────────────────────┬─────────────┬──────────────┐ -│ Nanos │ Days │ Months │ -│ (64 bits) │ (32 bits) │ (32 bits) │ -└──────────────────────────────┴─────────────┴──────────────┘ - 0 63 95 127 bit offset +┌───────────────┬─────────────┬─────────────────────────────┐ +│ Months │ Days │ Nanos │ +│ (32 bits) │ (32 bits) │ (64 bits) │ +└───────────────┴─────────────┴─────────────────────────────┘ + 0 32 64 128 bit offset ``` Please see the [Arrow Spec](https://github.com/apache/arrow/blob/081b4022fe6f659d8765efc82b3f4787c5039e3c/format/Schema.fbs#L409-L415) for more details @@ -290,9 +290,9 @@ representation which is fast and efficient, but leads to potentially surprising results. For example a -`IntervalMonthDayNano` of `1 month` will compare as **greater** than a -`IntervalMonthDayNano` of `100 days` because the binary representation of `1 month` -is larger than the binary representation of 100 days. +`IntervalMonthDayNano` of `1 month` will compare as **less** than a +`IntervalMonthDayNano` of `1 days` because the binary representation of `1 month` +is smaller than the binary representation of 1 days. "# ); make_type!( @@ -928,13 +928,14 @@ impl IntervalDayTimeType { int32_t milliseconds = 0; ... } - 64 56 48 40 32 24 16 8 0 - +-------+-------+-------+-------+-------+-------+-------+-------+ - | days | milliseconds | - +-------+-------+-------+-------+-------+-------+-------+-------+ + ┌──────────────┬──────────────┐ + │ Days │ Milliseconds │ + │ (32 bits) │ (32 bits) │ + └──────────────┴──────────────┘ + 0 31 63 bit offset */ - let m = millis as u64 & u32::MAX as u64; - let d = (days as u64 & u32::MAX as u64) << 32; + let m = (millis as u64 & u32::MAX as u64) << 32; + let d = days as u64 & u32::MAX as u64; (m | d) as ::Native } @@ -945,8 +946,8 @@ impl IntervalDayTimeType { /// * `i` - The IntervalDayTimeType to convert #[inline] pub fn to_parts(i: ::Native) -> (i32, i32) { - let days = (i >> 32) as i32; - let ms = i as i32; + let days = i as i32; + let ms = (i >> 32) as i32; (days, ms) } } @@ -972,14 +973,16 @@ impl IntervalMonthDayNanoType { int32_t days; int64_t nanoseconds; } - 128 112 96 80 64 48 32 16 0 - +-------+-------+-------+-------+-------+-------+-------+-------+ - | months | days | nanos | - +-------+-------+-------+-------+-------+-------+-------+-------+ + ┌───────────────┬─────────────┬─────────────────────────────┐ + │ Months │ Days │ Nanos │ + │ (32 bits) │ (32 bits) │ (64 bits) │ + └───────────────┴─────────────┴─────────────────────────────┘ + 0 32 64 128 bit offset + */ - let m = (months as u128 & u32::MAX as u128) << 96; - let d = (days as u128 & u32::MAX as u128) << 64; - let n = nanos as u128 & u64::MAX as u128; + let m = months as u128 & u32::MAX as u128; + let d = (days as u128 & u32::MAX as u128) << 32; + let n = (nanos as u128 & u64::MAX as u128) << 64; (m | d | n) as ::Native } @@ -992,9 +995,9 @@ impl IntervalMonthDayNanoType { pub fn to_parts( i: ::Native, ) -> (i32, i32, i64) { - let months = (i >> 96) as i32; - let days = (i >> 64) as i32; - let nanos = i as i64; + let months = i as i32; + let days = (i >> 32) as i32; + let nanos = (i >> 64) as i64; (months, days, nanos) } } diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 8b7579c4cfc0..3f266a603b48 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -37,32 +37,34 @@ //! assert_eq!(7.0, c.value(2)); //! ``` -mod decimal; -mod dictionary; -mod list; -mod string; -use crate::cast::decimal::*; -use crate::cast::dictionary::*; -use crate::cast::list::*; -use crate::cast::string::*; - -use chrono::{NaiveTime, Offset, TimeZone, Utc}; use std::cmp::Ordering; use std::sync::Arc; -use crate::display::{ArrayFormatter, FormatOptions}; -use crate::parse::{ - parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month, - string_to_datetime, Parser, -}; +use chrono::{NaiveTime, Offset, TimeZone, Utc}; +use num::cast::AsPrimitive; +use num::{NumCast, ToPrimitive}; + use arrow_array::{builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *}; use arrow_buffer::{i256, ArrowNativeType, OffsetBuffer}; use arrow_data::transform::MutableArrayData; use arrow_data::ArrayData; use arrow_schema::*; use arrow_select::take::take; -use num::cast::AsPrimitive; -use num::{NumCast, ToPrimitive}; + +use crate::cast::decimal::*; +use crate::cast::dictionary::*; +use crate::cast::list::*; +use crate::cast::string::*; +use crate::display::{ArrayFormatter, FormatOptions}; +use crate::parse::{ + parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month, + string_to_datetime, Parser, +}; + +mod decimal; +mod dictionary; +mod list; +mod string; /// CastOptions provides a way to override the default cast behaviors #[derive(Debug, Clone, PartialEq, Eq, Hash)] @@ -384,10 +386,13 @@ fn cast_month_day_nano_to_duration>( _ => unreachable!(), }; + let convert = |x| { + let (month, day, nanos) = IntervalMonthDayNanoType::to_parts(x); + (month == 0 && day == 0).then_some(nanos / scale) + }; + if cast_options.safe { - let iter = array - .iter() - .map(|v| v.and_then(|v| (v >> 64 == 0).then_some((v as i64) / scale))); + let iter = array.iter().map(|x| x.and_then(convert)); Ok(Arc::new(unsafe { PrimitiveArray::::from_trusted_len_iter(iter) })) @@ -395,12 +400,13 @@ fn cast_month_day_nano_to_duration>( let vec = array .iter() .map(|v| { - v.map(|v| match v >> 64 { - 0 => Ok((v as i64) / scale), - _ => Err(ArrowError::ComputeError( - "Cannot convert interval containing non-zero months or days to duration" - .to_string(), - )), + v.map(|v| { + convert(v).ok_or_else(|| { + ArrowError::ComputeError( + "Cannot convert interval containing non-zero months or days to duration" + .to_string(), + ) + }) }) .transpose() }) @@ -2240,10 +2246,11 @@ where #[cfg(test)] mod tests { - use arrow_buffer::{Buffer, NullBuffer}; use chrono::NaiveDate; use half::f16; + use arrow_buffer::{Buffer, NullBuffer}; + use super::*; macro_rules! generate_cast_test_case { @@ -8272,10 +8279,10 @@ mod tests { }; // from interval month day nano to duration second - let array = vec![1234567].into(); + let array = vec![IntervalMonthDayNanoType::make_value(0, 0, 1_000_000_000)].into(); let casted_array: DurationSecondArray = cast_from_interval_to_duration(&array, &nullable).unwrap(); - assert_eq!(casted_array.value(0), 0); + assert_eq!(casted_array.value(0), 1); let array = vec![i128::MAX].into(); let casted_array: DurationSecondArray = @@ -8286,7 +8293,7 @@ mod tests { assert!(res.is_err()); // from interval month day nano to duration millisecond - let array = vec![1234567].into(); + let array = vec![IntervalMonthDayNanoType::make_value(0, 0, 1234567)].into(); let casted_array: DurationMillisecondArray = cast_from_interval_to_duration(&array, &nullable).unwrap(); assert_eq!(casted_array.value(0), 1); @@ -8300,7 +8307,7 @@ mod tests { assert!(res.is_err()); // from interval month day nano to duration microsecond - let array = vec![1234567].into(); + let array = vec![IntervalMonthDayNanoType::make_value(0, 0, 1234567)].into(); let casted_array: DurationMicrosecondArray = cast_from_interval_to_duration(&array, &nullable).unwrap(); assert_eq!(casted_array.value(0), 1234); @@ -8315,7 +8322,7 @@ mod tests { assert!(casted_array.is_err()); // from interval month day nano to duration nanosecond - let array = vec![1234567].into(); + let array = vec![IntervalMonthDayNanoType::make_value(0, 0, 1234567)].into(); let casted_array: DurationNanosecondArray = cast_from_interval_to_duration(&array, &nullable).unwrap(); assert_eq!(casted_array.value(0), 1234567); @@ -8373,7 +8380,7 @@ mod tests { #[test] fn test_cast_from_interval_year_month_to_interval_month_day_nano() { // from interval year month to interval month day nano - let array = vec![1234567]; + let array = vec![IntervalYearMonthType::make_value(23, 34)]; let casted_array = cast_from_interval_year_month_to_interval_month_day_nano( array, &CastOptions::default(), @@ -8383,7 +8390,10 @@ mod tests { casted_array.data_type(), &DataType::Interval(IntervalUnit::MonthDayNano) ); - assert_eq!(casted_array.value(0), 97812474910747780469848774134464512); + assert_eq!( + casted_array.value(0), + IntervalMonthDayNanoType::make_value(310, 0, 0) + ); } /// helper function to test casting from interval day time to interval month day nano @@ -8406,7 +8416,7 @@ mod tests { #[test] fn test_cast_from_interval_day_time_to_interval_month_day_nano() { // from interval day time to interval month day nano - let array = vec![123]; + let array = vec![IntervalDayTimeType::make_value(23, 123)]; let casted_array = cast_from_interval_day_time_to_interval_month_day_nano(array, &CastOptions::default()) .unwrap(); @@ -8414,7 +8424,10 @@ mod tests { casted_array.data_type(), &DataType::Interval(IntervalUnit::MonthDayNano) ); - assert_eq!(casted_array.value(0), 123000000); + assert_eq!( + casted_array.value(0), + IntervalMonthDayNanoType::make_value(0, 23, 123000000) + ); } #[test] diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index a5f69b660944..d0c4caf426a3 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -660,19 +660,16 @@ impl<'a> DisplayIndex for &'a PrimitiveArray { impl<'a> DisplayIndex for &'a PrimitiveArray { fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { - let value: u64 = self.value(idx) as u64; + let (days, millis) = IntervalDayTimeType::to_parts(self.value(idx)); - let days_parts: i32 = ((value & 0xFFFFFFFF00000000) >> 32) as i32; - let milliseconds_part: i32 = (value & 0xFFFFFFFF) as i32; - - let secs = milliseconds_part / 1_000; + let secs = millis / 1_000; let mins = secs / 60; let hours = mins / 60; let secs = secs - (mins * 60); let mins = mins - (hours * 60); - let milliseconds = milliseconds_part % 1_000; + let milliseconds = millis % 1_000; let secs_sign = if secs < 0 || milliseconds < 0 { "-" @@ -683,7 +680,7 @@ impl<'a> DisplayIndex for &'a PrimitiveArray { write!( f, "0 years 0 mons {} days {} hours {} mins {}{}.{:03} secs", - days_parts, + days, hours, mins, secs_sign, @@ -696,28 +693,23 @@ impl<'a> DisplayIndex for &'a PrimitiveArray { impl<'a> DisplayIndex for &'a PrimitiveArray { fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { - let value: u128 = self.value(idx) as u128; - - let months_part: i32 = ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32; - let days_part: i32 = ((value & 0xFFFFFFFF0000000000000000) >> 64) as i32; - let nanoseconds_part: i64 = (value & 0xFFFFFFFFFFFFFFFF) as i64; - - let secs = nanoseconds_part / 1_000_000_000; + let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(self.value(idx)); + let secs = nanos / 1_000_000_000; let mins = secs / 60; let hours = mins / 60; let secs = secs - (mins * 60); let mins = mins - (hours * 60); - let nanoseconds = nanoseconds_part % 1_000_000_000; + let nanoseconds = nanos % 1_000_000_000; let secs_sign = if secs < 0 || nanoseconds < 0 { "-" } else { "" }; write!( f, "0 years {} mons {} days {} hours {} mins {}{}.{:09} secs", - months_part, - days_part, + months, + days, hours, mins, secs_sign, diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 4d552b038a7d..df68e7eb72ee 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -1714,18 +1714,17 @@ mod tests { IntervalDayTimeType::make_value(1, 3000), // 90M milliseconds IntervalDayTimeType::make_value(0, 90_000_000), + IntervalDayTimeType::make_value(4, 10), ], vec![ IntervalDayTimeType::make_value(0, 1000), IntervalDayTimeType::make_value(1, 0), IntervalDayTimeType::make_value(10, 0), IntervalDayTimeType::make_value(2, 1), - // NB even though 1 day is less than 90M milliseconds long, - // it compares as greater because the underlying type stores - // days and milliseconds as different fields IntervalDayTimeType::make_value(0, 12), + IntervalDayTimeType::make_value(56, 10), ], - vec![false, true, true, true ,false] + vec![true, false, false, false, false, true] ); cmp_vec!( @@ -1771,7 +1770,7 @@ mod tests { // 100 days (note is treated as greater than 1 month as the underlying integer representation) IntervalMonthDayNanoType::make_value(0, 100, 0), ], - vec![false, false, true, false, false] + vec![false, true, true, true, true] ); } diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index e793038de929..8304daff0424 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -227,14 +227,12 @@ pub mod tests { let cmp = build_compare(&array, &array).unwrap(); - assert_eq!(Ordering::Less, cmp(0, 1)); - assert_eq!(Ordering::Greater, cmp(1, 0)); - - // somewhat confusingly, while 90M milliseconds is more than 1 day, - // it will compare less as the comparison is done on the underlying - // values not field by field - assert_eq!(Ordering::Greater, cmp(1, 2)); - assert_eq!(Ordering::Less, cmp(2, 1)); + // Ordering is based on the underlying values + // leading to potentially surprising results + assert_eq!(Ordering::Greater, cmp(0, 1)); + assert_eq!(Ordering::Less, cmp(1, 0)); + assert_eq!(Ordering::Less, cmp(1, 2)); + assert_eq!(Ordering::Greater, cmp(2, 1)); } #[test] @@ -271,14 +269,12 @@ pub mod tests { let cmp = build_compare(&array, &array).unwrap(); - assert_eq!(Ordering::Less, cmp(0, 1)); - assert_eq!(Ordering::Greater, cmp(1, 0)); - - // somewhat confusingly, while 100 days is more than 1 month in all cases - // it will compare less as the comparison is done on the underlying - // values not field by field - assert_eq!(Ordering::Greater, cmp(1, 2)); - assert_eq!(Ordering::Less, cmp(2, 1)); + // Ordering is based on the underlying values + // leading to potentially surprising results + assert_eq!(Ordering::Greater, cmp(0, 1)); + assert_eq!(Ordering::Less, cmp(1, 0)); + assert_eq!(Ordering::Less, cmp(1, 2)); + assert_eq!(Ordering::Greater, cmp(2, 1)); } #[test]