From c5a9953f2a9193e9f31366219d2ee1853215a6d0 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 8 Dec 2023 13:38:12 -0500 Subject: [PATCH] Clarify interval comparison behavior with documentation and tests (#5192) * Clarify interval comparison behavior with documentation and tests * refine language --- arrow-array/src/array/primitive_array.rs | 8 +- arrow-array/src/types.rs | 71 +++++++++++++- arrow-ord/src/comparison.rs | 113 +++++++++++++++++++++++ arrow-ord/src/ord.rs | 67 ++++++++++++++ 4 files changed, 255 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 1112acacfcd9..2296cebd4681 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -352,12 +352,18 @@ pub type Time64MicrosecondArray = PrimitiveArray; pub type Time64NanosecondArray = PrimitiveArray; /// A [`PrimitiveArray`] of “calendar” intervals in months +/// +/// See [`IntervalYearMonthType`] for details on representation and caveats. pub type IntervalYearMonthArray = PrimitiveArray; /// A [`PrimitiveArray`] of “calendar” intervals in days and milliseconds +/// +/// See [`IntervalDayTimeType`] for details on representation and caveats. pub type IntervalDayTimeArray = PrimitiveArray; -/// A [`PrimitiveArray`] of “calendar” intervals in months, days, and nanoseconds +/// A [`PrimitiveArray`] of “calendar” intervals in months, days, and nanoseconds. +/// +/// See [`IntervalMonthDayNanoType`] for details on representation and caveats. pub type IntervalMonthDayNanoArray = PrimitiveArray; /// A [`PrimitiveArray`] of elapsed durations in seconds diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 16d0e822d052..6e177838c4f5 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -213,19 +213,84 @@ make_type!( IntervalYearMonthType, i32, DataType::Interval(IntervalUnit::YearMonth), - "A “calendar” interval type in months." + "A “calendar” interval stored as the number of whole months." ); make_type!( IntervalDayTimeType, i64, DataType::Interval(IntervalUnit::DayTime), - "A “calendar” interval type in days and milliseconds." + r#"A “calendar” interval type in days and milliseconds. + +## Representation +This type is stored as a single 64 bit integer, interpreted as two i32 fields: +1. the number of elapsed days +2. The number of milliseconds (no leap seconds), + +```text + ┌──────────────┬──────────────┐ + │ Days │ Milliseconds │ + │ (32 bits) │ (32 bits) │ + └──────────────┴──────────────┘ + 0 31 63 bit offset +``` +Please see the [Arrow Spec](https://github.com/apache/arrow/blob/081b4022fe6f659d8765efc82b3f4787c5039e3c/format/Schema.fbs#L406-L408) for more details + +## Note on Comparing and Ordering for Calendar Types + +Values of `IntervalDayTimeType` are compared using their binary representation, +which can lead to surprising results. Please see the description of ordering on +[`IntervalMonthDayNanoType`] for more details +"# ); make_type!( IntervalMonthDayNanoType, i128, DataType::Interval(IntervalUnit::MonthDayNano), - "A “calendar” interval type in months, days, and nanoseconds." + r#"A “calendar” interval type in months, days, and nanoseconds. + +## Representation +This type is stored as a single 128 bit integer, +interpreted as three different signed integral fields: + +1. The number of months (32 bits) +2. The number days (32 bits) +2. The number of nanoseconds (64 bits). + +Nanoseconds does not allow for leap seconds. +Each field is independent (e.g. there is no constraint that the quantity of +nanoseconds represents less than a day's worth of time). + +```text +┌──────────────────────────────┬─────────────┬──────────────┐ +│ Nanos │ Days │ Months │ +│ (64 bits) │ (32 bits) │ (32 bits) │ +└──────────────────────────────┴─────────────┴──────────────┘ + 0 63 95 127 bit offset +``` +Please see the [Arrow Spec](https://github.com/apache/arrow/blob/081b4022fe6f659d8765efc82b3f4787c5039e3c/format/Schema.fbs#L409-L415) for more details + +## Note on Comparing and Ordering for Calendar Types +Values of `IntervalMonthDayNanoType` are compared using their binary representation, +which can lead to surprising results. + +Spans of time measured in calendar units are not fixed in absolute size (e.g. +number of seconds) which makes defining comparisons and ordering non trivial. +For example `1 month` is 28 days for February but `1 month` is 31 days +in December. + +This makes the seemingly simple operation of comparing two intervals +complicated in practice. For example is `1 month` more or less than `30 days`? The +answer depends on what month you are talking about. + +This crate defines comparisons for calendar types using their binary +representation which is fast and efficient, but leads +to potentially surprising results. + +For example a +`IntervalMonthDayNano` of `1 month` will compare as **greater** than a +`IntervalMonthDayNano` of `100 days` because the binary representation of `1 month` +is larger than the binary representation of 100 days. +"# ); make_type!( DurationSecondType, diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs index 4dbb395192e1..4d552b038a7d 100644 --- a/arrow-ord/src/comparison.rs +++ b/arrow-ord/src/comparison.rs @@ -1407,6 +1407,48 @@ mod tests { vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], vec![false, false, true, false, false, false, false, true, false, false] ); + + cmp_vec!( + eq, + eq_dyn, + IntervalYearMonthArray, + vec![ + IntervalYearMonthType::make_value(1, 2), + IntervalYearMonthType::make_value(2, 1), + // 1 year + IntervalYearMonthType::make_value(1, 0), + ], + vec![ + IntervalYearMonthType::make_value(1, 2), + IntervalYearMonthType::make_value(1, 2), + // NB 12 months is treated as equal to a year (as the underlying + // type stores number of months) + IntervalYearMonthType::make_value(0, 12), + ], + vec![true, false, true] + ); + + cmp_vec!( + eq, + eq_dyn, + IntervalMonthDayNanoArray, + vec![ + IntervalMonthDayNanoType::make_value(1, 2, 3), + IntervalMonthDayNanoType::make_value(3, 2, 1), + // 1 month + IntervalMonthDayNanoType::make_value(1, 0, 0), + IntervalMonthDayNanoType::make_value(1, 0, 0), + ], + vec![ + IntervalMonthDayNanoType::make_value(1, 2, 3), + IntervalMonthDayNanoType::make_value(1, 2, 3), + // 30 days is not treated as a month + IntervalMonthDayNanoType::make_value(0, 30, 0), + // 100 days + IntervalMonthDayNanoType::make_value(0, 100, 0), + ], + vec![true, false, false, false] + ); } #[test] @@ -1660,6 +1702,77 @@ mod tests { vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], vec![false, false, false, true, true, false, false, false, true, true] ); + + cmp_vec!( + lt, + lt_dyn, + IntervalDayTimeArray, + vec![ + IntervalDayTimeType::make_value(1, 0), + IntervalDayTimeType::make_value(0, 1000), + IntervalDayTimeType::make_value(1, 1000), + IntervalDayTimeType::make_value(1, 3000), + // 90M milliseconds + IntervalDayTimeType::make_value(0, 90_000_000), + ], + vec![ + IntervalDayTimeType::make_value(0, 1000), + IntervalDayTimeType::make_value(1, 0), + IntervalDayTimeType::make_value(10, 0), + IntervalDayTimeType::make_value(2, 1), + // NB even though 1 day is less than 90M milliseconds long, + // it compares as greater because the underlying type stores + // days and milliseconds as different fields + IntervalDayTimeType::make_value(0, 12), + ], + vec![false, true, true, true ,false] + ); + + cmp_vec!( + lt, + lt_dyn, + IntervalYearMonthArray, + vec![ + IntervalYearMonthType::make_value(1, 2), + IntervalYearMonthType::make_value(2, 1), + IntervalYearMonthType::make_value(1, 2), + // 1 year + IntervalYearMonthType::make_value(1, 0), + ], + vec![ + IntervalYearMonthType::make_value(1, 2), + IntervalYearMonthType::make_value(1, 2), + IntervalYearMonthType::make_value(2, 1), + // NB 12 months is treated as equal to a year (as the underlying + // type stores number of months) + IntervalYearMonthType::make_value(0, 12), + ], + vec![false, false, true, false] + ); + + cmp_vec!( + lt, + lt_dyn, + IntervalMonthDayNanoArray, + vec![ + IntervalMonthDayNanoType::make_value(1, 2, 3), + IntervalMonthDayNanoType::make_value(3, 2, 1), + // 1 month + IntervalMonthDayNanoType::make_value(1, 0, 0), + IntervalMonthDayNanoType::make_value(1, 2, 0), + IntervalMonthDayNanoType::make_value(1, 0, 0), + ], + vec![ + IntervalMonthDayNanoType::make_value(1, 2, 3), + IntervalMonthDayNanoType::make_value(1, 2, 3), + IntervalMonthDayNanoType::make_value(2, 0, 0), + // 30 days is not treated as a month + IntervalMonthDayNanoType::make_value(0, 30, 0), + // 100 days (note is treated as greater than 1 month as the underlying integer representation) + IntervalMonthDayNanoType::make_value(0, 100, 0), + ], + vec![false, false, true, false, false] + ); } #[test] diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs index 28ca07cce260..f6bd39c9cd5d 100644 --- a/arrow-ord/src/ord.rs +++ b/arrow-ord/src/ord.rs @@ -216,6 +216,73 @@ pub mod tests { assert_eq!(Ordering::Greater, cmp(1, 0)); } + #[test] + fn test_interval_day_time() { + let array = IntervalDayTimeArray::from(vec![ + // 0 days, 1 second + IntervalDayTimeType::make_value(0, 1000), + // 1 day, 2 milliseconds + IntervalDayTimeType::make_value(1, 2), + // 90M milliseconds (which is more than is in 1 day) + IntervalDayTimeType::make_value(0, 90_000_000), + ]); + + let cmp = build_compare(&array, &array).unwrap(); + + assert_eq!(Ordering::Less, cmp(0, 1)); + assert_eq!(Ordering::Greater, cmp(1, 0)); + + // somewhat confusingly, while 90M milliseconds is more than 1 day, + // it will compare less as the comparison is done on the underlying + // values not field by field + assert_eq!(Ordering::Greater, cmp(1, 2)); + assert_eq!(Ordering::Less, cmp(2, 1)); + } + + #[test] + fn test_interval_year_month() { + let array = IntervalYearMonthArray::from(vec![ + // 1 year, 0 months + IntervalYearMonthType::make_value(1, 0), + // 0 years, 13 months + IntervalYearMonthType::make_value(0, 13), + // 1 year, 1 month + IntervalYearMonthType::make_value(1, 1), + ]); + + let cmp = build_compare(&array, &array).unwrap(); + + assert_eq!(Ordering::Less, cmp(0, 1)); + assert_eq!(Ordering::Greater, cmp(1, 0)); + + // the underlying representation is months, so both quantities are the same + assert_eq!(Ordering::Equal, cmp(1, 2)); + assert_eq!(Ordering::Equal, cmp(2, 1)); + } + + #[test] + fn test_interval_month_day_nano() { + let array = IntervalMonthDayNanoArray::from(vec![ + // 100 days + IntervalMonthDayNanoType::make_value(0, 100, 0), + // 1 month + IntervalMonthDayNanoType::make_value(1, 0, 0), + // 100 day, 1 nanoseconds + IntervalMonthDayNanoType::make_value(0, 100, 2), + ]); + + let cmp = build_compare(&array, &array).unwrap(); + + assert_eq!(Ordering::Less, cmp(0, 1)); + assert_eq!(Ordering::Greater, cmp(1, 0)); + + // somewhat confusingly, while 100 days is more than 1 month in all cases + // it will compare less as the comparison is done on the underlying + // values not field by field + assert_eq!(Ordering::Greater, cmp(1, 2)); + assert_eq!(Ordering::Less, cmp(2, 1)); + } + #[test] fn test_decimal() { let array = vec![Some(5_i128), Some(2_i128), Some(3_i128)]