From 08ac58761cb9a80b6ffd651727aa957db0c32bd7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Dec 2024 19:21:48 +0000 Subject: [PATCH 01/29] Update prost-build requirement from =0.13.3 to =0.13.4 (#6860) Updates the requirements on [prost-build](https://github.com/tokio-rs/prost) to permit the latest version. - [Release notes](https://github.com/tokio-rs/prost/releases) - [Changelog](https://github.com/tokio-rs/prost/blob/master/CHANGELOG.md) - [Commits](https://github.com/tokio-rs/prost/compare/v0.13.3...v0.13.4) --- updated-dependencies: - dependency-name: prost-build dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-flight/gen/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index ccd32b433cf..6358227a891 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -32,5 +32,5 @@ publish = false [dependencies] # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing -prost-build = { version = "=0.13.3", default-features = false } +prost-build = { version = "=0.13.4", default-features = false } tonic-build = { version = "=0.12.3", default-features = false, features = ["transport", "prost"] } From dc45d7d18857941c12082c0a6288048baf0c83ee Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 10 Dec 2024 21:39:31 -0700 Subject: [PATCH 02/29] chore: add cast_decimal benchmark (#6850) * add cast_decimal bench * format * address feedback --- arrow/benches/cast_kernels.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index ec7990d3d76..5c4fcff13de 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -250,6 +250,9 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function("cast decimal128 to decimal128 512", |b| { b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(30, 5))) }); + c.bench_function("cast decimal128 to decimal128 512 lower precision", |b| { + b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(6, 5))) + }); c.bench_function("cast decimal128 to decimal256 512", |b| { b.iter(|| cast_array(&decimal128_array, DataType::Decimal256(50, 5))) }); From 06a015770098a569b67855dfaa18bdfa7c18ff92 Mon Sep 17 00:00:00 2001 From: Onur Satici Date: Wed, 11 Dec 2024 13:23:05 +0000 Subject: [PATCH 03/29] add buffered data_pages to parquet column writer total bytes estimation (#6862) * add buffered data_pages to parquet column writer memory size estimation * move to get estimated total bytes --- parquet/src/column/writer/mod.rs | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 8d0be5f9f8c..16de0ba7898 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -574,7 +574,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// anticipated encoded size. #[cfg(feature = "arrow")] pub(crate) fn get_estimated_total_bytes(&self) -> u64 { - self.column_metrics.total_bytes_written + self.data_pages + .iter() + .map(|page| page.data().len() as u64) + .sum::() + + self.column_metrics.total_bytes_written + self.encoder.estimated_data_page_size() as u64 + self.encoder.estimated_dict_page_size().unwrap_or_default() as u64 } @@ -3422,6 +3426,26 @@ mod tests { assert!(stats.max_bytes_opt().is_none()); } + #[test] + #[cfg(feature = "arrow")] + fn test_column_writer_get_estimated_total_bytes() { + let page_writer = get_test_page_writer(); + let props = Default::default(); + let mut writer = get_test_column_writer::(page_writer, 0, 0, props); + assert_eq!(writer.get_estimated_total_bytes(), 0); + + writer.write_batch(&[1, 2, 3, 4], None, None).unwrap(); + writer.add_data_page().unwrap(); + let size_with_one_page = writer.get_estimated_total_bytes(); + assert_eq!(size_with_one_page, 20); + + writer.write_batch(&[5, 6, 7, 8], None, None).unwrap(); + writer.add_data_page().unwrap(); + let size_with_two_pages = writer.get_estimated_total_bytes(); + // different pages have different compressed lengths + assert_eq!(size_with_two_pages, 20 + 21); + } + fn write_multiple_pages( column_descr: &Arc, pages: &[&[Option]], From 50cf8bd80652ec9710bdcb1de568d52618233441 Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Wed, 11 Dec 2024 23:31:02 +0900 Subject: [PATCH 04/29] Always explicitly disable `gzip` automatic decompression on reqwest client used by object_store (#6843) * Explicitly disable gzip on reqwest client used by object_store * Add comment * Add integration test for checking reqwest gzip feature * Fix lint * Add comment explaining why gzip feature is enabled --- object_store/Cargo.toml | 2 ++ object_store/src/client/mod.rs | 4 ++++ object_store/tests/http.rs | 43 ++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+) create mode 100644 object_store/tests/http.rs diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index a0473362513..bcc8e0b9224 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -77,6 +77,8 @@ http-body-util = "0.1" rand = "0.8" tempfile = "3.1.0" regex = "1.11.1" +# The "gzip" feature for reqwest is enabled for an integration test. +reqwest = { version = "0.12", features = ["gzip"] } http = "1.1.0" [[test]] diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs index 76d1c1f22f5..1b7ce5aa7a7 100644 --- a/object_store/src/client/mod.rs +++ b/object_store/src/client/mod.rs @@ -671,6 +671,10 @@ impl ClientOptions { builder = builder.danger_accept_invalid_certs(true) } + // Reqwest will remove the `Content-Length` header if it is configured to + // transparently decompress the body via the non-default `gzip` feature. + builder = builder.no_gzip(); + builder .https_only(!self.allow_http.get()?) .build() diff --git a/object_store/tests/http.rs b/object_store/tests/http.rs new file mode 100644 index 00000000000..a9b3145bb66 --- /dev/null +++ b/object_store/tests/http.rs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tests the HTTP store implementation + +#[cfg(feature = "http")] +use object_store::{http::HttpBuilder, path::Path, GetOptions, GetRange, ObjectStore}; + +/// Tests that even when reqwest has the `gzip` feature enabled, the HTTP store +/// does not error on a missing `Content-Length` header. +#[tokio::test] +#[cfg(feature = "http")] +async fn test_http_store_gzip() { + let http_store = HttpBuilder::new() + .with_url("https://raw.githubusercontent.com/apache/arrow-rs/refs/heads/main") + .build() + .unwrap(); + + let _ = http_store + .get_opts( + &Path::parse("LICENSE.txt").unwrap(), + GetOptions { + range: Some(GetRange::Bounded(0..100)), + ..Default::default() + }, + ) + .await + .unwrap(); +} From 4acf4d3139b0cf1bad371a6d4632d6de3bf2babe Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Wed, 11 Dec 2024 16:47:51 +0100 Subject: [PATCH 05/29] Enable unused_crate_dependencies Rust lint, remove unused dependencies (#6804) * Disallow unused dependencies in arrow-array * Disallow unused dependencies in arrow-schema * check all modules on gh * Remove arrow-csv's unused dependencies * Remove arrow-ord's unused dependencies * Remove arrow-row's unused dependencies * Remove arrow's unused dependencies * Remove unused dependencies in arrow-flight and in tests And disable the check since it feels impossible/infeasible to configure it the right way. * Unify arrow workflow step names --- .github/workflows/arrow.yml | 202 +++++++++++++----- arrow-csv/Cargo.toml | 2 +- arrow-flight/Cargo.toml | 12 +- arrow-flight/src/lib.rs | 2 + arrow-integration-testing/Cargo.toml | 5 +- .../src/bin/arrow-file-to-stream.rs | 3 + .../src/bin/arrow-json-integration-test.rs | 3 + .../src/bin/arrow-stream-to-file.rs | 3 + .../src/bin/flight-test-integration-client.rs | 3 + .../src/bin/flight-test-integration-server.rs | 3 + arrow-integration-testing/src/lib.rs | 2 + arrow-ord/Cargo.toml | 2 +- arrow-row/Cargo.toml | 6 - arrow/Cargo.toml | 4 +- 14 files changed, 183 insertions(+), 69 deletions(-) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index d065c554bbf..daf38f2523f 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -61,39 +61,39 @@ jobs: submodules: true - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - - name: Test arrow-buffer with all features + - name: Test arrow-buffer run: cargo test -p arrow-buffer --all-features - - name: Test arrow-data with all features + - name: Test arrow-data run: cargo test -p arrow-data --all-features - - name: Test arrow-schema with all features + - name: Test arrow-schema run: cargo test -p arrow-schema --all-features - - name: Test arrow-array with all features + - name: Test arrow-array run: cargo test -p arrow-array --all-features - - name: Test arrow-select with all features + - name: Test arrow-select run: cargo test -p arrow-select --all-features - - name: Test arrow-cast with all features + - name: Test arrow-cast run: cargo test -p arrow-cast --all-features - - name: Test arrow-ipc with all features + - name: Test arrow-ipc run: cargo test -p arrow-ipc --all-features - - name: Test arrow-csv with all features + - name: Test arrow-csv run: cargo test -p arrow-csv --all-features - - name: Test arrow-json with all features + - name: Test arrow-json run: cargo test -p arrow-json --all-features - - name: Test arrow-avro with all features + - name: Test arrow-avro run: cargo test -p arrow-avro --all-features - - name: Test arrow-string with all features + - name: Test arrow-string run: cargo test -p arrow-string --all-features - - name: Test arrow-ord with all features + - name: Test arrow-ord run: cargo test -p arrow-ord --all-features - - name: Test arrow-arith with all features + - name: Test arrow-arith run: cargo test -p arrow-arith --all-features - - name: Test arrow-row with all features + - name: Test arrow-row run: cargo test -p arrow-row --all-features - - name: Test arrow-integration-test with all features + - name: Test arrow-integration-test run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features run: cargo test -p arrow - - name: Test arrow with all features except pyarrow + - name: Test arrow except pyarrow run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,chrono-tz - name: Run examples run: | @@ -163,37 +163,139 @@ jobs: uses: ./.github/actions/setup-builder - name: Setup Clippy run: rustup component add clippy - - name: Clippy arrow-buffer with all features - run: cargo clippy -p arrow-buffer --all-targets --all-features -- -D warnings - - name: Clippy arrow-data with all features - run: cargo clippy -p arrow-data --all-targets --all-features -- -D warnings - - name: Clippy arrow-schema with all features - run: cargo clippy -p arrow-schema --all-targets --all-features -- -D warnings - - name: Clippy arrow-array with all features - run: cargo clippy -p arrow-array --all-targets --all-features -- -D warnings - - name: Clippy arrow-select with all features - run: cargo clippy -p arrow-select --all-targets --all-features -- -D warnings - - name: Clippy arrow-cast with all features - run: cargo clippy -p arrow-cast --all-targets --all-features -- -D warnings - - name: Clippy arrow-ipc with all features - run: cargo clippy -p arrow-ipc --all-targets --all-features -- -D warnings - - name: Clippy arrow-csv with all features - run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings - - name: Clippy arrow-json with all features - run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings - - name: Clippy arrow-avro with all features - run: cargo clippy -p arrow-avro --all-targets --all-features -- -D warnings - - name: Clippy arrow-string with all features - run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings - - name: Clippy arrow-ord with all features - run: cargo clippy -p arrow-ord --all-targets --all-features -- -D warnings - - name: Clippy arrow-arith with all features - run: cargo clippy -p arrow-arith --all-targets --all-features -- -D warnings - - name: Clippy arrow-row with all features - run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings - - name: Clippy arrow with all features - run: cargo clippy -p arrow --all-features --all-targets -- -D warnings - - name: Clippy arrow-integration-test with all features - run: cargo clippy -p arrow-integration-test --all-targets --all-features -- -D warnings - - name: Clippy arrow-integration-testing with all features - run: cargo clippy -p arrow-integration-testing --all-targets --all-features -- -D warnings + - name: Clippy arrow-buffer + run: | + mod=arrow-buffer + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-data + run: | + mod=arrow-data + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-schema + run: | + mod=arrow-schema + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-array + run: | + mod=arrow-array + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-select + run: | + mod=arrow-select + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-cast + run: | + mod=arrow-cast + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-ipc + run: | + mod=arrow-ipc + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-csv + run: | + mod=arrow-csv + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-json + run: | + mod=arrow-json + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-avro + run: | + mod=arrow-avro + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-string + run: | + mod=arrow-string + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-ord + run: | + mod=arrow-ord + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-arith + run: | + mod=arrow-arith + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-row + run: | + mod=arrow-row + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow + run: | + mod=arrow + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-integration-test + run: | + mod=arrow-integration-test + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies + - name: Clippy arrow-integration-testing + run: | + mod=arrow-integration-testing + cargo clippy -p "$mod" --all-targets --all-features -- -D warnings + # Dependency checks excluding tests & benches. + cargo clippy -p "$mod" -- -D unused_crate_dependencies + cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies + cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml index 4729779c43b..8823924eb55 100644 --- a/arrow-csv/Cargo.toml +++ b/arrow-csv/Cargo.toml @@ -35,7 +35,6 @@ bench = false [dependencies] arrow-array = { workspace = true } -arrow-buffer = { workspace = true } arrow-cast = { workspace = true } arrow-schema = { workspace = true } chrono = { workspace = true } @@ -45,6 +44,7 @@ lazy_static = { version = "1.4", default-features = false } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } [dev-dependencies] +arrow-buffer = { workspace = true } tempfile = "3.3" futures = "0.3" tokio = { version = "1.27", default-features = false, features = ["io-util"] } diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 012d3947f02..fbb295036a9 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -43,11 +43,11 @@ base64 = { version = "0.22", default-features = false, features = ["std"] } bytes = { version = "1", default-features = false } futures = { version = "0.3", default-features = false, features = ["alloc"] } once_cell = { version = "1", optional = true } -paste = { version = "1.0" } +paste = { version = "1.0" , optional = true } prost = { version = "0.13.1", default-features = false, features = ["prost-derive"] } # For Timestamp type prost-types = { version = "0.13.1", default-features = false } -tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } +tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"], optional = true } tonic = { version = "0.12.3", default-features = false, features = ["transport", "codegen", "prost"] } # CLI-related dependencies @@ -61,11 +61,10 @@ all-features = true [features] default = [] -flight-sql-experimental = ["dep:arrow-arith", "dep:arrow-data", "dep:arrow-ord", "dep:arrow-row", "dep:arrow-select", "dep:arrow-string", "dep:once_cell"] +flight-sql-experimental = ["dep:arrow-arith", "dep:arrow-data", "dep:arrow-ord", "dep:arrow-row", "dep:arrow-select", "dep:arrow-string", "dep:once_cell", "dep:paste"] tls = ["tonic/tls"] - # Enable CLI tools -cli = ["dep:anyhow", "arrow-array/chrono-tz", "arrow-cast/prettyprint", "dep:clap", "dep:tracing-log", "dep:tracing-subscriber", "tonic/tls-webpki-roots"] +cli = ["arrow-array/chrono-tz", "arrow-cast/prettyprint", "tonic/tls-webpki-roots", "dep:anyhow", "dep:clap", "dep:tracing-log", "dep:tracing-subscriber"] [dev-dependencies] arrow-cast = { workspace = true, features = ["prettyprint"] } @@ -75,6 +74,9 @@ http-body = "1.0.0" hyper-util = "0.1" pin-project-lite = "0.2" tempfile = "3.3" +tracing-log = { version = "0.2" } +tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "env-filter", "fmt"] } +tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } tokio-stream = { version = "0.1", features = ["net"] } tower = { version = "0.5.0", features = ["util"] } uuid = { version = "1.10.0", features = ["v4"] } diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 9f18416c06e..81fa99faeb9 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -38,6 +38,8 @@ //! [Flight SQL]: https://arrow.apache.org/docs/format/FlightSql.html #![allow(rustdoc::invalid_html_tags)] #![warn(missing_docs)] +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] use arrow_ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions}; use arrow_schema::{ArrowError, Schema}; diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 0ebbc05a8b8..8654b4b9273 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -36,18 +36,17 @@ logging = ["tracing-subscriber"] [dependencies] arrow = { path = "../arrow", default-features = false, features = ["test_utils", "ipc", "ipc_compression", "json", "ffi"] } arrow-flight = { path = "../arrow-flight", default-features = false } -arrow-buffer = { path = "../arrow-buffer", default-features = false } arrow-integration-test = { path = "../arrow-integration-test", default-features = false } -async-trait = { version = "0.1.41", default-features = false } clap = { version = "4", default-features = false, features = ["std", "derive", "help", "error-context", "usage"] } futures = { version = "0.3", default-features = false } prost = { version = "0.13", default-features = false } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } -tokio = { version = "1.0", default-features = false } +tokio = { version = "1.0", default-features = false, features = [ "rt-multi-thread"] } tonic = { version = "0.12", default-features = false } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["fmt"], optional = true } flate2 = { version = "1", default-features = false, features = ["rust_backend"] } [dev-dependencies] +arrow-buffer = { path = "../arrow-buffer", default-features = false } tempfile = { version = "3", default-features = false } diff --git a/arrow-integration-testing/src/bin/arrow-file-to-stream.rs b/arrow-integration-testing/src/bin/arrow-file-to-stream.rs index 3e027faef91..661f0a047db 100644 --- a/arrow-integration-testing/src/bin/arrow-file-to-stream.rs +++ b/arrow-integration-testing/src/bin/arrow-file-to-stream.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow::error::Result; use arrow::ipc::reader::FileReader; use arrow::ipc::writer::StreamWriter; diff --git a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs index cc3dd2110e3..6a901cc63ba 100644 --- a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs +++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow::error::{ArrowError, Result}; use arrow::ipc::reader::FileReader; use arrow::ipc::writer::FileWriter; diff --git a/arrow-integration-testing/src/bin/arrow-stream-to-file.rs b/arrow-integration-testing/src/bin/arrow-stream-to-file.rs index 07ac5c7ddd4..8b4bb332781 100644 --- a/arrow-integration-testing/src/bin/arrow-stream-to-file.rs +++ b/arrow-integration-testing/src/bin/arrow-stream-to-file.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use std::io; use arrow::error::Result; diff --git a/arrow-integration-testing/src/bin/flight-test-integration-client.rs b/arrow-integration-testing/src/bin/flight-test-integration-client.rs index b8bbb952837..0d16fe3b403 100644 --- a/arrow-integration-testing/src/bin/flight-test-integration-client.rs +++ b/arrow-integration-testing/src/bin/flight-test-integration-client.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow_integration_testing::flight_client_scenarios; use clap::Parser; type Error = Box; diff --git a/arrow-integration-testing/src/bin/flight-test-integration-server.rs b/arrow-integration-testing/src/bin/flight-test-integration-server.rs index 5310d07d4f8..94be7130979 100644 --- a/arrow-integration-testing/src/bin/flight-test-integration-server.rs +++ b/arrow-integration-testing/src/bin/flight-test-integration-server.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] + use arrow_integration_testing::flight_server_scenarios; use clap::Parser; diff --git a/arrow-integration-testing/src/lib.rs b/arrow-integration-testing/src/lib.rs index c8ce01e9f13..e669690ef4f 100644 --- a/arrow-integration-testing/src/lib.rs +++ b/arrow-integration-testing/src/lib.rs @@ -17,6 +17,8 @@ //! Common code used in the integration test binaries +// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets +#![allow(unused_crate_dependencies)] #![warn(missing_docs)] use serde_json::Value; diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml index a2894c02029..8d74d2f97d7 100644 --- a/arrow-ord/Cargo.toml +++ b/arrow-ord/Cargo.toml @@ -39,7 +39,7 @@ arrow-buffer = { workspace = true } arrow-data = { workspace = true } arrow-schema = { workspace = true } arrow-select = { workspace = true } -half = { version = "2.1", default-features = false, features = ["num-traits"] } [dev-dependencies] +half = { version = "2.1", default-features = false, features = ["num-traits"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml index 3754afb4dbc..90d99684d26 100644 --- a/arrow-row/Cargo.toml +++ b/arrow-row/Cargo.toml @@ -33,12 +33,6 @@ name = "arrow_row" path = "src/lib.rs" bench = false -[target.'cfg(target_arch = "wasm32")'.dependencies] -ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] } - -[target.'cfg(not(target_arch = "wasm32"))'.dependencies] -ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } - [dependencies] arrow-array = { workspace = true } arrow-buffer = { workspace = true } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 32b2384afe7..8860cd61c5b 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -56,8 +56,6 @@ arrow-string = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } pyo3 = { version = "0.23", default-features = false, optional = true } -chrono = { workspace = true, optional = true } - [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"] @@ -72,7 +70,7 @@ prettyprint = ["arrow-cast/prettyprint"] # not the core arrow code itself. Be aware that `rand` must be kept as # an optional dependency for supporting compile to wasm32-unknown-unknown # target without assuming an environment containing JavaScript. -test_utils = ["rand", "dep:chrono"] +test_utils = ["dep:rand"] pyarrow = ["pyo3", "ffi"] # force_validate runs full data validation for all arrays that are created # this is not enabled by default as it is too computationally expensive From e613622fb69a7cc941bfb97bf1020bee580cfb86 Mon Sep 17 00:00:00 2001 From: Kaifeng Zheng <100595273+Kev1n8@users.noreply.github.com> Date: Fri, 13 Dec 2024 04:39:29 +0800 Subject: [PATCH 06/29] Enable matching temporal as from_type to Utf8View (#6872) * enable matching temporal from_type * drop todo message --- arrow-cast/src/cast/mod.rs | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 0a44392a245..5c892783d4a 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -1472,7 +1472,7 @@ pub fn cast_with_options( (BinaryView, _) => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), - (from_type, Utf8View) if from_type.is_numeric() => { + (from_type, Utf8View) if from_type.is_primitive() => { value_to_string_view(array, cast_options) } (from_type, LargeUtf8) if from_type.is_primitive() => { @@ -5258,9 +5258,6 @@ mod tests { assert_eq!("2018-12-25T00:00:00", c.value(1)); } - // Cast Timestamp to Utf8View is not supported yet - // TODO: Implement casting from Timestamp to Utf8View - // https://github.com/apache/arrow-rs/issues/6734 macro_rules! assert_cast_timestamp_to_string { ($array:expr, $datatype:expr, $output_array_type: ty, $expected:expr) => {{ let out = cast(&$array, &$datatype).unwrap(); @@ -5295,7 +5292,7 @@ mod tests { None, ]; - // assert_cast_timestamp_to_string!(array, DataType::Utf8View, StringViewArray, expected); + assert_cast_timestamp_to_string!(array, DataType::Utf8View, StringViewArray, expected); assert_cast_timestamp_to_string!(array, DataType::Utf8, StringArray, expected); assert_cast_timestamp_to_string!(array, DataType::LargeUtf8, LargeStringArray, expected); } @@ -5319,7 +5316,13 @@ mod tests { Some("2018-12-25 00:00:02.001000"), None, ]; - // assert_cast_timestamp_to_string!(array_without_tz, DataType::Utf8View, StringViewArray, cast_options, expected); + assert_cast_timestamp_to_string!( + array_without_tz, + DataType::Utf8View, + StringViewArray, + cast_options, + expected + ); assert_cast_timestamp_to_string!( array_without_tz, DataType::Utf8, @@ -5343,7 +5346,13 @@ mod tests { Some("2018-12-25 05:45:02.001000"), None, ]; - // assert_cast_timestamp_to_string!(array_with_tz, DataType::Utf8View, StringViewArray, cast_options, expected); + assert_cast_timestamp_to_string!( + array_with_tz, + DataType::Utf8View, + StringViewArray, + cast_options, + expected + ); assert_cast_timestamp_to_string!( array_with_tz, DataType::Utf8, From 84dba34e6754b34baf19bd694506278cdc176a2e Mon Sep 17 00:00:00 2001 From: Andrew Varnon Date: Thu, 12 Dec 2024 17:58:55 -0500 Subject: [PATCH 07/29] Use randomized content ID for Azure multipart uploads (#6869) * Use randomized content ID for Azure multipart uploads * Update object_store/src/azure/client.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads * fixup! Use randomized content ID for Azure multipart uploads --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/src/aws/mod.rs | 1 + object_store/src/azure/client.rs | 6 ++- object_store/src/azure/mod.rs | 1 + object_store/src/gcp/mod.rs | 1 + object_store/src/integration.rs | 87 ++++++++++++++++++++++++++++++++ 5 files changed, 94 insertions(+), 2 deletions(-) diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs index a7f9264a681..7f449c49963 100644 --- a/object_store/src/aws/mod.rs +++ b/object_store/src/aws/mod.rs @@ -570,6 +570,7 @@ mod tests { rename_and_copy(&integration).await; stream_get(&integration).await; multipart(&integration, &integration).await; + multipart_race_condition(&integration, true).await; signing(&integration).await; s3_encryption(&integration).await; put_get_attributes(&integration).await; diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 76dedd71aa5..69ff39526be 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -36,6 +36,7 @@ use base64::Engine; use bytes::{Buf, Bytes}; use chrono::{DateTime, Utc}; use hyper::http::HeaderName; +use rand::Rng as _; use reqwest::{ header::{HeaderMap, HeaderValue, CONTENT_LENGTH, CONTENT_TYPE, IF_MATCH, IF_NONE_MATCH}, Client as ReqwestClient, Method, RequestBuilder, Response, @@ -556,10 +557,11 @@ impl AzureClient { pub(crate) async fn put_block( &self, path: &Path, - part_idx: usize, + _part_idx: usize, payload: PutPayload, ) -> Result { - let content_id = format!("{part_idx:20}"); + let part_idx = u128::from_be_bytes(rand::thread_rng().gen()); + let content_id = format!("{part_idx:032x}"); let block_id = BASE64_STANDARD.encode(&content_id); self.put_request(path, payload) diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index 177bffb653a..81b6667bc05 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -314,6 +314,7 @@ mod tests { stream_get(&integration).await; put_opts(&integration, true).await; multipart(&integration, &integration).await; + multipart_race_condition(&integration, false).await; signing(&integration).await; let validate = !integration.client.config().disable_tagging; diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs index 039ec46b68c..5199135ba6b 100644 --- a/object_store/src/gcp/mod.rs +++ b/object_store/src/gcp/mod.rs @@ -297,6 +297,7 @@ mod test { // https://github.com/fsouza/fake-gcs-server/issues/852 stream_get(&integration).await; multipart(&integration, &integration).await; + multipart_race_condition(&integration, true).await; // Fake GCS server doesn't currently honor preconditions get_opts(&integration).await; put_opts(&integration, true).await; diff --git a/object_store/src/integration.rs b/object_store/src/integration.rs index 30177878306..20e95fddc47 100644 --- a/object_store/src/integration.rs +++ b/object_store/src/integration.rs @@ -24,6 +24,8 @@ //! //! They are intended solely for testing purposes. +use core::str; + use crate::multipart::MultipartStore; use crate::path::Path; use crate::{ @@ -1109,3 +1111,88 @@ async fn delete_fixtures(storage: &DynObjectStore) { .await .unwrap(); } + +/// Tests a race condition where 2 threads are performing multipart writes to the same path +pub async fn multipart_race_condition(storage: &dyn ObjectStore, last_writer_wins: bool) { + let path = Path::from("test_multipart_race_condition"); + + let mut multipart_upload_1 = storage.put_multipart(&path).await.unwrap(); + let mut multipart_upload_2 = storage.put_multipart(&path).await.unwrap(); + + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 0)).into()) + .await + .unwrap(); + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 0)).into()) + .await + .unwrap(); + + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 1)).into()) + .await + .unwrap(); + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 1)).into()) + .await + .unwrap(); + + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 2)).into()) + .await + .unwrap(); + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 2)).into()) + .await + .unwrap(); + + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 3)).into()) + .await + .unwrap(); + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 3)).into()) + .await + .unwrap(); + + multipart_upload_1 + .put_part(Bytes::from(format!("1:{:05300000},", 4)).into()) + .await + .unwrap(); + multipart_upload_2 + .put_part(Bytes::from(format!("2:{:05300000},", 4)).into()) + .await + .unwrap(); + + multipart_upload_1.complete().await.unwrap(); + + if last_writer_wins { + multipart_upload_2.complete().await.unwrap(); + } else { + let err = multipart_upload_2.complete().await.unwrap_err(); + + assert!(matches!(err, crate::Error::Generic { .. }), "{err}"); + } + + let get_result = storage.get(&path).await.unwrap(); + let bytes = get_result.bytes().await.unwrap(); + let string_contents = str::from_utf8(&bytes).unwrap(); + + if last_writer_wins { + assert!(string_contents.starts_with( + format!( + "2:{:05300000},2:{:05300000},2:{:05300000},2:{:05300000},2:{:05300000},", + 0, 1, 2, 3, 4 + ) + .as_str() + )); + } else { + assert!(string_contents.starts_with( + format!( + "1:{:05300000},1:{:05300000},1:{:05300000},1:{:05300000},1:{:05300000},", + 0, 1, 2, 3, 4 + ) + .as_str() + )); + } +} From eb7ab83aab6b54897b00da1eb739c863d61445a2 Mon Sep 17 00:00:00 2001 From: Himadri Pal Date: Thu, 12 Dec 2024 15:00:22 -0800 Subject: [PATCH 08/29] fix: decimal conversion looses value on lower precision (#6836) * decimal conversion looses value on lower precision, throws error now on overflow. * fix review comments and fix formatting. * for simple case of equal scale and bigger precision, no conversion needed. revert whitespace changes formatting check --------- Co-authored-by: himadripal --- arrow-cast/src/cast/decimal.rs | 49 +++++++++-------- arrow-cast/src/cast/mod.rs | 99 ++++++++++++++++++++++++++++++---- 2 files changed, 116 insertions(+), 32 deletions(-) diff --git a/arrow-cast/src/cast/decimal.rs b/arrow-cast/src/cast/decimal.rs index 74cd2637e6a..ba82ca9040c 100644 --- a/arrow-cast/src/cast/decimal.rs +++ b/arrow-cast/src/cast/decimal.rs @@ -111,9 +111,13 @@ where O::Native::from_decimal(adjusted) }; - Ok(match cast_options.safe { - true => array.unary_opt(f), - false => array.try_unary(|x| f(x).ok_or_else(|| error(x)))?, + Ok(if cast_options.safe { + array.unary_opt(|x| f(x).filter(|v| O::is_valid_decimal_precision(*v, output_precision))) + } else { + array.try_unary(|x| { + f(x).ok_or_else(|| error(x)) + .and_then(|v| O::validate_decimal_precision(v, output_precision).map(|_| v)) + })? }) } @@ -137,15 +141,20 @@ where let f = |x| O::Native::from_decimal(x).and_then(|x| x.mul_checked(mul).ok()); - Ok(match cast_options.safe { - true => array.unary_opt(f), - false => array.try_unary(|x| f(x).ok_or_else(|| error(x)))?, + Ok(if cast_options.safe { + array.unary_opt(|x| f(x).filter(|v| O::is_valid_decimal_precision(*v, output_precision))) + } else { + array.try_unary(|x| { + f(x).ok_or_else(|| error(x)) + .and_then(|v| O::validate_decimal_precision(v, output_precision).map(|_| v)) + })? }) } // Only support one type of decimal cast operations pub(crate) fn cast_decimal_to_decimal_same_type( array: &PrimitiveArray, + input_precision: u8, input_scale: i8, output_precision: u8, output_scale: i8, @@ -155,20 +164,11 @@ where T: DecimalType, T::Native: DecimalCast + ArrowNativeTypeOp, { - let array: PrimitiveArray = match input_scale.cmp(&output_scale) { - Ordering::Equal => { - // the scale doesn't change, the native value don't need to be changed + let array: PrimitiveArray = + if input_scale == output_scale && input_precision <= output_precision { array.clone() - } - Ordering::Greater => convert_to_smaller_scale_decimal::( - array, - input_scale, - output_precision, - output_scale, - cast_options, - )?, - Ordering::Less => { - // input_scale < output_scale + } else if input_scale < output_scale { + // the scale doesn't change, but precision may change and cause overflow convert_to_bigger_or_equal_scale_decimal::( array, input_scale, @@ -176,8 +176,15 @@ where output_scale, cast_options, )? - } - }; + } else { + convert_to_smaller_scale_decimal::( + array, + input_scale, + output_precision, + output_scale, + cast_options, + )? + }; Ok(Arc::new(array.with_precision_and_scale( output_precision, diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 5c892783d4a..ba470635c6c 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -830,18 +830,20 @@ pub fn cast_with_options( (Map(_, ordered1), Map(_, ordered2)) if ordered1 == ordered2 => { cast_map_values(array.as_map(), to_type, cast_options, ordered1.to_owned()) } - (Decimal128(_, s1), Decimal128(p2, s2)) => { + (Decimal128(p1, s1), Decimal128(p2, s2)) => { cast_decimal_to_decimal_same_type::( array.as_primitive(), + *p1, *s1, *p2, *s2, cast_options, ) } - (Decimal256(_, s1), Decimal256(p2, s2)) => { + (Decimal256(p1, s1), Decimal256(p2, s2)) => { cast_decimal_to_decimal_same_type::( array.as_primitive(), + *p1, *s1, *p2, *s2, @@ -2694,13 +2696,16 @@ mod tests { // negative test let array = vec![Some(123456), None]; let array = create_decimal_array(array, 10, 0).unwrap(); - let result = cast(&array, &DataType::Decimal128(2, 2)); - assert!(result.is_ok()); - let array = result.unwrap(); - let array: &Decimal128Array = array.as_primitive(); - let err = array.validate_decimal_precision(2); + let result_safe = cast(&array, &DataType::Decimal128(2, 2)); + assert!(result_safe.is_ok()); + let options = CastOptions { + safe: false, + ..Default::default() + }; + + let result_unsafe = cast_with_options(&array, &DataType::Decimal128(2, 2), &options); assert_eq!("Invalid argument error: 12345600 is too large to store in a Decimal128 of precision 2. Max is 99", - err.unwrap_err().to_string()); + result_unsafe.unwrap_err().to_string()); } #[test] @@ -8460,7 +8465,7 @@ mod tests { let input_type = DataType::Decimal128(10, 3); let output_type = DataType::Decimal256(10, 5); assert!(can_cast_types(&input_type, &output_type)); - let array = vec![Some(i128::MAX), Some(i128::MIN)]; + let array = vec![Some(123456), Some(-123456)]; let input_decimal_array = create_decimal_array(array, 10, 3).unwrap(); let array = Arc::new(input_decimal_array) as ArrayRef; @@ -8470,8 +8475,8 @@ mod tests { Decimal256Array, &output_type, vec![ - Some(i256::from_i128(i128::MAX).mul_wrapping(hundred)), - Some(i256::from_i128(i128::MIN).mul_wrapping(hundred)) + Some(i256::from_i128(123456).mul_wrapping(hundred)), + Some(i256::from_i128(-123456).mul_wrapping(hundred)) ] ); } @@ -9935,4 +9940,76 @@ mod tests { "Cast non-nullable to non-nullable struct field returning null should fail", ); } + + #[test] + fn test_decimal_to_decimal_throw_error_on_precision_overflow_same_scale() { + let array = vec![Some(123456789)]; + let array = create_decimal_array(array, 24, 2).unwrap(); + println!("{:?}", array); + let input_type = DataType::Decimal128(24, 2); + let output_type = DataType::Decimal128(6, 2); + assert!(can_cast_types(&input_type, &output_type)); + + let options = CastOptions { + safe: false, + ..Default::default() + }; + let result = cast_with_options(&array, &output_type, &options); + assert_eq!(result.unwrap_err().to_string(), + "Invalid argument error: 123456790 is too large to store in a Decimal128 of precision 6. Max is 999999"); + } + + #[test] + fn test_decimal_to_decimal_throw_error_on_precision_overflow_lower_scale() { + let array = vec![Some(123456789)]; + let array = create_decimal_array(array, 24, 2).unwrap(); + println!("{:?}", array); + let input_type = DataType::Decimal128(24, 4); + let output_type = DataType::Decimal128(6, 2); + assert!(can_cast_types(&input_type, &output_type)); + + let options = CastOptions { + safe: false, + ..Default::default() + }; + let result = cast_with_options(&array, &output_type, &options); + assert_eq!(result.unwrap_err().to_string(), + "Invalid argument error: 123456790 is too large to store in a Decimal128 of precision 6. Max is 999999"); + } + + #[test] + fn test_decimal_to_decimal_throw_error_on_precision_overflow_greater_scale() { + let array = vec![Some(123456789)]; + let array = create_decimal_array(array, 24, 2).unwrap(); + println!("{:?}", array); + let input_type = DataType::Decimal128(24, 2); + let output_type = DataType::Decimal128(6, 3); + assert!(can_cast_types(&input_type, &output_type)); + + let options = CastOptions { + safe: false, + ..Default::default() + }; + let result = cast_with_options(&array, &output_type, &options); + assert_eq!(result.unwrap_err().to_string(), + "Invalid argument error: 1234567890 is too large to store in a Decimal128 of precision 6. Max is 999999"); + } + + #[test] + fn test_decimal_to_decimal_throw_error_on_precision_overflow_diff_type() { + let array = vec![Some(123456789)]; + let array = create_decimal_array(array, 24, 2).unwrap(); + println!("{:?}", array); + let input_type = DataType::Decimal128(24, 2); + let output_type = DataType::Decimal256(6, 2); + assert!(can_cast_types(&input_type, &output_type)); + + let options = CastOptions { + safe: false, + ..Default::default() + }; + let result = cast_with_options(&array, &output_type, &options); + assert_eq!(result.unwrap_err().to_string(), + "Invalid argument error: 123456789 is too large to store in a Decimal256 of precision 6. Max is 999999"); + } } From c4dbf0d8af6ca5a19b8b2ea777da3c276807fc5e Mon Sep 17 00:00:00 2001 From: niebayes Date: Fri, 13 Dec 2024 20:23:33 +0800 Subject: [PATCH 09/29] fix: make GetCatalogsBuilder sort catalog names (#6864) * fix: sort catalog names * apply code suggestion from tustvold Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-flight/src/sql/metadata/catalogs.rs | 30 ++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/arrow-flight/src/sql/metadata/catalogs.rs b/arrow-flight/src/sql/metadata/catalogs.rs index 327fed81077..e27c63c3932 100644 --- a/arrow-flight/src/sql/metadata/catalogs.rs +++ b/arrow-flight/src/sql/metadata/catalogs.rs @@ -68,7 +68,8 @@ impl GetCatalogsBuilder { /// builds a `RecordBatch` with the correct schema for a /// [`CommandGetCatalogs`] response pub fn build(self) -> Result { - let Self { catalogs } = self; + let Self { mut catalogs } = self; + catalogs.sort_unstable(); let batch = RecordBatch::try_new( Arc::clone(&GET_CATALOG_SCHEMA), @@ -98,3 +99,30 @@ static GET_CATALOG_SCHEMA: Lazy = Lazy::new(|| { false, )])) }); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_catalogs_are_sorted() { + let batch = ["a_catalog", "c_catalog", "b_catalog"] + .into_iter() + .fold(GetCatalogsBuilder::new(), |mut builder, catalog| { + builder.append(catalog); + builder + }) + .build() + .unwrap(); + let catalogs = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .flatten() + .collect::>(); + assert!(catalogs.is_sorted()); + assert_eq!(catalogs, ["a_catalog", "b_catalog", "c_catalog"]); + } +} From fc6936aff50b0b1c8bea984bba2d57dac68a98b3 Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Mon, 16 Dec 2024 19:29:17 +0100 Subject: [PATCH 10/29] Add deprecation warnings for everything related to `dict_id` (#6873) * Add deprecation warnings for everything related to `dict_id` * Add allow deprecations where necessary --- arrow-flight/src/encode.rs | 9 ++++ arrow-flight/src/lib.rs | 1 + arrow-flight/src/utils.rs | 1 + arrow-integration-test/src/field.rs | 27 +++++++----- arrow-integration-test/src/lib.rs | 3 ++ arrow-integration-test/src/schema.rs | 1 + .../integration_test.rs | 1 + .../integration_test.rs | 1 + arrow-ipc/src/convert.rs | 5 +++ arrow-ipc/src/reader.rs | 14 ++++++ arrow-ipc/src/reader/stream.rs | 2 + arrow-ipc/src/writer.rs | 44 +++++++++++++++++++ arrow-schema/src/ffi.rs | 1 + arrow-schema/src/field.rs | 36 ++++++++++++++- arrow-schema/src/schema.rs | 19 ++++++-- parquet/src/arrow/arrow_writer/mod.rs | 3 ++ parquet/src/arrow/schema/complex.rs | 5 ++- parquet/src/arrow/schema/mod.rs | 2 + 18 files changed, 158 insertions(+), 17 deletions(-) diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs index 3ca192d0bb9..315b7b3cb6e 100644 --- a/arrow-flight/src/encode.rs +++ b/arrow-flight/src/encode.rs @@ -535,8 +535,10 @@ fn prepare_field_for_flight( ) .with_metadata(field.metadata().clone()) } else { + #[allow(deprecated)] let dict_id = dictionary_tracker.set_dict_id(field.as_ref()); + #[allow(deprecated)] Field::new_dict( field.name(), field.data_type().clone(), @@ -583,7 +585,9 @@ fn prepare_schema_for_flight( ) .with_metadata(field.metadata().clone()) } else { + #[allow(deprecated)] let dict_id = dictionary_tracker.set_dict_id(field.as_ref()); + #[allow(deprecated)] Field::new_dict( field.name(), field.data_type().clone(), @@ -650,10 +654,12 @@ struct FlightIpcEncoder { impl FlightIpcEncoder { fn new(options: IpcWriteOptions, error_on_replacement: bool) -> Self { + #[allow(deprecated)] let preserve_dict_id = options.preserve_dict_id(); Self { options, data_gen: IpcDataGenerator::default(), + #[allow(deprecated)] dictionary_tracker: DictionaryTracker::new_with_preserve_dict_id( error_on_replacement, preserve_dict_id, @@ -1541,6 +1547,7 @@ mod tests { async fn verify_flight_round_trip(mut batches: Vec) { let expected_schema = batches.first().unwrap().schema(); + #[allow(deprecated)] let encoder = FlightDataEncoderBuilder::default() .with_options(IpcWriteOptions::default().with_preserve_dict_id(false)) .with_dictionary_handling(DictionaryHandling::Resend) @@ -1568,6 +1575,7 @@ mod tests { HashMap::from([("some_key".to_owned(), "some_value".to_owned())]), ); + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); let got = prepare_schema_for_flight(&schema, &mut dictionary_tracker, false); @@ -1598,6 +1606,7 @@ mod tests { options: &IpcWriteOptions, ) -> (Vec, FlightData) { let data_gen = IpcDataGenerator::default(); + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 81fa99faeb9..1dd2700794f 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -143,6 +143,7 @@ pub struct IpcMessage(pub Bytes); fn flight_schema_as_encoded_data(arrow_schema: &Schema, options: &IpcWriteOptions) -> EncodedData { let data_gen = writer::IpcDataGenerator::default(); + #[allow(deprecated)] let mut dict_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); data_gen.schema_to_bytes_with_dictionary_tracker(arrow_schema, &mut dict_tracker, options) diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs index 3242f511ae6..428dde73ca6 100644 --- a/arrow-flight/src/utils.rs +++ b/arrow-flight/src/utils.rs @@ -90,6 +90,7 @@ pub fn batches_to_flight_data( let mut flight_data = vec![]; let data_gen = writer::IpcDataGenerator::default(); + #[allow(deprecated)] let mut dictionary_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs index 32edc416593..4b896ed391b 100644 --- a/arrow-integration-test/src/field.rs +++ b/arrow-integration-test/src/field.rs @@ -252,6 +252,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result { _ => data_type, }; + #[allow(deprecated)] let mut field = Field::new_dict(name, data_type, nullable, dict_id, dict_is_ordered); field.set_metadata(metadata); Ok(field) @@ -274,17 +275,21 @@ pub fn field_to_json(field: &Field) -> serde_json::Value { }; match field.data_type() { - DataType::Dictionary(ref index_type, ref value_type) => serde_json::json!({ - "name": field.name(), - "nullable": field.is_nullable(), - "type": data_type_to_json(value_type), - "children": children, - "dictionary": { - "id": field.dict_id().unwrap(), - "indexType": data_type_to_json(index_type), - "isOrdered": field.dict_is_ordered().unwrap(), - } - }), + DataType::Dictionary(ref index_type, ref value_type) => { + #[allow(deprecated)] + let dict_id = field.dict_id().unwrap(); + serde_json::json!({ + "name": field.name(), + "nullable": field.is_nullable(), + "type": data_type_to_json(value_type), + "children": children, + "dictionary": { + "id": dict_id, + "indexType": data_type_to_json(index_type), + "isOrdered": field.dict_is_ordered().unwrap(), + } + }) + } _ => serde_json::json!({ "name": field.name(), "nullable": field.is_nullable(), diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index a25f07ded1d..f025009c22d 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -787,6 +787,7 @@ pub fn array_from_json( Ok(Arc::new(array)) } DataType::Dictionary(key_type, value_type) => { + #[allow(deprecated)] let dict_id = field.dict_id().ok_or_else(|| { ArrowError::JsonError(format!("Unable to find dict_id for field {field:?}")) })?; @@ -930,10 +931,12 @@ pub fn dictionary_array_from_json( let null_buf = create_null_buf(&json_col); // build the key data into a buffer, then construct values separately + #[allow(deprecated)] let key_field = Field::new_dict( "key", dict_key.clone(), field.is_nullable(), + #[allow(deprecated)] field .dict_id() .expect("Dictionary fields must have a dict_id value"), diff --git a/arrow-integration-test/src/schema.rs b/arrow-integration-test/src/schema.rs index fb91aba00df..512f0aed8e5 100644 --- a/arrow-integration-test/src/schema.rs +++ b/arrow-integration-test/src/schema.rs @@ -189,6 +189,7 @@ mod tests { Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false), Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false), Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false), + #[allow(deprecated)] Field::new_dict( "c33", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs index 62cdca975b0..406419028d0 100644 --- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs @@ -72,6 +72,7 @@ async fn upload_data( let (mut upload_tx, upload_rx) = mpsc::channel(10); let options = arrow::ipc::writer::IpcWriteOptions::default(); + #[allow(deprecated)] let mut dict_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); let data_gen = writer::IpcDataGenerator::default(); diff --git a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs index 27721750081..92989a20393 100644 --- a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs +++ b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs @@ -119,6 +119,7 @@ impl FlightService for FlightServiceImpl { .ok_or_else(|| Status::not_found(format!("Could not find flight. {key}")))?; let options = arrow::ipc::writer::IpcWriteOptions::default(); + #[allow(deprecated)] let mut dictionary_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id()); let data_gen = writer::IpcDataGenerator::default(); diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs index 0aa07a6a47d..37c5a19439c 100644 --- a/arrow-ipc/src/convert.rs +++ b/arrow-ipc/src/convert.rs @@ -165,6 +165,7 @@ pub fn schema_to_fb_offset<'a>( impl From> for Field { fn from(field: crate::Field) -> Field { let arrow_field = if let Some(dictionary) = field.dictionary() { + #[allow(deprecated)] Field::new_dict( field.name().unwrap(), get_data_type(field, true), @@ -519,6 +520,7 @@ pub(crate) fn build_field<'a>( match dictionary_tracker { Some(tracker) => Some(get_fb_dictionary( index_type, + #[allow(deprecated)] tracker.set_dict_id(field), field .dict_is_ordered() @@ -527,6 +529,7 @@ pub(crate) fn build_field<'a>( )), None => Some(get_fb_dictionary( index_type, + #[allow(deprecated)] field .dict_id() .expect("Dictionary type must have a dictionary id"), @@ -1143,6 +1146,7 @@ mod tests { ), true, ), + #[allow(deprecated)] Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), @@ -1150,6 +1154,7 @@ mod tests { 123, true, ), + #[allow(deprecated)] Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::UInt32)), diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 481fefbd5fb..9ff4da30ed8 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -196,6 +196,7 @@ fn create_array( let index_node = reader.next_node(field)?; let index_buffers = [reader.next_buffer()?, reader.next_buffer()?]; + #[allow(deprecated)] let dict_id = field.dict_id().ok_or_else(|| { ArrowError::ParseError(format!("Field {field} does not have dict id")) })?; @@ -617,6 +618,7 @@ fn read_dictionary_impl( } let id = batch.id(); + #[allow(deprecated)] let fields_using_this_dictionary = schema.fields_with_dict_id(id); let first_field = fields_using_this_dictionary.first().ok_or_else(|| { ArrowError::InvalidArgumentError(format!("dictionary id {id} not found in schema")) @@ -1725,6 +1727,7 @@ mod tests { let mut writer = crate::writer::FileWriter::try_new_with_options( &mut buf, batch.schema_ref(), + #[allow(deprecated)] IpcWriteOptions::default().with_preserve_dict_id(false), ) .unwrap(); @@ -1869,6 +1872,7 @@ mod tests { let key_dict_keys = Int8Array::from_iter_values([0, 0, 2, 1, 1, 3]); let key_dict_array = DictionaryArray::new(key_dict_keys, values); + #[allow(deprecated)] let keys_field = Arc::new(Field::new_dict( "keys", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), @@ -1876,6 +1880,7 @@ mod tests { 1, false, )); + #[allow(deprecated)] let values_field = Arc::new(Field::new_dict( "values", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), @@ -1956,6 +1961,7 @@ mod tests { #[test] fn test_roundtrip_stream_dict_of_list_of_dict() { // list + #[allow(deprecated)] let list_data_type = DataType::List(Arc::new(Field::new_dict( "item", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), @@ -1967,6 +1973,7 @@ mod tests { test_roundtrip_stream_dict_of_list_of_dict_impl::(list_data_type, offsets); // large list + #[allow(deprecated)] let list_data_type = DataType::LargeList(Arc::new(Field::new_dict( "item", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), @@ -1985,6 +1992,7 @@ mod tests { let dict_array = DictionaryArray::new(keys, Arc::new(values)); let dict_data = dict_array.into_data(); + #[allow(deprecated)] let list_data_type = DataType::FixedSizeList( Arc::new(Field::new_dict( "item", @@ -2075,6 +2083,7 @@ mod tests { let key_dict_keys = Int8Array::from_iter_values([0, 0, 1, 2, 0, 1, 3]); let key_dict_array = DictionaryArray::new(key_dict_keys, utf8_view_array.clone()); + #[allow(deprecated)] let keys_field = Arc::new(Field::new_dict( "keys", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8View)), @@ -2085,6 +2094,7 @@ mod tests { let value_dict_keys = Int8Array::from_iter_values([0, 3, 0, 1, 2, 0, 1]); let value_dict_array = DictionaryArray::new(value_dict_keys, bin_view_array); + #[allow(deprecated)] let values_field = Arc::new(Field::new_dict( "values", DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::BinaryView)), @@ -2150,6 +2160,7 @@ mod tests { .unwrap(); let gen = IpcDataGenerator {}; + #[allow(deprecated)] let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); let (_, encoded) = gen .encoded_batch(&batch, &mut dict_tracker, &Default::default()) @@ -2187,6 +2198,7 @@ mod tests { .unwrap(); let gen = IpcDataGenerator {}; + #[allow(deprecated)] let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); let (_, encoded) = gen .encoded_batch(&batch, &mut dict_tracker, &Default::default()) @@ -2326,6 +2338,7 @@ mod tests { ["a", "b"] .iter() .map(|name| { + #[allow(deprecated)] Field::new_dict( name.to_string(), DataType::Dictionary( @@ -2360,6 +2373,7 @@ mod tests { let mut writer = crate::writer::StreamWriter::try_new_with_options( &mut buf, batch.schema().as_ref(), + #[allow(deprecated)] crate::writer::IpcWriteOptions::default().with_preserve_dict_id(false), ) .expect("Failed to create StreamWriter"); diff --git a/arrow-ipc/src/reader/stream.rs b/arrow-ipc/src/reader/stream.rs index de5f5bdd629..9b0eea9b619 100644 --- a/arrow-ipc/src/reader/stream.rs +++ b/arrow-ipc/src/reader/stream.rs @@ -324,6 +324,7 @@ mod tests { "test1", DataType::RunEndEncoded( Arc::new(Field::new("run_ends".to_string(), DataType::Int32, false)), + #[allow(deprecated)] Arc::new(Field::new_dict( "values".to_string(), DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), @@ -353,6 +354,7 @@ mod tests { let mut writer = StreamWriter::try_new_with_options( &mut buffer, &schema, + #[allow(deprecated)] IpcWriteOptions::default().with_preserve_dict_id(false), ) .expect("Failed to create StreamWriter"); diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 32dab762447..ee5b9a54cc9 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -65,6 +65,10 @@ pub struct IpcWriteOptions { /// schema or generate unique dictionary IDs internally during encoding. /// /// Defaults to `false` + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] preserve_dict_id: bool, } @@ -108,6 +112,7 @@ impl IpcWriteOptions { | crate::MetadataVersion::V3 => Err(ArrowError::InvalidArgumentError( "Writing IPC metadata version 3 and lower not supported".to_string(), )), + #[allow(deprecated)] crate::MetadataVersion::V4 => Ok(Self { alignment, write_legacy_ipc_format, @@ -121,6 +126,7 @@ impl IpcWriteOptions { "Legacy IPC format only supported on metadata version 4".to_string(), )) } else { + #[allow(deprecated)] Ok(Self { alignment, write_legacy_ipc_format, @@ -138,7 +144,12 @@ impl IpcWriteOptions { /// Return whether the writer is configured to preserve the dictionary IDs /// defined in the schema + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] pub fn preserve_dict_id(&self) -> bool { + #[allow(deprecated)] self.preserve_dict_id } @@ -149,6 +160,11 @@ impl IpcWriteOptions { /// to the dictionary batches in order to encode them correctly /// /// The default will change to `false` in future releases + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] + #[allow(deprecated)] pub fn with_preserve_dict_id(mut self, preserve_dict_id: bool) -> Self { self.preserve_dict_id = preserve_dict_id; self @@ -157,6 +173,7 @@ impl IpcWriteOptions { impl Default for IpcWriteOptions { fn default() -> Self { + #[allow(deprecated)] Self { alignment: 64, write_legacy_ipc_format: false, @@ -420,6 +437,7 @@ impl IpcDataGenerator { // It's importnat to only take the dict_id at this point, because the dict ID // sequence is assigned depth-first, so we need to first encode children and have // them take their assigned dict IDs before we take the dict ID for this field. + #[allow(deprecated)] let dict_id = dict_id_seq .next() .or_else(|| field.dict_id()) @@ -767,6 +785,10 @@ pub struct DictionaryTracker { written: HashMap, dict_ids: Vec, error_on_replacement: bool, + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] preserve_dict_id: bool, } @@ -782,6 +804,7 @@ impl DictionaryTracker { /// the last seen dictionary ID (or using `0` if no other dictionary IDs have been /// seen) pub fn new(error_on_replacement: bool) -> Self { + #[allow(deprecated)] Self { written: HashMap::new(), dict_ids: Vec::new(), @@ -795,7 +818,12 @@ impl DictionaryTracker { /// If `error_on_replacement` /// is true, an error will be generated if an update to an /// existing dictionary is attempted. + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] pub fn new_with_preserve_dict_id(error_on_replacement: bool, preserve_dict_id: bool) -> Self { + #[allow(deprecated)] Self { written: HashMap::new(), dict_ids: Vec::new(), @@ -811,8 +839,14 @@ impl DictionaryTracker { /// /// If `preserve_dict_id` is false, this will return the value of the last `dict_id` assigned incremented by 1 /// or 0 in the case where no dictionary IDs have yet been assigned + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] pub fn set_dict_id(&mut self, field: &Field) -> i64 { + #[allow(deprecated)] let next = if self.preserve_dict_id { + #[allow(deprecated)] field.dict_id().expect("no dict_id in field") } else { self.dict_ids @@ -936,7 +970,9 @@ impl FileWriter { writer.write_all(&super::ARROW_MAGIC)?; writer.write_all(&PADDING[..pad_len])?; // write the schema, set the written bytes to the schema + header + #[allow(deprecated)] let preserve_dict_id = write_options.preserve_dict_id; + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(true, preserve_dict_id); let encoded_message = data_gen.schema_to_bytes_with_dictionary_tracker( @@ -1013,7 +1049,9 @@ impl FileWriter { let mut fbb = FlatBufferBuilder::new(); let dictionaries = fbb.create_vector(&self.dictionary_blocks); let record_batches = fbb.create_vector(&self.record_blocks); + #[allow(deprecated)] let preserve_dict_id = self.write_options.preserve_dict_id; + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(true, preserve_dict_id); let schema = IpcSchemaEncoder::new() @@ -1144,7 +1182,9 @@ impl StreamWriter { write_options: IpcWriteOptions, ) -> Result { let data_gen = IpcDataGenerator::default(); + #[allow(deprecated)] let preserve_dict_id = write_options.preserve_dict_id; + #[allow(deprecated)] let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(false, preserve_dict_id); @@ -2032,6 +2072,7 @@ mod tests { let array = Arc::new(inner) as ArrayRef; // Dict field with id 2 + #[allow(deprecated)] let dctfield = Field::new_dict("dict", array.data_type().clone(), false, 2, false); let union_fields = [(0, Arc::new(dctfield))].into_iter().collect(); @@ -2049,6 +2090,7 @@ mod tests { let batch = RecordBatch::try_new(schema, vec![Arc::new(union)]).unwrap(); let gen = IpcDataGenerator {}; + #[allow(deprecated)] let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); gen.encoded_batch(&batch, &mut dict_tracker, &Default::default()) .unwrap(); @@ -2065,6 +2107,7 @@ mod tests { let array = Arc::new(inner) as ArrayRef; // Dict field with id 2 + #[allow(deprecated)] let dctfield = Arc::new(Field::new_dict( "dict", array.data_type().clone(), @@ -2085,6 +2128,7 @@ mod tests { let batch = RecordBatch::try_new(schema, vec![struct_array]).unwrap(); let gen = IpcDataGenerator {}; + #[allow(deprecated)] let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true); gen.encoded_batch(&batch, &mut dict_tracker, &Default::default()) .unwrap(); diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs index 08b1e6d0b18..96c80974982 100644 --- a/arrow-schema/src/ffi.rs +++ b/arrow-schema/src/ffi.rs @@ -923,6 +923,7 @@ mod tests { #[test] fn test_dictionary_ordered() { + #[allow(deprecated)] let schema = Schema::new(vec![Field::new_dict( "dict", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 131e23d68b9..7d47c0ae1de 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -38,6 +38,10 @@ pub struct Field { name: String, data_type: DataType, nullable: bool, + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] dict_id: i64, dict_is_ordered: bool, /// A map of key-value pairs containing additional custom meta data. @@ -122,6 +126,7 @@ impl Field { /// Creates a new field with the given name, type, and nullability pub fn new(name: impl Into, data_type: DataType, nullable: bool) -> Self { + #[allow(deprecated)] Field { name: name.into(), data_type, @@ -151,6 +156,10 @@ impl Field { } /// Creates a new field that has additional dictionary information + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With the dict_id field disappearing this function signature will change by removing the dict_id parameter." + )] pub fn new_dict( name: impl Into, data_type: DataType, @@ -158,6 +167,7 @@ impl Field { dict_id: i64, dict_is_ordered: bool, ) -> Self { + #[allow(deprecated)] Field { name: name.into(), data_type, @@ -386,19 +396,30 @@ impl Field { /// Returns a vector containing all (potentially nested) `Field` instances selected by the /// dictionary ID they use #[inline] + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] pub(crate) fn fields_with_dict_id(&self, id: i64) -> Vec<&Field> { self.fields() .into_iter() .filter(|&field| { - matches!(field.data_type(), DataType::Dictionary(_, _)) && field.dict_id == id + #[allow(deprecated)] + let matching_dict_id = field.dict_id == id; + matches!(field.data_type(), DataType::Dictionary(_, _)) && matching_dict_id }) .collect() } /// Returns the dictionary ID, if this is a dictionary type. #[inline] + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it." + )] pub const fn dict_id(&self) -> Option { match self.data_type { + #[allow(deprecated)] DataType::Dictionary(_, _) => Some(self.dict_id), _ => None, } @@ -428,6 +449,7 @@ impl Field { /// assert!(field.is_nullable()); /// ``` pub fn try_merge(&mut self, from: &Field) -> Result<(), ArrowError> { + #[allow(deprecated)] if from.dict_id != self.dict_id { return Err(ArrowError::SchemaError(format!( "Fail to merge schema field '{}' because from dict_id = {} does not match {}", @@ -570,9 +592,11 @@ impl Field { /// * self.metadata is a superset of other.metadata /// * all other fields are equal pub fn contains(&self, other: &Field) -> bool { + #[allow(deprecated)] + let matching_dict_id = self.dict_id == other.dict_id; self.name == other.name && self.data_type.contains(&other.data_type) - && self.dict_id == other.dict_id + && matching_dict_id && self.dict_is_ordered == other.dict_is_ordered // self need to be nullable or both of them are not nullable && (self.nullable || !other.nullable) @@ -621,6 +645,7 @@ mod test { fn test_new_dict_with_string() { // Fields should allow owned Strings to support reuse let s = "c1"; + #[allow(deprecated)] Field::new_dict(s, DataType::Int64, false, 4, false); } @@ -738,6 +763,7 @@ mod test { #[test] fn test_fields_with_dict_id() { + #[allow(deprecated)] let dict1 = Field::new_dict( "dict1", DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), @@ -745,6 +771,7 @@ mod test { 10, false, ); + #[allow(deprecated)] let dict2 = Field::new_dict( "dict2", DataType::Dictionary(DataType::Int32.into(), DataType::Int8.into()), @@ -781,9 +808,11 @@ mod test { false, ); + #[allow(deprecated)] for field in field.fields_with_dict_id(10) { assert_eq!(dict1, *field); } + #[allow(deprecated)] for field in field.fields_with_dict_id(20) { assert_eq!(dict2, *field); } @@ -798,6 +827,7 @@ mod test { #[test] fn test_field_comparison_case() { // dictionary-encoding properties not used for field comparison + #[allow(deprecated)] let dict1 = Field::new_dict( "dict1", DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), @@ -805,6 +835,7 @@ mod test { 10, false, ); + #[allow(deprecated)] let dict2 = Field::new_dict( "dict1", DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), @@ -816,6 +847,7 @@ mod test { assert_eq!(dict1, dict2); assert_eq!(get_field_hash(&dict1), get_field_hash(&dict2)); + #[allow(deprecated)] let dict1 = Field::new_dict( "dict0", DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()), diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 47c22e2a931..6c79da53f98 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -389,7 +389,12 @@ impl Schema { /// Returns a vector of immutable references to all [`Field`] instances selected by /// the dictionary ID they use. + #[deprecated( + since = "54.0.0", + note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it." + )] pub fn fields_with_dict_id(&self, dict_id: i64) -> Vec<&Field> { + #[allow(deprecated)] self.fields .iter() .flat_map(|f| f.fields_with_dict_id(dict_id)) @@ -638,7 +643,9 @@ mod tests { assert_eq!(first_name.name(), "first_name"); assert_eq!(first_name.data_type(), &DataType::Utf8); assert!(!first_name.is_nullable()); - assert_eq!(first_name.dict_id(), None); + #[allow(deprecated)] + let dict_id = first_name.dict_id(); + assert_eq!(dict_id, None); assert_eq!(first_name.dict_is_ordered(), None); let metadata = first_name.metadata(); @@ -655,7 +662,9 @@ mod tests { interests.data_type(), &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)) ); - assert_eq!(interests.dict_id(), Some(123)); + #[allow(deprecated)] + let dict_id = interests.dict_id(); + assert_eq!(dict_id, Some(123)); assert_eq!(interests.dict_is_ordered(), Some(true)); } @@ -691,6 +700,7 @@ mod tests { fn schema_field_with_dict_id() { let schema = person_schema(); + #[allow(deprecated)] let fields_dict_123: Vec<_> = schema .fields_with_dict_id(123) .iter() @@ -698,7 +708,9 @@ mod tests { .collect(); assert_eq!(fields_dict_123, vec!["interests"]); - assert!(schema.fields_with_dict_id(456).is_empty()); + #[allow(deprecated)] + let is_empty = schema.fields_with_dict_id(456).is_empty(); + assert!(is_empty); } fn person_schema() -> Schema { @@ -718,6 +730,7 @@ mod tests { ])), false, ), + #[allow(deprecated)] Field::new_dict( "interests", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index bb6ebf75ec8..59cfac04719 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -2587,6 +2587,7 @@ mod tests { #[test] fn arrow_writer_string_dictionary() { // define schema + #[allow(deprecated)] let schema = Arc::new(Schema::new(vec![Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), @@ -2608,6 +2609,7 @@ mod tests { #[test] fn arrow_writer_primitive_dictionary() { // define schema + #[allow(deprecated)] let schema = Arc::new(Schema::new(vec![Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::UInt32)), @@ -2630,6 +2632,7 @@ mod tests { #[test] fn arrow_writer_string_dictionary_unsigned_index() { // define schema + #[allow(deprecated)] let schema = Arc::new(Schema::new(vec![Field::new_dict( "dictionary", DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index d2f5a0bd8d6..16d46bd852d 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -552,8 +552,11 @@ fn convert_field(parquet_type: &Type, field: &ParquetField, arrow_hint: Option<& match arrow_hint { Some(hint) => { // If the inferred type is a dictionary, preserve dictionary metadata + #[allow(deprecated)] let field = match (&data_type, hint.dict_id(), hint.dict_is_ordered()) { - (DataType::Dictionary(_, _), Some(id), Some(ordered)) => { + (DataType::Dictionary(_, _), Some(id), Some(ordered)) => + { + #[allow(deprecated)] Field::new_dict(name, data_type, nullable, id, ordered) } _ => Field::new(name, data_type, nullable), diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 0fbcb4856e4..26b0d8b7b88 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -178,6 +178,7 @@ fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Result { /// Encodes the Arrow schema into the IPC format, and base64 encodes it fn encode_arrow_schema(schema: &Schema) -> String { let options = writer::IpcWriteOptions::default(); + #[allow(deprecated)] let mut dictionary_tracker = writer::DictionaryTracker::new_with_preserve_dict_id(true, options.preserve_dict_id()); let data_gen = writer::IpcDataGenerator::default(); @@ -1823,6 +1824,7 @@ mod tests { // Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false), // Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false), // Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false), + #[allow(deprecated)] Field::new_dict( "c31", DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), From 9ffa06543be51613ea1f509e63f6e7405b7d9989 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 16 Dec 2024 10:32:18 -0800 Subject: [PATCH 11/29] Improvements to UTF-8 statistics truncation (#6870) * fix a few edge cases with utf-8 incrementing * add todo * simplify truncation * add another test * note case where string should render right to left * rework entirely, also avoid UTF8 processing if not required by the schema * more consistent naming * modify some tests to truncate in the middle of a multibyte char * add test and docstring * document truncate_min_value too --- parquet/src/column/writer/mod.rs | 293 +++++++++++++++++++++++++------ 1 file changed, 236 insertions(+), 57 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 16de0ba7898..8dc1d0db447 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -878,24 +878,67 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } } + /// Returns `true` if this column's logical type is a UTF-8 string. + fn is_utf8(&self) -> bool { + self.get_descriptor().logical_type() == Some(LogicalType::String) + || self.get_descriptor().converted_type() == ConvertedType::UTF8 + } + + /// Truncates a binary statistic to at most `truncation_length` bytes. + /// + /// If truncation is not possible, returns `data`. + /// + /// The `bool` in the returned tuple indicates whether truncation occurred or not. + /// + /// UTF-8 Note: + /// If the column type indicates UTF-8, and `data` contains valid UTF-8, then the result will + /// also remain valid UTF-8, but may be less tnan `truncation_length` bytes to avoid splitting + /// on non-character boundaries. fn truncate_min_value(&self, truncation_length: Option, data: &[u8]) -> (Vec, bool) { truncation_length .filter(|l| data.len() > *l) - .and_then(|l| match str::from_utf8(data) { - Ok(str_data) => truncate_utf8(str_data, l), - Err(_) => Some(data[..l].to_vec()), - }) + .and_then(|l| + // don't do extra work if this column isn't UTF-8 + if self.is_utf8() { + match str::from_utf8(data) { + Ok(str_data) => truncate_utf8(str_data, l), + Err(_) => Some(data[..l].to_vec()), + } + } else { + Some(data[..l].to_vec()) + } + ) .map(|truncated| (truncated, true)) .unwrap_or_else(|| (data.to_vec(), false)) } + /// Truncates a binary statistic to at most `truncation_length` bytes, and then increment the + /// final byte(s) to yield a valid upper bound. This may result in a result of less than + /// `truncation_length` bytes if the last byte(s) overflows. + /// + /// If truncation is not possible, returns `data`. + /// + /// The `bool` in the returned tuple indicates whether truncation occurred or not. + /// + /// UTF-8 Note: + /// If the column type indicates UTF-8, and `data` contains valid UTF-8, then the result will + /// also remain valid UTF-8 (but again may be less than `truncation_length` bytes). If `data` + /// does not contain valid UTF-8, then truncation will occur as if the column is non-string + /// binary. fn truncate_max_value(&self, truncation_length: Option, data: &[u8]) -> (Vec, bool) { truncation_length .filter(|l| data.len() > *l) - .and_then(|l| match str::from_utf8(data) { - Ok(str_data) => truncate_utf8(str_data, l).and_then(increment_utf8), - Err(_) => increment(data[..l].to_vec()), - }) + .and_then(|l| + // don't do extra work if this column isn't UTF-8 + if self.is_utf8() { + match str::from_utf8(data) { + Ok(str_data) => truncate_and_increment_utf8(str_data, l), + Err(_) => increment(data[..l].to_vec()), + } + } else { + increment(data[..l].to_vec()) + } + ) .map(|truncated| (truncated, true)) .unwrap_or_else(|| (data.to_vec(), false)) } @@ -1418,13 +1461,50 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: &[u8]) -> bool { (a[1..]) > (b[1..]) } -/// Truncate a UTF8 slice to the longest prefix that is still a valid UTF8 string, -/// while being less than `length` bytes and non-empty +/// Truncate a UTF-8 slice to the longest prefix that is still a valid UTF-8 string, +/// while being less than `length` bytes and non-empty. Returns `None` if truncation +/// is not possible within those constraints. +/// +/// The caller guarantees that data.len() > length. fn truncate_utf8(data: &str, length: usize) -> Option> { let split = (1..=length).rfind(|x| data.is_char_boundary(*x))?; Some(data.as_bytes()[..split].to_vec()) } +/// Truncate a UTF-8 slice and increment it's final character. The returned value is the +/// longest such slice that is still a valid UTF-8 string while being less than `length` +/// bytes and non-empty. Returns `None` if no such transformation is possible. +/// +/// The caller guarantees that data.len() > length. +fn truncate_and_increment_utf8(data: &str, length: usize) -> Option> { + // UTF-8 is max 4 bytes, so start search 3 back from desired length + let lower_bound = length.saturating_sub(3); + let split = (lower_bound..=length).rfind(|x| data.is_char_boundary(*x))?; + increment_utf8(data.get(..split)?) +} + +/// Increment the final character in a UTF-8 string in such a way that the returned result +/// is still a valid UTF-8 string. The returned string may be shorter than the input if the +/// last character(s) cannot be incremented (due to overflow or producing invalid code points). +/// Returns `None` if the string cannot be incremented. +/// +/// Note that this implementation will not promote an N-byte code point to (N+1) bytes. +fn increment_utf8(data: &str) -> Option> { + for (idx, original_char) in data.char_indices().rev() { + let original_len = original_char.len_utf8(); + if let Some(next_char) = char::from_u32(original_char as u32 + 1) { + // do not allow increasing byte width of incremented char + if next_char.len_utf8() == original_len { + let mut result = data.as_bytes()[..idx + original_len].to_vec(); + next_char.encode_utf8(&mut result[idx..]); + return Some(result); + } + } + } + + None +} + /// Try and increment the bytes from right to left. /// /// Returns `None` if all bytes are set to `u8::MAX`. @@ -1441,29 +1521,15 @@ fn increment(mut data: Vec) -> Option> { None } -/// Try and increment the the string's bytes from right to left, returning when the result -/// is a valid UTF8 string. Returns `None` when it can't increment any byte. -fn increment_utf8(mut data: Vec) -> Option> { - for idx in (0..data.len()).rev() { - let original = data[idx]; - let (byte, overflow) = original.overflowing_add(1); - if !overflow { - data[idx] = byte; - if str::from_utf8(&data).is_ok() { - return Some(data); - } - data[idx] = original; - } - } - - None -} - #[cfg(test)] mod tests { - use crate::file::properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; + use crate::{ + file::{properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, writer::SerializedFileWriter}, + schema::parser::parse_message_type, + }; + use core::str; use rand::distributions::uniform::SampleUniform; - use std::sync::Arc; + use std::{fs::File, sync::Arc}; use crate::column::{ page::PageReader, @@ -3140,39 +3206,69 @@ mod tests { #[test] fn test_increment_utf8() { + let test_inc = |o: &str, expected: &str| { + if let Ok(v) = String::from_utf8(increment_utf8(o).unwrap()) { + // Got the expected result... + assert_eq!(v, expected); + // and it's greater than the original string + assert!(*v > *o); + // Also show that BinaryArray level comparison works here + let mut greater = ByteArray::new(); + greater.set_data(Bytes::from(v)); + let mut original = ByteArray::new(); + original.set_data(Bytes::from(o.as_bytes().to_vec())); + assert!(greater > original); + } else { + panic!("Expected incremented UTF8 string to also be valid."); + } + }; + // Basic ASCII case - let v = increment_utf8("hello".as_bytes().to_vec()).unwrap(); - assert_eq!(&v, "hellp".as_bytes()); + test_inc("hello", "hellp"); + + // 1-byte ending in max 1-byte + test_inc("a\u{7f}", "b"); - // Also show that BinaryArray level comparison works here - let mut greater = ByteArray::new(); - greater.set_data(Bytes::from(v)); - let mut original = ByteArray::new(); - original.set_data(Bytes::from("hello".as_bytes().to_vec())); - assert!(greater > original); + // 1-byte max should not truncate as it would need 2-byte code points + assert!(increment_utf8("\u{7f}\u{7f}").is_none()); // UTF8 string - let s = "❤️🧡💛💚💙💜"; - let v = increment_utf8(s.as_bytes().to_vec()).unwrap(); + test_inc("❤️🧡💛💚💙💜", "❤️🧡💛💚💙💝"); - if let Ok(new) = String::from_utf8(v) { - assert_ne!(&new, s); - assert_eq!(new, "❤️🧡💛💚💙💝"); - assert!(new.as_bytes().last().unwrap() > s.as_bytes().last().unwrap()); - } else { - panic!("Expected incremented UTF8 string to also be valid.") - } + // 2-byte without overflow + test_inc("éééé", "éééê"); - // Max UTF8 character - should be a No-Op - let s = char::MAX.to_string(); - assert_eq!(s.len(), 4); - let v = increment_utf8(s.as_bytes().to_vec()); - assert!(v.is_none()); + // 2-byte that overflows lowest byte + test_inc("\u{ff}\u{ff}", "\u{ff}\u{100}"); + + // 2-byte ending in max 2-byte + test_inc("a\u{7ff}", "b"); + + // Max 2-byte should not truncate as it would need 3-byte code points + assert!(increment_utf8("\u{7ff}\u{7ff}").is_none()); + + // 3-byte without overflow [U+800, U+800] -> [U+800, U+801] (note that these + // characters should render right to left). + test_inc("ࠀࠀ", "ࠀࠁ"); + + // 3-byte ending in max 3-byte + test_inc("a\u{ffff}", "b"); + + // Max 3-byte should not truncate as it would need 4-byte code points + assert!(increment_utf8("\u{ffff}\u{ffff}").is_none()); - // Handle multi-byte UTF8 characters - let s = "a\u{10ffff}"; - let v = increment_utf8(s.as_bytes().to_vec()); - assert_eq!(&v.unwrap(), "b\u{10ffff}".as_bytes()); + // 4-byte without overflow + test_inc("𐀀𐀀", "𐀀𐀁"); + + // 4-byte ending in max unicode + test_inc("a\u{10ffff}", "b"); + + // Max 4-byte should not truncate + assert!(increment_utf8("\u{10ffff}\u{10ffff}").is_none()); + + // Skip over surrogate pair range (0xD800..=0xDFFF) + //test_inc("a\u{D7FF}", "a\u{e000}"); + test_inc("a\u{D7FF}", "b"); } #[test] @@ -3182,7 +3278,6 @@ mod tests { let r = truncate_utf8(data, data.as_bytes().len()).unwrap(); assert_eq!(r.len(), data.as_bytes().len()); assert_eq!(&r, data.as_bytes()); - println!("len is {}", data.len()); // We slice it away from the UTF8 boundary let r = truncate_utf8(data, 13).unwrap(); @@ -3192,6 +3287,90 @@ mod tests { // One multi-byte code point, and a length shorter than it, so we can't slice it let r = truncate_utf8("\u{0836}", 1); assert!(r.is_none()); + + // Test truncate and increment for max bounds on UTF-8 statistics + // 7-bit (i.e. ASCII) + let r = truncate_and_increment_utf8("yyyyyyyyy", 8).unwrap(); + assert_eq!(&r, "yyyyyyyz".as_bytes()); + + // 2-byte without overflow + let r = truncate_and_increment_utf8("ééééé", 7).unwrap(); + assert_eq!(&r, "ééê".as_bytes()); + + // 2-byte that overflows lowest byte + let r = truncate_and_increment_utf8("\u{ff}\u{ff}\u{ff}\u{ff}\u{ff}", 8).unwrap(); + assert_eq!(&r, "\u{ff}\u{ff}\u{ff}\u{100}".as_bytes()); + + // max 2-byte should not truncate as it would need 3-byte code points + let r = truncate_and_increment_utf8("߿߿߿߿߿", 8); + assert!(r.is_none()); + + // 3-byte without overflow [U+800, U+800, U+800] -> [U+800, U+801] (note that these + // characters should render right to left). + let r = truncate_and_increment_utf8("ࠀࠀࠀࠀ", 8).unwrap(); + assert_eq!(&r, "ࠀࠁ".as_bytes()); + + // max 3-byte should not truncate as it would need 4-byte code points + let r = truncate_and_increment_utf8("\u{ffff}\u{ffff}\u{ffff}", 8); + assert!(r.is_none()); + + // 4-byte without overflow + let r = truncate_and_increment_utf8("𐀀𐀀𐀀𐀀", 9).unwrap(); + assert_eq!(&r, "𐀀𐀁".as_bytes()); + + // max 4-byte should not truncate + let r = truncate_and_increment_utf8("\u{10ffff}\u{10ffff}", 8); + assert!(r.is_none()); + } + + #[test] + // Check fallback truncation of statistics that should be UTF-8, but aren't + // (see https://github.com/apache/arrow-rs/pull/6870). + fn test_byte_array_truncate_invalid_utf8_statistics() { + let message_type = " + message test_schema { + OPTIONAL BYTE_ARRAY a (UTF8); + } + "; + let schema = Arc::new(parse_message_type(message_type).unwrap()); + + // Create Vec containing non-UTF8 bytes + let data = vec![ByteArray::from(vec![128u8; 32]); 7]; + let def_levels = [1, 1, 1, 1, 0, 1, 0, 1, 0, 1]; + let file: File = tempfile::tempfile().unwrap(); + let props = Arc::new( + WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Chunk) + .set_statistics_truncate_length(Some(8)) + .build(), + ); + + let mut writer = SerializedFileWriter::new(&file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer + .typed::() + .write_batch(&data, Some(&def_levels), None) + .unwrap(); + col_writer.close().unwrap(); + row_group_writer.close().unwrap(); + let file_metadata = writer.close().unwrap(); + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + let stats = file_metadata.row_groups[0].columns[0] + .meta_data + .as_ref() + .unwrap() + .statistics + .as_ref() + .unwrap(); + assert!(!stats.is_max_value_exact.unwrap()); + // Truncation of invalid UTF-8 should fall back to binary truncation, so last byte should + // be incremented by 1. + assert_eq!( + stats.max_value, + Some([128, 128, 128, 128, 128, 128, 128, 129].to_vec()) + ); } #[test] From 2808625bea15401e89343cd184deefca931243e2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 16 Dec 2024 16:32:08 -0500 Subject: [PATCH 12/29] Add `ArrowToParquetSchemaConverter`, deprecate `arrow_to_parquet_schema` (#6840) * Add ArrowToParquetSchemaConverter, deprecate `arrow_to_parquet_schema` et al * Fmt * update test * Update parquet/src/arrow/schema/mod.rs Co-authored-by: Ed Seidl * Apply suggestions from code review Co-authored-by: Ed Seidl * Improve comments * Add more detail to WriterPropertiesBuilder docs * Update parquet/src/file/properties.rs Co-authored-by: Ed Seidl * Fix some more capitalization and add a link to Parquet date spec * Update parquet/src/arrow/schema/mod.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Revert "Update parquet/src/arrow/schema/mod.rs" This reverts commit bd4e2d5135e3e4fe2d3dc61e043659243f4423bb. * rename to ArrowSchemaConverter * change from build --> convert * update doc * fix fmt --------- Co-authored-by: Ed Seidl Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- parquet/src/arrow/arrow_writer/mod.rs | 26 ++-- parquet/src/arrow/mod.rs | 8 +- parquet/src/arrow/schema/mod.rs | 172 +++++++++++++++++++++----- parquet/src/file/properties.rs | 37 +++--- 4 files changed, 180 insertions(+), 63 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 59cfac04719..871b140768c 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -30,12 +30,10 @@ use arrow_array::types::*; use arrow_array::{ArrayRef, RecordBatch, RecordBatchWriter}; use arrow_schema::{ArrowError, DataType as ArrowDataType, Field, IntervalUnit, SchemaRef}; -use super::schema::{ - add_encoded_arrow_schema_to_metadata, arrow_to_parquet_schema, - arrow_to_parquet_schema_with_root, decimal_length_from_precision, -}; +use super::schema::{add_encoded_arrow_schema_to_metadata, decimal_length_from_precision}; use crate::arrow::arrow_writer::byte_array::ByteArrayEncoder; +use crate::arrow::ArrowSchemaConverter; use crate::column::page::{CompressedPage, PageWriteSpec, PageWriter}; use crate::column::writer::encoder::ColumnValueEncoder; use crate::column::writer::{ @@ -181,10 +179,11 @@ impl ArrowWriter { options: ArrowWriterOptions, ) -> Result { let mut props = options.properties; - let schema = match options.schema_root { - Some(s) => arrow_to_parquet_schema_with_root(&arrow_schema, &s, props.coerce_types())?, - None => arrow_to_parquet_schema(&arrow_schema, props.coerce_types())?, - }; + let mut converter = ArrowSchemaConverter::new().with_coerce_types(props.coerce_types()); + if let Some(schema_root) = &options.schema_root { + converter = converter.schema_root(schema_root); + } + let schema = converter.convert(&arrow_schema)?; if !options.skip_arrow_metadata { // add serialized arrow schema add_encoded_arrow_schema_to_metadata(&arrow_schema, &mut props); @@ -390,9 +389,9 @@ impl ArrowWriterOptions { } /// Set the name of the root parquet schema element (defaults to `"arrow_schema"`) - pub fn with_schema_root(self, name: String) -> Self { + pub fn with_schema_root(self, schema_root: String) -> Self { Self { - schema_root: Some(name), + schema_root: Some(schema_root), ..self } } @@ -538,7 +537,7 @@ impl ArrowColumnChunk { /// # use std::sync::Arc; /// # use arrow_array::*; /// # use arrow_schema::*; -/// # use parquet::arrow::arrow_to_parquet_schema; +/// # use parquet::arrow::ArrowSchemaConverter; /// # use parquet::arrow::arrow_writer::{ArrowLeafColumn, compute_leaves, get_column_writers}; /// # use parquet::file::properties::WriterProperties; /// # use parquet::file::writer::SerializedFileWriter; @@ -550,7 +549,10 @@ impl ArrowColumnChunk { /// /// // Compute the parquet schema /// let props = Arc::new(WriterProperties::default()); -/// let parquet_schema = arrow_to_parquet_schema(schema.as_ref(), props.coerce_types()).unwrap(); +/// let parquet_schema = ArrowSchemaConverter::new() +/// .with_coerce_types(props.coerce_types()) +/// .convert(&schema) +/// .unwrap(); /// /// // Create writers for each of the leaf columns /// let col_writers = get_column_writers(&parquet_schema, &props, &schema).unwrap(); diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 2d09cd19203..d77436bc1ff 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -116,9 +116,13 @@ pub use self::async_writer::AsyncArrowWriter; use crate::schema::types::SchemaDescriptor; use arrow_schema::{FieldRef, Schema}; +// continue to export deprecated methods until they are removed +#[allow(deprecated)] +pub use self::schema::arrow_to_parquet_schema; + pub use self::schema::{ - arrow_to_parquet_schema, parquet_to_arrow_field_levels, parquet_to_arrow_schema, - parquet_to_arrow_schema_by_columns, FieldLevels, + parquet_to_arrow_field_levels, parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, + ArrowSchemaConverter, FieldLevels, }; /// Schema metadata key used to store serialized Arrow IPC schema diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 26b0d8b7b88..4ae3fdb8e5c 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -15,13 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Provides API for converting parquet schema to arrow schema and vice versa. -//! -//! The main interfaces for converting parquet schema to arrow schema are -//! `parquet_to_arrow_schema`, `parquet_to_arrow_schema_by_columns` and -//! `parquet_to_arrow_field`. -//! -//! The interfaces for converting arrow schema to parquet schema is coming. +//! Converting Parquet schema <--> Arrow schema: [`ArrowSchemaConverter`] and [parquet_to_arrow_schema] use base64::prelude::BASE64_STANDARD; use base64::Engine; @@ -226,27 +220,134 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut } } -/// Convert arrow schema to parquet schema +/// Converter for Arrow schema to Parquet schema /// -/// The name of the root schema element defaults to `"arrow_schema"`, this can be -/// overridden with [`arrow_to_parquet_schema_with_root`] -pub fn arrow_to_parquet_schema(schema: &Schema, coerce_types: bool) -> Result { - arrow_to_parquet_schema_with_root(schema, "arrow_schema", coerce_types) +/// Example: +/// ``` +/// # use std::sync::Arc; +/// # use arrow_schema::{Field, Schema, DataType}; +/// # use parquet::arrow::ArrowSchemaConverter; +/// use parquet::schema::types::{SchemaDescriptor, Type}; +/// use parquet::basic; // note there are two `Type`s in the following example +/// // create an Arrow Schema +/// let arrow_schema = Schema::new(vec![ +/// Field::new("a", DataType::Int64, true), +/// Field::new("b", DataType::Date32, true), +/// ]); +/// // convert the Arrow schema to a Parquet schema +/// let parquet_schema = ArrowSchemaConverter::new() +/// .convert(&arrow_schema) +/// .unwrap(); +/// +/// let expected_parquet_schema = SchemaDescriptor::new( +/// Arc::new( +/// Type::group_type_builder("arrow_schema") +/// .with_fields(vec![ +/// Arc::new( +/// Type::primitive_type_builder("a", basic::Type::INT64) +/// .build().unwrap() +/// ), +/// Arc::new( +/// Type::primitive_type_builder("b", basic::Type::INT32) +/// .with_converted_type(basic::ConvertedType::DATE) +/// .with_logical_type(Some(basic::LogicalType::Date)) +/// .build().unwrap() +/// ), +/// ]) +/// .build().unwrap() +/// ) +/// ); +/// assert_eq!(parquet_schema, expected_parquet_schema); +/// ``` +#[derive(Debug)] +pub struct ArrowSchemaConverter<'a> { + /// Name of the root schema in Parquet + schema_root: &'a str, + /// Should we coerce Arrow types to compatible Parquet types? + /// + /// See docs on [Self::with_coerce_types]` + coerce_types: bool, } -/// Convert arrow schema to parquet schema specifying the name of the root schema element -pub fn arrow_to_parquet_schema_with_root( - schema: &Schema, - root: &str, - coerce_types: bool, -) -> Result { - let fields = schema - .fields() - .iter() - .map(|field| arrow_to_parquet_type(field, coerce_types).map(Arc::new)) - .collect::>()?; - let group = Type::group_type_builder(root).with_fields(fields).build()?; - Ok(SchemaDescriptor::new(Arc::new(group))) +impl Default for ArrowSchemaConverter<'_> { + fn default() -> Self { + Self::new() + } +} + +impl<'a> ArrowSchemaConverter<'a> { + /// Create a new converter + pub fn new() -> Self { + Self { + schema_root: "arrow_schema", + coerce_types: false, + } + } + + /// Should Arrow types be coerced into Parquet native types (default `false`). + /// + /// Setting this option to `true` will result in Parquet files that can be + /// read by more readers, but may lose precision for Arrow types such as + /// [`DataType::Date64`] which have no direct [corresponding Parquet type]. + /// + /// By default, this converter does not coerce to native Parquet types. Enabling type + /// coercion allows for meaningful representations that do not require + /// downstream readers to consider the embedded Arrow schema, and can allow + /// for greater compatibility with other Parquet implementations. However, + /// type coercion also prevents data from being losslessly round-tripped. + /// + /// # Discussion + /// + /// Some Arrow types such as `Date64`, `Timestamp` and `Interval` have no + /// corresponding Parquet logical type. Thus, they can not be losslessly + /// round-tripped when stored using the appropriate Parquet logical type. + /// For example, some Date64 values may be truncated when stored with + /// parquet's native 32 bit date type. + /// + /// For [`List`] and [`Map`] types, some Parquet readers expect certain + /// schema elements to have specific names (earlier versions of the spec + /// were somewhat ambiguous on this point). Type coercion will use the names + /// prescribed by the Parquet specification, potentially losing naming + /// metadata from the Arrow schema. + /// + /// [`List`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + /// [`Map`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps + /// [corresponding Parquet type]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date + /// + pub fn with_coerce_types(mut self, coerce_types: bool) -> Self { + self.coerce_types = coerce_types; + self + } + + /// Set the root schema element name (defaults to `"arrow_schema"`). + pub fn schema_root(mut self, schema_root: &'a str) -> Self { + self.schema_root = schema_root; + self + } + + /// Convert the specified Arrow [`Schema`] to the desired Parquet [`SchemaDescriptor`] + /// + /// See example in [`ArrowSchemaConverter`] + pub fn convert(&self, schema: &Schema) -> Result { + let fields = schema + .fields() + .iter() + .map(|field| arrow_to_parquet_type(field, self.coerce_types).map(Arc::new)) + .collect::>()?; + let group = Type::group_type_builder(self.schema_root) + .with_fields(fields) + .build()?; + Ok(SchemaDescriptor::new(Arc::new(group))) + } +} + +/// Convert arrow schema to parquet schema +/// +/// The name of the root schema element defaults to `"arrow_schema"`, this can be +/// overridden with [`ArrowSchemaConverter`] +#[deprecated(since = "54.0.0", note = "Use `ArrowToParquetSchemaConverter` instead")] +pub fn arrow_to_parquet_schema(schema: &Schema) -> Result { + ArrowSchemaConverter::new().convert(schema) } fn parse_key_value_metadata( @@ -1488,7 +1589,10 @@ mod tests { "; let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, true).unwrap(); + let converted_arrow_schema = ArrowSchemaConverter::new() + .with_coerce_types(true) + .convert(&arrow_schema) + .unwrap(); assert_eq!( parquet_schema.columns().len(), converted_arrow_schema.columns().len() @@ -1512,7 +1616,10 @@ mod tests { "; let parquet_group_type = parse_message_type(message_type).unwrap(); let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); - let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, false).unwrap(); + let converted_arrow_schema = ArrowSchemaConverter::new() + .with_coerce_types(false) + .convert(&arrow_schema) + .unwrap(); assert_eq!( parquet_schema.columns().len(), converted_arrow_schema.columns().len() @@ -1668,7 +1775,7 @@ mod tests { Field::new("decimal256", DataType::Decimal256(39, 2), false), ]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, false).unwrap(); + let converted_arrow_schema = ArrowSchemaConverter::new().convert(&arrow_schema).unwrap(); assert_eq!( parquet_schema.columns().len(), @@ -1705,9 +1812,10 @@ mod tests { false, )]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, true); + let converted_arrow_schema = ArrowSchemaConverter::new() + .with_coerce_types(true) + .convert(&arrow_schema); - assert!(converted_arrow_schema.is_err()); converted_arrow_schema.unwrap(); } @@ -1978,7 +2086,9 @@ mod tests { // don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema let arrow_schema = crate::arrow::parquet_to_arrow_schema(&schema_descriptor, None)?; - let parq_schema_descr = crate::arrow::arrow_to_parquet_schema(&arrow_schema, true)?; + let parq_schema_descr = ArrowSchemaConverter::new() + .with_coerce_types(true) + .convert(&arrow_schema)?; let parq_fields = parq_schema_descr.root_schema().get_fields(); assert_eq!(parq_fields.len(), 2); assert_eq!(parq_fields[0].get_basic_info().id(), 1); diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index aac450acd82..7b688333e54 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -16,14 +16,13 @@ // under the License. //! Configuration via [`WriterProperties`] and [`ReaderProperties`] -use std::str::FromStr; -use std::{collections::HashMap, sync::Arc}; - use crate::basic::{Compression, Encoding}; use crate::compression::{CodecOptions, CodecOptionsBuilder}; use crate::file::metadata::KeyValue; use crate::format::SortingColumn; use crate::schema::types::ColumnPath; +use std::str::FromStr; +use std::{collections::HashMap, sync::Arc}; /// Default value for [`WriterProperties::data_page_size_limit`] pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; @@ -780,22 +779,24 @@ impl WriterPropertiesBuilder { self } - /// Sets flag to control if type coercion is enabled (defaults to `false`). + /// Should the writer coerce types to parquet native types (defaults to `false`). /// - /// # Notes - /// Some Arrow types do not have a corresponding Parquet logical type. - /// Affected Arrow data types include `Date64`, `Timestamp` and `Interval`. - /// Also, for [`List`] and [`Map`] types, Parquet expects certain schema elements - /// to have specific names to be considered fully compliant. - /// Writers have the option to coerce these types and names to match those required - /// by the Parquet specification. - /// This type coercion allows for meaningful representations that do not require - /// downstream readers to consider the embedded Arrow schema, and can allow for greater - /// compatibility with other Parquet implementations. However, type - /// coercion also prevents the data from being losslessly round-tripped. - /// - /// [`List`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists - /// [`Map`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps + /// Leaving this option the default `false` will ensure the exact same data + /// written to parquet using this library will be read. + /// + /// Setting this option to `true` will result in parquet files that can be + /// read by more readers, but potentially lose information in the process. + /// + /// * Types such as [`DataType::Date64`], which have no direct corresponding + /// Parquet type, may be stored with lower precision. + /// + /// * The internal field names of `List` and `Map` types will be renamed if + /// necessary to match what is required by the newest Parquet specification. + /// + /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details + /// + /// [`DataType::Date64`]: arrow_schema::DataType::Date64 + /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types pub fn set_coerce_types(mut self, coerce_types: bool) -> Self { self.coerce_types = coerce_types; self From 9a74a2588ec99221fd66565b9099b2c0056492ec Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:52:53 -0500 Subject: [PATCH 13/29] Update sysinfo requirement from 0.32.0 to 0.33.0 (#6835) * Update sysinfo requirement from 0.32.0 to 0.33.0 Updates the requirements on [sysinfo](https://github.com/GuillaumeGomez/sysinfo) to permit the latest version. - [Changelog](https://github.com/GuillaumeGomez/sysinfo/blob/master/CHANGELOG.md) - [Commits](https://github.com/GuillaumeGomez/sysinfo/commits) --- updated-dependencies: - dependency-name: sysinfo dependency-type: direct:production ... Signed-off-by: dependabot[bot] * chore: Update example for API change --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Andrew Lamb --- parquet/Cargo.toml | 2 +- parquet/examples/write_parquet.rs | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 4064baba094..19f89071077 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -67,7 +67,7 @@ hashbrown = { version = "0.15", default-features = false } twox-hash = { version = "1.6", default-features = false } paste = { version = "1.0" } half = { version = "2.1", default-features = false, features = ["num-traits"] } -sysinfo = { version = "0.32.0", optional = true, default-features = false, features = ["system"] } +sysinfo = { version = "0.33.0", optional = true, default-features = false, features = ["system"] } crc32fast = { version = "1.4.2", optional = true, default-features = false } [dev-dependencies] diff --git a/parquet/examples/write_parquet.rs b/parquet/examples/write_parquet.rs index 1b51d40c813..ebdd9527b6f 100644 --- a/parquet/examples/write_parquet.rs +++ b/parquet/examples/write_parquet.rs @@ -28,7 +28,7 @@ use parquet::arrow::ArrowWriter as ParquetWriter; use parquet::basic::Encoding; use parquet::errors::Result; use parquet::file::properties::{BloomFilterPosition, WriterProperties}; -use sysinfo::{MemoryRefreshKind, ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System}; +use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System}; #[derive(ValueEnum, Clone)] enum BloomFilterPositionArg { @@ -97,8 +97,7 @@ fn main() -> Result<()> { let file = File::create(args.path).unwrap(); let mut writer = ParquetWriter::try_new(file, schema.clone(), Some(properties))?; - let mut system = - System::new_with_specifics(RefreshKind::new().with_memory(MemoryRefreshKind::everything())); + let mut system = System::new_with_specifics(RefreshKind::everything()); eprintln!( "{} Writing {} batches of {} rows. RSS = {}", now(), From 123045cc766d42d1eb06ee8bb3f09e39ea995ddc Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 16 Dec 2024 16:07:57 -0800 Subject: [PATCH 14/29] deprecate max_statistics_size writer property (#6884) --- parquet/src/bin/parquet-rewrite.rs | 1 + parquet/src/file/properties.rs | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/parquet/src/bin/parquet-rewrite.rs b/parquet/src/bin/parquet-rewrite.rs index eaecda50375..5a1ec94d550 100644 --- a/parquet/src/bin/parquet-rewrite.rs +++ b/parquet/src/bin/parquet-rewrite.rs @@ -242,6 +242,7 @@ fn main() { if let Some(value) = args.dictionary_page_size_limit { writer_properties_builder = writer_properties_builder.set_dictionary_page_size_limit(value); } + #[allow(deprecated)] if let Some(value) = args.max_statistics_size { writer_properties_builder = writer_properties_builder.set_max_statistics_size(value); } diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 7b688333e54..dc918f6b563 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -41,6 +41,7 @@ pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000; /// Default value for [`WriterProperties::statistics_enabled`] pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page; /// Default value for [`WriterProperties::max_statistics_size`] +#[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096; /// Default value for [`WriterProperties::max_row_group_size`] pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; @@ -350,7 +351,9 @@ impl WriterProperties { /// Returns max size for statistics. /// Only applicable if statistics are enabled. + #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] pub fn max_statistics_size(&self, col: &ColumnPath) -> usize { + #[allow(deprecated)] self.column_properties .get(col) .and_then(|c| c.max_statistics_size()) @@ -601,7 +604,9 @@ impl WriterPropertiesBuilder { /// Sets default max statistics size for all columns (defaults to `4096`). /// /// Applicable only if statistics are enabled. + #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] pub fn set_max_statistics_size(mut self, value: usize) -> Self { + #[allow(deprecated)] self.default_column_properties .set_max_statistics_size(value); self @@ -706,7 +711,9 @@ impl WriterPropertiesBuilder { /// Sets max size for statistics for a specific column. /// /// Takes precedence over [`Self::set_max_statistics_size`]. + #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self { + #[allow(deprecated)] self.get_mut_props(col).set_max_statistics_size(value); self } @@ -896,6 +903,7 @@ struct ColumnProperties { codec: Option, dictionary_enabled: Option, statistics_enabled: Option, + #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] max_statistics_size: Option, /// bloom filter related properties bloom_filter_properties: Option, @@ -934,6 +942,8 @@ impl ColumnProperties { } /// Sets max size for statistics for this column. + #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] + #[allow(deprecated)] fn set_max_statistics_size(&mut self, value: usize) { self.max_statistics_size = Some(value); } @@ -998,7 +1008,9 @@ impl ColumnProperties { } /// Returns optional max size in bytes for statistics. + #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")] fn max_statistics_size(&self) -> Option { + #[allow(deprecated)] self.max_statistics_size } @@ -1142,10 +1154,6 @@ mod tests { props.statistics_enabled(&ColumnPath::from("col")), DEFAULT_STATISTICS_ENABLED ); - assert_eq!( - props.max_statistics_size(&ColumnPath::from("col")), - DEFAULT_MAX_STATISTICS_SIZE - ); assert!(props .bloom_filter_properties(&ColumnPath::from("col")) .is_none()); @@ -1222,13 +1230,11 @@ mod tests { .set_compression(Compression::GZIP(Default::default())) .set_dictionary_enabled(false) .set_statistics_enabled(EnabledStatistics::None) - .set_max_statistics_size(50) // specific column settings .set_column_encoding(ColumnPath::from("col"), Encoding::RLE) .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY) .set_column_dictionary_enabled(ColumnPath::from("col"), true) .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk) - .set_column_max_statistics_size(ColumnPath::from("col"), 123) .set_column_bloom_filter_enabled(ColumnPath::from("col"), true) .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64) .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1) @@ -1260,7 +1266,6 @@ mod tests { props.statistics_enabled(&ColumnPath::from("a")), EnabledStatistics::None ); - assert_eq!(props.max_statistics_size(&ColumnPath::from("a")), 50); assert_eq!( props.encoding(&ColumnPath::from("col")), @@ -1275,7 +1280,6 @@ mod tests { props.statistics_enabled(&ColumnPath::from("col")), EnabledStatistics::Chunk ); - assert_eq!(props.max_statistics_size(&ColumnPath::from("col")), 123); assert_eq!( props.bloom_filter_properties(&ColumnPath::from("col")), Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 }) From f8f24cf58311fee7bb2843144a615a80fb281fcb Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:48:34 +0200 Subject: [PATCH 15/29] docs: fix typo (#6890) --- arrow-array/src/array/dictionary_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs index e69de783dd5..f852b57fb65 100644 --- a/arrow-array/src/array/dictionary_array.rs +++ b/arrow-array/src/array/dictionary_array.rs @@ -249,7 +249,7 @@ pub struct DictionaryArray { /// map to the real values. keys: PrimitiveArray, - /// Array of dictionary values (can by any DataType). + /// Array of dictionary values (can be any DataType). values: ArrayRef, /// Values are ordered. From 54dccadccc6b599b93a46aef5f03dfd4d9b7349e Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 17 Dec 2024 10:49:07 -0800 Subject: [PATCH 16/29] fix deprecation notice (#6889) --- parquet/src/arrow/schema/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 4ae3fdb8e5c..5d3d7b2a654 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -345,7 +345,7 @@ impl<'a> ArrowSchemaConverter<'a> { /// /// The name of the root schema element defaults to `"arrow_schema"`, this can be /// overridden with [`ArrowSchemaConverter`] -#[deprecated(since = "54.0.0", note = "Use `ArrowToParquetSchemaConverter` instead")] +#[deprecated(since = "54.0.0", note = "Use `ArrowSchemaConverter` instead")] pub fn arrow_to_parquet_schema(schema: &Schema) -> Result { ArrowSchemaConverter::new().convert(schema) } From 10793608c84f9d26d89d32e2cf72107e632ee84c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 18 Dec 2024 04:52:44 -0500 Subject: [PATCH 17/29] Add Field::with_dict_is_ordered (#6885) --- arrow-schema/src/field.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 7d47c0ae1de..13bb7abf51b 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -426,6 +426,19 @@ impl Field { } /// Returns whether this `Field`'s dictionary is ordered, if this is a dictionary type. + /// + /// # Example + /// ``` + /// # use arrow_schema::{DataType, Field}; + /// // non dictionaries do not have a dict is ordered flat + /// let field = Field::new("c1", DataType::Int64, false); + /// assert_eq!(field.dict_is_ordered(), None); + /// // by default dictionary is not ordered + /// let field = Field::new("c1", DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)), false); + /// assert_eq!(field.dict_is_ordered(), Some(false)); + /// let field = field.with_dict_is_ordered(true); + /// assert_eq!(field.dict_is_ordered(), Some(true)); + /// ``` #[inline] pub const fn dict_is_ordered(&self) -> Option { match self.data_type { @@ -434,6 +447,18 @@ impl Field { } } + /// Set the is ordered field for this `Field`, if it is a dictionary. + /// + /// Does nothing if this is not a dictionary type. + /// + /// See [`Field::dict_is_ordered`] for more information. + pub fn with_dict_is_ordered(mut self, dict_is_ordered: bool) -> Self { + if matches!(self.data_type, DataType::Dictionary(_, _)) { + self.dict_is_ordered = dict_is_ordered; + }; + self + } + /// Merge this field into self if it is compatible. /// /// Struct fields are merged recursively. From 3317989711a4c081f5118d8fe126280ced7a92fb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 18 Dec 2024 05:52:15 -0500 Subject: [PATCH 18/29] Minor: make it easier to find fix instructions when `cargo fmt` on parquet fails (#6886) * Minor: make it easier to find instructions when fmt fails * purposely introduce a fmt issue * Revert "purposely introduce a fmt issue" This reverts commit 440e52079135df85128b15936425d2b5af488007. * Update .github/workflows/rust.yml Co-authored-by: Ed Seidl --------- Co-authored-by: Ed Seidl --- .github/workflows/rust.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index ff5040fd294..72a53263d33 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -101,12 +101,13 @@ jobs: - name: Format arrow run: cargo fmt --all -- --check - name: Format parquet - # Many modules in parquet are skipped, so check parquet separately. If this check fails, run: - # cargo fmt -p parquet -- --config skip_children=true `find ./parquet -name "*.rs" \! -name format.rs` - # from the top level arrow-rs directory and check in the result. + # Many modules in parquet are skipped, so check parquet separately # https://github.com/apache/arrow-rs/issues/6179 working-directory: parquet - run: cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs` + run: | + # if this fails, run this from the parquet directory: + # cargo fmt -p parquet -- --config skip_children=true `find . -name "*.rs" \! -name format.rs` + cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs` - name: Format object_store working-directory: object_store run: cargo fmt --all -- --check From 565c24b8071269b02c3937e34c51eacf0f4cbad6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 18 Dec 2024 05:52:40 -0500 Subject: [PATCH 19/29] Minor: add comments explaining bad MSRV, output in json (#6857) * Minor: add comments explaining bad MSRV * purposely introduce msrv brek * output in JSON format * Revert "purposely introduce msrv brek" This reverts commit 61872b69a5a85748031fe852e48b8e3d3381d270. --- .github/workflows/rust.yml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 72a53263d33..044250b7043 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -113,7 +113,7 @@ jobs: run: cargo fmt --all -- --check msrv: - name: Verify MSRV + name: Verify MSRV (Minimum Supported Rust Version) runs-on: ubuntu-latest container: image: amd64/rust @@ -127,13 +127,19 @@ jobs: run: cargo update -p ahash --precise 0.8.7 - name: Check arrow working-directory: arrow - run: cargo msrv --log-target stdout verify + run: | + # run `cd arrow; cargo msrv verify` to see problematic dependencies + cargo msrv verify --output-format=json - name: Check parquet working-directory: parquet - run: cargo msrv --log-target stdout verify + run: | + # run `cd parquet; cargo msrv verify` to see problematic dependencies + cargo msrv verify --output-format=json - name: Check arrow-flight working-directory: arrow-flight - run: cargo msrv --log-target stdout verify + run: | + # run `cd arrow-flight; cargo msrv verify` to see problematic dependencies + cargo msrv verify --output-format=json - name: Downgrade object_store dependencies working-directory: object_store # Necessary because tokio 1.30.0 updates MSRV to 1.63 @@ -143,4 +149,6 @@ jobs: cargo update -p url --precise 2.5.0 - name: Check object_store working-directory: object_store - run: cargo msrv --log-target stdout verify + run: | + # run `cd object_store; cargo msrv verify` to see problematic dependencies + cargo msrv verify --output-format=json From 61b7876d7a7d9de93e9db37bc96f87b1d646037f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 18 Dec 2024 07:25:59 -0500 Subject: [PATCH 20/29] Add 53.4.0 to release schedule (#6896) * Add 54.4.0 to release schedule * prettoer --- README.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 57794b1d6a4..f995ff6ad47 100644 --- a/README.md +++ b/README.md @@ -63,13 +63,14 @@ is described in the [contributing] guide. Planned Release Schedule -| Approximate Date | Version | Notes | -| ---------------- | -------- | --------------------------------------- | -| Nov 2024 | `53.3.0` | Minor, NO breaking API changes | -| Dec 2024 | `54.0.0` | Major, potentially breaking API changes | -| Jan 2025 | `54.1.0` | Minor, NO breaking API changes | -| Feb 2025 | `54.2.0` | Minor, NO breaking API changes | -| Mar 2025 | `55.0.0` | Major, potentially breaking API changes | +| Approximate Date | Version | Notes | +| ---------------- | -------- | ------------------------------------------ | +| Nov 2024 | `53.3.0` | Minor, NO breaking API changes | +| Dec 2024 | `54.0.0` | Major, potentially breaking API changes | +| Jan 2025 | `53.4.0` | Minor, NO breaking API changes (`53` line) | +| Jan 2025 | `54.1.0` | Minor, NO breaking API changes | +| Feb 2025 | `54.2.0` | Minor, NO breaking API changes | +| Mar 2025 | `55.0.0` | Major, potentially breaking API changes | [this ticket]: https://github.com/apache/arrow-rs/issues/5368 [semantic versioning]: https://semver.org/ From 13f3f213360441e6e7baaa1b98d23a0690d9572e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 18 Dec 2024 07:49:51 -0500 Subject: [PATCH 21/29] Add deprecation / API removal policy (#6852) * Add deprecation / API removal policy * Increase proposal to 2 releases * change from policy to guidelines, add flexibility * prettier * Make instructions more actionable --- README.md | 27 +++++++++++++++++++++++++++ arrow/README.md | 2 +- parquet/README.md | 2 +- 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f995ff6ad47..723249ad29e 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,33 @@ versions approximately every 2 months. [`object_store`]: https://crates.io/crates/object_store +### Deprecation Guidelines + +Minor releases may deprecate, but not remove APIs. Deprecating APIs allows +downstream Rust programs to still compile, but generate compiler warnings. This +gives downstream crates time to migrate prior to API removal. + +To deprecate an API: + +- Mark the API as deprecated using `#[deprecated]` and specify the exact arrow-rs version in which it was deprecated +- Concisely describe the preferred API to help the user transition + +The deprecated version is the next version which will be released (please +consult the list above). To mark the API as deprecated, use the +`#[deprecated(since = "...", note = "...")]` attribute. + +Foe example + +```rust +#[deprecated(since = "51.0.0", note = "Use `date_part` instead")] +``` + +In general, deprecated APIs will remain in the codebase for at least two major releases after +they were deprecated (typically between 6 - 9 months later). For example, an API +deprecated in `51.3.0` can be removed in `54.0.0` (or later). Deprecated APIs +may be removed earlier or later than these guidelines at the discretion of the +maintainers. + ## Related Projects There are several related crates in different repositories diff --git a/arrow/README.md b/arrow/README.md index a1444005ec0..79aefaae905 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -37,7 +37,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The `arrow` crate follows the [SemVer standard] defined by Cargo and works well within the Rust crate ecosystem. See the [repository README] for more details on -the release schedule and version. +the release schedule, version and deprecation policy. [SemVer standard]: https://doc.rust-lang.org/cargo/reference/semver.html [repository README]: https://github.com/apache/arrow-rs diff --git a/parquet/README.md b/parquet/README.md index e9f52ff279d..9ff1d921d69 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -36,7 +36,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The `parquet` crate follows the [SemVer standard] defined by Cargo and works well within the Rust crate ecosystem. See the [repository README] for more details on -the release schedule and version. +the release schedule, version and deprecation policy. [semver standard]: https://doc.rust-lang.org/cargo/reference/semver.html [repository readme]: https://github.com/apache/arrow-rs From cbe176533b227f31f467b9c9b512d65a8406ce10 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 18 Dec 2024 05:57:53 -0800 Subject: [PATCH 22/29] Enable string-based column projections from Parquet files (#6871) * add function to create ProjectionMask from column names * add some more tests --- parquet/src/arrow/arrow_reader/mod.rs | 68 ++++++++++ parquet/src/arrow/mod.rs | 178 +++++++++++++++++++++++++- parquet/src/arrow/schema/mod.rs | 11 ++ 3 files changed, 256 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 378884a1c43..6eba04c86f9 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -989,6 +989,21 @@ mod tests { assert_eq!(original_schema.fields()[1], reader.schema().fields()[0]); } + #[test] + fn test_arrow_reader_single_column_by_name() { + let file = get_test_file("parquet/generated_simple_numerics/blogs.parquet"); + + let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + let original_schema = Arc::clone(builder.schema()); + + let mask = ProjectionMask::columns(builder.parquet_schema(), ["blog_id"]); + let reader = builder.with_projection(mask).build().unwrap(); + + // Verify that the schema was correctly parsed + assert_eq!(1, reader.schema().fields().len()); + assert_eq!(original_schema.fields()[1], reader.schema().fields()[0]); + } + #[test] fn test_null_column_reader_test() { let mut file = tempfile::tempfile().unwrap(); @@ -2563,6 +2578,59 @@ mod tests { } } + #[test] + // same as test_read_structs but constructs projection mask via column names + fn test_read_structs_by_name() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/nested_structs.rust.parquet"); + let file = File::open(&path).unwrap(); + let record_batch_reader = ParquetRecordBatchReader::try_new(file, 60).unwrap(); + + for batch in record_batch_reader { + batch.unwrap(); + } + + let file = File::open(&path).unwrap(); + let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); + + let mask = ProjectionMask::columns( + builder.parquet_schema(), + ["roll_num.count", "PC_CUR.mean", "PC_CUR.sum"], + ); + let projected_reader = builder + .with_projection(mask) + .with_batch_size(60) + .build() + .unwrap(); + + let expected_schema = Schema::new(vec![ + Field::new( + "roll_num", + ArrowDataType::Struct(Fields::from(vec![Field::new( + "count", + ArrowDataType::UInt64, + false, + )])), + false, + ), + Field::new( + "PC_CUR", + ArrowDataType::Struct(Fields::from(vec![ + Field::new("mean", ArrowDataType::Int64, false), + Field::new("sum", ArrowDataType::Int64, false), + ])), + false, + ), + ]); + + assert_eq!(&expected_schema, projected_reader.schema().as_ref()); + + for batch in projected_reader { + let batch = batch.unwrap(); + assert_eq!(batch.schema().as_ref(), &expected_schema); + } + } + #[test] fn test_read_maps() { let testdata = arrow::util::test_util::parquet_test_data(); diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index d77436bc1ff..6777e00fb05 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -108,12 +108,14 @@ pub mod async_writer; mod record_reader; experimental!(mod schema); +use std::sync::Arc; + pub use self::arrow_writer::ArrowWriter; #[cfg(feature = "async")] pub use self::async_reader::ParquetRecordBatchStreamBuilder; #[cfg(feature = "async")] pub use self::async_writer::AsyncArrowWriter; -use crate::schema::types::SchemaDescriptor; +use crate::schema::types::{SchemaDescriptor, Type}; use arrow_schema::{FieldRef, Schema}; // continue to export deprecated methods until they are removed @@ -210,6 +212,71 @@ impl ProjectionMask { Self { mask: Some(mask) } } + // Given a starting point in the schema, do a DFS for that node adding leaf paths to `paths`. + fn find_leaves(root: &Arc, parent: Option<&String>, paths: &mut Vec) { + let path = parent + .map(|p| [p, root.name()].join(".")) + .unwrap_or(root.name().to_string()); + if root.is_group() { + for child in root.get_fields() { + Self::find_leaves(child, Some(&path), paths); + } + } else { + // Reached a leaf, add to paths + paths.push(path); + } + } + + /// Create a [`ProjectionMask`] which selects only the named columns + /// + /// All leaf columns that fall below a given name will be selected. For example, given + /// the schema + /// ```ignore + /// message schema { + /// OPTIONAL group a (MAP) { + /// REPEATED group key_value { + /// REQUIRED BYTE_ARRAY key (UTF8); // leaf index 0 + /// OPTIONAL group value (MAP) { + /// REPEATED group key_value { + /// REQUIRED INT32 key; // leaf index 1 + /// REQUIRED BOOLEAN value; // leaf index 2 + /// } + /// } + /// } + /// } + /// REQUIRED INT32 b; // leaf index 3 + /// REQUIRED DOUBLE c; // leaf index 4 + /// } + /// ``` + /// `["a.key_value.value", "c"]` would return leaf columns 1, 2, and 4. `["a"]` would return + /// columns 0, 1, and 2. + /// + /// Note: repeated or out of order indices will not impact the final mask. + /// + /// i.e. `["b", "c"]` will construct the same mask as `["c", "b", "c"]`. + pub fn columns<'a>( + schema: &SchemaDescriptor, + names: impl IntoIterator, + ) -> Self { + // first make vector of paths for leaf columns + let mut paths: Vec = vec![]; + for root in schema.root_schema().get_fields() { + Self::find_leaves(root, None, &mut paths); + } + assert_eq!(paths.len(), schema.num_columns()); + + let mut mask = vec![false; schema.num_columns()]; + for name in names { + for idx in 0..schema.num_columns() { + if paths[idx].starts_with(name) { + mask[idx] = true; + } + } + } + + Self { mask: Some(mask) } + } + /// Returns true if the leaf column `leaf_idx` is included by the mask pub fn leaf_included(&self, leaf_idx: usize) -> bool { self.mask.as_ref().map(|m| m[leaf_idx]).unwrap_or(true) @@ -246,10 +313,14 @@ mod test { use crate::arrow::ArrowWriter; use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader, ParquetMetaDataWriter}; use crate::file::properties::{EnabledStatistics, WriterProperties}; + use crate::schema::parser::parse_message_type; + use crate::schema::types::SchemaDescriptor; use arrow_array::{ArrayRef, Int32Array, RecordBatch}; use bytes::Bytes; use std::sync::Arc; + use super::ProjectionMask; + #[test] // Reproducer for https://github.com/apache/arrow-rs/issues/6464 fn test_metadata_read_write_partial_offset() { @@ -375,4 +446,109 @@ mod test { .unwrap(); Bytes::from(buf) } + + #[test] + fn test_mask_from_column_names() { + let message_type = " + message test_schema { + OPTIONAL group a (MAP) { + REPEATED group key_value { + REQUIRED BYTE_ARRAY key (UTF8); + OPTIONAL group value (MAP) { + REPEATED group key_value { + REQUIRED INT32 key; + REQUIRED BOOLEAN value; + } + } + } + } + REQUIRED INT32 b; + REQUIRED DOUBLE c; + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + let schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); + + let mask = ProjectionMask::columns(&schema, ["foo", "bar"]); + assert_eq!(mask.mask.unwrap(), vec![false; 5]); + + let mask = ProjectionMask::columns(&schema, []); + assert_eq!(mask.mask.unwrap(), vec![false; 5]); + + let mask = ProjectionMask::columns(&schema, ["a", "c"]); + assert_eq!(mask.mask.unwrap(), [true, true, true, false, true]); + + let mask = ProjectionMask::columns(&schema, ["a.key_value.key", "c"]); + assert_eq!(mask.mask.unwrap(), [true, false, false, false, true]); + + let mask = ProjectionMask::columns(&schema, ["a.key_value.value", "b"]); + assert_eq!(mask.mask.unwrap(), [false, true, true, true, false]); + + let message_type = " + message test_schema { + OPTIONAL group a (LIST) { + REPEATED group list { + OPTIONAL group element (LIST) { + REPEATED group list { + OPTIONAL group element (LIST) { + REPEATED group list { + OPTIONAL BYTE_ARRAY element (UTF8); + } + } + } + } + } + } + REQUIRED INT32 b; + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + let schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); + + let mask = ProjectionMask::columns(&schema, ["a", "b"]); + assert_eq!(mask.mask.unwrap(), [true, true]); + + let mask = ProjectionMask::columns(&schema, ["a.list.element", "b"]); + assert_eq!(mask.mask.unwrap(), [true, true]); + + let mask = + ProjectionMask::columns(&schema, ["a.list.element.list.element.list.element", "b"]); + assert_eq!(mask.mask.unwrap(), [true, true]); + + let mask = ProjectionMask::columns(&schema, ["b"]); + assert_eq!(mask.mask.unwrap(), [false, true]); + + let message_type = " + message test_schema { + OPTIONAL INT32 a; + OPTIONAL INT32 b; + OPTIONAL INT32 c; + OPTIONAL INT32 d; + OPTIONAL INT32 e; + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + let schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); + + let mask = ProjectionMask::columns(&schema, ["a", "b"]); + assert_eq!(mask.mask.unwrap(), [true, true, false, false, false]); + + let mask = ProjectionMask::columns(&schema, ["d", "b", "d"]); + assert_eq!(mask.mask.unwrap(), [false, true, false, true, false]); + + let message_type = " + message test_schema { + OPTIONAL INT32 a; + OPTIONAL INT32 b; + OPTIONAL INT32 a; + OPTIONAL INT32 d; + OPTIONAL INT32 e; + } + "; + let parquet_group_type = parse_message_type(message_type).unwrap(); + let schema = SchemaDescriptor::new(Arc::new(parquet_group_type)); + + let mask = ProjectionMask::columns(&schema, ["a", "e"]); + assert_eq!(mask.mask.unwrap(), [true, false, true, false, true]); + } } diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 5d3d7b2a654..212dec52583 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -1399,6 +1399,17 @@ mod tests { for i in 0..arrow_fields.len() { assert_eq!(&arrow_fields[i], converted_fields[i].as_ref()); } + + let mask = + ProjectionMask::columns(&parquet_schema, ["group2.leaf4", "group1.leaf1", "leaf5"]); + let converted_arrow_schema = + parquet_to_arrow_schema_by_columns(&parquet_schema, mask, None).unwrap(); + let converted_fields = converted_arrow_schema.fields(); + + assert_eq!(arrow_fields.len(), converted_fields.len()); + for i in 0..arrow_fields.len() { + assert_eq!(&arrow_fields[i], converted_fields[i].as_ref()); + } } #[test] From fc814bca6b743010bcde972e611c8ff8ea68c9f5 Mon Sep 17 00:00:00 2001 From: xxchan Date: Wed, 18 Dec 2024 21:58:19 +0800 Subject: [PATCH 23/29] doc: add comment for timezone string (#6899) * doc: add comment for timezone string Signed-off-by: xxchan * Update arrow-schema/src/datatype.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Signed-off-by: xxchan Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow-schema/src/datatype.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index a6333c80480..7cd53b13c73 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -196,6 +196,14 @@ pub enum DataType { /// DataType::Timestamp(TimeUnit::Second, Some("literal".into())); /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into())); /// ``` + /// + /// Timezone string parsing + /// ----------------------- + /// When feature `chrono-tz` is not enabled, allowed timezone strings are fixed offsets of the form "+09:00", "-09" or "+0930". + /// + /// When feature `chrono-tz` is enabled, additional strings supported by [chrono_tz](https://docs.rs/chrono-tz/latest/chrono_tz/) + /// are also allowed, which include [IANA database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones) + /// timezones. Timestamp(TimeUnit, Option>), /// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) /// in days. From 2887cc1030b2954ffcaba30f6a6d566b7017dc25 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 19 Dec 2024 08:56:05 -0500 Subject: [PATCH 24/29] Update version to 54.0.0, add CHANGELOG (#6894) * Update version to 54.0.0 * Update changelog * update notes * updtes * update --- CHANGELOG-old.md | 170 ++++++++++++++++++++++++++++++ CHANGELOG.md | 173 +++++++++++++++++-------------- Cargo.toml | 32 +++--- arrow-flight/README.md | 2 +- dev/release/update_change_log.sh | 4 +- 5 files changed, 283 insertions(+), 98 deletions(-) diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 376da627711..3fb17b390ac 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -19,6 +19,176 @@ # Historical Changelog +## [53.3.0](https://github.com/apache/arrow-rs/tree/53.3.0) (2024-11-17) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/53.2.0...53.3.0) + +- Signed decimal e-notation parsing bug [\#6728](https://github.com/apache/arrow-rs/issues/6728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support for Utf8View -\> numeric in can\_cast\_types [\#6715](https://github.com/apache/arrow-rs/issues/6715) +- IPC file writer produces incorrect footer when not preserving dict ID [\#6710](https://github.com/apache/arrow-rs/issues/6710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet from\_thrift\_helper incorrectly checks index [\#6693](https://github.com/apache/arrow-rs/issues/6693) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Primitive REPEATED fields not contained in LIST annotated groups aren't read as lists by record reader [\#6648](https://github.com/apache/arrow-rs/issues/6648) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- DictionaryHandling does not recurse into Map fields [\#6644](https://github.com/apache/arrow-rs/issues/6644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Array writer output empty when no record is written [\#6613](https://github.com/apache/arrow-rs/issues/6613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Archery Integration Test with c\# failing on main [\#6577](https://github.com/apache/arrow-rs/issues/6577) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Potential unsoundness in `filter_run_end_array` [\#6569](https://github.com/apache/arrow-rs/issues/6569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet reader can generate incorrect validity buffer information for nested structures [\#6510](https://github.com/apache/arrow-rs/issues/6510) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- arrow-array ffi: FFI\_ArrowArray.null\_count is always interpreted as unsigned and initialized during conversion from C to Rust. [\#6497](https://github.com/apache/arrow-rs/issues/6497) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- Minor: Document pattern for accessing views in StringView [\#6673](https://github.com/apache/arrow-rs/pull/6673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve Array::is\_nullable documentation [\#6615](https://github.com/apache/arrow-rs/pull/6615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Minor: improve docs for ByteViewArray-\>ByteArray From impl [\#6610](https://github.com/apache/arrow-rs/pull/6610) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Performance improvements:** + +- Speed up `filter_run_end_array` [\#6712](https://github.com/apache/arrow-rs/pull/6712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) + +**Closed issues:** + +- Incorrect like results for pattern starting/ending with `%` percent and containing escape characters [\#6702](https://github.com/apache/arrow-rs/issues/6702) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Fix signed decimal e-notation parsing [\#6729](https://github.com/apache/arrow-rs/pull/6729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) +- Clean up some arrow-flight tests and duplicated code [\#6725](https://github.com/apache/arrow-rs/pull/6725) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime)) +- Update PR template section about API breaking changes [\#6723](https://github.com/apache/arrow-rs/pull/6723) ([findepi](https://github.com/findepi)) +- Support for casting `StringViewArray` to `DecimalArray` [\#6720](https://github.com/apache/arrow-rs/pull/6720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- File writer preserve dict bug [\#6711](https://github.com/apache/arrow-rs/pull/6711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) +- Add filter\_kernel benchmark for run array [\#6706](https://github.com/apache/arrow-rs/pull/6706) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- Fix string view ILIKE checks with NULL values [\#6705](https://github.com/apache/arrow-rs/pull/6705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Implement logical\_null\_count for more array types [\#6704](https://github.com/apache/arrow-rs/pull/6704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Fix LIKE with escapes [\#6703](https://github.com/apache/arrow-rs/pull/6703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Speed up `filter_bytes` [\#6699](https://github.com/apache/arrow-rs/pull/6699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Minor: fix misleading comment in byte view [\#6695](https://github.com/apache/arrow-rs/pull/6695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211)) +- minor fix on checking index [\#6694](https://github.com/apache/arrow-rs/pull/6694) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jp0317](https://github.com/jp0317)) +- Undo run end filter performance regression [\#6691](https://github.com/apache/arrow-rs/pull/6691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- Reimplement `PartialEq` of `GenericByteViewArray` compares by logical value [\#6689](https://github.com/apache/arrow-rs/pull/6689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- feat: expose known\_schema from FlightDataEncoder [\#6688](https://github.com/apache/arrow-rs/pull/6688) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) +- Update hashbrown requirement from 0.14.2 to 0.15.1 [\#6684](https://github.com/apache/arrow-rs/pull/6684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support Duration in JSON Reader [\#6683](https://github.com/apache/arrow-rs/pull/6683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([simonvandel](https://github.com/simonvandel)) +- Check predicate and values are the same length for run end array filter safety [\#6675](https://github.com/apache/arrow-rs/pull/6675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- \[ffi\] Fix arrow-array null\_count error during conversion from C to Rust [\#6674](https://github.com/apache/arrow-rs/pull/6674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adbmal](https://github.com/adbmal)) +- Support `Utf8View` for `bit_length` kernel [\#6671](https://github.com/apache/arrow-rs/pull/6671) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([austin362667](https://github.com/austin362667)) +- Fix string view LIKE checks with NULL values [\#6662](https://github.com/apache/arrow-rs/pull/6662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Improve documentation for `nullif` kernel [\#6658](https://github.com/apache/arrow-rs/pull/6658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve test\_auth error message when contains\(\) fails [\#6657](https://github.com/apache/arrow-rs/pull/6657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- Let std::fmt::Debug for StructArray output Null/Validity info [\#6655](https://github.com/apache/arrow-rs/pull/6655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([XinyuZeng](https://github.com/XinyuZeng)) +- Include offending line number when processing CSV file fails [\#6653](https://github.com/apache/arrow-rs/pull/6653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- feat: add write\_bytes for GenericBinaryBuilder [\#6652](https://github.com/apache/arrow-rs/pull/6652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tisonkun](https://github.com/tisonkun)) +- feat: Support Utf8View in JSON serialization [\#6651](https://github.com/apache/arrow-rs/pull/6651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonmmease](https://github.com/jonmmease)) +- fix: include chrono-tz in flight sql cli [\#6650](https://github.com/apache/arrow-rs/pull/6650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Handle primitive REPEATED field not contained in LIST annotated group [\#6649](https://github.com/apache/arrow-rs/pull/6649) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) +- Implement `append_n` for `BooleanBuilder` [\#6646](https://github.com/apache/arrow-rs/pull/6646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- fix: recurse into Map datatype when hydrating dictionaries [\#6645](https://github.com/apache/arrow-rs/pull/6645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) +- fix: enable TLS roots for flight CLI client [\#6640](https://github.com/apache/arrow-rs/pull/6640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- doc: Clarify take kernel semantics [\#6632](https://github.com/apache/arrow-rs/pull/6632) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Return error rather than panic when too many row groups are written [\#6629](https://github.com/apache/arrow-rs/pull/6629) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Fix test feature selection so all feature combinations work as expected [\#6626](https://github.com/apache/arrow-rs/pull/6626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([itsjunetime](https://github.com/itsjunetime)) +- Add Parquet RowSelection benchmark [\#6623](https://github.com/apache/arrow-rs/pull/6623) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao)) +- Optimize `take_bits` to optimize `take_boolean` / `take_primitive` / `take_byte_view`: up to -25% [\#6622](https://github.com/apache/arrow-rs/pull/6622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Make downcast macros hygenic \(\#6400\) [\#6620](https://github.com/apache/arrow-rs/pull/6620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.88 to =1.0.89 [\#6618](https://github.com/apache/arrow-rs/pull/6618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix arrow-json writer empty [\#6614](https://github.com/apache/arrow-rs/pull/6614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gwik](https://github.com/gwik)) +- Add `ParquetObjectReader::with_runtime` [\#6612](https://github.com/apache/arrow-rs/pull/6612) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([itsjunetime](https://github.com/itsjunetime)) +- Re-enable `C#` arrow flight integration test [\#6611](https://github.com/apache/arrow-rs/pull/6611) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +## [53.3.0](https://github.com/apache/arrow-rs/tree/53.3.0) (2024-11-17) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/53.2.0...53.3.0) + +**Implemented enhancements:** + +- `PartialEq` of GenericByteViewArray \(StringViewArray / ByteViewArray\) that compares on equality rather than logical value [\#6679](https://github.com/apache/arrow-rs/issues/6679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Need a mechanism to handle schema changes due to dictionary hydration in FlightSQL server implementations [\#6672](https://github.com/apache/arrow-rs/issues/6672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support encoding Utf8View columns to JSON [\#6642](https://github.com/apache/arrow-rs/issues/6642) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement `append_n` for `BooleanBuilder` [\#6634](https://github.com/apache/arrow-rs/issues/6634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Some take optimizations [\#6621](https://github.com/apache/arrow-rs/issues/6621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Error Instead of Panic On Attempting to Write More Than 32769 Row Groups [\#6591](https://github.com/apache/arrow-rs/issues/6591) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Make casting from a timestamp without timezone to a timestamp with timezone configurable [\#6555](https://github.com/apache/arrow-rs/issues/6555) +- Add `record_batch!` macro for easy record batch creation [\#6553](https://github.com/apache/arrow-rs/issues/6553) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `Binary` --\> `Utf8View` casting [\#6531](https://github.com/apache/arrow-rs/issues/6531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `downcast_primitive_array` and `downcast_dictionary_array` are not hygienic wrt imports [\#6400](https://github.com/apache/arrow-rs/issues/6400) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement interleave\_record\_batch [\#6731](https://github.com/apache/arrow-rs/pull/6731) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waynexia](https://github.com/waynexia)) +- feat: `record_batch!` macro [\#6588](https://github.com/apache/arrow-rs/pull/6588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ByteBaker](https://github.com/ByteBaker)) + +**Fixed bugs:** + +- Signed decimal e-notation parsing bug [\#6728](https://github.com/apache/arrow-rs/issues/6728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support for Utf8View -\> numeric in can\_cast\_types [\#6715](https://github.com/apache/arrow-rs/issues/6715) +- IPC file writer produces incorrect footer when not preserving dict ID [\#6710](https://github.com/apache/arrow-rs/issues/6710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet from\_thrift\_helper incorrectly checks index [\#6693](https://github.com/apache/arrow-rs/issues/6693) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Primitive REPEATED fields not contained in LIST annotated groups aren't read as lists by record reader [\#6648](https://github.com/apache/arrow-rs/issues/6648) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- DictionaryHandling does not recurse into Map fields [\#6644](https://github.com/apache/arrow-rs/issues/6644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Array writer output empty when no record is written [\#6613](https://github.com/apache/arrow-rs/issues/6613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Archery Integration Test with c\# failing on main [\#6577](https://github.com/apache/arrow-rs/issues/6577) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Potential unsoundness in `filter_run_end_array` [\#6569](https://github.com/apache/arrow-rs/issues/6569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet reader can generate incorrect validity buffer information for nested structures [\#6510](https://github.com/apache/arrow-rs/issues/6510) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- arrow-array ffi: FFI\_ArrowArray.null\_count is always interpreted as unsigned and initialized during conversion from C to Rust. [\#6497](https://github.com/apache/arrow-rs/issues/6497) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Documentation updates:** + +- Minor: Document pattern for accessing views in StringView [\#6673](https://github.com/apache/arrow-rs/pull/6673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve Array::is\_nullable documentation [\#6615](https://github.com/apache/arrow-rs/pull/6615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Minor: improve docs for ByteViewArray-\>ByteArray From impl [\#6610](https://github.com/apache/arrow-rs/pull/6610) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Performance improvements:** + +- Speed up `filter_run_end_array` [\#6712](https://github.com/apache/arrow-rs/pull/6712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) + +**Closed issues:** + +- Incorrect like results for pattern starting/ending with `%` percent and containing escape characters [\#6702](https://github.com/apache/arrow-rs/issues/6702) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Fix signed decimal e-notation parsing [\#6729](https://github.com/apache/arrow-rs/pull/6729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) +- Clean up some arrow-flight tests and duplicated code [\#6725](https://github.com/apache/arrow-rs/pull/6725) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime)) +- Update PR template section about API breaking changes [\#6723](https://github.com/apache/arrow-rs/pull/6723) ([findepi](https://github.com/findepi)) +- Support for casting `StringViewArray` to `DecimalArray` [\#6720](https://github.com/apache/arrow-rs/pull/6720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- File writer preserve dict bug [\#6711](https://github.com/apache/arrow-rs/pull/6711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) +- Add filter\_kernel benchmark for run array [\#6706](https://github.com/apache/arrow-rs/pull/6706) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- Fix string view ILIKE checks with NULL values [\#6705](https://github.com/apache/arrow-rs/pull/6705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Implement logical\_null\_count for more array types [\#6704](https://github.com/apache/arrow-rs/pull/6704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Fix LIKE with escapes [\#6703](https://github.com/apache/arrow-rs/pull/6703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Speed up `filter_bytes` [\#6699](https://github.com/apache/arrow-rs/pull/6699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Minor: fix misleading comment in byte view [\#6695](https://github.com/apache/arrow-rs/pull/6695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211)) +- minor fix on checking index [\#6694](https://github.com/apache/arrow-rs/pull/6694) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jp0317](https://github.com/jp0317)) +- Undo run end filter performance regression [\#6691](https://github.com/apache/arrow-rs/pull/6691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- Reimplement `PartialEq` of `GenericByteViewArray` compares by logical value [\#6689](https://github.com/apache/arrow-rs/pull/6689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- feat: expose known\_schema from FlightDataEncoder [\#6688](https://github.com/apache/arrow-rs/pull/6688) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) +- Update hashbrown requirement from 0.14.2 to 0.15.1 [\#6684](https://github.com/apache/arrow-rs/pull/6684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support Duration in JSON Reader [\#6683](https://github.com/apache/arrow-rs/pull/6683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([simonvandel](https://github.com/simonvandel)) +- Check predicate and values are the same length for run end array filter safety [\#6675](https://github.com/apache/arrow-rs/pull/6675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- \[ffi\] Fix arrow-array null\_count error during conversion from C to Rust [\#6674](https://github.com/apache/arrow-rs/pull/6674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adbmal](https://github.com/adbmal)) +- Support `Utf8View` for `bit_length` kernel [\#6671](https://github.com/apache/arrow-rs/pull/6671) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([austin362667](https://github.com/austin362667)) +- Fix string view LIKE checks with NULL values [\#6662](https://github.com/apache/arrow-rs/pull/6662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Improve documentation for `nullif` kernel [\#6658](https://github.com/apache/arrow-rs/pull/6658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Improve test\_auth error message when contains\(\) fails [\#6657](https://github.com/apache/arrow-rs/pull/6657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- Let std::fmt::Debug for StructArray output Null/Validity info [\#6655](https://github.com/apache/arrow-rs/pull/6655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([XinyuZeng](https://github.com/XinyuZeng)) +- Include offending line number when processing CSV file fails [\#6653](https://github.com/apache/arrow-rs/pull/6653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- feat: add write\_bytes for GenericBinaryBuilder [\#6652](https://github.com/apache/arrow-rs/pull/6652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tisonkun](https://github.com/tisonkun)) +- feat: Support Utf8View in JSON serialization [\#6651](https://github.com/apache/arrow-rs/pull/6651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonmmease](https://github.com/jonmmease)) +- fix: include chrono-tz in flight sql cli [\#6650](https://github.com/apache/arrow-rs/pull/6650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- Handle primitive REPEATED field not contained in LIST annotated group [\#6649](https://github.com/apache/arrow-rs/pull/6649) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) +- Implement `append_n` for `BooleanBuilder` [\#6646](https://github.com/apache/arrow-rs/pull/6646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) +- fix: recurse into Map datatype when hydrating dictionaries [\#6645](https://github.com/apache/arrow-rs/pull/6645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) +- fix: enable TLS roots for flight CLI client [\#6640](https://github.com/apache/arrow-rs/pull/6640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) +- doc: Clarify take kernel semantics [\#6632](https://github.com/apache/arrow-rs/pull/6632) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Return error rather than panic when too many row groups are written [\#6629](https://github.com/apache/arrow-rs/pull/6629) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Fix test feature selection so all feature combinations work as expected [\#6626](https://github.com/apache/arrow-rs/pull/6626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([itsjunetime](https://github.com/itsjunetime)) +- Add Parquet RowSelection benchmark [\#6623](https://github.com/apache/arrow-rs/pull/6623) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao)) +- Optimize `take_bits` to optimize `take_boolean` / `take_primitive` / `take_byte_view`: up to -25% [\#6622](https://github.com/apache/arrow-rs/pull/6622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Make downcast macros hygenic \(\#6400\) [\#6620](https://github.com/apache/arrow-rs/pull/6620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Update proc-macro2 requirement from =1.0.88 to =1.0.89 [\#6618](https://github.com/apache/arrow-rs/pull/6618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix arrow-json writer empty [\#6614](https://github.com/apache/arrow-rs/pull/6614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gwik](https://github.com/gwik)) +- Add `ParquetObjectReader::with_runtime` [\#6612](https://github.com/apache/arrow-rs/pull/6612) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([itsjunetime](https://github.com/itsjunetime)) +- Re-enable `C#` arrow flight integration test [\#6611](https://github.com/apache/arrow-rs/pull/6611) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add Array::logical\_null\_count for inspecting number of null values [\#6608](https://github.com/apache/arrow-rs/pull/6608) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Added casting from Binary/LargeBinary to Utf8View [\#6592](https://github.com/apache/arrow-rs/pull/6592) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ngli-me](https://github.com/ngli-me)) +- Parquet AsyncReader: Don't panic when empty offset\_index is Some\(\[\]\) [\#6582](https://github.com/apache/arrow-rs/pull/6582) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jroddev](https://github.com/jroddev)) +- Skip writing down null buffers for non-nullable primitive arrays [\#6524](https://github.com/apache/arrow-rs/pull/6524) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([bkirwi](https://github.com/bkirwi)) ## [53.2.0](https://github.com/apache/arrow-rs/tree/53.2.0) (2024-10-21) [Full Changelog](https://github.com/apache/arrow-rs/compare/53.1.0...53.2.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b729360608..a7f2a4ff34d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,101 +19,116 @@ # Changelog -## [53.3.0](https://github.com/apache/arrow-rs/tree/53.3.0) (2024-11-17) +## [54.0.0](https://github.com/apache/arrow-rs/tree/54.0.0) (2024-12-18) -[Full Changelog](https://github.com/apache/arrow-rs/compare/53.2.0...53.3.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/53.3.0...54.0.0) + +**Breaking changes:** + +- avoid redundant parsing of repeated value in RleDecoder [\#6834](https://github.com/apache/arrow-rs/pull/6834) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jp0317](https://github.com/jp0317)) +- Handling nullable DictionaryArray in CSV parser [\#6830](https://github.com/apache/arrow-rs/pull/6830) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([edmondop](https://github.com/edmondop)) +- fix\(flightsql\): remove Any encoding of DoPutUpdateResult [\#6825](https://github.com/apache/arrow-rs/pull/6825) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([davisp](https://github.com/davisp)) +- arrow-ipc: Default to not preserving dict IDs [\#6788](https://github.com/apache/arrow-rs/pull/6788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) +- Remove some very old deprecated functions [\#6774](https://github.com/apache/arrow-rs/pull/6774) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- update to pyo3 0.23.0 [\#6745](https://github.com/apache/arrow-rs/pull/6745) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Remove APIs deprecated since v 4.4.0 [\#6722](https://github.com/apache/arrow-rs/pull/6722) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- Return `None` when Parquet page indexes are not present in file [\#6639](https://github.com/apache/arrow-rs/pull/6639) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Add `ParquetError::NeedMoreData` mark `ParquetError` as `non_exhaustive` [\#6630](https://github.com/apache/arrow-rs/pull/6630) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Remove APIs deprecated since v 2.0.0 [\#6609](https://github.com/apache/arrow-rs/pull/6609) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) **Implemented enhancements:** -- `PartialEq` of GenericByteViewArray \(StringViewArray / ByteViewArray\) that compares on equality rather than logical value [\#6679](https://github.com/apache/arrow-rs/issues/6679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Need a mechanism to handle schema changes due to dictionary hydration in FlightSQL server implementations [\#6672](https://github.com/apache/arrow-rs/issues/6672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Support encoding Utf8View columns to JSON [\#6642](https://github.com/apache/arrow-rs/issues/6642) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Implement `append_n` for `BooleanBuilder` [\#6634](https://github.com/apache/arrow-rs/issues/6634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Some take optimizations [\#6621](https://github.com/apache/arrow-rs/issues/6621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Error Instead of Panic On Attempting to Write More Than 32769 Row Groups [\#6591](https://github.com/apache/arrow-rs/issues/6591) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Make casting from a timestamp without timezone to a timestamp with timezone configurable [\#6555](https://github.com/apache/arrow-rs/issues/6555) -- Add `record_batch!` macro for easy record batch creation [\#6553](https://github.com/apache/arrow-rs/issues/6553) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `Binary` --\> `Utf8View` casting [\#6531](https://github.com/apache/arrow-rs/issues/6531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `downcast_primitive_array` and `downcast_dictionary_array` are not hygienic wrt imports [\#6400](https://github.com/apache/arrow-rs/issues/6400) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Implement interleave\_record\_batch [\#6731](https://github.com/apache/arrow-rs/pull/6731) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waynexia](https://github.com/waynexia)) -- feat: `record_batch!` macro [\#6588](https://github.com/apache/arrow-rs/pull/6588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ByteBaker](https://github.com/ByteBaker)) +- Parquet schema hint doesn't support integer types upcasting [\#6891](https://github.com/apache/arrow-rs/issues/6891) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Parquet UTF-8 max statistics are overly pessimistic [\#6867](https://github.com/apache/arrow-rs/issues/6867) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add builder support for Int8 keys [\#6844](https://github.com/apache/arrow-rs/issues/6844) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Formalize the name of the nested `Field` in a list [\#6784](https://github.com/apache/arrow-rs/issues/6784) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Allow disabling the writing of Parquet Offset Index [\#6778](https://github.com/apache/arrow-rs/issues/6778) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `parquet::record::make_row` is not exposed to users, leaving no option to users to manually create `Row` objects [\#6761](https://github.com/apache/arrow-rs/issues/6761) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Avoid `from_num_days_from_ce_opt` calls in `timestamp_s_to_datetime` if we don't need [\#6746](https://github.com/apache/arrow-rs/issues/6746) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Temporal -\> Utf8View casting [\#6734](https://github.com/apache/arrow-rs/issues/6734) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add Option To Coerce List Type on Parquet Write [\#6733](https://github.com/apache/arrow-rs/issues/6733) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Numeric -\> Utf8View casting [\#6714](https://github.com/apache/arrow-rs/issues/6714) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support Utf8View \<=\> boolean casting [\#6713](https://github.com/apache/arrow-rs/issues/6713) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Fixed bugs:** -- Signed decimal e-notation parsing bug [\#6728](https://github.com/apache/arrow-rs/issues/6728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add support for Utf8View -\> numeric in can\_cast\_types [\#6715](https://github.com/apache/arrow-rs/issues/6715) -- IPC file writer produces incorrect footer when not preserving dict ID [\#6710](https://github.com/apache/arrow-rs/issues/6710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- parquet from\_thrift\_helper incorrectly checks index [\#6693](https://github.com/apache/arrow-rs/issues/6693) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Primitive REPEATED fields not contained in LIST annotated groups aren't read as lists by record reader [\#6648](https://github.com/apache/arrow-rs/issues/6648) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- DictionaryHandling does not recurse into Map fields [\#6644](https://github.com/apache/arrow-rs/issues/6644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Array writer output empty when no record is written [\#6613](https://github.com/apache/arrow-rs/issues/6613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Archery Integration Test with c\# failing on main [\#6577](https://github.com/apache/arrow-rs/issues/6577) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Potential unsoundness in `filter_run_end_array` [\#6569](https://github.com/apache/arrow-rs/issues/6569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Parquet reader can generate incorrect validity buffer information for nested structures [\#6510](https://github.com/apache/arrow-rs/issues/6510) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- arrow-array ffi: FFI\_ArrowArray.null\_count is always interpreted as unsigned and initialized during conversion from C to Rust. [\#6497](https://github.com/apache/arrow-rs/issues/6497) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `Buffer::bit_slice` loses length with byte-aligned offsets [\#6895](https://github.com/apache/arrow-rs/issues/6895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- parquet arrow writer doesn't track memory size correctly for fixed sized lists [\#6839](https://github.com/apache/arrow-rs/issues/6839) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Casting Decimal128 to Decimal128 with smaller precision produces incorrect results in some cases [\#6833](https://github.com/apache/arrow-rs/issues/6833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Should empty nullable dictionary be parsed as null from arrow-csv? [\#6821](https://github.com/apache/arrow-rs/issues/6821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Array take doesn't make fields nullable [\#6809](https://github.com/apache/arrow-rs/issues/6809) +- Arrow Flight Encodes a Slice's List Offsets If the slice offset is starts with zero [\#6803](https://github.com/apache/arrow-rs/issues/6803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Parquet readers incorrectly interpret legacy nested lists [\#6756](https://github.com/apache/arrow-rs/issues/6756) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- filter\_bits under-allocates resulting boolean buffer [\#6750](https://github.com/apache/arrow-rs/issues/6750) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Multi-language support issues with Arrow FlightSQL client's execute\_update and execute\_ingest methods [\#6545](https://github.com/apache/arrow-rs/issues/6545) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Documentation updates:** -- Minor: Document pattern for accessing views in StringView [\#6673](https://github.com/apache/arrow-rs/pull/6673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Improve Array::is\_nullable documentation [\#6615](https://github.com/apache/arrow-rs/pull/6615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- Minor: improve docs for ByteViewArray-\>ByteArray From impl [\#6610](https://github.com/apache/arrow-rs/pull/6610) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) - -**Performance improvements:** - -- Speed up `filter_run_end_array` [\#6712](https://github.com/apache/arrow-rs/pull/6712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Should we document at what rate deprecated APIs are removed? [\#6851](https://github.com/apache/arrow-rs/issues/6851) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix docstring for `Format::with_header` in `arrow-csv` [\#6856](https://github.com/apache/arrow-rs/pull/6856) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kylebarron](https://github.com/kylebarron)) +- Add deprecation / API removal policy [\#6852](https://github.com/apache/arrow-rs/pull/6852) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Minor: add example for creating `SchemaDescriptor` [\#6841](https://github.com/apache/arrow-rs/pull/6841) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- chore: enrich panic context when BooleanBuffer fails to create [\#6810](https://github.com/apache/arrow-rs/pull/6810) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tisonkun](https://github.com/tisonkun)) **Closed issues:** -- Incorrect like results for pattern starting/ending with `%` percent and containing escape characters [\#6702](https://github.com/apache/arrow-rs/issues/6702) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- \[FlightSQL\] GetCatalogsBuilder does not sort the catalog names [\#6807](https://github.com/apache/arrow-rs/issues/6807) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Add a lint to automatically check for unused dependencies [\#6796](https://github.com/apache/arrow-rs/issues/6796) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Merged pull requests:** -- Fix signed decimal e-notation parsing [\#6729](https://github.com/apache/arrow-rs/pull/6729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya)) -- Clean up some arrow-flight tests and duplicated code [\#6725](https://github.com/apache/arrow-rs/pull/6725) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime)) -- Update PR template section about API breaking changes [\#6723](https://github.com/apache/arrow-rs/pull/6723) ([findepi](https://github.com/findepi)) -- Support for casting `StringViewArray` to `DecimalArray` [\#6720](https://github.com/apache/arrow-rs/pull/6720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) -- File writer preserve dict bug [\#6711](https://github.com/apache/arrow-rs/pull/6711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz)) -- Add filter\_kernel benchmark for run array [\#6706](https://github.com/apache/arrow-rs/pull/6706) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) -- Fix string view ILIKE checks with NULL values [\#6705](https://github.com/apache/arrow-rs/pull/6705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- Implement logical\_null\_count for more array types [\#6704](https://github.com/apache/arrow-rs/pull/6704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- Fix LIKE with escapes [\#6703](https://github.com/apache/arrow-rs/pull/6703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- Speed up `filter_bytes` [\#6699](https://github.com/apache/arrow-rs/pull/6699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Minor: fix misleading comment in byte view [\#6695](https://github.com/apache/arrow-rs/pull/6695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211)) -- minor fix on checking index [\#6694](https://github.com/apache/arrow-rs/pull/6694) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jp0317](https://github.com/jp0317)) -- Undo run end filter performance regression [\#6691](https://github.com/apache/arrow-rs/pull/6691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) -- Reimplement `PartialEq` of `GenericByteViewArray` compares by logical value [\#6689](https://github.com/apache/arrow-rs/pull/6689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) -- feat: expose known\_schema from FlightDataEncoder [\#6688](https://github.com/apache/arrow-rs/pull/6688) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) -- Update hashbrown requirement from 0.14.2 to 0.15.1 [\#6684](https://github.com/apache/arrow-rs/pull/6684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Support Duration in JSON Reader [\#6683](https://github.com/apache/arrow-rs/pull/6683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([simonvandel](https://github.com/simonvandel)) -- Check predicate and values are the same length for run end array filter safety [\#6675](https://github.com/apache/arrow-rs/pull/6675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) -- \[ffi\] Fix arrow-array null\_count error during conversion from C to Rust [\#6674](https://github.com/apache/arrow-rs/pull/6674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adbmal](https://github.com/adbmal)) -- Support `Utf8View` for `bit_length` kernel [\#6671](https://github.com/apache/arrow-rs/pull/6671) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([austin362667](https://github.com/austin362667)) -- Fix string view LIKE checks with NULL values [\#6662](https://github.com/apache/arrow-rs/pull/6662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- Improve documentation for `nullif` kernel [\#6658](https://github.com/apache/arrow-rs/pull/6658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Improve test\_auth error message when contains\(\) fails [\#6657](https://github.com/apache/arrow-rs/pull/6657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) -- Let std::fmt::Debug for StructArray output Null/Validity info [\#6655](https://github.com/apache/arrow-rs/pull/6655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([XinyuZeng](https://github.com/XinyuZeng)) -- Include offending line number when processing CSV file fails [\#6653](https://github.com/apache/arrow-rs/pull/6653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- feat: add write\_bytes for GenericBinaryBuilder [\#6652](https://github.com/apache/arrow-rs/pull/6652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tisonkun](https://github.com/tisonkun)) -- feat: Support Utf8View in JSON serialization [\#6651](https://github.com/apache/arrow-rs/pull/6651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonmmease](https://github.com/jonmmease)) -- fix: include chrono-tz in flight sql cli [\#6650](https://github.com/apache/arrow-rs/pull/6650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) -- Handle primitive REPEATED field not contained in LIST annotated group [\#6649](https://github.com/apache/arrow-rs/pull/6649) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm)) -- Implement `append_n` for `BooleanBuilder` [\#6646](https://github.com/apache/arrow-rs/pull/6646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3)) -- fix: recurse into Map datatype when hydrating dictionaries [\#6645](https://github.com/apache/arrow-rs/pull/6645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc)) -- fix: enable TLS roots for flight CLI client [\#6640](https://github.com/apache/arrow-rs/pull/6640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum)) -- doc: Clarify take kernel semantics [\#6632](https://github.com/apache/arrow-rs/pull/6632) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Return error rather than panic when too many row groups are written [\#6629](https://github.com/apache/arrow-rs/pull/6629) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) -- Fix test feature selection so all feature combinations work as expected [\#6626](https://github.com/apache/arrow-rs/pull/6626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([itsjunetime](https://github.com/itsjunetime)) -- Add Parquet RowSelection benchmark [\#6623](https://github.com/apache/arrow-rs/pull/6623) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao)) -- Optimize `take_bits` to optimize `take_boolean` / `take_primitive` / `take_byte_view`: up to -25% [\#6622](https://github.com/apache/arrow-rs/pull/6622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Make downcast macros hygenic \(\#6400\) [\#6620](https://github.com/apache/arrow-rs/pull/6620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Update proc-macro2 requirement from =1.0.88 to =1.0.89 [\#6618](https://github.com/apache/arrow-rs/pull/6618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) -- Fix arrow-json writer empty [\#6614](https://github.com/apache/arrow-rs/pull/6614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gwik](https://github.com/gwik)) -- Add `ParquetObjectReader::with_runtime` [\#6612](https://github.com/apache/arrow-rs/pull/6612) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([itsjunetime](https://github.com/itsjunetime)) -- Re-enable `C#` arrow flight integration test [\#6611](https://github.com/apache/arrow-rs/pull/6611) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Add Array::logical\_null\_count for inspecting number of null values [\#6608](https://github.com/apache/arrow-rs/pull/6608) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) -- Added casting from Binary/LargeBinary to Utf8View [\#6592](https://github.com/apache/arrow-rs/pull/6592) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ngli-me](https://github.com/ngli-me)) -- Parquet AsyncReader: Don't panic when empty offset\_index is Some\(\[\]\) [\#6582](https://github.com/apache/arrow-rs/pull/6582) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jroddev](https://github.com/jroddev)) -- Skip writing down null buffers for non-nullable primitive arrays [\#6524](https://github.com/apache/arrow-rs/pull/6524) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([bkirwi](https://github.com/bkirwi)) +- doc: add comment for timezone string [\#6899](https://github.com/apache/arrow-rs/pull/6899) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xxchan](https://github.com/xxchan)) +- docs: fix typo [\#6890](https://github.com/apache/arrow-rs/pull/6890) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton)) +- Minor: Fix deprecation notice for `arrow_to_parquet_schema` [\#6889](https://github.com/apache/arrow-rs/pull/6889) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Add Field::with\_dict\_is\_ordered [\#6885](https://github.com/apache/arrow-rs/pull/6885) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Deprecate "max statistics size" property in `WriterProperties` [\#6884](https://github.com/apache/arrow-rs/pull/6884) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Add deprecation warnings for everything related to `dict_id` [\#6873](https://github.com/apache/arrow-rs/pull/6873) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([brancz](https://github.com/brancz)) +- Enable matching temporal as from\_type to Utf8View [\#6872](https://github.com/apache/arrow-rs/pull/6872) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Kev1n8](https://github.com/Kev1n8)) +- Enable string-based column projections from Parquet files [\#6871](https://github.com/apache/arrow-rs/pull/6871) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Improvements to UTF-8 statistics truncation [\#6870](https://github.com/apache/arrow-rs/pull/6870) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- fix: make GetCatalogsBuilder sort catalog names [\#6864](https://github.com/apache/arrow-rs/pull/6864) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([niebayes](https://github.com/niebayes)) +- add buffered data\_pages to parquet column writer total bytes estimation [\#6862](https://github.com/apache/arrow-rs/pull/6862) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([onursatici](https://github.com/onursatici)) +- Update prost-build requirement from =0.13.3 to =0.13.4 [\#6860](https://github.com/apache/arrow-rs/pull/6860) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Minor: add comments explaining bad MSRV, output in json [\#6857](https://github.com/apache/arrow-rs/pull/6857) ([alamb](https://github.com/alamb)) +- perf: Use Cow in get\_format\_string in FFI\_ArrowSchema [\#6853](https://github.com/apache/arrow-rs/pull/6853) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove)) +- chore: add cast\_decimal benchmark [\#6850](https://github.com/apache/arrow-rs/pull/6850) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove)) +- arrow-array::builder: support Int8, Int16 and Int64 keys [\#6845](https://github.com/apache/arrow-rs/pull/6845) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ajwerner](https://github.com/ajwerner)) +- Add `ArrowToParquetSchemaConverter`, deprecate `arrow_to_parquet_schema` [\#6840](https://github.com/apache/arrow-rs/pull/6840) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Remove APIs deprecated in 50.0.0 [\#6838](https://github.com/apache/arrow-rs/pull/6838) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- fix: decimal conversion looses value on lower precision [\#6836](https://github.com/apache/arrow-rs/pull/6836) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([himadripal](https://github.com/himadripal)) +- Update sysinfo requirement from 0.32.0 to 0.33.0 [\#6835](https://github.com/apache/arrow-rs/pull/6835) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Optionally coerce names of maps and lists to match Parquet specification [\#6828](https://github.com/apache/arrow-rs/pull/6828) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Remove deprecated unary\_dyn and try\_unary\_dyn [\#6824](https://github.com/apache/arrow-rs/pull/6824) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Remove deprecated flight\_data\_from\_arrow\_batch [\#6823](https://github.com/apache/arrow-rs/pull/6823) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- \[arrow-cast\] Support cast boolean from/to string view [\#6822](https://github.com/apache/arrow-rs/pull/6822) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- Hook up Avro Decoder [\#6820](https://github.com/apache/arrow-rs/pull/6820) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix arrow-avro compilation without default features [\#6819](https://github.com/apache/arrow-rs/pull/6819) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Support shrink to empty [\#6817](https://github.com/apache/arrow-rs/pull/6817) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- \[arrow-cast\] Support cast numeric to string view \(alternate\) [\#6816](https://github.com/apache/arrow-rs/pull/6816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Hide implicit optional dependency features in arrow-flight [\#6806](https://github.com/apache/arrow-rs/pull/6806) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- fix: Encoding of List offsets was incorrect when slice offsets begin with zero [\#6805](https://github.com/apache/arrow-rs/pull/6805) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HawaiianSpork](https://github.com/HawaiianSpork)) +- Enable unused\_crate\_dependencies Rust lint, remove unused dependencies [\#6804](https://github.com/apache/arrow-rs/pull/6804) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- Minor: Fix docstrings for `ColumnProperties::statistics_enabled` property [\#6798](https://github.com/apache/arrow-rs/pull/6798) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Add option to disable writing of Parquet offset index [\#6797](https://github.com/apache/arrow-rs/pull/6797) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Remove unused dependencies [\#6792](https://github.com/apache/arrow-rs/pull/6792) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi)) +- Add `Array::shrink_to_fit(&mut self)` [\#6790](https://github.com/apache/arrow-rs/pull/6790) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([emilk](https://github.com/emilk)) +- Formalize the default nested list field name to `item` [\#6785](https://github.com/apache/arrow-rs/pull/6785) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([gruuya](https://github.com/gruuya)) +- Improve UnionArray logical\_nulls tests [\#6781](https://github.com/apache/arrow-rs/pull/6781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gstvg](https://github.com/gstvg)) +- Improve list builder usage example in docs [\#6775](https://github.com/apache/arrow-rs/pull/6775) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Update proc-macro2 requirement from =1.0.89 to =1.0.92 [\#6772](https://github.com/apache/arrow-rs/pull/6772) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Allow NullBuffer construction directly from array [\#6769](https://github.com/apache/arrow-rs/pull/6769) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Include license and notice files in published crates [\#6767](https://github.com/apache/arrow-rs/pull/6767) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([ankane](https://github.com/ankane)) +- fix: remove redundant `bit_util::ceil` [\#6766](https://github.com/apache/arrow-rs/pull/6766) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([miroim](https://github.com/miroim)) +- Remove 'make\_row', expose a 'Row::new' method instead. [\#6763](https://github.com/apache/arrow-rs/pull/6763) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jonded94](https://github.com/jonded94)) +- Read nested Parquet 2-level lists correctly [\#6757](https://github.com/apache/arrow-rs/pull/6757) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Split `timestamp_s_to_datetime` to `date` and `time` to avoid unnecessary computation [\#6755](https://github.com/apache/arrow-rs/pull/6755) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211)) +- More trivial implementation of `Box` and `Box` [\#6748](https://github.com/apache/arrow-rs/pull/6748) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([ethe](https://github.com/ethe)) +- Update cache action to v4 [\#6744](https://github.com/apache/arrow-rs/pull/6744) ([findepi](https://github.com/findepi)) +- Remove redundant implementation of `StringArrayType` [\#6743](https://github.com/apache/arrow-rs/pull/6743) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365)) +- Fix Dictionary logical nulls for RunArray/UnionArray Values [\#6740](https://github.com/apache/arrow-rs/pull/6740) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Allow reading Parquet maps that lack a `values` field [\#6730](https://github.com/apache/arrow-rs/pull/6730) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl)) +- Improve default implementation of Array::is\_nullable [\#6721](https://github.com/apache/arrow-rs/pull/6721) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi)) +- Fix Buffer::bit\_slice losing length with byte-aligned offsets [\#6707](https://github.com/apache/arrow-rs/pull/6707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime)) diff --git a/Cargo.toml b/Cargo.toml index 375a4efac55..75ba410f12a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,7 +62,7 @@ exclude = [ ] [workspace.package] -version = "53.3.0" +version = "54.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -77,20 +77,20 @@ edition = "2021" rust-version = "1.62" [workspace.dependencies] -arrow = { version = "53.3.0", path = "./arrow", default-features = false } -arrow-arith = { version = "53.3.0", path = "./arrow-arith" } -arrow-array = { version = "53.3.0", path = "./arrow-array" } -arrow-buffer = { version = "53.3.0", path = "./arrow-buffer" } -arrow-cast = { version = "53.3.0", path = "./arrow-cast" } -arrow-csv = { version = "53.3.0", path = "./arrow-csv" } -arrow-data = { version = "53.3.0", path = "./arrow-data" } -arrow-ipc = { version = "53.3.0", path = "./arrow-ipc" } -arrow-json = { version = "53.3.0", path = "./arrow-json" } -arrow-ord = { version = "53.3.0", path = "./arrow-ord" } -arrow-row = { version = "53.3.0", path = "./arrow-row" } -arrow-schema = { version = "53.3.0", path = "./arrow-schema" } -arrow-select = { version = "53.3.0", path = "./arrow-select" } -arrow-string = { version = "53.3.0", path = "./arrow-string" } -parquet = { version = "53.3.0", path = "./parquet", default-features = false } +arrow = { version = "54.0.0", path = "./arrow", default-features = false } +arrow-arith = { version = "54.0.0", path = "./arrow-arith" } +arrow-array = { version = "54.0.0", path = "./arrow-array" } +arrow-buffer = { version = "54.0.0", path = "./arrow-buffer" } +arrow-cast = { version = "54.0.0", path = "./arrow-cast" } +arrow-csv = { version = "54.0.0", path = "./arrow-csv" } +arrow-data = { version = "54.0.0", path = "./arrow-data" } +arrow-ipc = { version = "54.0.0", path = "./arrow-ipc" } +arrow-json = { version = "54.0.0", path = "./arrow-json" } +arrow-ord = { version = "54.0.0", path = "./arrow-ord" } +arrow-row = { version = "54.0.0", path = "./arrow-row" } +arrow-schema = { version = "54.0.0", path = "./arrow-schema" } +arrow-select = { version = "54.0.0", path = "./arrow-select" } +arrow-string = { version = "54.0.0", path = "./arrow-string" } +parquet = { version = "54.0.0", path = "./parquet", default-features = false } chrono = { version = "0.4.34", default-features = false, features = ["clock"] } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 661abfc5869..3ffc8780c2f 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -31,7 +31,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "53.3.0" +arrow-flight = "54.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index d00cc498625..4a2f5e3f198 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="53.2.0" -FUTURE_RELEASE="53.3.0" +SINCE_TAG="53.3.0" +FUTURE_RELEASE="54.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 1dbe523754eda1cefe526f00da2b92d2e83fcfa7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 19 Dec 2024 14:51:30 -0500 Subject: [PATCH 25/29] [object store] Add planned release schedule (#6904) --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 723249ad29e..ed42f630514 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,13 @@ versions approximately every 2 months. [`object_store`]: https://crates.io/crates/object_store +Planned Release Schedule + +| Approximate Date | Version | Notes | +| ---------------- | -------- | --------------------------------------- | +| Dec 2024 | `0.11.2` | Minor, NO breaking API changes | +| Feb 2025 | `0.12.0` | Major, potentially breaking API changes | + ### Deprecation Guidelines Minor releases may deprecate, but not remove APIs. Deprecating APIs allows From 2908a80d9ca3e3fb0414e35b67856f1fb761304c Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 19 Dec 2024 21:52:18 +0200 Subject: [PATCH 26/29] add `extend_dictionary` in dictionary builder for improved performance (#6875) * add `extend_dictionary` in dictionary builder for improved performance * fix extends all nulls * support null in mapped value * adding comment * run `clippy` and `fmt` * fix ci * Apply suggestions from code review Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- .../generic_bytes_dictionary_builder.rs | 187 ++++++++++++++++- .../builder/primitive_dictionary_builder.rs | 198 +++++++++++++++++- 2 files changed, 379 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index bb0fb5e91be..ead151d5cee 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -17,7 +17,7 @@ use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder}; use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType}; -use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray}; +use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray, TypedDictionaryArray}; use arrow_buffer::ArrowNativeType; use arrow_schema::{ArrowError, DataType}; use hashbrown::HashTable; @@ -305,6 +305,63 @@ where }; } + /// Extends builder with an existing dictionary array. + /// + /// This is the same as [`Self::extend`] but is faster as it translates + /// the dictionary values once rather than doing a lookup for each item in the iterator + /// + /// when dictionary values are null (the actual mapped values) the keys are null + /// + pub fn extend_dictionary( + &mut self, + dictionary: &TypedDictionaryArray>, + ) -> Result<(), ArrowError> { + let values = dictionary.values(); + + let v_len = values.len(); + let k_len = dictionary.keys().len(); + if v_len == 0 && k_len == 0 { + return Ok(()); + } + + // All nulls + if v_len == 0 { + self.append_nulls(k_len); + return Ok(()); + } + + if k_len == 0 { + return Err(ArrowError::InvalidArgumentError( + "Dictionary keys should not be empty when values are not empty".to_string(), + )); + } + + // Orphan values will be carried over to the new dictionary + let mapped_values = values + .iter() + // Dictionary values can technically be null, so we need to handle that + .map(|dict_value| { + dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose() + }) + .collect::, _>>()?; + + // Just insert the keys without additional lookups + dictionary.keys().iter().for_each(|key| match key { + None => self.append_null(), + Some(original_dict_index) => { + let index = original_dict_index.as_usize().min(v_len - 1); + match mapped_values[index] { + None => self.append_null(), + Some(mapped_value) => self.keys_builder.append_value(mapped_value), + } + } + }); + + Ok(()) + } + /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.dedup.clear(); @@ -445,8 +502,9 @@ mod tests { use super::*; use crate::array::Int8Array; + use crate::cast::AsArray; use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type}; - use crate::{BinaryArray, StringArray}; + use crate::{ArrowPrimitiveType, BinaryArray, StringArray}; fn test_bytes_dictionary_builder(values: Vec<&T::Native>) where @@ -664,4 +722,129 @@ mod tests { assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]); assert_eq!(dict.values().len(), 4); } + + #[test] + fn test_extend_dictionary() { + let some_dict = { + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some)); + builder.extend([None::<&str>]); + builder.extend(["c", "d", "a"].into_iter().map(Some)); + builder.append_null(); + builder.finish() + }; + + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some)); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 6); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [ + Some("e"), + Some("e"), + Some("f"), + Some("e"), + Some("d"), + Some("a"), + Some("b"), + Some("c"), + Some("a"), + Some("b"), + Some("c"), + None, + Some("c"), + Some("d"), + Some("a"), + None + ] + ); + } + #[test] + fn test_extend_dictionary_with_null_in_mapped_value() { + let some_dict = { + let mut values_builder = GenericByteBuilder::::new(); + let mut keys_builder = PrimitiveBuilder::::new(); + + // Manually build a dictionary values that the mapped values have null + values_builder.append_null(); + keys_builder.append_value(0); + values_builder.append_value("I like worm hugs"); + keys_builder.append_value(1); + + let values = values_builder.finish(); + let keys = keys_builder.finish(); + + let data_type = DataType::Dictionary( + Box::new(Int32Type::DATA_TYPE), + Box::new(Utf8Type::DATA_TYPE), + ); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + }; + + let some_dict_values = some_dict.values().as_string::(); + assert_eq!( + some_dict_values.into_iter().collect::>(), + &[None, Some("I like worm hugs")] + ); + + let mut builder = GenericByteDictionaryBuilder::::new(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 1); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [None, Some("I like worm hugs")]); + } + + #[test] + fn test_extend_all_null_dictionary() { + let some_dict = { + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.append_nulls(2); + builder.finish() + }; + + let mut builder = GenericByteDictionaryBuilder::::new(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 0); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [None, None]); + } } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index ac40f8a469d..282f0ae9d5b 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -17,7 +17,9 @@ use crate::builder::{ArrayBuilder, PrimitiveBuilder}; use crate::types::ArrowDictionaryKeyType; -use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray}; +use crate::{ + Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, PrimitiveArray, TypedDictionaryArray, +}; use arrow_buffer::{ArrowNativeType, ToByteSlice}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -44,7 +46,7 @@ impl PartialEq for Value { impl Eq for Value {} -/// Builder for [`DictionaryArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray) +/// Builder for [`DictionaryArray`] of [`PrimitiveArray`] /// /// # Example: /// @@ -303,6 +305,63 @@ where }; } + /// Extends builder with dictionary + /// + /// This is the same as [`Self::extend`] but is faster as it translates + /// the dictionary values once rather than doing a lookup for each item in the iterator + /// + /// when dictionary values are null (the actual mapped values) the keys are null + /// + pub fn extend_dictionary( + &mut self, + dictionary: &TypedDictionaryArray>, + ) -> Result<(), ArrowError> { + let values = dictionary.values(); + + let v_len = values.len(); + let k_len = dictionary.keys().len(); + if v_len == 0 && k_len == 0 { + return Ok(()); + } + + // All nulls + if v_len == 0 { + self.append_nulls(k_len); + return Ok(()); + } + + if k_len == 0 { + return Err(ArrowError::InvalidArgumentError( + "Dictionary keys should not be empty when values are not empty".to_string(), + )); + } + + // Orphan values will be carried over to the new dictionary + let mapped_values = values + .iter() + // Dictionary values can technically be null, so we need to handle that + .map(|dict_value| { + dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose() + }) + .collect::, _>>()?; + + // Just insert the keys without additional lookups + dictionary.keys().iter().for_each(|key| match key { + None => self.append_null(), + Some(original_dict_index) => { + let index = original_dict_index.as_usize().min(v_len - 1); + match mapped_values[index] { + None => self.append_null(), + Some(mapped_value) => self.keys_builder.append_value(mapped_value), + } + } + }); + + Ok(()) + } + /// Builds the `DictionaryArray` and reset this builder. pub fn finish(&mut self) -> DictionaryArray { self.map.clear(); @@ -368,9 +427,9 @@ impl Extend> mod tests { use super::*; - use crate::array::UInt32Array; - use crate::array::UInt8Array; + use crate::array::{Int32Array, UInt32Array, UInt8Array}; use crate::builder::Decimal128Builder; + use crate::cast::AsArray; use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type}; #[test] @@ -443,4 +502,135 @@ mod tests { ) ); } + + #[test] + fn test_extend_dictionary() { + let some_dict = { + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some)); + builder.extend([None::]); + builder.extend([4, 5, 1, 3, 1].into_iter().map(Some)); + builder.append_null(); + builder.finish() + }; + + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.extend([6, 6, 7, 6, 5].into_iter().map(Some)); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 7); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [ + Some(6), + Some(6), + Some(7), + Some(6), + Some(5), + Some(1), + Some(2), + Some(3), + Some(1), + Some(2), + Some(3), + Some(1), + Some(2), + Some(3), + None, + Some(4), + Some(5), + Some(1), + Some(3), + Some(1), + None + ] + ); + } + + #[test] + fn test_extend_dictionary_with_null_in_mapped_value() { + let some_dict = { + let mut values_builder = PrimitiveBuilder::::new(); + let mut keys_builder = PrimitiveBuilder::::new(); + + // Manually build a dictionary values that the mapped values have null + values_builder.append_null(); + keys_builder.append_value(0); + values_builder.append_value(42); + keys_builder.append_value(1); + + let values = values_builder.finish(); + let keys = keys_builder.finish(); + + let data_type = DataType::Dictionary( + Box::new(Int32Type::DATA_TYPE), + Box::new(values.data_type().clone()), + ); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + }; + + let some_dict_values = some_dict.values().as_primitive::(); + assert_eq!( + some_dict_values.into_iter().collect::>(), + &[None, Some(42)] + ); + + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 1); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [None, Some(42)]); + } + + #[test] + fn test_extend_all_null_dictionary() { + let some_dict = { + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.append_nulls(2); + builder.finish() + }; + + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 0); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!(values, [None, None]); + } } From 2c84f243b882eff69806cd7294d38bf422fdb24a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 20 Dec 2024 16:18:55 -0500 Subject: [PATCH 27/29] [object_store]: Version and Changelog for 0.11.2 (#6908) * [object_store]: Version and Changelog for 0.11.2 * increment version * update script * changelog * tweaks * Update object_store/CHANGELOG.md Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --------- Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- object_store/CHANGELOG-old.md | 39 ++++++++++++++ object_store/CHANGELOG.md | 51 ++++++++++--------- object_store/Cargo.toml | 2 +- object_store/dev/release/README.md | 5 +- object_store/dev/release/update_change_log.sh | 4 +- 5 files changed, 72 insertions(+), 29 deletions(-) diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md index 28dbde4e7b7..c42689240dd 100644 --- a/object_store/CHANGELOG-old.md +++ b/object_store/CHANGELOG-old.md @@ -19,6 +19,45 @@ # Historical Changelog + +## [object_store_0.11.1](https://github.com/apache/arrow-rs/tree/object_store_0.11.1) (2024-10-15) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.0...object_store_0.11.1) + +**Implemented enhancements:** + +- There is no way to pass object store client options as environment variables [\#6333](https://github.com/apache/arrow-rs/issues/6333) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Better Document Backoff Algorithm [\#6324](https://github.com/apache/arrow-rs/issues/6324) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Add direction to `list_with_offset` [\#6274](https://github.com/apache/arrow-rs/issues/6274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support server-side encryption with customer-provided keys \(SSE-C\) [\#6229](https://github.com/apache/arrow-rs/issues/6229) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- \[object-store\] Requested tokio version is too old - does not compile [\#6458](https://github.com/apache/arrow-rs/issues/6458) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Azure SAS tokens are visible when retry errors are logged via object\_store [\#6322](https://github.com/apache/arrow-rs/issues/6322) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Merged pull requests:** + +- object\_store: fix typo in with\_connect\_timeout\_disabled that actually disabled non-connect timeouts [\#6563](https://github.com/apache/arrow-rs/pull/6563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adriangb](https://github.com/adriangb)) +- object\_store: Clarify what is a prefix in list\(\) documentation [\#6520](https://github.com/apache/arrow-rs/pull/6520) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([progval](https://github.com/progval)) +- object\_store: enable lint `unreachable_pub` [\#6512](https://github.com/apache/arrow-rs/pull/6512) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) +- \[object\_store\] Retry S3 requests with 200 response with "Error" in body [\#6508](https://github.com/apache/arrow-rs/pull/6508) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([PeterKeDer](https://github.com/PeterKeDer)) +- \[object-store\] Require tokio 1.29.0. [\#6459](https://github.com/apache/arrow-rs/pull/6459) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ashtuchkin](https://github.com/ashtuchkin)) +- feat: expose HTTP/2 max frame size in `object_store` [\#6442](https://github.com/apache/arrow-rs/pull/6442) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- Derive `Clone` for `object_store::aws::AmazonS3` [\#6414](https://github.com/apache/arrow-rs/pull/6414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ethe](https://github.com/ethe)) +- object\_score: Support Azure Fabric OAuth Provider [\#6382](https://github.com/apache/arrow-rs/pull/6382) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666)) +- `object_store::GetOptions` derive `Clone` [\#6361](https://github.com/apache/arrow-rs/pull/6361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([samuelcolvin](https://github.com/samuelcolvin)) +- \[object\_store\] Propagate env vars as object store client options [\#6334](https://github.com/apache/arrow-rs/pull/6334) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ccciudatu](https://github.com/ccciudatu)) +- docs\[object\_store\]: clarify the backoff strategy that is actually implemented [\#6325](https://github.com/apache/arrow-rs/pull/6325) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([westonpace](https://github.com/westonpace)) +- fix: azure sas token visible in logs [\#6323](https://github.com/apache/arrow-rs/pull/6323) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) +- object\_store/delimited: Fix `TrailingEscape` condition [\#6265](https://github.com/apache/arrow-rs/pull/6265) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) +- fix\(object\_store\): only add encryption headers for SSE-C in get request [\#6260](https://github.com/apache/arrow-rs/pull/6260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) +- docs: Add parquet\_opendal in related projects [\#6236](https://github.com/apache/arrow-rs/pull/6236) ([Xuanwo](https://github.com/Xuanwo)) +- feat\(object\_store\): add support for server-side encryption with customer-provided keys \(SSE-C\) [\#6230](https://github.com/apache/arrow-rs/pull/6230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) +- feat: further TLS options on ClientOptions: \#5034 [\#6148](https://github.com/apache/arrow-rs/pull/6148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) + + + ## [object_store_0.11.0](https://github.com/apache/arrow-rs/tree/object_store_0.11.0) (2024-08-12) [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.2...object_store_0.11.0) diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md index 95585983572..0e834c5e2ef 100644 --- a/object_store/CHANGELOG.md +++ b/object_store/CHANGELOG.md @@ -19,41 +19,42 @@ # Changelog -## [object_store_0.11.1](https://github.com/apache/arrow-rs/tree/object_store_0.11.1) (2024-10-15) +## [object_store_0.11.2](https://github.com/apache/arrow-rs/tree/object_store_0.11.2) (2024-12-20) -[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.0...object_store_0.11.1) +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.1...object_store_0.11.2) **Implemented enhancements:** -- There is no way to pass object store client options as environment variables [\#6333](https://github.com/apache/arrow-rs/issues/6333) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Better Document Backoff Algorithm [\#6324](https://github.com/apache/arrow-rs/issues/6324) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Add direction to `list_with_offset` [\#6274](https://github.com/apache/arrow-rs/issues/6274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Support server-side encryption with customer-provided keys \(SSE-C\) [\#6229](https://github.com/apache/arrow-rs/issues/6229) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object-store's AzureClient should protect against multiple streams performing put\_block in parallel for the same BLOB path [\#6868](https://github.com/apache/arrow-rs/issues/6868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support S3 Put IfMatch [\#6799](https://github.com/apache/arrow-rs/issues/6799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store Azure Government using OAuth [\#6759](https://github.com/apache/arrow-rs/issues/6759) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support for AWS Requester Pays buckets [\#6716](https://github.com/apache/arrow-rs/issues/6716) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[object-store\]: Implement credential\_process support for S3 [\#6422](https://github.com/apache/arrow-rs/issues/6422) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Conditional put and rename\_if\_not\_exist on S3 [\#6285](https://github.com/apache/arrow-rs/issues/6285) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Fixed bugs:** -- \[object-store\] Requested tokio version is too old - does not compile [\#6458](https://github.com/apache/arrow-rs/issues/6458) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] -- Azure SAS tokens are visible when retry errors are logged via object\_store [\#6322](https://github.com/apache/arrow-rs/issues/6322) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- `object_store` errors when `reqwest` `gzip` feature is enabled [\#6842](https://github.com/apache/arrow-rs/issues/6842) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Multi-part s3 uploads fail when using checksum [\#6793](https://github.com/apache/arrow-rs/issues/6793) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- `with_unsigned_payload` shouldn't generate payload hash [\#6697](https://github.com/apache/arrow-rs/issues/6697) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- \[Object\_store\] min\_ttl is too high for GKE tokens [\#6625](https://github.com/apache/arrow-rs/issues/6625) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store `test_private_bucket` fails - store: "S3", source: BucketNotFound { bucket: "bloxbender" } [\#6600](https://github.com/apache/arrow-rs/issues/6600) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- S3 endpoint and trailing slash result in weird/invalid requests [\#6580](https://github.com/apache/arrow-rs/issues/6580) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] **Merged pull requests:** -- object\_store: fix typo in with\_connect\_timeout\_disabled that actually disabled non-connect timeouts [\#6563](https://github.com/apache/arrow-rs/pull/6563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adriangb](https://github.com/adriangb)) -- object\_store: Clarify what is a prefix in list\(\) documentation [\#6520](https://github.com/apache/arrow-rs/pull/6520) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([progval](https://github.com/progval)) -- object\_store: enable lint `unreachable_pub` [\#6512](https://github.com/apache/arrow-rs/pull/6512) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) -- \[object\_store\] Retry S3 requests with 200 response with "Error" in body [\#6508](https://github.com/apache/arrow-rs/pull/6508) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([PeterKeDer](https://github.com/PeterKeDer)) -- \[object-store\] Require tokio 1.29.0. [\#6459](https://github.com/apache/arrow-rs/pull/6459) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ashtuchkin](https://github.com/ashtuchkin)) -- feat: expose HTTP/2 max frame size in `object_store` [\#6442](https://github.com/apache/arrow-rs/pull/6442) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) -- Derive `Clone` for `object_store::aws::AmazonS3` [\#6414](https://github.com/apache/arrow-rs/pull/6414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ethe](https://github.com/ethe)) -- object\_score: Support Azure Fabric OAuth Provider [\#6382](https://github.com/apache/arrow-rs/pull/6382) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666)) -- `object_store::GetOptions` derive `Clone` [\#6361](https://github.com/apache/arrow-rs/pull/6361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([samuelcolvin](https://github.com/samuelcolvin)) -- \[object\_store\] Propagate env vars as object store client options [\#6334](https://github.com/apache/arrow-rs/pull/6334) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ccciudatu](https://github.com/ccciudatu)) -- docs\[object\_store\]: clarify the backoff strategy that is actually implemented [\#6325](https://github.com/apache/arrow-rs/pull/6325) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([westonpace](https://github.com/westonpace)) -- fix: azure sas token visible in logs [\#6323](https://github.com/apache/arrow-rs/pull/6323) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel)) -- object\_store/delimited: Fix `TrailingEscape` condition [\#6265](https://github.com/apache/arrow-rs/pull/6265) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87)) -- fix\(object\_store\): only add encryption headers for SSE-C in get request [\#6260](https://github.com/apache/arrow-rs/pull/6260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) -- docs: Add parquet\_opendal in related projects [\#6236](https://github.com/apache/arrow-rs/pull/6236) ([Xuanwo](https://github.com/Xuanwo)) -- feat\(object\_store\): add support for server-side encryption with customer-provided keys \(SSE-C\) [\#6230](https://github.com/apache/arrow-rs/pull/6230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb)) -- feat: further TLS options on ClientOptions: \#5034 [\#6148](https://github.com/apache/arrow-rs/pull/6148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker)) +- Use randomized content ID for Azure multipart uploads [\#6869](https://github.com/apache/arrow-rs/pull/6869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avarnon](https://github.com/avarnon)) +- Always explicitly disable `gzip` automatic decompression on reqwest client used by object\_store [\#6843](https://github.com/apache/arrow-rs/pull/6843) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([phillipleblanc](https://github.com/phillipleblanc)) +- object-store: remove S3ConditionalPut::ETagPutIfNotExists [\#6802](https://github.com/apache/arrow-rs/pull/6802) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch)) +- Fix multipart uploads with checksums on object locked buckets [\#6794](https://github.com/apache/arrow-rs/pull/6794) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Add AuthorityHost to AzureConfigKey [\#6773](https://github.com/apache/arrow-rs/pull/6773) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([zadeluca](https://github.com/zadeluca)) +- object\_store: Add support for requester pays buckets [\#6768](https://github.com/apache/arrow-rs/pull/6768) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kylebarron](https://github.com/kylebarron)) +- check sign\_payload instead of skip\_signature before computing checksum [\#6698](https://github.com/apache/arrow-rs/pull/6698) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mherrerarendon](https://github.com/mherrerarendon)) +- Update quick-xml requirement from 0.36.0 to 0.37.0 in /object\_store [\#6687](https://github.com/apache/arrow-rs/pull/6687) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum)) +- Support native S3 conditional writes [\#6682](https://github.com/apache/arrow-rs/pull/6682) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch)) +- \[object\_store\] fix S3 endpoint and trailing slash result in invalid requests [\#6641](https://github.com/apache/arrow-rs/pull/6641) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adbmal](https://github.com/adbmal)) +- Lower GCP token min\_ttl to 4 minutes and add backoff to token refresh logic [\#6638](https://github.com/apache/arrow-rs/pull/6638) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mwylde](https://github.com/mwylde)) +- Remove `test_private_bucket` object\_store test [\#6601](https://github.com/apache/arrow-rs/pull/6601) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index bcc8e0b9224..bf254b3a0bb 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "object_store" -version = "0.11.1" +version = "0.11.2" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" diff --git a/object_store/dev/release/README.md b/object_store/dev/release/README.md index 912ff4cd8ba..2dd1f6243c0 100644 --- a/object_store/dev/release/README.md +++ b/object_store/dev/release/README.md @@ -24,7 +24,10 @@ This file documents the release process for the `object_store` crate. -At the time of writing, we release a new version of `object_store` on demand rather than on a regular schedule. +We release a new version of `object_store` according to the schedule listed in +the [main README.md] + +[main README.md]: https://github.com/apache/arrow-rs?tab=readme-ov-file#object_store-crate As we are still in an early phase, we use the 0.x version scheme. If any code has been merged to main that has a breaking API change, as defined in [Rust RFC 1105] diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh index 30724478ae1..2797b62c001 100755 --- a/object_store/dev/release/update_change_log.sh +++ b/object_store/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="object_store_0.11.0" -FUTURE_RELEASE="object_store_0.11.1" +SINCE_TAG="object_store_0.11.1" +FUTURE_RELEASE="object_store_0.11.2" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" From 10cf03c35808eedc191a9b3330bc7c268b7b71c1 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 24 Dec 2024 22:22:33 +0800 Subject: [PATCH 28/29] feat(parquet): Add next_row_group API for ParquetRecordBatchStream (#6907) * feat(parquet): Add next_row_group API for ParquetRecordBatchStream Signed-off-by: Xuanwo * chore: Returning error instead of using unreachable Signed-off-by: Xuanwo --------- Signed-off-by: Xuanwo --- parquet/src/arrow/async_reader/mod.rs | 132 ++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index c408456df14..96715e1164b 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -613,6 +613,9 @@ impl std::fmt::Debug for StreamState { /// An asynchronous [`Stream`](https://docs.rs/futures/latest/futures/stream/trait.Stream.html) of [`RecordBatch`] /// for a parquet file that can be constructed using [`ParquetRecordBatchStreamBuilder`]. +/// +/// `ParquetRecordBatchStream` also provides [`ParquetRecordBatchStream::next_row_group`] for fetching row groups, +/// allowing users to decode record batches separately from I/O. pub struct ParquetRecordBatchStream { metadata: Arc, @@ -654,6 +657,70 @@ impl ParquetRecordBatchStream { } } +impl ParquetRecordBatchStream +where + T: AsyncFileReader + Unpin + Send + 'static, +{ + /// Fetches the next row group from the stream. + /// + /// Users can continue to call this function to get row groups and decode them concurrently. + /// + /// ## Notes + /// + /// ParquetRecordBatchStream should be used either as a `Stream` or with `next_row_group`; they should not be used simultaneously. + /// + /// ## Returns + /// + /// - `Ok(None)` if the stream has ended. + /// - `Err(error)` if the stream has errored. All subsequent calls will return `Ok(None)`. + /// - `Ok(Some(reader))` which holds all the data for the row group. + pub async fn next_row_group(&mut self) -> Result> { + loop { + match &mut self.state { + StreamState::Decoding(_) | StreamState::Reading(_) => { + return Err(ParquetError::General( + "Cannot combine the use of next_row_group with the Stream API".to_string(), + )) + } + StreamState::Init => { + let row_group_idx = match self.row_groups.pop_front() { + Some(idx) => idx, + None => return Ok(None), + }; + + let row_count = self.metadata.row_group(row_group_idx).num_rows() as usize; + + let selection = self.selection.as_mut().map(|s| s.split_off(row_count)); + + let reader_factory = self.reader.take().expect("lost reader"); + + let (reader_factory, maybe_reader) = reader_factory + .read_row_group( + row_group_idx, + selection, + self.projection.clone(), + self.batch_size, + ) + .await + .map_err(|err| { + self.state = StreamState::Error; + err + })?; + self.reader = Some(reader_factory); + + if let Some(reader) = maybe_reader { + return Ok(Some(reader)); + } else { + // All rows skipped, read next row group + continue; + } + } + StreamState::Error => return Ok(None), // Ends the stream as error happens. + } + } + } +} + impl Stream for ParquetRecordBatchStream where T: AsyncFileReader + Unpin + Send + 'static, @@ -1020,6 +1087,71 @@ mod tests { ); } + #[tokio::test] + async fn test_async_reader_with_next_row_group() { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/alltypes_plain.parquet"); + let data = Bytes::from(std::fs::read(path).unwrap()); + + let metadata = ParquetMetaDataReader::new() + .parse_and_finish(&data) + .unwrap(); + let metadata = Arc::new(metadata); + + assert_eq!(metadata.num_row_groups(), 1); + + let async_reader = TestReader { + data: data.clone(), + metadata: metadata.clone(), + requests: Default::default(), + }; + + let requests = async_reader.requests.clone(); + let builder = ParquetRecordBatchStreamBuilder::new(async_reader) + .await + .unwrap(); + + let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![1, 2]); + let mut stream = builder + .with_projection(mask.clone()) + .with_batch_size(1024) + .build() + .unwrap(); + + let mut readers = vec![]; + while let Some(reader) = stream.next_row_group().await.unwrap() { + readers.push(reader); + } + + let async_batches: Vec<_> = readers + .into_iter() + .flat_map(|r| r.map(|v| v.unwrap()).collect::>()) + .collect(); + + let sync_batches = ParquetRecordBatchReaderBuilder::try_new(data) + .unwrap() + .with_projection(mask) + .with_batch_size(104) + .build() + .unwrap() + .collect::>>() + .unwrap(); + + assert_eq!(async_batches, sync_batches); + + let requests = requests.lock().unwrap(); + let (offset_1, length_1) = metadata.row_group(0).column(1).byte_range(); + let (offset_2, length_2) = metadata.row_group(0).column(2).byte_range(); + + assert_eq!( + &requests[..], + &[ + offset_1 as usize..(offset_1 + length_1) as usize, + offset_2 as usize..(offset_2 + length_2) as usize + ] + ); + } + #[tokio::test] async fn test_async_reader_with_index() { let testdata = arrow::util::test_util::parquet_test_data(); From 63899b7ad9d093e86acc6c98604aa66431102993 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 26 Dec 2024 16:49:36 +0200 Subject: [PATCH 29/29] chore(arrow-ord): move `can_rank` to the `rank` file (#6910) --- arrow-ord/src/rank.rs | 9 +++++++++ arrow-ord/src/sort.rs | 11 +---------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arrow-ord/src/rank.rs b/arrow-ord/src/rank.rs index ecc693bab4e..e61cebef38e 100644 --- a/arrow-ord/src/rank.rs +++ b/arrow-ord/src/rank.rs @@ -24,6 +24,15 @@ use arrow_buffer::NullBuffer; use arrow_schema::{ArrowError, DataType, SortOptions}; use std::cmp::Ordering; +/// Whether `arrow_ord::rank` can rank an array of given data type. +pub(crate) fn can_rank(data_type: &DataType) -> bool { + data_type.is_primitive() + || matches!( + data_type, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary + ) +} + /// Assigns a rank to each value in `array` based on its position in the sorted order /// /// Where values are equal, they will be assigned the highest of their ranks, diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 60fc4a91852..51a6659e631 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -30,7 +30,7 @@ use arrow_select::take::take; use std::cmp::Ordering; use std::sync::Arc; -use crate::rank::rank; +use crate::rank::{can_rank, rank}; pub use arrow_schema::SortOptions; /// Sort the `ArrayRef` using `SortOptions`. @@ -190,15 +190,6 @@ fn partition_validity(array: &dyn Array) -> (Vec, Vec) { } } -/// Whether `arrow_ord::rank` can rank an array of given data type. -fn can_rank(data_type: &DataType) -> bool { - data_type.is_primitive() - || matches!( - data_type, - DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary - ) -} - /// Whether `sort_to_indices` can sort an array of given data type. fn can_sort_to_indices(data_type: &DataType) -> bool { data_type.is_primitive()