From 08ac58761cb9a80b6ffd651727aa957db0c32bd7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Dec 2024 19:21:48 +0000
Subject: [PATCH 01/29] Update prost-build requirement from =0.13.3 to =0.13.4
 (#6860)

Updates the requirements on [prost-build](https://github.com/tokio-rs/prost) to permit the latest version.
- [Release notes](https://github.com/tokio-rs/prost/releases)
- [Changelog](https://github.com/tokio-rs/prost/blob/master/CHANGELOG.md)
- [Commits](https://github.com/tokio-rs/prost/compare/v0.13.3...v0.13.4)

---
updated-dependencies:
- dependency-name: prost-build
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 arrow-flight/gen/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml
index ccd32b433cf..6358227a891 100644
--- a/arrow-flight/gen/Cargo.toml
+++ b/arrow-flight/gen/Cargo.toml
@@ -32,5 +32,5 @@ publish = false
 [dependencies]
 # Pin specific version of the tonic-build dependencies to avoid auto-generated
 # (and checked in) arrow.flight.protocol.rs from changing
-prost-build = { version = "=0.13.3", default-features = false }
+prost-build = { version = "=0.13.4", default-features = false }
 tonic-build = { version = "=0.12.3", default-features = false, features = ["transport", "prost"] }

From dc45d7d18857941c12082c0a6288048baf0c83ee Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 10 Dec 2024 21:39:31 -0700
Subject: [PATCH 02/29] chore: add cast_decimal benchmark (#6850)

* add cast_decimal bench

* format

* address feedback
---
 arrow/benches/cast_kernels.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs
index ec7990d3d76..5c4fcff13de 100644
--- a/arrow/benches/cast_kernels.rs
+++ b/arrow/benches/cast_kernels.rs
@@ -250,6 +250,9 @@ fn add_benchmark(c: &mut Criterion) {
     c.bench_function("cast decimal128 to decimal128 512", |b| {
         b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(30, 5)))
     });
+    c.bench_function("cast decimal128 to decimal128 512 lower precision", |b| {
+        b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(6, 5)))
+    });
     c.bench_function("cast decimal128 to decimal256 512", |b| {
         b.iter(|| cast_array(&decimal128_array, DataType::Decimal256(50, 5)))
     });

From 06a015770098a569b67855dfaa18bdfa7c18ff92 Mon Sep 17 00:00:00 2001
From: Onur Satici <onursatici@users.noreply.github.com>
Date: Wed, 11 Dec 2024 13:23:05 +0000
Subject: [PATCH 03/29] add buffered data_pages to parquet column writer total
 bytes estimation (#6862)

* add buffered data_pages to parquet column writer memory size estimation

* move to get estimated total bytes
---
 parquet/src/column/writer/mod.rs | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 8d0be5f9f8c..16de0ba7898 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -574,7 +574,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
     /// anticipated encoded size.
     #[cfg(feature = "arrow")]
     pub(crate) fn get_estimated_total_bytes(&self) -> u64 {
-        self.column_metrics.total_bytes_written
+        self.data_pages
+            .iter()
+            .map(|page| page.data().len() as u64)
+            .sum::<u64>()
+            + self.column_metrics.total_bytes_written
             + self.encoder.estimated_data_page_size() as u64
             + self.encoder.estimated_dict_page_size().unwrap_or_default() as u64
     }
@@ -3422,6 +3426,26 @@ mod tests {
         assert!(stats.max_bytes_opt().is_none());
     }
 
+    #[test]
+    #[cfg(feature = "arrow")]
+    fn test_column_writer_get_estimated_total_bytes() {
+        let page_writer = get_test_page_writer();
+        let props = Default::default();
+        let mut writer = get_test_column_writer::<Int32Type>(page_writer, 0, 0, props);
+        assert_eq!(writer.get_estimated_total_bytes(), 0);
+
+        writer.write_batch(&[1, 2, 3, 4], None, None).unwrap();
+        writer.add_data_page().unwrap();
+        let size_with_one_page = writer.get_estimated_total_bytes();
+        assert_eq!(size_with_one_page, 20);
+
+        writer.write_batch(&[5, 6, 7, 8], None, None).unwrap();
+        writer.add_data_page().unwrap();
+        let size_with_two_pages = writer.get_estimated_total_bytes();
+        // different pages have different compressed lengths
+        assert_eq!(size_with_two_pages, 20 + 21);
+    }
+
     fn write_multiple_pages<T: DataType>(
         column_descr: &Arc<ColumnDescriptor>,
         pages: &[&[Option<T::T>]],

From 50cf8bd80652ec9710bdcb1de568d52618233441 Mon Sep 17 00:00:00 2001
From: Phillip LeBlanc <phillip@leblanc.tech>
Date: Wed, 11 Dec 2024 23:31:02 +0900
Subject: [PATCH 04/29] Always explicitly disable `gzip` automatic
 decompression on reqwest client used by object_store (#6843)

* Explicitly disable gzip on reqwest client used by object_store

* Add comment

* Add integration test for checking reqwest gzip feature

* Fix lint

* Add comment explaining why gzip feature is enabled
---
 object_store/Cargo.toml        |  2 ++
 object_store/src/client/mod.rs |  4 ++++
 object_store/tests/http.rs     | 43 ++++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+)
 create mode 100644 object_store/tests/http.rs

diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml
index a0473362513..bcc8e0b9224 100644
--- a/object_store/Cargo.toml
+++ b/object_store/Cargo.toml
@@ -77,6 +77,8 @@ http-body-util = "0.1"
 rand = "0.8"
 tempfile = "3.1.0"
 regex = "1.11.1"
+# The "gzip" feature for reqwest is enabled for an integration test.
+reqwest = { version = "0.12", features = ["gzip"] }
 http = "1.1.0"
 
 [[test]]
diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs
index 76d1c1f22f5..1b7ce5aa7a7 100644
--- a/object_store/src/client/mod.rs
+++ b/object_store/src/client/mod.rs
@@ -671,6 +671,10 @@ impl ClientOptions {
             builder = builder.danger_accept_invalid_certs(true)
         }
 
+        // Reqwest will remove the `Content-Length` header if it is configured to
+        // transparently decompress the body via the non-default `gzip` feature.
+        builder = builder.no_gzip();
+
         builder
             .https_only(!self.allow_http.get()?)
             .build()
diff --git a/object_store/tests/http.rs b/object_store/tests/http.rs
new file mode 100644
index 00000000000..a9b3145bb66
--- /dev/null
+++ b/object_store/tests/http.rs
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Tests the HTTP store implementation
+
+#[cfg(feature = "http")]
+use object_store::{http::HttpBuilder, path::Path, GetOptions, GetRange, ObjectStore};
+
+/// Tests that even when reqwest has the `gzip` feature enabled, the HTTP store
+/// does not error on a missing `Content-Length` header.
+#[tokio::test]
+#[cfg(feature = "http")]
+async fn test_http_store_gzip() {
+    let http_store = HttpBuilder::new()
+        .with_url("https://raw.githubusercontent.com/apache/arrow-rs/refs/heads/main")
+        .build()
+        .unwrap();
+
+    let _ = http_store
+        .get_opts(
+            &Path::parse("LICENSE.txt").unwrap(),
+            GetOptions {
+                range: Some(GetRange::Bounded(0..100)),
+                ..Default::default()
+            },
+        )
+        .await
+        .unwrap();
+}

From 4acf4d3139b0cf1bad371a6d4632d6de3bf2babe Mon Sep 17 00:00:00 2001
From: Piotr Findeisen <piotr.findeisen@gmail.com>
Date: Wed, 11 Dec 2024 16:47:51 +0100
Subject: [PATCH 05/29] Enable unused_crate_dependencies Rust lint, remove
 unused dependencies (#6804)

* Disallow unused dependencies in arrow-array

* Disallow unused dependencies in arrow-schema

* check all modules on gh

* Remove arrow-csv's unused dependencies

* Remove arrow-ord's unused dependencies

* Remove arrow-row's unused dependencies

* Remove arrow's unused dependencies

* Remove unused dependencies in arrow-flight and in tests

And disable the check since it feels impossible/infeasible to configure
it the right way.

* Unify arrow workflow step names
---
 .github/workflows/arrow.yml                   | 202 +++++++++++++-----
 arrow-csv/Cargo.toml                          |   2 +-
 arrow-flight/Cargo.toml                       |  12 +-
 arrow-flight/src/lib.rs                       |   2 +
 arrow-integration-testing/Cargo.toml          |   5 +-
 .../src/bin/arrow-file-to-stream.rs           |   3 +
 .../src/bin/arrow-json-integration-test.rs    |   3 +
 .../src/bin/arrow-stream-to-file.rs           |   3 +
 .../src/bin/flight-test-integration-client.rs |   3 +
 .../src/bin/flight-test-integration-server.rs |   3 +
 arrow-integration-testing/src/lib.rs          |   2 +
 arrow-ord/Cargo.toml                          |   2 +-
 arrow-row/Cargo.toml                          |   6 -
 arrow/Cargo.toml                              |   4 +-
 14 files changed, 183 insertions(+), 69 deletions(-)

diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml
index d065c554bbf..daf38f2523f 100644
--- a/.github/workflows/arrow.yml
+++ b/.github/workflows/arrow.yml
@@ -61,39 +61,39 @@ jobs:
           submodules: true
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
-      - name: Test arrow-buffer with all features
+      - name: Test arrow-buffer
         run: cargo test -p arrow-buffer --all-features
-      - name: Test arrow-data with all features
+      - name: Test arrow-data
         run: cargo test -p arrow-data --all-features
-      - name: Test arrow-schema with all features
+      - name: Test arrow-schema
         run: cargo test -p arrow-schema --all-features
-      - name: Test arrow-array with all features
+      - name: Test arrow-array
         run: cargo test -p arrow-array --all-features
-      - name: Test arrow-select with all features
+      - name: Test arrow-select
         run: cargo test -p arrow-select --all-features
-      - name: Test arrow-cast with all features
+      - name: Test arrow-cast
         run: cargo test -p arrow-cast --all-features
-      - name: Test arrow-ipc with all features
+      - name: Test arrow-ipc
         run: cargo test -p arrow-ipc --all-features
-      - name: Test arrow-csv with all features
+      - name: Test arrow-csv
         run: cargo test -p arrow-csv --all-features
-      - name: Test arrow-json with all features
+      - name: Test arrow-json
         run: cargo test -p arrow-json --all-features
-      - name: Test arrow-avro with all features
+      - name: Test arrow-avro
         run: cargo test -p arrow-avro --all-features
-      - name: Test arrow-string with all features
+      - name: Test arrow-string
         run: cargo test -p arrow-string --all-features
-      - name: Test arrow-ord with all features
+      - name: Test arrow-ord
         run: cargo test -p arrow-ord --all-features
-      - name: Test arrow-arith with all features
+      - name: Test arrow-arith
         run: cargo test -p arrow-arith --all-features
-      - name: Test arrow-row with all features
+      - name: Test arrow-row
         run: cargo test -p arrow-row --all-features
-      - name: Test arrow-integration-test with all features
+      - name: Test arrow-integration-test
         run: cargo test -p arrow-integration-test --all-features
       - name: Test arrow with default features
         run: cargo test -p arrow
-      - name: Test arrow with all features except pyarrow
+      - name: Test arrow except pyarrow
         run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,chrono-tz
       - name: Run examples
         run: |
@@ -163,37 +163,139 @@ jobs:
         uses: ./.github/actions/setup-builder
       - name: Setup Clippy
         run: rustup component add clippy
-      - name: Clippy arrow-buffer with all features
-        run: cargo clippy -p arrow-buffer --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-data with all features
-        run: cargo clippy -p arrow-data --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-schema with all features
-        run: cargo clippy -p arrow-schema --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-array with all features
-        run: cargo clippy -p arrow-array --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-select with all features
-        run: cargo clippy -p arrow-select --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-cast with all features
-        run: cargo clippy -p arrow-cast --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-ipc with all features
-        run: cargo clippy -p arrow-ipc --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-csv with all features
-        run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-json with all features
-        run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-avro with all features
-        run: cargo clippy -p arrow-avro --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-string with all features
-        run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-ord with all features
-        run: cargo clippy -p arrow-ord --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-arith with all features
-        run: cargo clippy -p arrow-arith --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-row with all features
-        run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings
-      - name: Clippy arrow with all features
-        run: cargo clippy -p arrow --all-features --all-targets -- -D warnings
-      - name: Clippy arrow-integration-test with all features
-        run: cargo clippy -p arrow-integration-test --all-targets --all-features -- -D warnings
-      - name: Clippy arrow-integration-testing with all features
-        run: cargo clippy -p arrow-integration-testing --all-targets --all-features -- -D warnings
+      - name: Clippy arrow-buffer
+        run: |
+          mod=arrow-buffer
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-data
+        run: |
+          mod=arrow-data
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-schema
+        run: |
+          mod=arrow-schema
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-array
+        run: |
+          mod=arrow-array
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-select
+        run: |
+          mod=arrow-select
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-cast
+        run: |
+          mod=arrow-cast
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-ipc
+        run: |
+          mod=arrow-ipc
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-csv
+        run: |
+          mod=arrow-csv
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-json
+        run: |
+          mod=arrow-json
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-avro
+        run: |
+          mod=arrow-avro
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-string
+        run: |
+          mod=arrow-string
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-ord
+        run: |
+          mod=arrow-ord
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-arith
+        run: |
+          mod=arrow-arith
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-row
+        run: |
+          mod=arrow-row
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow
+        run: |
+          mod=arrow
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-integration-test
+        run: |
+          mod=arrow-integration-test
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
+      - name: Clippy arrow-integration-testing
+        run: |
+          mod=arrow-integration-testing
+          cargo clippy -p "$mod" --all-targets --all-features -- -D warnings
+          # Dependency checks excluding tests & benches.
+          cargo clippy -p "$mod" -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --all-features -- -D unused_crate_dependencies
+          cargo clippy -p "$mod" --no-default-features -- -D unused_crate_dependencies
diff --git a/arrow-csv/Cargo.toml b/arrow-csv/Cargo.toml
index 4729779c43b..8823924eb55 100644
--- a/arrow-csv/Cargo.toml
+++ b/arrow-csv/Cargo.toml
@@ -35,7 +35,6 @@ bench = false
 
 [dependencies]
 arrow-array = { workspace = true }
-arrow-buffer = { workspace = true }
 arrow-cast = { workspace = true }
 arrow-schema = { workspace = true }
 chrono = { workspace = true }
@@ -45,6 +44,7 @@ lazy_static = { version = "1.4", default-features = false }
 regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] }
 
 [dev-dependencies]
+arrow-buffer = { workspace = true }
 tempfile = "3.3"
 futures = "0.3"
 tokio = { version = "1.27", default-features = false, features = ["io-util"] }
diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml
index 012d3947f02..fbb295036a9 100644
--- a/arrow-flight/Cargo.toml
+++ b/arrow-flight/Cargo.toml
@@ -43,11 +43,11 @@ base64 = { version = "0.22", default-features = false, features = ["std"] }
 bytes = { version = "1", default-features = false }
 futures = { version = "0.3", default-features = false, features = ["alloc"] }
 once_cell = { version = "1", optional = true }
-paste = { version = "1.0" }
+paste = { version = "1.0" , optional = true }
 prost = { version = "0.13.1", default-features = false, features = ["prost-derive"] }
 # For Timestamp type
 prost-types = { version = "0.13.1", default-features = false }
-tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] }
+tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"], optional = true }
 tonic = { version = "0.12.3", default-features = false, features = ["transport", "codegen", "prost"] }
 
 # CLI-related dependencies
@@ -61,11 +61,10 @@ all-features = true
 
 [features]
 default = []
-flight-sql-experimental = ["dep:arrow-arith", "dep:arrow-data", "dep:arrow-ord", "dep:arrow-row", "dep:arrow-select", "dep:arrow-string", "dep:once_cell"]
+flight-sql-experimental = ["dep:arrow-arith", "dep:arrow-data", "dep:arrow-ord", "dep:arrow-row", "dep:arrow-select", "dep:arrow-string", "dep:once_cell", "dep:paste"]
 tls = ["tonic/tls"]
-
 # Enable CLI tools
-cli = ["dep:anyhow", "arrow-array/chrono-tz", "arrow-cast/prettyprint", "dep:clap", "dep:tracing-log", "dep:tracing-subscriber", "tonic/tls-webpki-roots"]
+cli = ["arrow-array/chrono-tz", "arrow-cast/prettyprint", "tonic/tls-webpki-roots", "dep:anyhow", "dep:clap", "dep:tracing-log", "dep:tracing-subscriber"]
 
 [dev-dependencies]
 arrow-cast = { workspace = true, features = ["prettyprint"] }
@@ -75,6 +74,9 @@ http-body = "1.0.0"
 hyper-util = "0.1"
 pin-project-lite = "0.2"
 tempfile = "3.3"
+tracing-log = { version = "0.2" }
+tracing-subscriber = { version = "0.3.1", default-features = false, features = ["ansi", "env-filter", "fmt"] }
+tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] }
 tokio-stream = { version = "0.1", features = ["net"] }
 tower = { version = "0.5.0", features = ["util"] }
 uuid = { version = "1.10.0", features = ["v4"] }
diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs
index 9f18416c06e..81fa99faeb9 100644
--- a/arrow-flight/src/lib.rs
+++ b/arrow-flight/src/lib.rs
@@ -38,6 +38,8 @@
 //! [Flight SQL]: https://arrow.apache.org/docs/format/FlightSql.html
 #![allow(rustdoc::invalid_html_tags)]
 #![warn(missing_docs)]
+// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets
+#![allow(unused_crate_dependencies)]
 
 use arrow_ipc::{convert, writer, writer::EncodedData, writer::IpcWriteOptions};
 use arrow_schema::{ArrowError, Schema};
diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml
index 0ebbc05a8b8..8654b4b9273 100644
--- a/arrow-integration-testing/Cargo.toml
+++ b/arrow-integration-testing/Cargo.toml
@@ -36,18 +36,17 @@ logging = ["tracing-subscriber"]
 [dependencies]
 arrow = { path = "../arrow", default-features = false, features = ["test_utils", "ipc", "ipc_compression", "json", "ffi"] }
 arrow-flight = { path = "../arrow-flight", default-features = false }
-arrow-buffer = { path = "../arrow-buffer", default-features = false }
 arrow-integration-test = { path = "../arrow-integration-test", default-features = false }
-async-trait = { version = "0.1.41", default-features = false }
 clap = { version = "4", default-features = false, features = ["std", "derive", "help", "error-context", "usage"] }
 futures = { version = "0.3", default-features = false }
 prost = { version = "0.13", default-features = false }
 serde = { version = "1.0", default-features = false, features = ["rc", "derive"] }
 serde_json = { version = "1.0", default-features = false, features = ["std"] }
-tokio = { version = "1.0", default-features = false }
+tokio = { version = "1.0", default-features = false, features = [ "rt-multi-thread"] }
 tonic = { version = "0.12", default-features = false }
 tracing-subscriber = { version = "0.3.1", default-features = false, features = ["fmt"], optional = true }
 flate2 = { version = "1", default-features = false, features = ["rust_backend"] }
 
 [dev-dependencies]
+arrow-buffer = { path = "../arrow-buffer", default-features = false }
 tempfile = { version = "3", default-features = false }
diff --git a/arrow-integration-testing/src/bin/arrow-file-to-stream.rs b/arrow-integration-testing/src/bin/arrow-file-to-stream.rs
index 3e027faef91..661f0a047db 100644
--- a/arrow-integration-testing/src/bin/arrow-file-to-stream.rs
+++ b/arrow-integration-testing/src/bin/arrow-file-to-stream.rs
@@ -15,6 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets
+#![allow(unused_crate_dependencies)]
+
 use arrow::error::Result;
 use arrow::ipc::reader::FileReader;
 use arrow::ipc::writer::StreamWriter;
diff --git a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs
index cc3dd2110e3..6a901cc63ba 100644
--- a/arrow-integration-testing/src/bin/arrow-json-integration-test.rs
+++ b/arrow-integration-testing/src/bin/arrow-json-integration-test.rs
@@ -15,6 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets
+#![allow(unused_crate_dependencies)]
+
 use arrow::error::{ArrowError, Result};
 use arrow::ipc::reader::FileReader;
 use arrow::ipc::writer::FileWriter;
diff --git a/arrow-integration-testing/src/bin/arrow-stream-to-file.rs b/arrow-integration-testing/src/bin/arrow-stream-to-file.rs
index 07ac5c7ddd4..8b4bb332781 100644
--- a/arrow-integration-testing/src/bin/arrow-stream-to-file.rs
+++ b/arrow-integration-testing/src/bin/arrow-stream-to-file.rs
@@ -15,6 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets
+#![allow(unused_crate_dependencies)]
+
 use std::io;
 
 use arrow::error::Result;
diff --git a/arrow-integration-testing/src/bin/flight-test-integration-client.rs b/arrow-integration-testing/src/bin/flight-test-integration-client.rs
index b8bbb952837..0d16fe3b403 100644
--- a/arrow-integration-testing/src/bin/flight-test-integration-client.rs
+++ b/arrow-integration-testing/src/bin/flight-test-integration-client.rs
@@ -15,6 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets
+#![allow(unused_crate_dependencies)]
+
 use arrow_integration_testing::flight_client_scenarios;
 use clap::Parser;
 type Error = Box<dyn std::error::Error + Send + Sync + 'static>;
diff --git a/arrow-integration-testing/src/bin/flight-test-integration-server.rs b/arrow-integration-testing/src/bin/flight-test-integration-server.rs
index 5310d07d4f8..94be7130979 100644
--- a/arrow-integration-testing/src/bin/flight-test-integration-server.rs
+++ b/arrow-integration-testing/src/bin/flight-test-integration-server.rs
@@ -15,6 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets
+#![allow(unused_crate_dependencies)]
+
 use arrow_integration_testing::flight_server_scenarios;
 use clap::Parser;
 
diff --git a/arrow-integration-testing/src/lib.rs b/arrow-integration-testing/src/lib.rs
index c8ce01e9f13..e669690ef4f 100644
--- a/arrow-integration-testing/src/lib.rs
+++ b/arrow-integration-testing/src/lib.rs
@@ -17,6 +17,8 @@
 
 //! Common code used in the integration test binaries
 
+// The unused_crate_dependencies lint does not work well for crates defining additional examples/bin targets
+#![allow(unused_crate_dependencies)]
 #![warn(missing_docs)]
 use serde_json::Value;
 
diff --git a/arrow-ord/Cargo.toml b/arrow-ord/Cargo.toml
index a2894c02029..8d74d2f97d7 100644
--- a/arrow-ord/Cargo.toml
+++ b/arrow-ord/Cargo.toml
@@ -39,7 +39,7 @@ arrow-buffer = { workspace = true }
 arrow-data = { workspace = true }
 arrow-schema = { workspace = true }
 arrow-select = { workspace = true }
-half = { version = "2.1", default-features = false, features = ["num-traits"] }
 
 [dev-dependencies]
+half = { version = "2.1", default-features = false, features = ["num-traits"] }
 rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
diff --git a/arrow-row/Cargo.toml b/arrow-row/Cargo.toml
index 3754afb4dbc..90d99684d26 100644
--- a/arrow-row/Cargo.toml
+++ b/arrow-row/Cargo.toml
@@ -33,12 +33,6 @@ name = "arrow_row"
 path = "src/lib.rs"
 bench = false
 
-[target.'cfg(target_arch = "wasm32")'.dependencies]
-ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] }
-
-[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
-ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
-
 [dependencies]
 arrow-array = { workspace = true }
 arrow-buffer = { workspace = true }
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index 32b2384afe7..8860cd61c5b 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -56,8 +56,6 @@ arrow-string = { workspace = true }
 rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true }
 pyo3 = { version = "0.23", default-features = false, optional = true }
 
-chrono = { workspace = true, optional = true }
-
 [package.metadata.docs.rs]
 features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"]
 
@@ -72,7 +70,7 @@ prettyprint = ["arrow-cast/prettyprint"]
 # not the core arrow code itself. Be aware that `rand` must be kept as
 # an optional dependency for supporting compile to wasm32-unknown-unknown
 # target without assuming an environment containing JavaScript.
-test_utils = ["rand", "dep:chrono"]
+test_utils = ["dep:rand"]
 pyarrow = ["pyo3", "ffi"]
 # force_validate runs full data validation for all arrays that are created
 # this is not enabled by default as it is too computationally expensive

From e613622fb69a7cc941bfb97bf1020bee580cfb86 Mon Sep 17 00:00:00 2001
From: Kaifeng Zheng <100595273+Kev1n8@users.noreply.github.com>
Date: Fri, 13 Dec 2024 04:39:29 +0800
Subject: [PATCH 06/29] Enable matching temporal as from_type to Utf8View
 (#6872)

* enable matching temporal from_type

* drop todo message
---
 arrow-cast/src/cast/mod.rs | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 0a44392a245..5c892783d4a 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -1472,7 +1472,7 @@ pub fn cast_with_options(
         (BinaryView, _) => Err(ArrowError::CastError(format!(
             "Casting from {from_type:?} to {to_type:?} not supported",
         ))),
-        (from_type, Utf8View) if from_type.is_numeric() => {
+        (from_type, Utf8View) if from_type.is_primitive() => {
             value_to_string_view(array, cast_options)
         }
         (from_type, LargeUtf8) if from_type.is_primitive() => {
@@ -5258,9 +5258,6 @@ mod tests {
         assert_eq!("2018-12-25T00:00:00", c.value(1));
     }
 
-    // Cast Timestamp to Utf8View is not supported yet
-    // TODO: Implement casting from Timestamp to Utf8View
-    // https://github.com/apache/arrow-rs/issues/6734
     macro_rules! assert_cast_timestamp_to_string {
         ($array:expr, $datatype:expr, $output_array_type: ty, $expected:expr) => {{
             let out = cast(&$array, &$datatype).unwrap();
@@ -5295,7 +5292,7 @@ mod tests {
             None,
         ];
 
-        // assert_cast_timestamp_to_string!(array, DataType::Utf8View, StringViewArray, expected);
+        assert_cast_timestamp_to_string!(array, DataType::Utf8View, StringViewArray, expected);
         assert_cast_timestamp_to_string!(array, DataType::Utf8, StringArray, expected);
         assert_cast_timestamp_to_string!(array, DataType::LargeUtf8, LargeStringArray, expected);
     }
@@ -5319,7 +5316,13 @@ mod tests {
             Some("2018-12-25 00:00:02.001000"),
             None,
         ];
-        // assert_cast_timestamp_to_string!(array_without_tz, DataType::Utf8View, StringViewArray, cast_options, expected);
+        assert_cast_timestamp_to_string!(
+            array_without_tz,
+            DataType::Utf8View,
+            StringViewArray,
+            cast_options,
+            expected
+        );
         assert_cast_timestamp_to_string!(
             array_without_tz,
             DataType::Utf8,
@@ -5343,7 +5346,13 @@ mod tests {
             Some("2018-12-25 05:45:02.001000"),
             None,
         ];
-        // assert_cast_timestamp_to_string!(array_with_tz, DataType::Utf8View, StringViewArray, cast_options, expected);
+        assert_cast_timestamp_to_string!(
+            array_with_tz,
+            DataType::Utf8View,
+            StringViewArray,
+            cast_options,
+            expected
+        );
         assert_cast_timestamp_to_string!(
             array_with_tz,
             DataType::Utf8,

From 84dba34e6754b34baf19bd694506278cdc176a2e Mon Sep 17 00:00:00 2001
From: Andrew Varnon <avarnon@users.noreply.github.com>
Date: Thu, 12 Dec 2024 17:58:55 -0500
Subject: [PATCH 07/29] Use randomized content ID for Azure multipart uploads
 (#6869)

* Use randomized content ID for Azure multipart uploads

* Update object_store/src/azure/client.rs

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

* fixup! Use randomized content ID for Azure multipart uploads

* fixup! Use randomized content ID for Azure multipart uploads

* fixup! Use randomized content ID for Azure multipart uploads

* fixup! Use randomized content ID for Azure multipart uploads

* fixup! Use randomized content ID for Azure multipart uploads

* fixup! Use randomized content ID for Azure multipart uploads

* fixup! Use randomized content ID for Azure multipart uploads

* fixup! Use randomized content ID for Azure multipart uploads

---------

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
---
 object_store/src/aws/mod.rs      |  1 +
 object_store/src/azure/client.rs |  6 ++-
 object_store/src/azure/mod.rs    |  1 +
 object_store/src/gcp/mod.rs      |  1 +
 object_store/src/integration.rs  | 87 ++++++++++++++++++++++++++++++++
 5 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/object_store/src/aws/mod.rs b/object_store/src/aws/mod.rs
index a7f9264a681..7f449c49963 100644
--- a/object_store/src/aws/mod.rs
+++ b/object_store/src/aws/mod.rs
@@ -570,6 +570,7 @@ mod tests {
         rename_and_copy(&integration).await;
         stream_get(&integration).await;
         multipart(&integration, &integration).await;
+        multipart_race_condition(&integration, true).await;
         signing(&integration).await;
         s3_encryption(&integration).await;
         put_get_attributes(&integration).await;
diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs
index 76dedd71aa5..69ff39526be 100644
--- a/object_store/src/azure/client.rs
+++ b/object_store/src/azure/client.rs
@@ -36,6 +36,7 @@ use base64::Engine;
 use bytes::{Buf, Bytes};
 use chrono::{DateTime, Utc};
 use hyper::http::HeaderName;
+use rand::Rng as _;
 use reqwest::{
     header::{HeaderMap, HeaderValue, CONTENT_LENGTH, CONTENT_TYPE, IF_MATCH, IF_NONE_MATCH},
     Client as ReqwestClient, Method, RequestBuilder, Response,
@@ -556,10 +557,11 @@ impl AzureClient {
     pub(crate) async fn put_block(
         &self,
         path: &Path,
-        part_idx: usize,
+        _part_idx: usize,
         payload: PutPayload,
     ) -> Result<PartId> {
-        let content_id = format!("{part_idx:20}");
+        let part_idx = u128::from_be_bytes(rand::thread_rng().gen());
+        let content_id = format!("{part_idx:032x}");
         let block_id = BASE64_STANDARD.encode(&content_id);
 
         self.put_request(path, payload)
diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs
index 177bffb653a..81b6667bc05 100644
--- a/object_store/src/azure/mod.rs
+++ b/object_store/src/azure/mod.rs
@@ -314,6 +314,7 @@ mod tests {
         stream_get(&integration).await;
         put_opts(&integration, true).await;
         multipart(&integration, &integration).await;
+        multipart_race_condition(&integration, false).await;
         signing(&integration).await;
 
         let validate = !integration.client.config().disable_tagging;
diff --git a/object_store/src/gcp/mod.rs b/object_store/src/gcp/mod.rs
index 039ec46b68c..5199135ba6b 100644
--- a/object_store/src/gcp/mod.rs
+++ b/object_store/src/gcp/mod.rs
@@ -297,6 +297,7 @@ mod test {
             // https://github.com/fsouza/fake-gcs-server/issues/852
             stream_get(&integration).await;
             multipart(&integration, &integration).await;
+            multipart_race_condition(&integration, true).await;
             // Fake GCS server doesn't currently honor preconditions
             get_opts(&integration).await;
             put_opts(&integration, true).await;
diff --git a/object_store/src/integration.rs b/object_store/src/integration.rs
index 30177878306..20e95fddc47 100644
--- a/object_store/src/integration.rs
+++ b/object_store/src/integration.rs
@@ -24,6 +24,8 @@
 //!
 //! They are intended solely for testing purposes.
 
+use core::str;
+
 use crate::multipart::MultipartStore;
 use crate::path::Path;
 use crate::{
@@ -1109,3 +1111,88 @@ async fn delete_fixtures(storage: &DynObjectStore) {
         .await
         .unwrap();
 }
+
+/// Tests a race condition where 2 threads are performing multipart writes to the same path
+pub async fn multipart_race_condition(storage: &dyn ObjectStore, last_writer_wins: bool) {
+    let path = Path::from("test_multipart_race_condition");
+
+    let mut multipart_upload_1 = storage.put_multipart(&path).await.unwrap();
+    let mut multipart_upload_2 = storage.put_multipart(&path).await.unwrap();
+
+    multipart_upload_1
+        .put_part(Bytes::from(format!("1:{:05300000},", 0)).into())
+        .await
+        .unwrap();
+    multipart_upload_2
+        .put_part(Bytes::from(format!("2:{:05300000},", 0)).into())
+        .await
+        .unwrap();
+
+    multipart_upload_2
+        .put_part(Bytes::from(format!("2:{:05300000},", 1)).into())
+        .await
+        .unwrap();
+    multipart_upload_1
+        .put_part(Bytes::from(format!("1:{:05300000},", 1)).into())
+        .await
+        .unwrap();
+
+    multipart_upload_1
+        .put_part(Bytes::from(format!("1:{:05300000},", 2)).into())
+        .await
+        .unwrap();
+    multipart_upload_2
+        .put_part(Bytes::from(format!("2:{:05300000},", 2)).into())
+        .await
+        .unwrap();
+
+    multipart_upload_2
+        .put_part(Bytes::from(format!("2:{:05300000},", 3)).into())
+        .await
+        .unwrap();
+    multipart_upload_1
+        .put_part(Bytes::from(format!("1:{:05300000},", 3)).into())
+        .await
+        .unwrap();
+
+    multipart_upload_1
+        .put_part(Bytes::from(format!("1:{:05300000},", 4)).into())
+        .await
+        .unwrap();
+    multipart_upload_2
+        .put_part(Bytes::from(format!("2:{:05300000},", 4)).into())
+        .await
+        .unwrap();
+
+    multipart_upload_1.complete().await.unwrap();
+
+    if last_writer_wins {
+        multipart_upload_2.complete().await.unwrap();
+    } else {
+        let err = multipart_upload_2.complete().await.unwrap_err();
+
+        assert!(matches!(err, crate::Error::Generic { .. }), "{err}");
+    }
+
+    let get_result = storage.get(&path).await.unwrap();
+    let bytes = get_result.bytes().await.unwrap();
+    let string_contents = str::from_utf8(&bytes).unwrap();
+
+    if last_writer_wins {
+        assert!(string_contents.starts_with(
+            format!(
+                "2:{:05300000},2:{:05300000},2:{:05300000},2:{:05300000},2:{:05300000},",
+                0, 1, 2, 3, 4
+            )
+            .as_str()
+        ));
+    } else {
+        assert!(string_contents.starts_with(
+            format!(
+                "1:{:05300000},1:{:05300000},1:{:05300000},1:{:05300000},1:{:05300000},",
+                0, 1, 2, 3, 4
+            )
+            .as_str()
+        ));
+    }
+}

From eb7ab83aab6b54897b00da1eb739c863d61445a2 Mon Sep 17 00:00:00 2001
From: Himadri Pal <mehimu@gmail.com>
Date: Thu, 12 Dec 2024 15:00:22 -0800
Subject: [PATCH 08/29] fix: decimal conversion looses value on lower precision
 (#6836)

* decimal conversion looses value on lower precision, throws error now on overflow.

* fix review comments and fix formatting.

* for simple case of equal scale and bigger precision, no conversion needed.

revert whitespace changes

formatting check

---------

Co-authored-by: himadripal <hpal@apple.com>
---
 arrow-cast/src/cast/decimal.rs | 49 +++++++++--------
 arrow-cast/src/cast/mod.rs     | 99 ++++++++++++++++++++++++++++++----
 2 files changed, 116 insertions(+), 32 deletions(-)

diff --git a/arrow-cast/src/cast/decimal.rs b/arrow-cast/src/cast/decimal.rs
index 74cd2637e6a..ba82ca9040c 100644
--- a/arrow-cast/src/cast/decimal.rs
+++ b/arrow-cast/src/cast/decimal.rs
@@ -111,9 +111,13 @@ where
         O::Native::from_decimal(adjusted)
     };
 
-    Ok(match cast_options.safe {
-        true => array.unary_opt(f),
-        false => array.try_unary(|x| f(x).ok_or_else(|| error(x)))?,
+    Ok(if cast_options.safe {
+        array.unary_opt(|x| f(x).filter(|v| O::is_valid_decimal_precision(*v, output_precision)))
+    } else {
+        array.try_unary(|x| {
+            f(x).ok_or_else(|| error(x))
+                .and_then(|v| O::validate_decimal_precision(v, output_precision).map(|_| v))
+        })?
     })
 }
 
@@ -137,15 +141,20 @@ where
 
     let f = |x| O::Native::from_decimal(x).and_then(|x| x.mul_checked(mul).ok());
 
-    Ok(match cast_options.safe {
-        true => array.unary_opt(f),
-        false => array.try_unary(|x| f(x).ok_or_else(|| error(x)))?,
+    Ok(if cast_options.safe {
+        array.unary_opt(|x| f(x).filter(|v| O::is_valid_decimal_precision(*v, output_precision)))
+    } else {
+        array.try_unary(|x| {
+            f(x).ok_or_else(|| error(x))
+                .and_then(|v| O::validate_decimal_precision(v, output_precision).map(|_| v))
+        })?
     })
 }
 
 // Only support one type of decimal cast operations
 pub(crate) fn cast_decimal_to_decimal_same_type<T>(
     array: &PrimitiveArray<T>,
+    input_precision: u8,
     input_scale: i8,
     output_precision: u8,
     output_scale: i8,
@@ -155,20 +164,11 @@ where
     T: DecimalType,
     T::Native: DecimalCast + ArrowNativeTypeOp,
 {
-    let array: PrimitiveArray<T> = match input_scale.cmp(&output_scale) {
-        Ordering::Equal => {
-            // the scale doesn't change, the native value don't need to be changed
+    let array: PrimitiveArray<T> =
+        if input_scale == output_scale && input_precision <= output_precision {
             array.clone()
-        }
-        Ordering::Greater => convert_to_smaller_scale_decimal::<T, T>(
-            array,
-            input_scale,
-            output_precision,
-            output_scale,
-            cast_options,
-        )?,
-        Ordering::Less => {
-            // input_scale < output_scale
+        } else if input_scale < output_scale {
+            // the scale doesn't change, but precision may change and cause overflow
             convert_to_bigger_or_equal_scale_decimal::<T, T>(
                 array,
                 input_scale,
@@ -176,8 +176,15 @@ where
                 output_scale,
                 cast_options,
             )?
-        }
-    };
+        } else {
+            convert_to_smaller_scale_decimal::<T, T>(
+                array,
+                input_scale,
+                output_precision,
+                output_scale,
+                cast_options,
+            )?
+        };
 
     Ok(Arc::new(array.with_precision_and_scale(
         output_precision,
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 5c892783d4a..ba470635c6c 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -830,18 +830,20 @@ pub fn cast_with_options(
         (Map(_, ordered1), Map(_, ordered2)) if ordered1 == ordered2 => {
             cast_map_values(array.as_map(), to_type, cast_options, ordered1.to_owned())
         }
-        (Decimal128(_, s1), Decimal128(p2, s2)) => {
+        (Decimal128(p1, s1), Decimal128(p2, s2)) => {
             cast_decimal_to_decimal_same_type::<Decimal128Type>(
                 array.as_primitive(),
+                *p1,
                 *s1,
                 *p2,
                 *s2,
                 cast_options,
             )
         }
-        (Decimal256(_, s1), Decimal256(p2, s2)) => {
+        (Decimal256(p1, s1), Decimal256(p2, s2)) => {
             cast_decimal_to_decimal_same_type::<Decimal256Type>(
                 array.as_primitive(),
+                *p1,
                 *s1,
                 *p2,
                 *s2,
@@ -2694,13 +2696,16 @@ mod tests {
         // negative test
         let array = vec![Some(123456), None];
         let array = create_decimal_array(array, 10, 0).unwrap();
-        let result = cast(&array, &DataType::Decimal128(2, 2));
-        assert!(result.is_ok());
-        let array = result.unwrap();
-        let array: &Decimal128Array = array.as_primitive();
-        let err = array.validate_decimal_precision(2);
+        let result_safe = cast(&array, &DataType::Decimal128(2, 2));
+        assert!(result_safe.is_ok());
+        let options = CastOptions {
+            safe: false,
+            ..Default::default()
+        };
+
+        let result_unsafe = cast_with_options(&array, &DataType::Decimal128(2, 2), &options);
         assert_eq!("Invalid argument error: 12345600 is too large to store in a Decimal128 of precision 2. Max is 99",
-                   err.unwrap_err().to_string());
+                   result_unsafe.unwrap_err().to_string());
     }
 
     #[test]
@@ -8460,7 +8465,7 @@ mod tests {
         let input_type = DataType::Decimal128(10, 3);
         let output_type = DataType::Decimal256(10, 5);
         assert!(can_cast_types(&input_type, &output_type));
-        let array = vec![Some(i128::MAX), Some(i128::MIN)];
+        let array = vec![Some(123456), Some(-123456)];
         let input_decimal_array = create_decimal_array(array, 10, 3).unwrap();
         let array = Arc::new(input_decimal_array) as ArrayRef;
 
@@ -8470,8 +8475,8 @@ mod tests {
             Decimal256Array,
             &output_type,
             vec![
-                Some(i256::from_i128(i128::MAX).mul_wrapping(hundred)),
-                Some(i256::from_i128(i128::MIN).mul_wrapping(hundred))
+                Some(i256::from_i128(123456).mul_wrapping(hundred)),
+                Some(i256::from_i128(-123456).mul_wrapping(hundred))
             ]
         );
     }
@@ -9935,4 +9940,76 @@ mod tests {
             "Cast non-nullable to non-nullable struct field returning null should fail",
         );
     }
+
+    #[test]
+    fn test_decimal_to_decimal_throw_error_on_precision_overflow_same_scale() {
+        let array = vec![Some(123456789)];
+        let array = create_decimal_array(array, 24, 2).unwrap();
+        println!("{:?}", array);
+        let input_type = DataType::Decimal128(24, 2);
+        let output_type = DataType::Decimal128(6, 2);
+        assert!(can_cast_types(&input_type, &output_type));
+
+        let options = CastOptions {
+            safe: false,
+            ..Default::default()
+        };
+        let result = cast_with_options(&array, &output_type, &options);
+        assert_eq!(result.unwrap_err().to_string(),
+                   "Invalid argument error: 123456790 is too large to store in a Decimal128 of precision 6. Max is 999999");
+    }
+
+    #[test]
+    fn test_decimal_to_decimal_throw_error_on_precision_overflow_lower_scale() {
+        let array = vec![Some(123456789)];
+        let array = create_decimal_array(array, 24, 2).unwrap();
+        println!("{:?}", array);
+        let input_type = DataType::Decimal128(24, 4);
+        let output_type = DataType::Decimal128(6, 2);
+        assert!(can_cast_types(&input_type, &output_type));
+
+        let options = CastOptions {
+            safe: false,
+            ..Default::default()
+        };
+        let result = cast_with_options(&array, &output_type, &options);
+        assert_eq!(result.unwrap_err().to_string(),
+                   "Invalid argument error: 123456790 is too large to store in a Decimal128 of precision 6. Max is 999999");
+    }
+
+    #[test]
+    fn test_decimal_to_decimal_throw_error_on_precision_overflow_greater_scale() {
+        let array = vec![Some(123456789)];
+        let array = create_decimal_array(array, 24, 2).unwrap();
+        println!("{:?}", array);
+        let input_type = DataType::Decimal128(24, 2);
+        let output_type = DataType::Decimal128(6, 3);
+        assert!(can_cast_types(&input_type, &output_type));
+
+        let options = CastOptions {
+            safe: false,
+            ..Default::default()
+        };
+        let result = cast_with_options(&array, &output_type, &options);
+        assert_eq!(result.unwrap_err().to_string(),
+                   "Invalid argument error: 1234567890 is too large to store in a Decimal128 of precision 6. Max is 999999");
+    }
+
+    #[test]
+    fn test_decimal_to_decimal_throw_error_on_precision_overflow_diff_type() {
+        let array = vec![Some(123456789)];
+        let array = create_decimal_array(array, 24, 2).unwrap();
+        println!("{:?}", array);
+        let input_type = DataType::Decimal128(24, 2);
+        let output_type = DataType::Decimal256(6, 2);
+        assert!(can_cast_types(&input_type, &output_type));
+
+        let options = CastOptions {
+            safe: false,
+            ..Default::default()
+        };
+        let result = cast_with_options(&array, &output_type, &options);
+        assert_eq!(result.unwrap_err().to_string(),
+                   "Invalid argument error: 123456789 is too large to store in a Decimal256 of precision 6. Max is 999999");
+    }
 }

From c4dbf0d8af6ca5a19b8b2ea777da3c276807fc5e Mon Sep 17 00:00:00 2001
From: niebayes <niebayes@gmail.com>
Date: Fri, 13 Dec 2024 20:23:33 +0800
Subject: [PATCH 09/29] fix: make GetCatalogsBuilder sort catalog names 
 (#6864)

* fix: sort catalog names

* apply code suggestion from tustvold

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

---------

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
---
 arrow-flight/src/sql/metadata/catalogs.rs | 30 ++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/arrow-flight/src/sql/metadata/catalogs.rs b/arrow-flight/src/sql/metadata/catalogs.rs
index 327fed81077..e27c63c3932 100644
--- a/arrow-flight/src/sql/metadata/catalogs.rs
+++ b/arrow-flight/src/sql/metadata/catalogs.rs
@@ -68,7 +68,8 @@ impl GetCatalogsBuilder {
     /// builds a `RecordBatch` with the correct schema for a
     /// [`CommandGetCatalogs`] response
     pub fn build(self) -> Result<RecordBatch> {
-        let Self { catalogs } = self;
+        let Self { mut catalogs } = self;
+        catalogs.sort_unstable();
 
         let batch = RecordBatch::try_new(
             Arc::clone(&GET_CATALOG_SCHEMA),
@@ -98,3 +99,30 @@ static GET_CATALOG_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
         false,
     )]))
 });
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_catalogs_are_sorted() {
+        let batch = ["a_catalog", "c_catalog", "b_catalog"]
+            .into_iter()
+            .fold(GetCatalogsBuilder::new(), |mut builder, catalog| {
+                builder.append(catalog);
+                builder
+            })
+            .build()
+            .unwrap();
+        let catalogs = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap()
+            .iter()
+            .flatten()
+            .collect::<Vec<_>>();
+        assert!(catalogs.is_sorted());
+        assert_eq!(catalogs, ["a_catalog", "b_catalog", "c_catalog"]);
+    }
+}

From fc6936aff50b0b1c8bea984bba2d57dac68a98b3 Mon Sep 17 00:00:00 2001
From: Frederic Branczyk <fbranczyk@gmail.com>
Date: Mon, 16 Dec 2024 19:29:17 +0100
Subject: [PATCH 10/29] Add deprecation warnings for everything related to
 `dict_id` (#6873)

* Add deprecation warnings for everything related to `dict_id`

* Add allow deprecations where necessary
---
 arrow-flight/src/encode.rs                    |  9 ++++
 arrow-flight/src/lib.rs                       |  1 +
 arrow-flight/src/utils.rs                     |  1 +
 arrow-integration-test/src/field.rs           | 27 +++++++-----
 arrow-integration-test/src/lib.rs             |  3 ++
 arrow-integration-test/src/schema.rs          |  1 +
 .../integration_test.rs                       |  1 +
 .../integration_test.rs                       |  1 +
 arrow-ipc/src/convert.rs                      |  5 +++
 arrow-ipc/src/reader.rs                       | 14 ++++++
 arrow-ipc/src/reader/stream.rs                |  2 +
 arrow-ipc/src/writer.rs                       | 44 +++++++++++++++++++
 arrow-schema/src/ffi.rs                       |  1 +
 arrow-schema/src/field.rs                     | 36 ++++++++++++++-
 arrow-schema/src/schema.rs                    | 19 ++++++--
 parquet/src/arrow/arrow_writer/mod.rs         |  3 ++
 parquet/src/arrow/schema/complex.rs           |  5 ++-
 parquet/src/arrow/schema/mod.rs               |  2 +
 18 files changed, 158 insertions(+), 17 deletions(-)

diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs
index 3ca192d0bb9..315b7b3cb6e 100644
--- a/arrow-flight/src/encode.rs
+++ b/arrow-flight/src/encode.rs
@@ -535,8 +535,10 @@ fn prepare_field_for_flight(
                 )
                 .with_metadata(field.metadata().clone())
             } else {
+                #[allow(deprecated)]
                 let dict_id = dictionary_tracker.set_dict_id(field.as_ref());
 
+                #[allow(deprecated)]
                 Field::new_dict(
                     field.name(),
                     field.data_type().clone(),
@@ -583,7 +585,9 @@ fn prepare_schema_for_flight(
                     )
                     .with_metadata(field.metadata().clone())
                 } else {
+                    #[allow(deprecated)]
                     let dict_id = dictionary_tracker.set_dict_id(field.as_ref());
+                    #[allow(deprecated)]
                     Field::new_dict(
                         field.name(),
                         field.data_type().clone(),
@@ -650,10 +654,12 @@ struct FlightIpcEncoder {
 
 impl FlightIpcEncoder {
     fn new(options: IpcWriteOptions, error_on_replacement: bool) -> Self {
+        #[allow(deprecated)]
         let preserve_dict_id = options.preserve_dict_id();
         Self {
             options,
             data_gen: IpcDataGenerator::default(),
+            #[allow(deprecated)]
             dictionary_tracker: DictionaryTracker::new_with_preserve_dict_id(
                 error_on_replacement,
                 preserve_dict_id,
@@ -1541,6 +1547,7 @@ mod tests {
     async fn verify_flight_round_trip(mut batches: Vec<RecordBatch>) {
         let expected_schema = batches.first().unwrap().schema();
 
+        #[allow(deprecated)]
         let encoder = FlightDataEncoderBuilder::default()
             .with_options(IpcWriteOptions::default().with_preserve_dict_id(false))
             .with_dictionary_handling(DictionaryHandling::Resend)
@@ -1568,6 +1575,7 @@ mod tests {
             HashMap::from([("some_key".to_owned(), "some_value".to_owned())]),
         );
 
+        #[allow(deprecated)]
         let mut dictionary_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true);
 
         let got = prepare_schema_for_flight(&schema, &mut dictionary_tracker, false);
@@ -1598,6 +1606,7 @@ mod tests {
         options: &IpcWriteOptions,
     ) -> (Vec<FlightData>, FlightData) {
         let data_gen = IpcDataGenerator::default();
+        #[allow(deprecated)]
         let mut dictionary_tracker =
             DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id());
 
diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs
index 81fa99faeb9..1dd2700794f 100644
--- a/arrow-flight/src/lib.rs
+++ b/arrow-flight/src/lib.rs
@@ -143,6 +143,7 @@ pub struct IpcMessage(pub Bytes);
 
 fn flight_schema_as_encoded_data(arrow_schema: &Schema, options: &IpcWriteOptions) -> EncodedData {
     let data_gen = writer::IpcDataGenerator::default();
+    #[allow(deprecated)]
     let mut dict_tracker =
         writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id());
     data_gen.schema_to_bytes_with_dictionary_tracker(arrow_schema, &mut dict_tracker, options)
diff --git a/arrow-flight/src/utils.rs b/arrow-flight/src/utils.rs
index 3242f511ae6..428dde73ca6 100644
--- a/arrow-flight/src/utils.rs
+++ b/arrow-flight/src/utils.rs
@@ -90,6 +90,7 @@ pub fn batches_to_flight_data(
     let mut flight_data = vec![];
 
     let data_gen = writer::IpcDataGenerator::default();
+    #[allow(deprecated)]
     let mut dictionary_tracker =
         writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id());
 
diff --git a/arrow-integration-test/src/field.rs b/arrow-integration-test/src/field.rs
index 32edc416593..4b896ed391b 100644
--- a/arrow-integration-test/src/field.rs
+++ b/arrow-integration-test/src/field.rs
@@ -252,6 +252,7 @@ pub fn field_from_json(json: &serde_json::Value) -> Result<Field> {
                 _ => data_type,
             };
 
+            #[allow(deprecated)]
             let mut field = Field::new_dict(name, data_type, nullable, dict_id, dict_is_ordered);
             field.set_metadata(metadata);
             Ok(field)
@@ -274,17 +275,21 @@ pub fn field_to_json(field: &Field) -> serde_json::Value {
     };
 
     match field.data_type() {
-        DataType::Dictionary(ref index_type, ref value_type) => serde_json::json!({
-            "name": field.name(),
-            "nullable": field.is_nullable(),
-            "type": data_type_to_json(value_type),
-            "children": children,
-            "dictionary": {
-                "id": field.dict_id().unwrap(),
-                "indexType": data_type_to_json(index_type),
-                "isOrdered": field.dict_is_ordered().unwrap(),
-            }
-        }),
+        DataType::Dictionary(ref index_type, ref value_type) => {
+            #[allow(deprecated)]
+            let dict_id = field.dict_id().unwrap();
+            serde_json::json!({
+                "name": field.name(),
+                "nullable": field.is_nullable(),
+                "type": data_type_to_json(value_type),
+                "children": children,
+                "dictionary": {
+                    "id": dict_id,
+                    "indexType": data_type_to_json(index_type),
+                    "isOrdered": field.dict_is_ordered().unwrap(),
+                }
+            })
+        }
         _ => serde_json::json!({
             "name": field.name(),
             "nullable": field.is_nullable(),
diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs
index a25f07ded1d..f025009c22d 100644
--- a/arrow-integration-test/src/lib.rs
+++ b/arrow-integration-test/src/lib.rs
@@ -787,6 +787,7 @@ pub fn array_from_json(
             Ok(Arc::new(array))
         }
         DataType::Dictionary(key_type, value_type) => {
+            #[allow(deprecated)]
             let dict_id = field.dict_id().ok_or_else(|| {
                 ArrowError::JsonError(format!("Unable to find dict_id for field {field:?}"))
             })?;
@@ -930,10 +931,12 @@ pub fn dictionary_array_from_json(
             let null_buf = create_null_buf(&json_col);
 
             // build the key data into a buffer, then construct values separately
+            #[allow(deprecated)]
             let key_field = Field::new_dict(
                 "key",
                 dict_key.clone(),
                 field.is_nullable(),
+                #[allow(deprecated)]
                 field
                     .dict_id()
                     .expect("Dictionary fields must have a dict_id value"),
diff --git a/arrow-integration-test/src/schema.rs b/arrow-integration-test/src/schema.rs
index fb91aba00df..512f0aed8e5 100644
--- a/arrow-integration-test/src/schema.rs
+++ b/arrow-integration-test/src/schema.rs
@@ -189,6 +189,7 @@ mod tests {
                 Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false),
                 Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false),
                 Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false),
+                #[allow(deprecated)]
                 Field::new_dict(
                     "c33",
                     DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
diff --git a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs
index 62cdca975b0..406419028d0 100644
--- a/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs
+++ b/arrow-integration-testing/src/flight_client_scenarios/integration_test.rs
@@ -72,6 +72,7 @@ async fn upload_data(
     let (mut upload_tx, upload_rx) = mpsc::channel(10);
 
     let options = arrow::ipc::writer::IpcWriteOptions::default();
+    #[allow(deprecated)]
     let mut dict_tracker =
         writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id());
     let data_gen = writer::IpcDataGenerator::default();
diff --git a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs
index 27721750081..92989a20393 100644
--- a/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs
+++ b/arrow-integration-testing/src/flight_server_scenarios/integration_test.rs
@@ -119,6 +119,7 @@ impl FlightService for FlightServiceImpl {
             .ok_or_else(|| Status::not_found(format!("Could not find flight. {key}")))?;
 
         let options = arrow::ipc::writer::IpcWriteOptions::default();
+        #[allow(deprecated)]
         let mut dictionary_tracker =
             writer::DictionaryTracker::new_with_preserve_dict_id(false, options.preserve_dict_id());
         let data_gen = writer::IpcDataGenerator::default();
diff --git a/arrow-ipc/src/convert.rs b/arrow-ipc/src/convert.rs
index 0aa07a6a47d..37c5a19439c 100644
--- a/arrow-ipc/src/convert.rs
+++ b/arrow-ipc/src/convert.rs
@@ -165,6 +165,7 @@ pub fn schema_to_fb_offset<'a>(
 impl From<crate::Field<'_>> for Field {
     fn from(field: crate::Field) -> Field {
         let arrow_field = if let Some(dictionary) = field.dictionary() {
+            #[allow(deprecated)]
             Field::new_dict(
                 field.name().unwrap(),
                 get_data_type(field, true),
@@ -519,6 +520,7 @@ pub(crate) fn build_field<'a>(
         match dictionary_tracker {
             Some(tracker) => Some(get_fb_dictionary(
                 index_type,
+                #[allow(deprecated)]
                 tracker.set_dict_id(field),
                 field
                     .dict_is_ordered()
@@ -527,6 +529,7 @@ pub(crate) fn build_field<'a>(
             )),
             None => Some(get_fb_dictionary(
                 index_type,
+                #[allow(deprecated)]
                 field
                     .dict_id()
                     .expect("Dictionary type must have a dictionary id"),
@@ -1143,6 +1146,7 @@ mod tests {
                     ),
                     true,
                 ),
+                #[allow(deprecated)]
                 Field::new_dict(
                     "dictionary<int32, utf8>",
                     DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
@@ -1150,6 +1154,7 @@ mod tests {
                     123,
                     true,
                 ),
+                #[allow(deprecated)]
                 Field::new_dict(
                     "dictionary<uint8, uint32>",
                     DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::UInt32)),
diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs
index 481fefbd5fb..9ff4da30ed8 100644
--- a/arrow-ipc/src/reader.rs
+++ b/arrow-ipc/src/reader.rs
@@ -196,6 +196,7 @@ fn create_array(
             let index_node = reader.next_node(field)?;
             let index_buffers = [reader.next_buffer()?, reader.next_buffer()?];
 
+            #[allow(deprecated)]
             let dict_id = field.dict_id().ok_or_else(|| {
                 ArrowError::ParseError(format!("Field {field} does not have dict id"))
             })?;
@@ -617,6 +618,7 @@ fn read_dictionary_impl(
     }
 
     let id = batch.id();
+    #[allow(deprecated)]
     let fields_using_this_dictionary = schema.fields_with_dict_id(id);
     let first_field = fields_using_this_dictionary.first().ok_or_else(|| {
         ArrowError::InvalidArgumentError(format!("dictionary id {id} not found in schema"))
@@ -1725,6 +1727,7 @@ mod tests {
         let mut writer = crate::writer::FileWriter::try_new_with_options(
             &mut buf,
             batch.schema_ref(),
+            #[allow(deprecated)]
             IpcWriteOptions::default().with_preserve_dict_id(false),
         )
         .unwrap();
@@ -1869,6 +1872,7 @@ mod tests {
         let key_dict_keys = Int8Array::from_iter_values([0, 0, 2, 1, 1, 3]);
         let key_dict_array = DictionaryArray::new(key_dict_keys, values);
 
+        #[allow(deprecated)]
         let keys_field = Arc::new(Field::new_dict(
             "keys",
             DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
@@ -1876,6 +1880,7 @@ mod tests {
             1,
             false,
         ));
+        #[allow(deprecated)]
         let values_field = Arc::new(Field::new_dict(
             "values",
             DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
@@ -1956,6 +1961,7 @@ mod tests {
     #[test]
     fn test_roundtrip_stream_dict_of_list_of_dict() {
         // list
+        #[allow(deprecated)]
         let list_data_type = DataType::List(Arc::new(Field::new_dict(
             "item",
             DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
@@ -1967,6 +1973,7 @@ mod tests {
         test_roundtrip_stream_dict_of_list_of_dict_impl::<i32, i32>(list_data_type, offsets);
 
         // large list
+        #[allow(deprecated)]
         let list_data_type = DataType::LargeList(Arc::new(Field::new_dict(
             "item",
             DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)),
@@ -1985,6 +1992,7 @@ mod tests {
         let dict_array = DictionaryArray::new(keys, Arc::new(values));
         let dict_data = dict_array.into_data();
 
+        #[allow(deprecated)]
         let list_data_type = DataType::FixedSizeList(
             Arc::new(Field::new_dict(
                 "item",
@@ -2075,6 +2083,7 @@ mod tests {
 
         let key_dict_keys = Int8Array::from_iter_values([0, 0, 1, 2, 0, 1, 3]);
         let key_dict_array = DictionaryArray::new(key_dict_keys, utf8_view_array.clone());
+        #[allow(deprecated)]
         let keys_field = Arc::new(Field::new_dict(
             "keys",
             DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8View)),
@@ -2085,6 +2094,7 @@ mod tests {
 
         let value_dict_keys = Int8Array::from_iter_values([0, 3, 0, 1, 2, 0, 1]);
         let value_dict_array = DictionaryArray::new(value_dict_keys, bin_view_array);
+        #[allow(deprecated)]
         let values_field = Arc::new(Field::new_dict(
             "values",
             DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::BinaryView)),
@@ -2150,6 +2160,7 @@ mod tests {
         .unwrap();
 
         let gen = IpcDataGenerator {};
+        #[allow(deprecated)]
         let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true);
         let (_, encoded) = gen
             .encoded_batch(&batch, &mut dict_tracker, &Default::default())
@@ -2187,6 +2198,7 @@ mod tests {
         .unwrap();
 
         let gen = IpcDataGenerator {};
+        #[allow(deprecated)]
         let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true);
         let (_, encoded) = gen
             .encoded_batch(&batch, &mut dict_tracker, &Default::default())
@@ -2326,6 +2338,7 @@ mod tests {
                 ["a", "b"]
                     .iter()
                     .map(|name| {
+                        #[allow(deprecated)]
                         Field::new_dict(
                             name.to_string(),
                             DataType::Dictionary(
@@ -2360,6 +2373,7 @@ mod tests {
             let mut writer = crate::writer::StreamWriter::try_new_with_options(
                 &mut buf,
                 batch.schema().as_ref(),
+                #[allow(deprecated)]
                 crate::writer::IpcWriteOptions::default().with_preserve_dict_id(false),
             )
             .expect("Failed to create StreamWriter");
diff --git a/arrow-ipc/src/reader/stream.rs b/arrow-ipc/src/reader/stream.rs
index de5f5bdd629..9b0eea9b619 100644
--- a/arrow-ipc/src/reader/stream.rs
+++ b/arrow-ipc/src/reader/stream.rs
@@ -324,6 +324,7 @@ mod tests {
             "test1",
             DataType::RunEndEncoded(
                 Arc::new(Field::new("run_ends".to_string(), DataType::Int32, false)),
+                #[allow(deprecated)]
                 Arc::new(Field::new_dict(
                     "values".to_string(),
                     DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
@@ -353,6 +354,7 @@ mod tests {
             let mut writer = StreamWriter::try_new_with_options(
                 &mut buffer,
                 &schema,
+                #[allow(deprecated)]
                 IpcWriteOptions::default().with_preserve_dict_id(false),
             )
             .expect("Failed to create StreamWriter");
diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs
index 32dab762447..ee5b9a54cc9 100644
--- a/arrow-ipc/src/writer.rs
+++ b/arrow-ipc/src/writer.rs
@@ -65,6 +65,10 @@ pub struct IpcWriteOptions {
     /// schema or generate unique dictionary IDs internally during encoding.
     ///
     /// Defaults to `false`
+    #[deprecated(
+        since = "54.0.0",
+        note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
+    )]
     preserve_dict_id: bool,
 }
 
@@ -108,6 +112,7 @@ impl IpcWriteOptions {
             | crate::MetadataVersion::V3 => Err(ArrowError::InvalidArgumentError(
                 "Writing IPC metadata version 3 and lower not supported".to_string(),
             )),
+            #[allow(deprecated)]
             crate::MetadataVersion::V4 => Ok(Self {
                 alignment,
                 write_legacy_ipc_format,
@@ -121,6 +126,7 @@ impl IpcWriteOptions {
                         "Legacy IPC format only supported on metadata version 4".to_string(),
                     ))
                 } else {
+                    #[allow(deprecated)]
                     Ok(Self {
                         alignment,
                         write_legacy_ipc_format,
@@ -138,7 +144,12 @@ impl IpcWriteOptions {
 
     /// Return whether the writer is configured to preserve the dictionary IDs
     /// defined in the schema
+    #[deprecated(
+        since = "54.0.0",
+        note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it."
+    )]
     pub fn preserve_dict_id(&self) -> bool {
+        #[allow(deprecated)]
         self.preserve_dict_id
     }
 
@@ -149,6 +160,11 @@ impl IpcWriteOptions {
     /// to the dictionary batches in order to encode them correctly
     ///
     /// The default will change to `false`  in future releases
+    #[deprecated(
+        since = "54.0.0",
+        note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it."
+    )]
+    #[allow(deprecated)]
     pub fn with_preserve_dict_id(mut self, preserve_dict_id: bool) -> Self {
         self.preserve_dict_id = preserve_dict_id;
         self
@@ -157,6 +173,7 @@ impl IpcWriteOptions {
 
 impl Default for IpcWriteOptions {
     fn default() -> Self {
+        #[allow(deprecated)]
         Self {
             alignment: 64,
             write_legacy_ipc_format: false,
@@ -420,6 +437,7 @@ impl IpcDataGenerator {
                 // It's importnat to only take the dict_id at this point, because the dict ID
                 // sequence is assigned depth-first, so we need to first encode children and have
                 // them take their assigned dict IDs before we take the dict ID for this field.
+                #[allow(deprecated)]
                 let dict_id = dict_id_seq
                     .next()
                     .or_else(|| field.dict_id())
@@ -767,6 +785,10 @@ pub struct DictionaryTracker {
     written: HashMap<i64, ArrayData>,
     dict_ids: Vec<i64>,
     error_on_replacement: bool,
+    #[deprecated(
+        since = "54.0.0",
+        note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
+    )]
     preserve_dict_id: bool,
 }
 
@@ -782,6 +804,7 @@ impl DictionaryTracker {
     /// the last seen dictionary ID (or using `0` if no other dictionary IDs have been
     /// seen)
     pub fn new(error_on_replacement: bool) -> Self {
+        #[allow(deprecated)]
         Self {
             written: HashMap::new(),
             dict_ids: Vec::new(),
@@ -795,7 +818,12 @@ impl DictionaryTracker {
     /// If `error_on_replacement`
     /// is true, an error will be generated if an update to an
     /// existing dictionary is attempted.
+    #[deprecated(
+        since = "54.0.0",
+        note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it."
+    )]
     pub fn new_with_preserve_dict_id(error_on_replacement: bool, preserve_dict_id: bool) -> Self {
+        #[allow(deprecated)]
         Self {
             written: HashMap::new(),
             dict_ids: Vec::new(),
@@ -811,8 +839,14 @@ impl DictionaryTracker {
     ///
     /// If `preserve_dict_id` is false, this will return the value of the last `dict_id` assigned incremented by 1
     /// or 0 in the case where no dictionary IDs have yet been assigned
+    #[deprecated(
+        since = "54.0.0",
+        note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it."
+    )]
     pub fn set_dict_id(&mut self, field: &Field) -> i64 {
+        #[allow(deprecated)]
         let next = if self.preserve_dict_id {
+            #[allow(deprecated)]
             field.dict_id().expect("no dict_id in field")
         } else {
             self.dict_ids
@@ -936,7 +970,9 @@ impl<W: Write> FileWriter<W> {
         writer.write_all(&super::ARROW_MAGIC)?;
         writer.write_all(&PADDING[..pad_len])?;
         // write the schema, set the written bytes to the schema + header
+        #[allow(deprecated)]
         let preserve_dict_id = write_options.preserve_dict_id;
+        #[allow(deprecated)]
         let mut dictionary_tracker =
             DictionaryTracker::new_with_preserve_dict_id(true, preserve_dict_id);
         let encoded_message = data_gen.schema_to_bytes_with_dictionary_tracker(
@@ -1013,7 +1049,9 @@ impl<W: Write> FileWriter<W> {
         let mut fbb = FlatBufferBuilder::new();
         let dictionaries = fbb.create_vector(&self.dictionary_blocks);
         let record_batches = fbb.create_vector(&self.record_blocks);
+        #[allow(deprecated)]
         let preserve_dict_id = self.write_options.preserve_dict_id;
+        #[allow(deprecated)]
         let mut dictionary_tracker =
             DictionaryTracker::new_with_preserve_dict_id(true, preserve_dict_id);
         let schema = IpcSchemaEncoder::new()
@@ -1144,7 +1182,9 @@ impl<W: Write> StreamWriter<W> {
         write_options: IpcWriteOptions,
     ) -> Result<Self, ArrowError> {
         let data_gen = IpcDataGenerator::default();
+        #[allow(deprecated)]
         let preserve_dict_id = write_options.preserve_dict_id;
+        #[allow(deprecated)]
         let mut dictionary_tracker =
             DictionaryTracker::new_with_preserve_dict_id(false, preserve_dict_id);
 
@@ -2032,6 +2072,7 @@ mod tests {
         let array = Arc::new(inner) as ArrayRef;
 
         // Dict field with id 2
+        #[allow(deprecated)]
         let dctfield = Field::new_dict("dict", array.data_type().clone(), false, 2, false);
         let union_fields = [(0, Arc::new(dctfield))].into_iter().collect();
 
@@ -2049,6 +2090,7 @@ mod tests {
         let batch = RecordBatch::try_new(schema, vec![Arc::new(union)]).unwrap();
 
         let gen = IpcDataGenerator {};
+        #[allow(deprecated)]
         let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true);
         gen.encoded_batch(&batch, &mut dict_tracker, &Default::default())
             .unwrap();
@@ -2065,6 +2107,7 @@ mod tests {
         let array = Arc::new(inner) as ArrayRef;
 
         // Dict field with id 2
+        #[allow(deprecated)]
         let dctfield = Arc::new(Field::new_dict(
             "dict",
             array.data_type().clone(),
@@ -2085,6 +2128,7 @@ mod tests {
         let batch = RecordBatch::try_new(schema, vec![struct_array]).unwrap();
 
         let gen = IpcDataGenerator {};
+        #[allow(deprecated)]
         let mut dict_tracker = DictionaryTracker::new_with_preserve_dict_id(false, true);
         gen.encoded_batch(&batch, &mut dict_tracker, &Default::default())
             .unwrap();
diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs
index 08b1e6d0b18..96c80974982 100644
--- a/arrow-schema/src/ffi.rs
+++ b/arrow-schema/src/ffi.rs
@@ -923,6 +923,7 @@ mod tests {
 
     #[test]
     fn test_dictionary_ordered() {
+        #[allow(deprecated)]
         let schema = Schema::new(vec![Field::new_dict(
             "dict",
             DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs
index 131e23d68b9..7d47c0ae1de 100644
--- a/arrow-schema/src/field.rs
+++ b/arrow-schema/src/field.rs
@@ -38,6 +38,10 @@ pub struct Field {
     name: String,
     data_type: DataType,
     nullable: bool,
+    #[deprecated(
+        since = "54.0.0",
+        note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
+    )]
     dict_id: i64,
     dict_is_ordered: bool,
     /// A map of key-value pairs containing additional custom meta data.
@@ -122,6 +126,7 @@ impl Field {
 
     /// Creates a new field with the given name, type, and nullability
     pub fn new(name: impl Into<String>, data_type: DataType, nullable: bool) -> Self {
+        #[allow(deprecated)]
         Field {
             name: name.into(),
             data_type,
@@ -151,6 +156,10 @@ impl Field {
     }
 
     /// Creates a new field that has additional dictionary information
+    #[deprecated(
+        since = "54.0.0",
+        note = "The ability to preserve dictionary IDs will be removed. With the dict_id field disappearing this function signature will change by removing the dict_id parameter."
+    )]
     pub fn new_dict(
         name: impl Into<String>,
         data_type: DataType,
@@ -158,6 +167,7 @@ impl Field {
         dict_id: i64,
         dict_is_ordered: bool,
     ) -> Self {
+        #[allow(deprecated)]
         Field {
             name: name.into(),
             data_type,
@@ -386,19 +396,30 @@ impl Field {
     /// Returns a vector containing all (potentially nested) `Field` instances selected by the
     /// dictionary ID they use
     #[inline]
+    #[deprecated(
+        since = "54.0.0",
+        note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
+    )]
     pub(crate) fn fields_with_dict_id(&self, id: i64) -> Vec<&Field> {
         self.fields()
             .into_iter()
             .filter(|&field| {
-                matches!(field.data_type(), DataType::Dictionary(_, _)) && field.dict_id == id
+                #[allow(deprecated)]
+                let matching_dict_id = field.dict_id == id;
+                matches!(field.data_type(), DataType::Dictionary(_, _)) && matching_dict_id
             })
             .collect()
     }
 
     /// Returns the dictionary ID, if this is a dictionary type.
     #[inline]
+    #[deprecated(
+        since = "54.0.0",
+        note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
+    )]
     pub const fn dict_id(&self) -> Option<i64> {
         match self.data_type {
+            #[allow(deprecated)]
             DataType::Dictionary(_, _) => Some(self.dict_id),
             _ => None,
         }
@@ -428,6 +449,7 @@ impl Field {
     /// assert!(field.is_nullable());
     /// ```
     pub fn try_merge(&mut self, from: &Field) -> Result<(), ArrowError> {
+        #[allow(deprecated)]
         if from.dict_id != self.dict_id {
             return Err(ArrowError::SchemaError(format!(
                 "Fail to merge schema field '{}' because from dict_id = {} does not match {}",
@@ -570,9 +592,11 @@ impl Field {
     /// * self.metadata is a superset of other.metadata
     /// * all other fields are equal
     pub fn contains(&self, other: &Field) -> bool {
+        #[allow(deprecated)]
+        let matching_dict_id = self.dict_id == other.dict_id;
         self.name == other.name
         && self.data_type.contains(&other.data_type)
-        && self.dict_id == other.dict_id
+        && matching_dict_id
         && self.dict_is_ordered == other.dict_is_ordered
         // self need to be nullable or both of them are not nullable
         && (self.nullable || !other.nullable)
@@ -621,6 +645,7 @@ mod test {
     fn test_new_dict_with_string() {
         // Fields should allow owned Strings to support reuse
         let s = "c1";
+        #[allow(deprecated)]
         Field::new_dict(s, DataType::Int64, false, 4, false);
     }
 
@@ -738,6 +763,7 @@ mod test {
 
     #[test]
     fn test_fields_with_dict_id() {
+        #[allow(deprecated)]
         let dict1 = Field::new_dict(
             "dict1",
             DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
@@ -745,6 +771,7 @@ mod test {
             10,
             false,
         );
+        #[allow(deprecated)]
         let dict2 = Field::new_dict(
             "dict2",
             DataType::Dictionary(DataType::Int32.into(), DataType::Int8.into()),
@@ -781,9 +808,11 @@ mod test {
             false,
         );
 
+        #[allow(deprecated)]
         for field in field.fields_with_dict_id(10) {
             assert_eq!(dict1, *field);
         }
+        #[allow(deprecated)]
         for field in field.fields_with_dict_id(20) {
             assert_eq!(dict2, *field);
         }
@@ -798,6 +827,7 @@ mod test {
     #[test]
     fn test_field_comparison_case() {
         // dictionary-encoding properties not used for field comparison
+        #[allow(deprecated)]
         let dict1 = Field::new_dict(
             "dict1",
             DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
@@ -805,6 +835,7 @@ mod test {
             10,
             false,
         );
+        #[allow(deprecated)]
         let dict2 = Field::new_dict(
             "dict1",
             DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
@@ -816,6 +847,7 @@ mod test {
         assert_eq!(dict1, dict2);
         assert_eq!(get_field_hash(&dict1), get_field_hash(&dict2));
 
+        #[allow(deprecated)]
         let dict1 = Field::new_dict(
             "dict0",
             DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs
index 47c22e2a931..6c79da53f98 100644
--- a/arrow-schema/src/schema.rs
+++ b/arrow-schema/src/schema.rs
@@ -389,7 +389,12 @@ impl Schema {
 
     /// Returns a vector of immutable references to all [`Field`] instances selected by
     /// the dictionary ID they use.
+    #[deprecated(
+        since = "54.0.0",
+        note = "The ability to preserve dictionary IDs will be removed. With it, all functions related to it."
+    )]
     pub fn fields_with_dict_id(&self, dict_id: i64) -> Vec<&Field> {
+        #[allow(deprecated)]
         self.fields
             .iter()
             .flat_map(|f| f.fields_with_dict_id(dict_id))
@@ -638,7 +643,9 @@ mod tests {
         assert_eq!(first_name.name(), "first_name");
         assert_eq!(first_name.data_type(), &DataType::Utf8);
         assert!(!first_name.is_nullable());
-        assert_eq!(first_name.dict_id(), None);
+        #[allow(deprecated)]
+        let dict_id = first_name.dict_id();
+        assert_eq!(dict_id, None);
         assert_eq!(first_name.dict_is_ordered(), None);
 
         let metadata = first_name.metadata();
@@ -655,7 +662,9 @@ mod tests {
             interests.data_type(),
             &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
         );
-        assert_eq!(interests.dict_id(), Some(123));
+        #[allow(deprecated)]
+        let dict_id = interests.dict_id();
+        assert_eq!(dict_id, Some(123));
         assert_eq!(interests.dict_is_ordered(), Some(true));
     }
 
@@ -691,6 +700,7 @@ mod tests {
     fn schema_field_with_dict_id() {
         let schema = person_schema();
 
+        #[allow(deprecated)]
         let fields_dict_123: Vec<_> = schema
             .fields_with_dict_id(123)
             .iter()
@@ -698,7 +708,9 @@ mod tests {
             .collect();
         assert_eq!(fields_dict_123, vec!["interests"]);
 
-        assert!(schema.fields_with_dict_id(456).is_empty());
+        #[allow(deprecated)]
+        let is_empty = schema.fields_with_dict_id(456).is_empty();
+        assert!(is_empty);
     }
 
     fn person_schema() -> Schema {
@@ -718,6 +730,7 @@ mod tests {
                 ])),
                 false,
             ),
+            #[allow(deprecated)]
             Field::new_dict(
                 "interests",
                 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
index bb6ebf75ec8..59cfac04719 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -2587,6 +2587,7 @@ mod tests {
     #[test]
     fn arrow_writer_string_dictionary() {
         // define schema
+        #[allow(deprecated)]
         let schema = Arc::new(Schema::new(vec![Field::new_dict(
             "dictionary",
             DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
@@ -2608,6 +2609,7 @@ mod tests {
     #[test]
     fn arrow_writer_primitive_dictionary() {
         // define schema
+        #[allow(deprecated)]
         let schema = Arc::new(Schema::new(vec![Field::new_dict(
             "dictionary",
             DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::UInt32)),
@@ -2630,6 +2632,7 @@ mod tests {
     #[test]
     fn arrow_writer_string_dictionary_unsigned_index() {
         // define schema
+        #[allow(deprecated)]
         let schema = Arc::new(Schema::new(vec![Field::new_dict(
             "dictionary",
             DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)),
diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs
index d2f5a0bd8d6..16d46bd852d 100644
--- a/parquet/src/arrow/schema/complex.rs
+++ b/parquet/src/arrow/schema/complex.rs
@@ -552,8 +552,11 @@ fn convert_field(parquet_type: &Type, field: &ParquetField, arrow_hint: Option<&
     match arrow_hint {
         Some(hint) => {
             // If the inferred type is a dictionary, preserve dictionary metadata
+            #[allow(deprecated)]
             let field = match (&data_type, hint.dict_id(), hint.dict_is_ordered()) {
-                (DataType::Dictionary(_, _), Some(id), Some(ordered)) => {
+                (DataType::Dictionary(_, _), Some(id), Some(ordered)) =>
+                {
+                    #[allow(deprecated)]
                     Field::new_dict(name, data_type, nullable, id, ordered)
                 }
                 _ => Field::new(name, data_type, nullable),
diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
index 0fbcb4856e4..26b0d8b7b88 100644
--- a/parquet/src/arrow/schema/mod.rs
+++ b/parquet/src/arrow/schema/mod.rs
@@ -178,6 +178,7 @@ fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Result<Schema> {
 /// Encodes the Arrow schema into the IPC format, and base64 encodes it
 fn encode_arrow_schema(schema: &Schema) -> String {
     let options = writer::IpcWriteOptions::default();
+    #[allow(deprecated)]
     let mut dictionary_tracker =
         writer::DictionaryTracker::new_with_preserve_dict_id(true, options.preserve_dict_id());
     let data_gen = writer::IpcDataGenerator::default();
@@ -1823,6 +1824,7 @@ mod tests {
                 // Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false),
                 // Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false),
                 // Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false),
+                #[allow(deprecated)]
                 Field::new_dict(
                     "c31",
                     DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),

From 9ffa06543be51613ea1f509e63f6e7405b7d9989 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Mon, 16 Dec 2024 10:32:18 -0800
Subject: [PATCH 11/29] Improvements to UTF-8 statistics truncation (#6870)

* fix a few edge cases with utf-8 incrementing

* add todo

* simplify truncation

* add another test

* note case where string should render right to left

* rework entirely, also avoid UTF8 processing if not required by the schema

* more consistent naming

* modify some tests to truncate in the middle of a multibyte char

* add test and docstring

* document truncate_min_value too
---
 parquet/src/column/writer/mod.rs | 293 +++++++++++++++++++++++++------
 1 file changed, 236 insertions(+), 57 deletions(-)

diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 16de0ba7898..8dc1d0db447 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -878,24 +878,67 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
         }
     }
 
+    /// Returns `true` if this column's logical type is a UTF-8 string.
+    fn is_utf8(&self) -> bool {
+        self.get_descriptor().logical_type() == Some(LogicalType::String)
+            || self.get_descriptor().converted_type() == ConvertedType::UTF8
+    }
+
+    /// Truncates a binary statistic to at most `truncation_length` bytes.
+    ///
+    /// If truncation is not possible, returns `data`.
+    ///
+    /// The `bool` in the returned tuple indicates whether truncation occurred or not.
+    ///
+    /// UTF-8 Note:
+    /// If the column type indicates UTF-8, and `data` contains valid UTF-8, then the result will
+    /// also remain valid UTF-8, but may be less tnan `truncation_length` bytes to avoid splitting
+    /// on non-character boundaries.
     fn truncate_min_value(&self, truncation_length: Option<usize>, data: &[u8]) -> (Vec<u8>, bool) {
         truncation_length
             .filter(|l| data.len() > *l)
-            .and_then(|l| match str::from_utf8(data) {
-                Ok(str_data) => truncate_utf8(str_data, l),
-                Err(_) => Some(data[..l].to_vec()),
-            })
+            .and_then(|l|
+                // don't do extra work if this column isn't UTF-8
+                if self.is_utf8() {
+                    match str::from_utf8(data) {
+                        Ok(str_data) => truncate_utf8(str_data, l),
+                        Err(_) => Some(data[..l].to_vec()),
+                    }
+                } else {
+                    Some(data[..l].to_vec())
+                }
+            )
             .map(|truncated| (truncated, true))
             .unwrap_or_else(|| (data.to_vec(), false))
     }
 
+    /// Truncates a binary statistic to at most `truncation_length` bytes, and then increment the
+    /// final byte(s) to yield a valid upper bound. This may result in a result of less than
+    /// `truncation_length` bytes if the last byte(s) overflows.
+    ///
+    /// If truncation is not possible, returns `data`.
+    ///
+    /// The `bool` in the returned tuple indicates whether truncation occurred or not.
+    ///
+    /// UTF-8 Note:
+    /// If the column type indicates UTF-8, and `data` contains valid UTF-8, then the result will
+    /// also remain valid UTF-8 (but again may be less than `truncation_length` bytes). If `data`
+    /// does not contain valid UTF-8, then truncation will occur as if the column is non-string
+    /// binary.
     fn truncate_max_value(&self, truncation_length: Option<usize>, data: &[u8]) -> (Vec<u8>, bool) {
         truncation_length
             .filter(|l| data.len() > *l)
-            .and_then(|l| match str::from_utf8(data) {
-                Ok(str_data) => truncate_utf8(str_data, l).and_then(increment_utf8),
-                Err(_) => increment(data[..l].to_vec()),
-            })
+            .and_then(|l|
+                // don't do extra work if this column isn't UTF-8
+                if self.is_utf8() {
+                    match str::from_utf8(data) {
+                        Ok(str_data) => truncate_and_increment_utf8(str_data, l),
+                        Err(_) => increment(data[..l].to_vec()),
+                    }
+                } else {
+                    increment(data[..l].to_vec())
+                }
+            )
             .map(|truncated| (truncated, true))
             .unwrap_or_else(|| (data.to_vec(), false))
     }
@@ -1418,13 +1461,50 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: &[u8]) -> bool {
     (a[1..]) > (b[1..])
 }
 
-/// Truncate a UTF8 slice to the longest prefix that is still a valid UTF8 string,
-/// while being less than `length` bytes and non-empty
+/// Truncate a UTF-8 slice to the longest prefix that is still a valid UTF-8 string,
+/// while being less than `length` bytes and non-empty. Returns `None` if truncation
+/// is not possible within those constraints.
+///
+/// The caller guarantees that data.len() > length.
 fn truncate_utf8(data: &str, length: usize) -> Option<Vec<u8>> {
     let split = (1..=length).rfind(|x| data.is_char_boundary(*x))?;
     Some(data.as_bytes()[..split].to_vec())
 }
 
+/// Truncate a UTF-8 slice and increment it's final character. The returned value is the
+/// longest such slice that is still a valid UTF-8 string while being less than `length`
+/// bytes and non-empty. Returns `None` if no such transformation is possible.
+///
+/// The caller guarantees that data.len() > length.
+fn truncate_and_increment_utf8(data: &str, length: usize) -> Option<Vec<u8>> {
+    // UTF-8 is max 4 bytes, so start search 3 back from desired length
+    let lower_bound = length.saturating_sub(3);
+    let split = (lower_bound..=length).rfind(|x| data.is_char_boundary(*x))?;
+    increment_utf8(data.get(..split)?)
+}
+
+/// Increment the final character in a UTF-8 string in such a way that the returned result
+/// is still a valid UTF-8 string. The returned string may be shorter than the input if the
+/// last character(s) cannot be incremented (due to overflow or producing invalid code points).
+/// Returns `None` if the string cannot be incremented.
+///
+/// Note that this implementation will not promote an N-byte code point to (N+1) bytes.
+fn increment_utf8(data: &str) -> Option<Vec<u8>> {
+    for (idx, original_char) in data.char_indices().rev() {
+        let original_len = original_char.len_utf8();
+        if let Some(next_char) = char::from_u32(original_char as u32 + 1) {
+            // do not allow increasing byte width of incremented char
+            if next_char.len_utf8() == original_len {
+                let mut result = data.as_bytes()[..idx + original_len].to_vec();
+                next_char.encode_utf8(&mut result[idx..]);
+                return Some(result);
+            }
+        }
+    }
+
+    None
+}
+
 /// Try and increment the bytes from right to left.
 ///
 /// Returns `None` if all bytes are set to `u8::MAX`.
@@ -1441,29 +1521,15 @@ fn increment(mut data: Vec<u8>) -> Option<Vec<u8>> {
     None
 }
 
-/// Try and increment the the string's bytes from right to left, returning when the result
-/// is a valid UTF8 string. Returns `None` when it can't increment any byte.
-fn increment_utf8(mut data: Vec<u8>) -> Option<Vec<u8>> {
-    for idx in (0..data.len()).rev() {
-        let original = data[idx];
-        let (byte, overflow) = original.overflowing_add(1);
-        if !overflow {
-            data[idx] = byte;
-            if str::from_utf8(&data).is_ok() {
-                return Some(data);
-            }
-            data[idx] = original;
-        }
-    }
-
-    None
-}
-
 #[cfg(test)]
 mod tests {
-    use crate::file::properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH;
+    use crate::{
+        file::{properties::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, writer::SerializedFileWriter},
+        schema::parser::parse_message_type,
+    };
+    use core::str;
     use rand::distributions::uniform::SampleUniform;
-    use std::sync::Arc;
+    use std::{fs::File, sync::Arc};
 
     use crate::column::{
         page::PageReader,
@@ -3140,39 +3206,69 @@ mod tests {
 
     #[test]
     fn test_increment_utf8() {
+        let test_inc = |o: &str, expected: &str| {
+            if let Ok(v) = String::from_utf8(increment_utf8(o).unwrap()) {
+                // Got the expected result...
+                assert_eq!(v, expected);
+                // and it's greater than the original string
+                assert!(*v > *o);
+                // Also show that BinaryArray level comparison works here
+                let mut greater = ByteArray::new();
+                greater.set_data(Bytes::from(v));
+                let mut original = ByteArray::new();
+                original.set_data(Bytes::from(o.as_bytes().to_vec()));
+                assert!(greater > original);
+            } else {
+                panic!("Expected incremented UTF8 string to also be valid.");
+            }
+        };
+
         // Basic ASCII case
-        let v = increment_utf8("hello".as_bytes().to_vec()).unwrap();
-        assert_eq!(&v, "hellp".as_bytes());
+        test_inc("hello", "hellp");
+
+        // 1-byte ending in max 1-byte
+        test_inc("a\u{7f}", "b");
 
-        // Also show that BinaryArray level comparison works here
-        let mut greater = ByteArray::new();
-        greater.set_data(Bytes::from(v));
-        let mut original = ByteArray::new();
-        original.set_data(Bytes::from("hello".as_bytes().to_vec()));
-        assert!(greater > original);
+        // 1-byte max should not truncate as it would need 2-byte code points
+        assert!(increment_utf8("\u{7f}\u{7f}").is_none());
 
         // UTF8 string
-        let s = "❤️🧡💛💚💙💜";
-        let v = increment_utf8(s.as_bytes().to_vec()).unwrap();
+        test_inc("❤️🧡💛💚💙💜", "❤️🧡💛💚💙💝");
 
-        if let Ok(new) = String::from_utf8(v) {
-            assert_ne!(&new, s);
-            assert_eq!(new, "❤️🧡💛💚💙💝");
-            assert!(new.as_bytes().last().unwrap() > s.as_bytes().last().unwrap());
-        } else {
-            panic!("Expected incremented UTF8 string to also be valid.")
-        }
+        // 2-byte without overflow
+        test_inc("éééé", "éééê");
 
-        // Max UTF8 character - should be a No-Op
-        let s = char::MAX.to_string();
-        assert_eq!(s.len(), 4);
-        let v = increment_utf8(s.as_bytes().to_vec());
-        assert!(v.is_none());
+        // 2-byte that overflows lowest byte
+        test_inc("\u{ff}\u{ff}", "\u{ff}\u{100}");
+
+        // 2-byte ending in max 2-byte
+        test_inc("a\u{7ff}", "b");
+
+        // Max 2-byte should not truncate as it would need 3-byte code points
+        assert!(increment_utf8("\u{7ff}\u{7ff}").is_none());
+
+        // 3-byte without overflow [U+800, U+800] -> [U+800, U+801] (note that these
+        // characters should render right to left).
+        test_inc("ࠀࠀ", "ࠀࠁ");
+
+        // 3-byte ending in max 3-byte
+        test_inc("a\u{ffff}", "b");
+
+        // Max 3-byte should not truncate as it would need 4-byte code points
+        assert!(increment_utf8("\u{ffff}\u{ffff}").is_none());
 
-        // Handle multi-byte UTF8 characters
-        let s = "a\u{10ffff}";
-        let v = increment_utf8(s.as_bytes().to_vec());
-        assert_eq!(&v.unwrap(), "b\u{10ffff}".as_bytes());
+        // 4-byte without overflow
+        test_inc("𐀀𐀀", "𐀀𐀁");
+
+        // 4-byte ending in max unicode
+        test_inc("a\u{10ffff}", "b");
+
+        // Max 4-byte should not truncate
+        assert!(increment_utf8("\u{10ffff}\u{10ffff}").is_none());
+
+        // Skip over surrogate pair range (0xD800..=0xDFFF)
+        //test_inc("a\u{D7FF}", "a\u{e000}");
+        test_inc("a\u{D7FF}", "b");
     }
 
     #[test]
@@ -3182,7 +3278,6 @@ mod tests {
         let r = truncate_utf8(data, data.as_bytes().len()).unwrap();
         assert_eq!(r.len(), data.as_bytes().len());
         assert_eq!(&r, data.as_bytes());
-        println!("len is {}", data.len());
 
         // We slice it away from the UTF8 boundary
         let r = truncate_utf8(data, 13).unwrap();
@@ -3192,6 +3287,90 @@ mod tests {
         // One multi-byte code point, and a length shorter than it, so we can't slice it
         let r = truncate_utf8("\u{0836}", 1);
         assert!(r.is_none());
+
+        // Test truncate and increment for max bounds on UTF-8 statistics
+        // 7-bit (i.e. ASCII)
+        let r = truncate_and_increment_utf8("yyyyyyyyy", 8).unwrap();
+        assert_eq!(&r, "yyyyyyyz".as_bytes());
+
+        // 2-byte without overflow
+        let r = truncate_and_increment_utf8("ééééé", 7).unwrap();
+        assert_eq!(&r, "ééê".as_bytes());
+
+        // 2-byte that overflows lowest byte
+        let r = truncate_and_increment_utf8("\u{ff}\u{ff}\u{ff}\u{ff}\u{ff}", 8).unwrap();
+        assert_eq!(&r, "\u{ff}\u{ff}\u{ff}\u{100}".as_bytes());
+
+        // max 2-byte should not truncate as it would need 3-byte code points
+        let r = truncate_and_increment_utf8("߿߿߿߿߿", 8);
+        assert!(r.is_none());
+
+        // 3-byte without overflow [U+800, U+800, U+800] -> [U+800, U+801] (note that these
+        // characters should render right to left).
+        let r = truncate_and_increment_utf8("ࠀࠀࠀࠀ", 8).unwrap();
+        assert_eq!(&r, "ࠀࠁ".as_bytes());
+
+        // max 3-byte should not truncate as it would need 4-byte code points
+        let r = truncate_and_increment_utf8("\u{ffff}\u{ffff}\u{ffff}", 8);
+        assert!(r.is_none());
+
+        // 4-byte without overflow
+        let r = truncate_and_increment_utf8("𐀀𐀀𐀀𐀀", 9).unwrap();
+        assert_eq!(&r, "𐀀𐀁".as_bytes());
+
+        // max 4-byte should not truncate
+        let r = truncate_and_increment_utf8("\u{10ffff}\u{10ffff}", 8);
+        assert!(r.is_none());
+    }
+
+    #[test]
+    // Check fallback truncation of statistics that should be UTF-8, but aren't
+    // (see https://github.com/apache/arrow-rs/pull/6870).
+    fn test_byte_array_truncate_invalid_utf8_statistics() {
+        let message_type = "
+            message test_schema {
+                OPTIONAL BYTE_ARRAY a (UTF8);
+            }
+        ";
+        let schema = Arc::new(parse_message_type(message_type).unwrap());
+
+        // Create Vec<ByteArray> containing non-UTF8 bytes
+        let data = vec![ByteArray::from(vec![128u8; 32]); 7];
+        let def_levels = [1, 1, 1, 1, 0, 1, 0, 1, 0, 1];
+        let file: File = tempfile::tempfile().unwrap();
+        let props = Arc::new(
+            WriterProperties::builder()
+                .set_statistics_enabled(EnabledStatistics::Chunk)
+                .set_statistics_truncate_length(Some(8))
+                .build(),
+        );
+
+        let mut writer = SerializedFileWriter::new(&file, schema, props).unwrap();
+        let mut row_group_writer = writer.next_row_group().unwrap();
+
+        let mut col_writer = row_group_writer.next_column().unwrap().unwrap();
+        col_writer
+            .typed::<ByteArrayType>()
+            .write_batch(&data, Some(&def_levels), None)
+            .unwrap();
+        col_writer.close().unwrap();
+        row_group_writer.close().unwrap();
+        let file_metadata = writer.close().unwrap();
+        assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some());
+        let stats = file_metadata.row_groups[0].columns[0]
+            .meta_data
+            .as_ref()
+            .unwrap()
+            .statistics
+            .as_ref()
+            .unwrap();
+        assert!(!stats.is_max_value_exact.unwrap());
+        // Truncation of invalid UTF-8 should fall back to binary truncation, so last byte should
+        // be incremented by 1.
+        assert_eq!(
+            stats.max_value,
+            Some([128, 128, 128, 128, 128, 128, 128, 129].to_vec())
+        );
     }
 
     #[test]

From 2808625bea15401e89343cd184deefca931243e2 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Mon, 16 Dec 2024 16:32:08 -0500
Subject: [PATCH 12/29] Add `ArrowToParquetSchemaConverter`, deprecate
 `arrow_to_parquet_schema` (#6840)

* Add ArrowToParquetSchemaConverter, deprecate `arrow_to_parquet_schema` et al

* Fmt

* update test

* Update parquet/src/arrow/schema/mod.rs

Co-authored-by: Ed Seidl <etseidl@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Ed Seidl <etseidl@users.noreply.github.com>

* Improve comments

* Add more detail to WriterPropertiesBuilder docs

* Update parquet/src/file/properties.rs

Co-authored-by: Ed Seidl <etseidl@users.noreply.github.com>

* Fix some more capitalization and add a link to Parquet date spec

* Update parquet/src/arrow/schema/mod.rs

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

* Revert "Update parquet/src/arrow/schema/mod.rs"

This reverts commit bd4e2d5135e3e4fe2d3dc61e043659243f4423bb.

* rename to ArrowSchemaConverter

* change from build --> convert

* update doc

* fix fmt

---------

Co-authored-by: Ed Seidl <etseidl@users.noreply.github.com>
Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
---
 parquet/src/arrow/arrow_writer/mod.rs |  26 ++--
 parquet/src/arrow/mod.rs              |   8 +-
 parquet/src/arrow/schema/mod.rs       | 172 +++++++++++++++++++++-----
 parquet/src/file/properties.rs        |  37 +++---
 4 files changed, 180 insertions(+), 63 deletions(-)

diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
index 59cfac04719..871b140768c 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -30,12 +30,10 @@ use arrow_array::types::*;
 use arrow_array::{ArrayRef, RecordBatch, RecordBatchWriter};
 use arrow_schema::{ArrowError, DataType as ArrowDataType, Field, IntervalUnit, SchemaRef};
 
-use super::schema::{
-    add_encoded_arrow_schema_to_metadata, arrow_to_parquet_schema,
-    arrow_to_parquet_schema_with_root, decimal_length_from_precision,
-};
+use super::schema::{add_encoded_arrow_schema_to_metadata, decimal_length_from_precision};
 
 use crate::arrow::arrow_writer::byte_array::ByteArrayEncoder;
+use crate::arrow::ArrowSchemaConverter;
 use crate::column::page::{CompressedPage, PageWriteSpec, PageWriter};
 use crate::column::writer::encoder::ColumnValueEncoder;
 use crate::column::writer::{
@@ -181,10 +179,11 @@ impl<W: Write + Send> ArrowWriter<W> {
         options: ArrowWriterOptions,
     ) -> Result<Self> {
         let mut props = options.properties;
-        let schema = match options.schema_root {
-            Some(s) => arrow_to_parquet_schema_with_root(&arrow_schema, &s, props.coerce_types())?,
-            None => arrow_to_parquet_schema(&arrow_schema, props.coerce_types())?,
-        };
+        let mut converter = ArrowSchemaConverter::new().with_coerce_types(props.coerce_types());
+        if let Some(schema_root) = &options.schema_root {
+            converter = converter.schema_root(schema_root);
+        }
+        let schema = converter.convert(&arrow_schema)?;
         if !options.skip_arrow_metadata {
             // add serialized arrow schema
             add_encoded_arrow_schema_to_metadata(&arrow_schema, &mut props);
@@ -390,9 +389,9 @@ impl ArrowWriterOptions {
     }
 
     /// Set the name of the root parquet schema element (defaults to `"arrow_schema"`)
-    pub fn with_schema_root(self, name: String) -> Self {
+    pub fn with_schema_root(self, schema_root: String) -> Self {
         Self {
-            schema_root: Some(name),
+            schema_root: Some(schema_root),
             ..self
         }
     }
@@ -538,7 +537,7 @@ impl ArrowColumnChunk {
 /// # use std::sync::Arc;
 /// # use arrow_array::*;
 /// # use arrow_schema::*;
-/// # use parquet::arrow::arrow_to_parquet_schema;
+/// # use parquet::arrow::ArrowSchemaConverter;
 /// # use parquet::arrow::arrow_writer::{ArrowLeafColumn, compute_leaves, get_column_writers};
 /// # use parquet::file::properties::WriterProperties;
 /// # use parquet::file::writer::SerializedFileWriter;
@@ -550,7 +549,10 @@ impl ArrowColumnChunk {
 ///
 /// // Compute the parquet schema
 /// let props = Arc::new(WriterProperties::default());
-/// let parquet_schema = arrow_to_parquet_schema(schema.as_ref(), props.coerce_types()).unwrap();
+/// let parquet_schema = ArrowSchemaConverter::new()
+///   .with_coerce_types(props.coerce_types())
+///   .convert(&schema)
+///   .unwrap();
 ///
 /// // Create writers for each of the leaf columns
 /// let col_writers = get_column_writers(&parquet_schema, &props, &schema).unwrap();
diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs
index 2d09cd19203..d77436bc1ff 100644
--- a/parquet/src/arrow/mod.rs
+++ b/parquet/src/arrow/mod.rs
@@ -116,9 +116,13 @@ pub use self::async_writer::AsyncArrowWriter;
 use crate::schema::types::SchemaDescriptor;
 use arrow_schema::{FieldRef, Schema};
 
+// continue to export deprecated methods until they are removed
+#[allow(deprecated)]
+pub use self::schema::arrow_to_parquet_schema;
+
 pub use self::schema::{
-    arrow_to_parquet_schema, parquet_to_arrow_field_levels, parquet_to_arrow_schema,
-    parquet_to_arrow_schema_by_columns, FieldLevels,
+    parquet_to_arrow_field_levels, parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns,
+    ArrowSchemaConverter, FieldLevels,
 };
 
 /// Schema metadata key used to store serialized Arrow IPC schema
diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
index 26b0d8b7b88..4ae3fdb8e5c 100644
--- a/parquet/src/arrow/schema/mod.rs
+++ b/parquet/src/arrow/schema/mod.rs
@@ -15,13 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Provides API for converting parquet schema to arrow schema and vice versa.
-//!
-//! The main interfaces for converting parquet schema to arrow schema  are
-//! `parquet_to_arrow_schema`, `parquet_to_arrow_schema_by_columns` and
-//! `parquet_to_arrow_field`.
-//!
-//! The interfaces for converting arrow schema to parquet schema is coming.
+//! Converting Parquet schema <--> Arrow schema: [`ArrowSchemaConverter`] and [parquet_to_arrow_schema]
 
 use base64::prelude::BASE64_STANDARD;
 use base64::Engine;
@@ -226,27 +220,134 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut
     }
 }
 
-/// Convert arrow schema to parquet schema
+/// Converter for Arrow schema to Parquet schema
 ///
-/// The name of the root schema element defaults to `"arrow_schema"`, this can be
-/// overridden with [`arrow_to_parquet_schema_with_root`]
-pub fn arrow_to_parquet_schema(schema: &Schema, coerce_types: bool) -> Result<SchemaDescriptor> {
-    arrow_to_parquet_schema_with_root(schema, "arrow_schema", coerce_types)
+/// Example:
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow_schema::{Field, Schema, DataType};
+/// # use parquet::arrow::ArrowSchemaConverter;
+/// use parquet::schema::types::{SchemaDescriptor, Type};
+/// use parquet::basic; // note there are two `Type`s in the following example
+/// // create an Arrow Schema
+/// let arrow_schema = Schema::new(vec![
+///   Field::new("a", DataType::Int64, true),
+///   Field::new("b", DataType::Date32, true),
+/// ]);
+/// // convert the Arrow schema to a Parquet schema
+/// let parquet_schema = ArrowSchemaConverter::new()
+///   .convert(&arrow_schema)
+///   .unwrap();
+///
+/// let expected_parquet_schema = SchemaDescriptor::new(
+///   Arc::new(
+///     Type::group_type_builder("arrow_schema")
+///       .with_fields(vec![
+///         Arc::new(
+///          Type::primitive_type_builder("a", basic::Type::INT64)
+///           .build().unwrap()
+///         ),
+///         Arc::new(
+///          Type::primitive_type_builder("b", basic::Type::INT32)
+///           .with_converted_type(basic::ConvertedType::DATE)
+///           .with_logical_type(Some(basic::LogicalType::Date))
+///           .build().unwrap()
+///         ),
+///      ])
+///      .build().unwrap()
+///   )
+/// );
+/// assert_eq!(parquet_schema, expected_parquet_schema);
+/// ```
+#[derive(Debug)]
+pub struct ArrowSchemaConverter<'a> {
+    /// Name of the root schema in Parquet
+    schema_root: &'a str,
+    /// Should we coerce Arrow types to compatible Parquet types?
+    ///
+    /// See docs on [Self::with_coerce_types]`
+    coerce_types: bool,
 }
 
-/// Convert arrow schema to parquet schema specifying the name of the root schema element
-pub fn arrow_to_parquet_schema_with_root(
-    schema: &Schema,
-    root: &str,
-    coerce_types: bool,
-) -> Result<SchemaDescriptor> {
-    let fields = schema
-        .fields()
-        .iter()
-        .map(|field| arrow_to_parquet_type(field, coerce_types).map(Arc::new))
-        .collect::<Result<_>>()?;
-    let group = Type::group_type_builder(root).with_fields(fields).build()?;
-    Ok(SchemaDescriptor::new(Arc::new(group)))
+impl Default for ArrowSchemaConverter<'_> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<'a> ArrowSchemaConverter<'a> {
+    /// Create a new converter
+    pub fn new() -> Self {
+        Self {
+            schema_root: "arrow_schema",
+            coerce_types: false,
+        }
+    }
+
+    /// Should Arrow types be coerced into Parquet native types (default `false`).
+    ///
+    /// Setting this option to `true` will result in Parquet files that can be
+    /// read by more readers, but may lose precision for Arrow types such as
+    /// [`DataType::Date64`] which have no direct [corresponding Parquet type].
+    ///
+    /// By default, this converter does not coerce to native Parquet types. Enabling type
+    /// coercion allows for meaningful representations that do not require
+    /// downstream readers to consider the embedded Arrow schema, and can allow
+    /// for greater compatibility with other Parquet implementations. However,
+    /// type coercion also prevents data from being losslessly round-tripped.
+    ///
+    /// # Discussion
+    ///
+    /// Some Arrow types such as `Date64`, `Timestamp` and `Interval` have no
+    /// corresponding Parquet logical type. Thus, they can not be losslessly
+    /// round-tripped when stored using the appropriate Parquet logical type.
+    /// For example, some Date64 values may be truncated when stored with
+    /// parquet's native 32 bit date type.
+    ///
+    /// For [`List`] and [`Map`] types, some Parquet readers expect certain
+    /// schema elements to have specific names (earlier versions of the spec
+    /// were somewhat ambiguous on this point). Type coercion will use the names
+    /// prescribed by the Parquet specification, potentially losing naming
+    /// metadata from the Arrow schema.
+    ///
+    /// [`List`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
+    /// [`Map`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps
+    /// [corresponding Parquet type]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date
+    ///
+    pub fn with_coerce_types(mut self, coerce_types: bool) -> Self {
+        self.coerce_types = coerce_types;
+        self
+    }
+
+    /// Set the root schema element name (defaults to `"arrow_schema"`).
+    pub fn schema_root(mut self, schema_root: &'a str) -> Self {
+        self.schema_root = schema_root;
+        self
+    }
+
+    /// Convert the specified Arrow [`Schema`] to the desired Parquet [`SchemaDescriptor`]
+    ///
+    /// See example in [`ArrowSchemaConverter`]
+    pub fn convert(&self, schema: &Schema) -> Result<SchemaDescriptor> {
+        let fields = schema
+            .fields()
+            .iter()
+            .map(|field| arrow_to_parquet_type(field, self.coerce_types).map(Arc::new))
+            .collect::<Result<_>>()?;
+        let group = Type::group_type_builder(self.schema_root)
+            .with_fields(fields)
+            .build()?;
+        Ok(SchemaDescriptor::new(Arc::new(group)))
+    }
+}
+
+/// Convert arrow schema to parquet schema
+///
+/// The name of the root schema element defaults to `"arrow_schema"`, this can be
+/// overridden with [`ArrowSchemaConverter`]
+#[deprecated(since = "54.0.0", note = "Use `ArrowToParquetSchemaConverter` instead")]
+pub fn arrow_to_parquet_schema(schema: &Schema) -> Result<SchemaDescriptor> {
+    ArrowSchemaConverter::new().convert(schema)
 }
 
 fn parse_key_value_metadata(
@@ -1488,7 +1589,10 @@ mod tests {
         ";
         let parquet_group_type = parse_message_type(message_type).unwrap();
         let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type));
-        let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, true).unwrap();
+        let converted_arrow_schema = ArrowSchemaConverter::new()
+            .with_coerce_types(true)
+            .convert(&arrow_schema)
+            .unwrap();
         assert_eq!(
             parquet_schema.columns().len(),
             converted_arrow_schema.columns().len()
@@ -1512,7 +1616,10 @@ mod tests {
         ";
         let parquet_group_type = parse_message_type(message_type).unwrap();
         let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type));
-        let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, false).unwrap();
+        let converted_arrow_schema = ArrowSchemaConverter::new()
+            .with_coerce_types(false)
+            .convert(&arrow_schema)
+            .unwrap();
         assert_eq!(
             parquet_schema.columns().len(),
             converted_arrow_schema.columns().len()
@@ -1668,7 +1775,7 @@ mod tests {
             Field::new("decimal256", DataType::Decimal256(39, 2), false),
         ];
         let arrow_schema = Schema::new(arrow_fields);
-        let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, false).unwrap();
+        let converted_arrow_schema = ArrowSchemaConverter::new().convert(&arrow_schema).unwrap();
 
         assert_eq!(
             parquet_schema.columns().len(),
@@ -1705,9 +1812,10 @@ mod tests {
             false,
         )];
         let arrow_schema = Schema::new(arrow_fields);
-        let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, true);
+        let converted_arrow_schema = ArrowSchemaConverter::new()
+            .with_coerce_types(true)
+            .convert(&arrow_schema);
 
-        assert!(converted_arrow_schema.is_err());
         converted_arrow_schema.unwrap();
     }
 
@@ -1978,7 +2086,9 @@ mod tests {
         // don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema
         let arrow_schema = crate::arrow::parquet_to_arrow_schema(&schema_descriptor, None)?;
 
-        let parq_schema_descr = crate::arrow::arrow_to_parquet_schema(&arrow_schema, true)?;
+        let parq_schema_descr = ArrowSchemaConverter::new()
+            .with_coerce_types(true)
+            .convert(&arrow_schema)?;
         let parq_fields = parq_schema_descr.root_schema().get_fields();
         assert_eq!(parq_fields.len(), 2);
         assert_eq!(parq_fields[0].get_basic_info().id(), 1);
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index aac450acd82..7b688333e54 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -16,14 +16,13 @@
 // under the License.
 
 //! Configuration via [`WriterProperties`] and [`ReaderProperties`]
-use std::str::FromStr;
-use std::{collections::HashMap, sync::Arc};
-
 use crate::basic::{Compression, Encoding};
 use crate::compression::{CodecOptions, CodecOptionsBuilder};
 use crate::file::metadata::KeyValue;
 use crate::format::SortingColumn;
 use crate::schema::types::ColumnPath;
+use std::str::FromStr;
+use std::{collections::HashMap, sync::Arc};
 
 /// Default value for [`WriterProperties::data_page_size_limit`]
 pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
@@ -780,22 +779,24 @@ impl WriterPropertiesBuilder {
         self
     }
 
-    /// Sets flag to control if type coercion is enabled (defaults to `false`).
+    /// Should the writer coerce types to parquet native types (defaults to `false`).
     ///
-    /// # Notes
-    /// Some Arrow types do not have a corresponding Parquet logical type.
-    /// Affected Arrow data types include `Date64`, `Timestamp` and `Interval`.
-    /// Also, for [`List`] and [`Map`] types, Parquet expects certain schema elements
-    /// to have specific names to be considered fully compliant.
-    /// Writers have the option to coerce these types and names to match those required
-    /// by the Parquet specification.
-    /// This type coercion allows for meaningful representations that do not require
-    /// downstream readers to consider the embedded Arrow schema, and can allow for greater
-    /// compatibility with other Parquet implementations. However, type
-    /// coercion also prevents the data from being losslessly round-tripped.
-    ///
-    /// [`List`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
-    /// [`Map`]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps
+    /// Leaving this option the default `false` will ensure the exact same data
+    /// written to parquet using this library will be read.
+    ///
+    /// Setting this option to `true` will result in parquet files that can be
+    /// read by more readers, but potentially lose information in the process.
+    ///
+    /// * Types such as [`DataType::Date64`], which have no direct corresponding
+    ///   Parquet type, may be stored with lower precision.
+    ///
+    /// * The internal field names of `List` and `Map` types will be renamed if
+    ///   necessary to match what is required by the newest Parquet specification.
+    ///
+    /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
+    ///
+    /// [`DataType::Date64`]: arrow_schema::DataType::Date64
+    /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
     pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
         self.coerce_types = coerce_types;
         self

From 9a74a2588ec99221fd66565b9099b2c0056492ec Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 16 Dec 2024 16:52:53 -0500
Subject: [PATCH 13/29] Update sysinfo requirement from 0.32.0 to 0.33.0
 (#6835)

* Update sysinfo requirement from 0.32.0 to 0.33.0

Updates the requirements on [sysinfo](https://github.com/GuillaumeGomez/sysinfo) to permit the latest version.
- [Changelog](https://github.com/GuillaumeGomez/sysinfo/blob/master/CHANGELOG.md)
- [Commits](https://github.com/GuillaumeGomez/sysinfo/commits)

---
updated-dependencies:
- dependency-name: sysinfo
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

* chore: Update example for API change

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 parquet/Cargo.toml                | 2 +-
 parquet/examples/write_parquet.rs | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index 4064baba094..19f89071077 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -67,7 +67,7 @@ hashbrown = { version = "0.15", default-features = false }
 twox-hash = { version = "1.6", default-features = false }
 paste = { version = "1.0" }
 half = { version = "2.1", default-features = false, features = ["num-traits"] }
-sysinfo = { version = "0.32.0", optional = true, default-features = false, features = ["system"] }
+sysinfo = { version = "0.33.0", optional = true, default-features = false, features = ["system"] }
 crc32fast = { version = "1.4.2", optional = true, default-features = false }
 
 [dev-dependencies]
diff --git a/parquet/examples/write_parquet.rs b/parquet/examples/write_parquet.rs
index 1b51d40c813..ebdd9527b6f 100644
--- a/parquet/examples/write_parquet.rs
+++ b/parquet/examples/write_parquet.rs
@@ -28,7 +28,7 @@ use parquet::arrow::ArrowWriter as ParquetWriter;
 use parquet::basic::Encoding;
 use parquet::errors::Result;
 use parquet::file::properties::{BloomFilterPosition, WriterProperties};
-use sysinfo::{MemoryRefreshKind, ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System};
+use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, RefreshKind, System};
 
 #[derive(ValueEnum, Clone)]
 enum BloomFilterPositionArg {
@@ -97,8 +97,7 @@ fn main() -> Result<()> {
     let file = File::create(args.path).unwrap();
     let mut writer = ParquetWriter::try_new(file, schema.clone(), Some(properties))?;
 
-    let mut system =
-        System::new_with_specifics(RefreshKind::new().with_memory(MemoryRefreshKind::everything()));
+    let mut system = System::new_with_specifics(RefreshKind::everything());
     eprintln!(
         "{} Writing {} batches of {} rows. RSS = {}",
         now(),

From 123045cc766d42d1eb06ee8bb3f09e39ea995ddc Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Mon, 16 Dec 2024 16:07:57 -0800
Subject: [PATCH 14/29] deprecate max_statistics_size writer property (#6884)

---
 parquet/src/bin/parquet-rewrite.rs |  1 +
 parquet/src/file/properties.rs     | 20 ++++++++++++--------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/parquet/src/bin/parquet-rewrite.rs b/parquet/src/bin/parquet-rewrite.rs
index eaecda50375..5a1ec94d550 100644
--- a/parquet/src/bin/parquet-rewrite.rs
+++ b/parquet/src/bin/parquet-rewrite.rs
@@ -242,6 +242,7 @@ fn main() {
     if let Some(value) = args.dictionary_page_size_limit {
         writer_properties_builder = writer_properties_builder.set_dictionary_page_size_limit(value);
     }
+    #[allow(deprecated)]
     if let Some(value) = args.max_statistics_size {
         writer_properties_builder = writer_properties_builder.set_max_statistics_size(value);
     }
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index 7b688333e54..dc918f6b563 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -41,6 +41,7 @@ pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
 /// Default value for [`WriterProperties::statistics_enabled`]
 pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
 /// Default value for [`WriterProperties::max_statistics_size`]
+#[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
 pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
 /// Default value for [`WriterProperties::max_row_group_size`]
 pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
@@ -350,7 +351,9 @@ impl WriterProperties {
 
     /// Returns max size for statistics.
     /// Only applicable if statistics are enabled.
+    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
     pub fn max_statistics_size(&self, col: &ColumnPath) -> usize {
+        #[allow(deprecated)]
         self.column_properties
             .get(col)
             .and_then(|c| c.max_statistics_size())
@@ -601,7 +604,9 @@ impl WriterPropertiesBuilder {
     /// Sets default max statistics size for all columns (defaults to `4096`).
     ///
     /// Applicable only if statistics are enabled.
+    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
     pub fn set_max_statistics_size(mut self, value: usize) -> Self {
+        #[allow(deprecated)]
         self.default_column_properties
             .set_max_statistics_size(value);
         self
@@ -706,7 +711,9 @@ impl WriterPropertiesBuilder {
     /// Sets max size for statistics for a specific column.
     ///
     /// Takes precedence over [`Self::set_max_statistics_size`].
+    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
     pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self {
+        #[allow(deprecated)]
         self.get_mut_props(col).set_max_statistics_size(value);
         self
     }
@@ -896,6 +903,7 @@ struct ColumnProperties {
     codec: Option<Compression>,
     dictionary_enabled: Option<bool>,
     statistics_enabled: Option<EnabledStatistics>,
+    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
     max_statistics_size: Option<usize>,
     /// bloom filter related properties
     bloom_filter_properties: Option<BloomFilterProperties>,
@@ -934,6 +942,8 @@ impl ColumnProperties {
     }
 
     /// Sets max size for statistics for this column.
+    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
+    #[allow(deprecated)]
     fn set_max_statistics_size(&mut self, value: usize) {
         self.max_statistics_size = Some(value);
     }
@@ -998,7 +1008,9 @@ impl ColumnProperties {
     }
 
     /// Returns optional max size in bytes for statistics.
+    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
     fn max_statistics_size(&self) -> Option<usize> {
+        #[allow(deprecated)]
         self.max_statistics_size
     }
 
@@ -1142,10 +1154,6 @@ mod tests {
             props.statistics_enabled(&ColumnPath::from("col")),
             DEFAULT_STATISTICS_ENABLED
         );
-        assert_eq!(
-            props.max_statistics_size(&ColumnPath::from("col")),
-            DEFAULT_MAX_STATISTICS_SIZE
-        );
         assert!(props
             .bloom_filter_properties(&ColumnPath::from("col"))
             .is_none());
@@ -1222,13 +1230,11 @@ mod tests {
             .set_compression(Compression::GZIP(Default::default()))
             .set_dictionary_enabled(false)
             .set_statistics_enabled(EnabledStatistics::None)
-            .set_max_statistics_size(50)
             // specific column settings
             .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
             .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
             .set_column_dictionary_enabled(ColumnPath::from("col"), true)
             .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
-            .set_column_max_statistics_size(ColumnPath::from("col"), 123)
             .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
             .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
             .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
@@ -1260,7 +1266,6 @@ mod tests {
             props.statistics_enabled(&ColumnPath::from("a")),
             EnabledStatistics::None
         );
-        assert_eq!(props.max_statistics_size(&ColumnPath::from("a")), 50);
 
         assert_eq!(
             props.encoding(&ColumnPath::from("col")),
@@ -1275,7 +1280,6 @@ mod tests {
             props.statistics_enabled(&ColumnPath::from("col")),
             EnabledStatistics::Chunk
         );
-        assert_eq!(props.max_statistics_size(&ColumnPath::from("col")), 123);
         assert_eq!(
             props.bloom_filter_properties(&ColumnPath::from("col")),
             Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })

From f8f24cf58311fee7bb2843144a615a80fb281fcb Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Tue, 17 Dec 2024 20:48:34 +0200
Subject: [PATCH 15/29] docs: fix typo (#6890)

---
 arrow-array/src/array/dictionary_array.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arrow-array/src/array/dictionary_array.rs b/arrow-array/src/array/dictionary_array.rs
index e69de783dd5..f852b57fb65 100644
--- a/arrow-array/src/array/dictionary_array.rs
+++ b/arrow-array/src/array/dictionary_array.rs
@@ -249,7 +249,7 @@ pub struct DictionaryArray<K: ArrowDictionaryKeyType> {
     /// map to the real values.
     keys: PrimitiveArray<K>,
 
-    /// Array of dictionary values (can by any DataType).
+    /// Array of dictionary values (can be any DataType).
     values: ArrayRef,
 
     /// Values are ordered.

From 54dccadccc6b599b93a46aef5f03dfd4d9b7349e Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 17 Dec 2024 10:49:07 -0800
Subject: [PATCH 16/29] fix deprecation notice (#6889)

---
 parquet/src/arrow/schema/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
index 4ae3fdb8e5c..5d3d7b2a654 100644
--- a/parquet/src/arrow/schema/mod.rs
+++ b/parquet/src/arrow/schema/mod.rs
@@ -345,7 +345,7 @@ impl<'a> ArrowSchemaConverter<'a> {
 ///
 /// The name of the root schema element defaults to `"arrow_schema"`, this can be
 /// overridden with [`ArrowSchemaConverter`]
-#[deprecated(since = "54.0.0", note = "Use `ArrowToParquetSchemaConverter` instead")]
+#[deprecated(since = "54.0.0", note = "Use `ArrowSchemaConverter` instead")]
 pub fn arrow_to_parquet_schema(schema: &Schema) -> Result<SchemaDescriptor> {
     ArrowSchemaConverter::new().convert(schema)
 }

From 10793608c84f9d26d89d32e2cf72107e632ee84c Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 18 Dec 2024 04:52:44 -0500
Subject: [PATCH 17/29] Add Field::with_dict_is_ordered (#6885)

---
 arrow-schema/src/field.rs | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs
index 7d47c0ae1de..13bb7abf51b 100644
--- a/arrow-schema/src/field.rs
+++ b/arrow-schema/src/field.rs
@@ -426,6 +426,19 @@ impl Field {
     }
 
     /// Returns whether this `Field`'s dictionary is ordered, if this is a dictionary type.
+    ///
+    /// # Example
+    /// ```
+    /// # use arrow_schema::{DataType, Field};
+    /// // non dictionaries do not have a dict is ordered flat
+    /// let field = Field::new("c1", DataType::Int64, false);
+    /// assert_eq!(field.dict_is_ordered(), None);
+    /// // by default dictionary is not ordered
+    /// let field = Field::new("c1", DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)), false);
+    /// assert_eq!(field.dict_is_ordered(), Some(false));
+    /// let field = field.with_dict_is_ordered(true);
+    /// assert_eq!(field.dict_is_ordered(), Some(true));
+    /// ```
     #[inline]
     pub const fn dict_is_ordered(&self) -> Option<bool> {
         match self.data_type {
@@ -434,6 +447,18 @@ impl Field {
         }
     }
 
+    /// Set the is ordered field for this `Field`, if it is a dictionary.
+    ///
+    /// Does nothing if this is not a dictionary type.
+    ///
+    /// See [`Field::dict_is_ordered`] for more information.
+    pub fn with_dict_is_ordered(mut self, dict_is_ordered: bool) -> Self {
+        if matches!(self.data_type, DataType::Dictionary(_, _)) {
+            self.dict_is_ordered = dict_is_ordered;
+        };
+        self
+    }
+
     /// Merge this field into self if it is compatible.
     ///
     /// Struct fields are merged recursively.

From 3317989711a4c081f5118d8fe126280ced7a92fb Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 18 Dec 2024 05:52:15 -0500
Subject: [PATCH 18/29] Minor: make it easier to find fix instructions when
 `cargo fmt` on parquet fails (#6886)

* Minor: make it easier to find instructions when fmt fails

* purposely introduce a fmt issue

* Revert "purposely introduce a fmt issue"

This reverts commit 440e52079135df85128b15936425d2b5af488007.

* Update .github/workflows/rust.yml

Co-authored-by: Ed Seidl <etseidl@users.noreply.github.com>

---------

Co-authored-by: Ed Seidl <etseidl@users.noreply.github.com>
---
 .github/workflows/rust.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index ff5040fd294..72a53263d33 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -101,12 +101,13 @@ jobs:
       - name: Format arrow
         run: cargo fmt --all -- --check
       - name: Format parquet
-        # Many modules in parquet are skipped, so check parquet separately. If this check fails, run:
-        #   cargo fmt -p parquet -- --config skip_children=true `find ./parquet -name "*.rs" \! -name format.rs`
-        # from the top level arrow-rs directory and check in the result.
+        # Many modules in parquet are skipped, so check parquet separately
         # https://github.com/apache/arrow-rs/issues/6179
         working-directory: parquet
-        run: cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs`
+        run: |
+          # if this fails, run this from the parquet directory:
+          # cargo fmt -p parquet -- --config skip_children=true `find . -name "*.rs" \! -name format.rs`
+          cargo fmt -p parquet -- --check --config skip_children=true `find . -name "*.rs" \! -name format.rs`
       - name: Format object_store
         working-directory: object_store
         run: cargo fmt --all -- --check

From 565c24b8071269b02c3937e34c51eacf0f4cbad6 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 18 Dec 2024 05:52:40 -0500
Subject: [PATCH 19/29] Minor: add comments explaining bad MSRV, output in json
 (#6857)

* Minor: add comments explaining bad MSRV

* purposely introduce msrv brek

* output in JSON format

* Revert "purposely introduce msrv brek"

This reverts commit 61872b69a5a85748031fe852e48b8e3d3381d270.
---
 .github/workflows/rust.yml | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 72a53263d33..044250b7043 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -113,7 +113,7 @@ jobs:
         run: cargo fmt --all -- --check
 
   msrv:
-    name: Verify MSRV
+    name: Verify MSRV (Minimum Supported Rust Version)
     runs-on: ubuntu-latest
     container:
       image: amd64/rust
@@ -127,13 +127,19 @@ jobs:
         run: cargo update -p ahash --precise 0.8.7
       - name: Check arrow
         working-directory: arrow
-        run: cargo msrv --log-target stdout verify
+        run: |
+          # run `cd arrow; cargo msrv verify` to see problematic dependencies
+          cargo msrv verify --output-format=json
       - name: Check parquet
         working-directory: parquet
-        run: cargo msrv --log-target stdout verify
+        run: |
+          # run `cd parquet; cargo msrv verify` to see problematic dependencies
+          cargo msrv verify --output-format=json
       - name: Check arrow-flight
         working-directory: arrow-flight
-        run: cargo msrv --log-target stdout verify
+        run: |
+          # run `cd arrow-flight; cargo msrv verify` to see problematic dependencies
+          cargo msrv verify --output-format=json
       - name: Downgrade object_store dependencies
         working-directory: object_store
         # Necessary because tokio 1.30.0 updates MSRV to 1.63
@@ -143,4 +149,6 @@ jobs:
           cargo update -p url --precise 2.5.0
       - name: Check object_store
         working-directory: object_store
-        run: cargo msrv --log-target stdout verify
+        run: |
+          # run `cd object_store; cargo msrv verify` to see problematic dependencies
+          cargo msrv verify --output-format=json

From 61b7876d7a7d9de93e9db37bc96f87b1d646037f Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 18 Dec 2024 07:25:59 -0500
Subject: [PATCH 20/29] Add 53.4.0 to release schedule (#6896)

* Add 54.4.0 to release schedule

* prettoer
---
 README.md | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 57794b1d6a4..f995ff6ad47 100644
--- a/README.md
+++ b/README.md
@@ -63,13 +63,14 @@ is described in the [contributing] guide.
 
 Planned Release Schedule
 
-| Approximate Date | Version  | Notes                                   |
-| ---------------- | -------- | --------------------------------------- |
-| Nov 2024         | `53.3.0` | Minor, NO breaking API changes          |
-| Dec 2024         | `54.0.0` | Major, potentially breaking API changes |
-| Jan 2025         | `54.1.0` | Minor, NO breaking API changes          |
-| Feb 2025         | `54.2.0` | Minor, NO breaking API changes          |
-| Mar 2025         | `55.0.0` | Major, potentially breaking API changes |
+| Approximate Date | Version  | Notes                                      |
+| ---------------- | -------- | ------------------------------------------ |
+| Nov 2024         | `53.3.0` | Minor, NO breaking API changes             |
+| Dec 2024         | `54.0.0` | Major, potentially breaking API changes    |
+| Jan 2025         | `53.4.0` | Minor, NO breaking API changes (`53` line) |
+| Jan 2025         | `54.1.0` | Minor, NO breaking API changes             |
+| Feb 2025         | `54.2.0` | Minor, NO breaking API changes             |
+| Mar 2025         | `55.0.0` | Major, potentially breaking API changes    |
 
 [this ticket]: https://github.com/apache/arrow-rs/issues/5368
 [semantic versioning]: https://semver.org/

From 13f3f213360441e6e7baaa1b98d23a0690d9572e Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 18 Dec 2024 07:49:51 -0500
Subject: [PATCH 21/29] Add deprecation / API removal policy (#6852)

* Add deprecation / API removal policy

* Increase proposal to 2 releases

* change from policy to guidelines, add flexibility

* prettier

* Make instructions more actionable
---
 README.md         | 27 +++++++++++++++++++++++++++
 arrow/README.md   |  2 +-
 parquet/README.md |  2 +-
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f995ff6ad47..723249ad29e 100644
--- a/README.md
+++ b/README.md
@@ -83,6 +83,33 @@ versions approximately every 2 months.
 
 [`object_store`]: https://crates.io/crates/object_store
 
+### Deprecation Guidelines
+
+Minor releases may deprecate, but not remove APIs. Deprecating APIs allows
+downstream Rust programs to still compile, but generate compiler warnings. This
+gives downstream crates time to migrate prior to API removal.
+
+To deprecate an API:
+
+- Mark the API as deprecated using `#[deprecated]` and specify the exact arrow-rs version in which it was deprecated
+- Concisely describe the preferred API to help the user transition
+
+The deprecated version is the next version which will be released (please
+consult the list above). To mark the API as deprecated, use the
+`#[deprecated(since = "...", note = "...")]` attribute.
+
+Foe example
+
+```rust
+#[deprecated(since = "51.0.0", note = "Use `date_part` instead")]
+```
+
+In general, deprecated APIs will remain in the codebase for at least two major releases after
+they were deprecated (typically between 6 - 9 months later). For example, an API
+deprecated in `51.3.0` can be removed in `54.0.0` (or later). Deprecated APIs
+may be removed earlier or later than these guidelines at the discretion of the
+maintainers.
+
 ## Related Projects
 
 There are several related crates in different repositories
diff --git a/arrow/README.md b/arrow/README.md
index a1444005ec0..79aefaae905 100644
--- a/arrow/README.md
+++ b/arrow/README.md
@@ -37,7 +37,7 @@ This crate is tested with the latest stable version of Rust. We do not currently
 
 The `arrow` crate follows the [SemVer standard] defined by Cargo and works well
 within the Rust crate ecosystem. See the [repository README] for more details on
-the release schedule and version.
+the release schedule, version and deprecation policy.
 
 [SemVer standard]: https://doc.rust-lang.org/cargo/reference/semver.html
 [repository README]: https://github.com/apache/arrow-rs
diff --git a/parquet/README.md b/parquet/README.md
index e9f52ff279d..9ff1d921d69 100644
--- a/parquet/README.md
+++ b/parquet/README.md
@@ -36,7 +36,7 @@ This crate is tested with the latest stable version of Rust. We do not currently
 
 The `parquet` crate follows the [SemVer standard] defined by Cargo and works well
 within the Rust crate ecosystem. See the [repository README] for more details on
-the release schedule and version.
+the release schedule, version and deprecation policy.
 
 [semver standard]: https://doc.rust-lang.org/cargo/reference/semver.html
 [repository readme]: https://github.com/apache/arrow-rs

From cbe176533b227f31f467b9c9b512d65a8406ce10 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 18 Dec 2024 05:57:53 -0800
Subject: [PATCH 22/29] Enable string-based column projections from Parquet
 files (#6871)

* add function to create ProjectionMask from column names

* add some more tests
---
 parquet/src/arrow/arrow_reader/mod.rs |  68 ++++++++++
 parquet/src/arrow/mod.rs              | 178 +++++++++++++++++++++++++-
 parquet/src/arrow/schema/mod.rs       |  11 ++
 3 files changed, 256 insertions(+), 1 deletion(-)

diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
index 378884a1c43..6eba04c86f9 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -989,6 +989,21 @@ mod tests {
         assert_eq!(original_schema.fields()[1], reader.schema().fields()[0]);
     }
 
+    #[test]
+    fn test_arrow_reader_single_column_by_name() {
+        let file = get_test_file("parquet/generated_simple_numerics/blogs.parquet");
+
+        let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
+        let original_schema = Arc::clone(builder.schema());
+
+        let mask = ProjectionMask::columns(builder.parquet_schema(), ["blog_id"]);
+        let reader = builder.with_projection(mask).build().unwrap();
+
+        // Verify that the schema was correctly parsed
+        assert_eq!(1, reader.schema().fields().len());
+        assert_eq!(original_schema.fields()[1], reader.schema().fields()[0]);
+    }
+
     #[test]
     fn test_null_column_reader_test() {
         let mut file = tempfile::tempfile().unwrap();
@@ -2563,6 +2578,59 @@ mod tests {
         }
     }
 
+    #[test]
+    // same as test_read_structs but constructs projection mask via column names
+    fn test_read_structs_by_name() {
+        let testdata = arrow::util::test_util::parquet_test_data();
+        let path = format!("{testdata}/nested_structs.rust.parquet");
+        let file = File::open(&path).unwrap();
+        let record_batch_reader = ParquetRecordBatchReader::try_new(file, 60).unwrap();
+
+        for batch in record_batch_reader {
+            batch.unwrap();
+        }
+
+        let file = File::open(&path).unwrap();
+        let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
+
+        let mask = ProjectionMask::columns(
+            builder.parquet_schema(),
+            ["roll_num.count", "PC_CUR.mean", "PC_CUR.sum"],
+        );
+        let projected_reader = builder
+            .with_projection(mask)
+            .with_batch_size(60)
+            .build()
+            .unwrap();
+
+        let expected_schema = Schema::new(vec![
+            Field::new(
+                "roll_num",
+                ArrowDataType::Struct(Fields::from(vec![Field::new(
+                    "count",
+                    ArrowDataType::UInt64,
+                    false,
+                )])),
+                false,
+            ),
+            Field::new(
+                "PC_CUR",
+                ArrowDataType::Struct(Fields::from(vec![
+                    Field::new("mean", ArrowDataType::Int64, false),
+                    Field::new("sum", ArrowDataType::Int64, false),
+                ])),
+                false,
+            ),
+        ]);
+
+        assert_eq!(&expected_schema, projected_reader.schema().as_ref());
+
+        for batch in projected_reader {
+            let batch = batch.unwrap();
+            assert_eq!(batch.schema().as_ref(), &expected_schema);
+        }
+    }
+
     #[test]
     fn test_read_maps() {
         let testdata = arrow::util::test_util::parquet_test_data();
diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs
index d77436bc1ff..6777e00fb05 100644
--- a/parquet/src/arrow/mod.rs
+++ b/parquet/src/arrow/mod.rs
@@ -108,12 +108,14 @@ pub mod async_writer;
 mod record_reader;
 experimental!(mod schema);
 
+use std::sync::Arc;
+
 pub use self::arrow_writer::ArrowWriter;
 #[cfg(feature = "async")]
 pub use self::async_reader::ParquetRecordBatchStreamBuilder;
 #[cfg(feature = "async")]
 pub use self::async_writer::AsyncArrowWriter;
-use crate::schema::types::SchemaDescriptor;
+use crate::schema::types::{SchemaDescriptor, Type};
 use arrow_schema::{FieldRef, Schema};
 
 // continue to export deprecated methods until they are removed
@@ -210,6 +212,71 @@ impl ProjectionMask {
         Self { mask: Some(mask) }
     }
 
+    // Given a starting point in the schema, do a DFS for that node adding leaf paths to `paths`.
+    fn find_leaves(root: &Arc<Type>, parent: Option<&String>, paths: &mut Vec<String>) {
+        let path = parent
+            .map(|p| [p, root.name()].join("."))
+            .unwrap_or(root.name().to_string());
+        if root.is_group() {
+            for child in root.get_fields() {
+                Self::find_leaves(child, Some(&path), paths);
+            }
+        } else {
+            // Reached a leaf, add to paths
+            paths.push(path);
+        }
+    }
+
+    /// Create a [`ProjectionMask`] which selects only the named columns
+    ///
+    /// All leaf columns that fall below a given name will be selected. For example, given
+    /// the schema
+    /// ```ignore
+    /// message schema {
+    ///   OPTIONAL group a (MAP) {
+    ///     REPEATED group key_value {
+    ///       REQUIRED BYTE_ARRAY key (UTF8);  // leaf index 0
+    ///       OPTIONAL group value (MAP) {
+    ///         REPEATED group key_value {
+    ///           REQUIRED INT32 key;          // leaf index 1
+    ///           REQUIRED BOOLEAN value;      // leaf index 2
+    ///         }
+    ///       }
+    ///     }
+    ///   }
+    ///   REQUIRED INT32 b;                    // leaf index 3
+    ///   REQUIRED DOUBLE c;                   // leaf index 4
+    /// }
+    /// ```
+    /// `["a.key_value.value", "c"]` would return leaf columns 1, 2, and 4. `["a"]` would return
+    /// columns 0, 1, and 2.
+    ///
+    /// Note: repeated or out of order indices will not impact the final mask.
+    ///
+    /// i.e. `["b", "c"]` will construct the same mask as `["c", "b", "c"]`.
+    pub fn columns<'a>(
+        schema: &SchemaDescriptor,
+        names: impl IntoIterator<Item = &'a str>,
+    ) -> Self {
+        // first make vector of paths for leaf columns
+        let mut paths: Vec<String> = vec![];
+        for root in schema.root_schema().get_fields() {
+            Self::find_leaves(root, None, &mut paths);
+        }
+        assert_eq!(paths.len(), schema.num_columns());
+
+        let mut mask = vec![false; schema.num_columns()];
+        for name in names {
+            for idx in 0..schema.num_columns() {
+                if paths[idx].starts_with(name) {
+                    mask[idx] = true;
+                }
+            }
+        }
+
+        Self { mask: Some(mask) }
+    }
+
     /// Returns true if the leaf column `leaf_idx` is included by the mask
     pub fn leaf_included(&self, leaf_idx: usize) -> bool {
         self.mask.as_ref().map(|m| m[leaf_idx]).unwrap_or(true)
@@ -246,10 +313,14 @@ mod test {
     use crate::arrow::ArrowWriter;
     use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader, ParquetMetaDataWriter};
     use crate::file::properties::{EnabledStatistics, WriterProperties};
+    use crate::schema::parser::parse_message_type;
+    use crate::schema::types::SchemaDescriptor;
     use arrow_array::{ArrayRef, Int32Array, RecordBatch};
     use bytes::Bytes;
     use std::sync::Arc;
 
+    use super::ProjectionMask;
+
     #[test]
     // Reproducer for https://github.com/apache/arrow-rs/issues/6464
     fn test_metadata_read_write_partial_offset() {
@@ -375,4 +446,109 @@ mod test {
             .unwrap();
         Bytes::from(buf)
     }
+
+    #[test]
+    fn test_mask_from_column_names() {
+        let message_type = "
+            message test_schema {
+                OPTIONAL group a (MAP) {
+                    REPEATED group key_value {
+                        REQUIRED BYTE_ARRAY key (UTF8);
+                        OPTIONAL group value (MAP) {
+                            REPEATED group key_value {
+                                REQUIRED INT32 key;
+                                REQUIRED BOOLEAN value;
+                            }
+                        }
+                    }
+                }
+                REQUIRED INT32 b;
+                REQUIRED DOUBLE c;
+            }
+            ";
+        let parquet_group_type = parse_message_type(message_type).unwrap();
+        let schema = SchemaDescriptor::new(Arc::new(parquet_group_type));
+
+        let mask = ProjectionMask::columns(&schema, ["foo", "bar"]);
+        assert_eq!(mask.mask.unwrap(), vec![false; 5]);
+
+        let mask = ProjectionMask::columns(&schema, []);
+        assert_eq!(mask.mask.unwrap(), vec![false; 5]);
+
+        let mask = ProjectionMask::columns(&schema, ["a", "c"]);
+        assert_eq!(mask.mask.unwrap(), [true, true, true, false, true]);
+
+        let mask = ProjectionMask::columns(&schema, ["a.key_value.key", "c"]);
+        assert_eq!(mask.mask.unwrap(), [true, false, false, false, true]);
+
+        let mask = ProjectionMask::columns(&schema, ["a.key_value.value", "b"]);
+        assert_eq!(mask.mask.unwrap(), [false, true, true, true, false]);
+
+        let message_type = "
+            message test_schema {
+                OPTIONAL group a (LIST) {
+                    REPEATED group list {
+                        OPTIONAL group element (LIST) {
+                            REPEATED group list {
+                                OPTIONAL group element (LIST) {
+                                    REPEATED group list {
+                                        OPTIONAL BYTE_ARRAY element (UTF8);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                REQUIRED INT32 b;
+            }
+            ";
+        let parquet_group_type = parse_message_type(message_type).unwrap();
+        let schema = SchemaDescriptor::new(Arc::new(parquet_group_type));
+
+        let mask = ProjectionMask::columns(&schema, ["a", "b"]);
+        assert_eq!(mask.mask.unwrap(), [true, true]);
+
+        let mask = ProjectionMask::columns(&schema, ["a.list.element", "b"]);
+        assert_eq!(mask.mask.unwrap(), [true, true]);
+
+        let mask =
+            ProjectionMask::columns(&schema, ["a.list.element.list.element.list.element", "b"]);
+        assert_eq!(mask.mask.unwrap(), [true, true]);
+
+        let mask = ProjectionMask::columns(&schema, ["b"]);
+        assert_eq!(mask.mask.unwrap(), [false, true]);
+
+        let message_type = "
+            message test_schema {
+                OPTIONAL INT32 a;
+                OPTIONAL INT32 b;
+                OPTIONAL INT32 c;
+                OPTIONAL INT32 d;
+                OPTIONAL INT32 e;
+            }
+            ";
+        let parquet_group_type = parse_message_type(message_type).unwrap();
+        let schema = SchemaDescriptor::new(Arc::new(parquet_group_type));
+
+        let mask = ProjectionMask::columns(&schema, ["a", "b"]);
+        assert_eq!(mask.mask.unwrap(), [true, true, false, false, false]);
+
+        let mask = ProjectionMask::columns(&schema, ["d", "b", "d"]);
+        assert_eq!(mask.mask.unwrap(), [false, true, false, true, false]);
+
+        let message_type = "
+            message test_schema {
+                OPTIONAL INT32 a;
+                OPTIONAL INT32 b;
+                OPTIONAL INT32 a;
+                OPTIONAL INT32 d;
+                OPTIONAL INT32 e;
+            }
+            ";
+        let parquet_group_type = parse_message_type(message_type).unwrap();
+        let schema = SchemaDescriptor::new(Arc::new(parquet_group_type));
+
+        let mask = ProjectionMask::columns(&schema, ["a", "e"]);
+        assert_eq!(mask.mask.unwrap(), [true, false, true, false, true]);
+    }
 }
diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
index 5d3d7b2a654..212dec52583 100644
--- a/parquet/src/arrow/schema/mod.rs
+++ b/parquet/src/arrow/schema/mod.rs
@@ -1399,6 +1399,17 @@ mod tests {
         for i in 0..arrow_fields.len() {
             assert_eq!(&arrow_fields[i], converted_fields[i].as_ref());
         }
+
+        let mask =
+            ProjectionMask::columns(&parquet_schema, ["group2.leaf4", "group1.leaf1", "leaf5"]);
+        let converted_arrow_schema =
+            parquet_to_arrow_schema_by_columns(&parquet_schema, mask, None).unwrap();
+        let converted_fields = converted_arrow_schema.fields();
+
+        assert_eq!(arrow_fields.len(), converted_fields.len());
+        for i in 0..arrow_fields.len() {
+            assert_eq!(&arrow_fields[i], converted_fields[i].as_ref());
+        }
     }
 
     #[test]

From fc814bca6b743010bcde972e611c8ff8ea68c9f5 Mon Sep 17 00:00:00 2001
From: xxchan <xxchan22f@gmail.com>
Date: Wed, 18 Dec 2024 21:58:19 +0800
Subject: [PATCH 23/29] doc: add comment for timezone string (#6899)

* doc: add comment for timezone string

Signed-off-by: xxchan <xxchan22f@gmail.com>

* Update arrow-schema/src/datatype.rs

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

---------

Signed-off-by: xxchan <xxchan22f@gmail.com>
Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
---
 arrow-schema/src/datatype.rs | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs
index a6333c80480..7cd53b13c73 100644
--- a/arrow-schema/src/datatype.rs
+++ b/arrow-schema/src/datatype.rs
@@ -196,6 +196,14 @@ pub enum DataType {
     /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
     /// DataType::Timestamp(TimeUnit::Second, Some("string".to_string().into()));
     /// ```
+    ///
+    /// Timezone string parsing
+    /// -----------------------
+    /// When feature `chrono-tz` is not enabled, allowed timezone strings are fixed offsets of the form "+09:00", "-09" or "+0930".
+    ///
+    /// When feature `chrono-tz` is enabled, additional strings supported by [chrono_tz](https://docs.rs/chrono-tz/latest/chrono_tz/)
+    /// are also allowed, which include [IANA database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones)
+    /// timezones.
     Timestamp(TimeUnit, Option<Arc<str>>),
     /// A signed 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
     /// in days.

From 2887cc1030b2954ffcaba30f6a6d566b7017dc25 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 19 Dec 2024 08:56:05 -0500
Subject: [PATCH 24/29] Update version to 54.0.0, add CHANGELOG (#6894)

* Update version to 54.0.0

* Update changelog

* update notes

* updtes

* update
---
 CHANGELOG-old.md                 | 170 ++++++++++++++++++++++++++++++
 CHANGELOG.md                     | 173 +++++++++++++++++--------------
 Cargo.toml                       |  32 +++---
 arrow-flight/README.md           |   2 +-
 dev/release/update_change_log.sh |   4 +-
 5 files changed, 283 insertions(+), 98 deletions(-)

diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md
index 376da627711..3fb17b390ac 100644
--- a/CHANGELOG-old.md
+++ b/CHANGELOG-old.md
@@ -19,6 +19,176 @@
 
 # Historical Changelog
 
+## [53.3.0](https://github.com/apache/arrow-rs/tree/53.3.0) (2024-11-17)
+
+[Full Changelog](https://github.com/apache/arrow-rs/compare/53.2.0...53.3.0)
+
+- Signed decimal e-notation parsing bug [\#6728](https://github.com/apache/arrow-rs/issues/6728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Add support for Utf8View -\> numeric in can\_cast\_types [\#6715](https://github.com/apache/arrow-rs/issues/6715)
+- IPC file writer produces incorrect footer when not preserving dict ID [\#6710](https://github.com/apache/arrow-rs/issues/6710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- parquet from\_thrift\_helper incorrectly checks index [\#6693](https://github.com/apache/arrow-rs/issues/6693) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
+- Primitive REPEATED fields not contained in LIST annotated groups aren't read as lists by record reader [\#6648](https://github.com/apache/arrow-rs/issues/6648) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
+- DictionaryHandling does not recurse into Map fields [\#6644](https://github.com/apache/arrow-rs/issues/6644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
+- Array writer output empty when no record is written [\#6613](https://github.com/apache/arrow-rs/issues/6613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Archery Integration Test with c\# failing on main [\#6577](https://github.com/apache/arrow-rs/issues/6577) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Potential unsoundness in `filter_run_end_array` [\#6569](https://github.com/apache/arrow-rs/issues/6569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Parquet reader can generate incorrect validity buffer information for nested structures [\#6510](https://github.com/apache/arrow-rs/issues/6510) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
+- arrow-array ffi: FFI\_ArrowArray.null\_count is always interpreted as unsigned and initialized during conversion from C to Rust. [\#6497](https://github.com/apache/arrow-rs/issues/6497) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+
+**Documentation updates:**
+
+- Minor: Document pattern for accessing views in StringView [\#6673](https://github.com/apache/arrow-rs/pull/6673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
+- Improve Array::is\_nullable documentation [\#6615](https://github.com/apache/arrow-rs/pull/6615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Minor: improve docs for ByteViewArray-\>ByteArray From impl [\#6610](https://github.com/apache/arrow-rs/pull/6610) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
+
+**Performance improvements:**
+
+- Speed up `filter_run_end_array` [\#6712](https://github.com/apache/arrow-rs/pull/6712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan))
+
+**Closed issues:**
+
+- Incorrect like results for pattern starting/ending with `%` percent and containing escape characters [\#6702](https://github.com/apache/arrow-rs/issues/6702) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+
+**Merged pull requests:**
+
+- Fix signed decimal e-notation parsing [\#6729](https://github.com/apache/arrow-rs/pull/6729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya))
+- Clean up some arrow-flight tests and duplicated code [\#6725](https://github.com/apache/arrow-rs/pull/6725) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime))
+- Update PR template section about API breaking changes [\#6723](https://github.com/apache/arrow-rs/pull/6723) ([findepi](https://github.com/findepi))
+- Support for casting `StringViewArray` to `DecimalArray` [\#6720](https://github.com/apache/arrow-rs/pull/6720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365))
+- File writer preserve dict bug [\#6711](https://github.com/apache/arrow-rs/pull/6711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz))
+- Add filter\_kernel benchmark for run array [\#6706](https://github.com/apache/arrow-rs/pull/6706) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3))
+- Fix string view ILIKE checks with NULL values [\#6705](https://github.com/apache/arrow-rs/pull/6705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Implement logical\_null\_count for more array types [\#6704](https://github.com/apache/arrow-rs/pull/6704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Fix LIKE with escapes [\#6703](https://github.com/apache/arrow-rs/pull/6703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Speed up `filter_bytes` [\#6699](https://github.com/apache/arrow-rs/pull/6699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan))
+- Minor: fix misleading comment in byte view [\#6695](https://github.com/apache/arrow-rs/pull/6695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211))
+- minor fix on checking index [\#6694](https://github.com/apache/arrow-rs/pull/6694) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jp0317](https://github.com/jp0317))
+- Undo run end filter performance regression [\#6691](https://github.com/apache/arrow-rs/pull/6691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3))
+- Reimplement `PartialEq` of `GenericByteViewArray` compares by logical value [\#6689](https://github.com/apache/arrow-rs/pull/6689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365))
+- feat: expose known\_schema from FlightDataEncoder [\#6688](https://github.com/apache/arrow-rs/pull/6688) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc))
+- Update hashbrown requirement from 0.14.2 to 0.15.1 [\#6684](https://github.com/apache/arrow-rs/pull/6684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot))
+- Support Duration in JSON Reader [\#6683](https://github.com/apache/arrow-rs/pull/6683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([simonvandel](https://github.com/simonvandel))
+- Check predicate and values are the same length for run end array filter safety [\#6675](https://github.com/apache/arrow-rs/pull/6675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3))
+- \[ffi\] Fix arrow-array null\_count error during conversion from C to Rust [\#6674](https://github.com/apache/arrow-rs/pull/6674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adbmal](https://github.com/adbmal))
+- Support `Utf8View` for `bit_length` kernel [\#6671](https://github.com/apache/arrow-rs/pull/6671) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([austin362667](https://github.com/austin362667))
+- Fix string view LIKE checks with NULL values [\#6662](https://github.com/apache/arrow-rs/pull/6662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Improve documentation for `nullif` kernel [\#6658](https://github.com/apache/arrow-rs/pull/6658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
+- Improve test\_auth error message when contains\(\) fails [\#6657](https://github.com/apache/arrow-rs/pull/6657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi))
+- Let std::fmt::Debug for StructArray output Null/Validity info [\#6655](https://github.com/apache/arrow-rs/pull/6655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([XinyuZeng](https://github.com/XinyuZeng))
+- Include offending line number when processing CSV file fails [\#6653](https://github.com/apache/arrow-rs/pull/6653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- feat: add write\_bytes for GenericBinaryBuilder [\#6652](https://github.com/apache/arrow-rs/pull/6652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tisonkun](https://github.com/tisonkun))
+- feat: Support Utf8View in JSON serialization [\#6651](https://github.com/apache/arrow-rs/pull/6651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonmmease](https://github.com/jonmmease))
+- fix: include chrono-tz in flight sql cli [\#6650](https://github.com/apache/arrow-rs/pull/6650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum))
+- Handle primitive REPEATED field not contained in LIST annotated group [\#6649](https://github.com/apache/arrow-rs/pull/6649) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm))
+- Implement `append_n` for `BooleanBuilder` [\#6646](https://github.com/apache/arrow-rs/pull/6646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3))
+- fix: recurse into Map datatype when hydrating dictionaries [\#6645](https://github.com/apache/arrow-rs/pull/6645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc))
+- fix: enable TLS roots for flight CLI client [\#6640](https://github.com/apache/arrow-rs/pull/6640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum))
+- doc: Clarify take kernel semantics [\#6632](https://github.com/apache/arrow-rs/pull/6632) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya))
+- Return error rather than panic when too many row groups are written [\#6629](https://github.com/apache/arrow-rs/pull/6629) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
+- Fix test feature selection so all feature combinations work as expected [\#6626](https://github.com/apache/arrow-rs/pull/6626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([itsjunetime](https://github.com/itsjunetime))
+- Add Parquet RowSelection benchmark [\#6623](https://github.com/apache/arrow-rs/pull/6623) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao))
+- Optimize `take_bits` to optimize `take_boolean` / `take_primitive` / `take_byte_view`: up to -25% [\#6622](https://github.com/apache/arrow-rs/pull/6622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan))
+- Make downcast macros hygenic \(\#6400\) [\#6620](https://github.com/apache/arrow-rs/pull/6620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
+- Update proc-macro2 requirement from =1.0.88 to =1.0.89 [\#6618](https://github.com/apache/arrow-rs/pull/6618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot))
+- Fix arrow-json writer empty [\#6614](https://github.com/apache/arrow-rs/pull/6614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gwik](https://github.com/gwik))
+- Add `ParquetObjectReader::with_runtime` [\#6612](https://github.com/apache/arrow-rs/pull/6612) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([itsjunetime](https://github.com/itsjunetime))
+- Re-enable `C#` arrow flight integration test [\#6611](https://github.com/apache/arrow-rs/pull/6611) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
+
+## [53.3.0](https://github.com/apache/arrow-rs/tree/53.3.0) (2024-11-17)
+
+[Full Changelog](https://github.com/apache/arrow-rs/compare/53.2.0...53.3.0)
+
+**Implemented enhancements:**
+
+- `PartialEq` of GenericByteViewArray \(StringViewArray / ByteViewArray\) that compares on equality rather than logical value [\#6679](https://github.com/apache/arrow-rs/issues/6679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Need a mechanism to handle schema changes due to dictionary hydration in FlightSQL server implementations [\#6672](https://github.com/apache/arrow-rs/issues/6672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
+- Support encoding Utf8View columns to JSON [\#6642](https://github.com/apache/arrow-rs/issues/6642) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Implement `append_n` for `BooleanBuilder` [\#6634](https://github.com/apache/arrow-rs/issues/6634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Some take optimizations [\#6621](https://github.com/apache/arrow-rs/issues/6621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Error Instead of Panic On Attempting to Write More Than 32769 Row Groups [\#6591](https://github.com/apache/arrow-rs/issues/6591) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
+- Make casting from a timestamp without timezone to a timestamp with timezone configurable [\#6555](https://github.com/apache/arrow-rs/issues/6555)
+- Add `record_batch!` macro for easy record batch creation [\#6553](https://github.com/apache/arrow-rs/issues/6553) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Support `Binary` --\> `Utf8View` casting [\#6531](https://github.com/apache/arrow-rs/issues/6531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- `downcast_primitive_array` and `downcast_dictionary_array` are not hygienic wrt imports [\#6400](https://github.com/apache/arrow-rs/issues/6400) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Implement interleave\_record\_batch [\#6731](https://github.com/apache/arrow-rs/pull/6731) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waynexia](https://github.com/waynexia))
+- feat: `record_batch!` macro [\#6588](https://github.com/apache/arrow-rs/pull/6588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ByteBaker](https://github.com/ByteBaker))
+
+**Fixed bugs:**
+
+- Signed decimal e-notation parsing bug [\#6728](https://github.com/apache/arrow-rs/issues/6728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Add support for Utf8View -\> numeric in can\_cast\_types [\#6715](https://github.com/apache/arrow-rs/issues/6715)
+- IPC file writer produces incorrect footer when not preserving dict ID [\#6710](https://github.com/apache/arrow-rs/issues/6710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- parquet from\_thrift\_helper incorrectly checks index [\#6693](https://github.com/apache/arrow-rs/issues/6693) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
+- Primitive REPEATED fields not contained in LIST annotated groups aren't read as lists by record reader [\#6648](https://github.com/apache/arrow-rs/issues/6648) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
+- DictionaryHandling does not recurse into Map fields [\#6644](https://github.com/apache/arrow-rs/issues/6644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
+- Array writer output empty when no record is written [\#6613](https://github.com/apache/arrow-rs/issues/6613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Archery Integration Test with c\# failing on main [\#6577](https://github.com/apache/arrow-rs/issues/6577) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Potential unsoundness in `filter_run_end_array` [\#6569](https://github.com/apache/arrow-rs/issues/6569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Parquet reader can generate incorrect validity buffer information for nested structures [\#6510](https://github.com/apache/arrow-rs/issues/6510) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
+- arrow-array ffi: FFI\_ArrowArray.null\_count is always interpreted as unsigned and initialized during conversion from C to Rust. [\#6497](https://github.com/apache/arrow-rs/issues/6497) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+
+**Documentation updates:**
+
+- Minor: Document pattern for accessing views in StringView [\#6673](https://github.com/apache/arrow-rs/pull/6673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
+- Improve Array::is\_nullable documentation [\#6615](https://github.com/apache/arrow-rs/pull/6615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Minor: improve docs for ByteViewArray-\>ByteArray From impl [\#6610](https://github.com/apache/arrow-rs/pull/6610) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
+
+**Performance improvements:**
+
+- Speed up `filter_run_end_array` [\#6712](https://github.com/apache/arrow-rs/pull/6712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan))
+
+**Closed issues:**
+
+- Incorrect like results for pattern starting/ending with `%` percent and containing escape characters [\#6702](https://github.com/apache/arrow-rs/issues/6702) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+
+**Merged pull requests:**
+
+- Fix signed decimal e-notation parsing [\#6729](https://github.com/apache/arrow-rs/pull/6729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya))
+- Clean up some arrow-flight tests and duplicated code [\#6725](https://github.com/apache/arrow-rs/pull/6725) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime))
+- Update PR template section about API breaking changes [\#6723](https://github.com/apache/arrow-rs/pull/6723) ([findepi](https://github.com/findepi))
+- Support for casting `StringViewArray` to `DecimalArray` [\#6720](https://github.com/apache/arrow-rs/pull/6720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365))
+- File writer preserve dict bug [\#6711](https://github.com/apache/arrow-rs/pull/6711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz))
+- Add filter\_kernel benchmark for run array [\#6706](https://github.com/apache/arrow-rs/pull/6706) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3))
+- Fix string view ILIKE checks with NULL values [\#6705](https://github.com/apache/arrow-rs/pull/6705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Implement logical\_null\_count for more array types [\#6704](https://github.com/apache/arrow-rs/pull/6704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Fix LIKE with escapes [\#6703](https://github.com/apache/arrow-rs/pull/6703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Speed up `filter_bytes` [\#6699](https://github.com/apache/arrow-rs/pull/6699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan))
+- Minor: fix misleading comment in byte view [\#6695](https://github.com/apache/arrow-rs/pull/6695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211))
+- minor fix on checking index [\#6694](https://github.com/apache/arrow-rs/pull/6694) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jp0317](https://github.com/jp0317))
+- Undo run end filter performance regression [\#6691](https://github.com/apache/arrow-rs/pull/6691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3))
+- Reimplement `PartialEq` of `GenericByteViewArray` compares by logical value [\#6689](https://github.com/apache/arrow-rs/pull/6689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365))
+- feat: expose known\_schema from FlightDataEncoder [\#6688](https://github.com/apache/arrow-rs/pull/6688) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc))
+- Update hashbrown requirement from 0.14.2 to 0.15.1 [\#6684](https://github.com/apache/arrow-rs/pull/6684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot))
+- Support Duration in JSON Reader [\#6683](https://github.com/apache/arrow-rs/pull/6683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([simonvandel](https://github.com/simonvandel))
+- Check predicate and values are the same length for run end array filter safety [\#6675](https://github.com/apache/arrow-rs/pull/6675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3))
+- \[ffi\] Fix arrow-array null\_count error during conversion from C to Rust [\#6674](https://github.com/apache/arrow-rs/pull/6674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adbmal](https://github.com/adbmal))
+- Support `Utf8View` for `bit_length` kernel [\#6671](https://github.com/apache/arrow-rs/pull/6671) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([austin362667](https://github.com/austin362667))
+- Fix string view LIKE checks with NULL values [\#6662](https://github.com/apache/arrow-rs/pull/6662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Improve documentation for `nullif` kernel [\#6658](https://github.com/apache/arrow-rs/pull/6658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
+- Improve test\_auth error message when contains\(\) fails [\#6657](https://github.com/apache/arrow-rs/pull/6657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi))
+- Let std::fmt::Debug for StructArray output Null/Validity info [\#6655](https://github.com/apache/arrow-rs/pull/6655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([XinyuZeng](https://github.com/XinyuZeng))
+- Include offending line number when processing CSV file fails [\#6653](https://github.com/apache/arrow-rs/pull/6653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- feat: add write\_bytes for GenericBinaryBuilder [\#6652](https://github.com/apache/arrow-rs/pull/6652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tisonkun](https://github.com/tisonkun))
+- feat: Support Utf8View in JSON serialization [\#6651](https://github.com/apache/arrow-rs/pull/6651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonmmease](https://github.com/jonmmease))
+- fix: include chrono-tz in flight sql cli [\#6650](https://github.com/apache/arrow-rs/pull/6650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum))
+- Handle primitive REPEATED field not contained in LIST annotated group [\#6649](https://github.com/apache/arrow-rs/pull/6649) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm))
+- Implement `append_n` for `BooleanBuilder` [\#6646](https://github.com/apache/arrow-rs/pull/6646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3))
+- fix: recurse into Map datatype when hydrating dictionaries [\#6645](https://github.com/apache/arrow-rs/pull/6645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc))
+- fix: enable TLS roots for flight CLI client [\#6640](https://github.com/apache/arrow-rs/pull/6640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum))
+- doc: Clarify take kernel semantics [\#6632](https://github.com/apache/arrow-rs/pull/6632) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya))
+- Return error rather than panic when too many row groups are written [\#6629](https://github.com/apache/arrow-rs/pull/6629) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
+- Fix test feature selection so all feature combinations work as expected [\#6626](https://github.com/apache/arrow-rs/pull/6626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([itsjunetime](https://github.com/itsjunetime))
+- Add Parquet RowSelection benchmark [\#6623](https://github.com/apache/arrow-rs/pull/6623) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao))
+- Optimize `take_bits` to optimize `take_boolean` / `take_primitive` / `take_byte_view`: up to -25% [\#6622](https://github.com/apache/arrow-rs/pull/6622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan))
+- Make downcast macros hygenic \(\#6400\) [\#6620](https://github.com/apache/arrow-rs/pull/6620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
+- Update proc-macro2 requirement from =1.0.88 to =1.0.89 [\#6618](https://github.com/apache/arrow-rs/pull/6618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot))
+- Fix arrow-json writer empty [\#6614](https://github.com/apache/arrow-rs/pull/6614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gwik](https://github.com/gwik))
+- Add `ParquetObjectReader::with_runtime` [\#6612](https://github.com/apache/arrow-rs/pull/6612) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([itsjunetime](https://github.com/itsjunetime))
+- Re-enable `C#` arrow flight integration test [\#6611](https://github.com/apache/arrow-rs/pull/6611) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
+- Add Array::logical\_null\_count for inspecting number of null values [\#6608](https://github.com/apache/arrow-rs/pull/6608) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Added casting from Binary/LargeBinary to Utf8View [\#6592](https://github.com/apache/arrow-rs/pull/6592) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ngli-me](https://github.com/ngli-me))
+- Parquet AsyncReader: Don't panic when empty offset\_index is Some\(\[\]\) [\#6582](https://github.com/apache/arrow-rs/pull/6582) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jroddev](https://github.com/jroddev))
+- Skip writing down null buffers for non-nullable primitive arrays [\#6524](https://github.com/apache/arrow-rs/pull/6524) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([bkirwi](https://github.com/bkirwi))
 ## [53.2.0](https://github.com/apache/arrow-rs/tree/53.2.0) (2024-10-21)
 
 [Full Changelog](https://github.com/apache/arrow-rs/compare/53.1.0...53.2.0)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3b729360608..a7f2a4ff34d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,101 +19,116 @@
 
 # Changelog
 
-## [53.3.0](https://github.com/apache/arrow-rs/tree/53.3.0) (2024-11-17)
+## [54.0.0](https://github.com/apache/arrow-rs/tree/54.0.0) (2024-12-18)
 
-[Full Changelog](https://github.com/apache/arrow-rs/compare/53.2.0...53.3.0)
+[Full Changelog](https://github.com/apache/arrow-rs/compare/53.3.0...54.0.0)
+
+**Breaking changes:**
+
+- avoid redundant parsing of repeated value in RleDecoder [\#6834](https://github.com/apache/arrow-rs/pull/6834) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jp0317](https://github.com/jp0317))
+- Handling nullable DictionaryArray in CSV parser [\#6830](https://github.com/apache/arrow-rs/pull/6830) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([edmondop](https://github.com/edmondop))
+- fix\(flightsql\): remove Any encoding of DoPutUpdateResult [\#6825](https://github.com/apache/arrow-rs/pull/6825) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([davisp](https://github.com/davisp))
+- arrow-ipc: Default to not preserving dict IDs [\#6788](https://github.com/apache/arrow-rs/pull/6788) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz))
+- Remove some very old deprecated functions [\#6774](https://github.com/apache/arrow-rs/pull/6774) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
+- update to pyo3 0.23.0 [\#6745](https://github.com/apache/arrow-rs/pull/6745) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri))
+- Remove APIs deprecated since v 4.4.0 [\#6722](https://github.com/apache/arrow-rs/pull/6722) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi))
+- Return `None` when Parquet page indexes are not present in file [\#6639](https://github.com/apache/arrow-rs/pull/6639) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
+- Add `ParquetError::NeedMoreData` mark `ParquetError` as `non_exhaustive` [\#6630](https://github.com/apache/arrow-rs/pull/6630) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
+- Remove APIs deprecated since v 2.0.0 [\#6609](https://github.com/apache/arrow-rs/pull/6609) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
 
 **Implemented enhancements:**
 
-- `PartialEq` of GenericByteViewArray \(StringViewArray / ByteViewArray\) that compares on equality rather than logical value [\#6679](https://github.com/apache/arrow-rs/issues/6679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
-- Need a mechanism to handle schema changes due to dictionary hydration in FlightSQL server implementations [\#6672](https://github.com/apache/arrow-rs/issues/6672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
-- Support encoding Utf8View columns to JSON [\#6642](https://github.com/apache/arrow-rs/issues/6642) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
-- Implement `append_n` for `BooleanBuilder` [\#6634](https://github.com/apache/arrow-rs/issues/6634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
-- Some take optimizations [\#6621](https://github.com/apache/arrow-rs/issues/6621) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
-- Error Instead of Panic On Attempting to Write More Than 32769 Row Groups [\#6591](https://github.com/apache/arrow-rs/issues/6591) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
-- Make casting from a timestamp without timezone to a timestamp with timezone configurable [\#6555](https://github.com/apache/arrow-rs/issues/6555)
-- Add `record_batch!` macro for easy record batch creation [\#6553](https://github.com/apache/arrow-rs/issues/6553) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
-- Support `Binary` --\> `Utf8View` casting [\#6531](https://github.com/apache/arrow-rs/issues/6531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
-- `downcast_primitive_array` and `downcast_dictionary_array` are not hygienic wrt imports [\#6400](https://github.com/apache/arrow-rs/issues/6400) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
-- Implement interleave\_record\_batch [\#6731](https://github.com/apache/arrow-rs/pull/6731) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([waynexia](https://github.com/waynexia))
-- feat: `record_batch!` macro [\#6588](https://github.com/apache/arrow-rs/pull/6588) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ByteBaker](https://github.com/ByteBaker))
+- Parquet schema hint doesn't support integer types upcasting [\#6891](https://github.com/apache/arrow-rs/issues/6891) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
+- Parquet UTF-8 max statistics are overly pessimistic [\#6867](https://github.com/apache/arrow-rs/issues/6867) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
+- Add builder support for Int8 keys [\#6844](https://github.com/apache/arrow-rs/issues/6844) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Formalize the name of the nested `Field` in a list [\#6784](https://github.com/apache/arrow-rs/issues/6784) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
+- Allow disabling the writing of Parquet Offset Index [\#6778](https://github.com/apache/arrow-rs/issues/6778) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
+- `parquet::record::make_row` is not exposed to users, leaving no option to users to manually create `Row` objects [\#6761](https://github.com/apache/arrow-rs/issues/6761) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
+- Avoid `from_num_days_from_ce_opt` calls in `timestamp_s_to_datetime` if we don't need [\#6746](https://github.com/apache/arrow-rs/issues/6746) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Support Temporal -\> Utf8View casting [\#6734](https://github.com/apache/arrow-rs/issues/6734) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Add Option To Coerce List Type on Parquet Write [\#6733](https://github.com/apache/arrow-rs/issues/6733) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Support Numeric -\> Utf8View casting [\#6714](https://github.com/apache/arrow-rs/issues/6714) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Support Utf8View \<=\> boolean casting [\#6713](https://github.com/apache/arrow-rs/issues/6713) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
 
 **Fixed bugs:**
 
-- Signed decimal e-notation parsing bug [\#6728](https://github.com/apache/arrow-rs/issues/6728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
-- Add support for Utf8View -\> numeric in can\_cast\_types [\#6715](https://github.com/apache/arrow-rs/issues/6715)
-- IPC file writer produces incorrect footer when not preserving dict ID [\#6710](https://github.com/apache/arrow-rs/issues/6710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
-- parquet from\_thrift\_helper incorrectly checks index [\#6693](https://github.com/apache/arrow-rs/issues/6693) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
-- Primitive REPEATED fields not contained in LIST annotated groups aren't read as lists by record reader [\#6648](https://github.com/apache/arrow-rs/issues/6648) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
-- DictionaryHandling does not recurse into Map fields [\#6644](https://github.com/apache/arrow-rs/issues/6644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
-- Array writer output empty when no record is written [\#6613](https://github.com/apache/arrow-rs/issues/6613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
-- Archery Integration Test with c\# failing on main [\#6577](https://github.com/apache/arrow-rs/issues/6577) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
-- Potential unsoundness in `filter_run_end_array` [\#6569](https://github.com/apache/arrow-rs/issues/6569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
-- Parquet reader can generate incorrect validity buffer information for nested structures [\#6510](https://github.com/apache/arrow-rs/issues/6510) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
-- arrow-array ffi: FFI\_ArrowArray.null\_count is always interpreted as unsigned and initialized during conversion from C to Rust. [\#6497](https://github.com/apache/arrow-rs/issues/6497) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- `Buffer::bit_slice` loses length with byte-aligned offsets [\#6895](https://github.com/apache/arrow-rs/issues/6895) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- parquet arrow writer doesn't track memory size correctly for fixed sized lists [\#6839](https://github.com/apache/arrow-rs/issues/6839) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
+- Casting Decimal128 to Decimal128 with smaller precision produces incorrect results in some cases [\#6833](https://github.com/apache/arrow-rs/issues/6833) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Should empty nullable dictionary be parsed as null from arrow-csv? [\#6821](https://github.com/apache/arrow-rs/issues/6821) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Array take doesn't make fields nullable [\#6809](https://github.com/apache/arrow-rs/issues/6809)
+- Arrow Flight Encodes a Slice's List Offsets If the slice offset is starts with zero [\#6803](https://github.com/apache/arrow-rs/issues/6803) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Parquet readers incorrectly interpret legacy nested lists [\#6756](https://github.com/apache/arrow-rs/issues/6756) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)]
+- filter\_bits under-allocates resulting boolean buffer [\#6750](https://github.com/apache/arrow-rs/issues/6750) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Multi-language support issues with Arrow FlightSQL client's execute\_update and execute\_ingest methods [\#6545](https://github.com/apache/arrow-rs/issues/6545) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
 
 **Documentation updates:**
 
-- Minor: Document pattern for accessing views in StringView [\#6673](https://github.com/apache/arrow-rs/pull/6673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
-- Improve Array::is\_nullable documentation [\#6615](https://github.com/apache/arrow-rs/pull/6615) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
-- Minor: improve docs for ByteViewArray-\>ByteArray From impl [\#6610](https://github.com/apache/arrow-rs/pull/6610) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
-
-**Performance improvements:**
-
-- Speed up `filter_run_end_array` [\#6712](https://github.com/apache/arrow-rs/pull/6712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan))
+- Should we document at what rate deprecated APIs are removed? [\#6851](https://github.com/apache/arrow-rs/issues/6851) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- Fix docstring for `Format::with_header` in `arrow-csv` [\#6856](https://github.com/apache/arrow-rs/pull/6856) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kylebarron](https://github.com/kylebarron))
+- Add deprecation / API removal policy [\#6852](https://github.com/apache/arrow-rs/pull/6852) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
+- Minor: add example for creating `SchemaDescriptor` [\#6841](https://github.com/apache/arrow-rs/pull/6841) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb))
+- chore: enrich panic context when BooleanBuffer fails to create [\#6810](https://github.com/apache/arrow-rs/pull/6810) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tisonkun](https://github.com/tisonkun))
 
 **Closed issues:**
 
-- Incorrect like results for pattern starting/ending with `%` percent and containing escape characters [\#6702](https://github.com/apache/arrow-rs/issues/6702) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)]
+- \[FlightSQL\] GetCatalogsBuilder does not sort the catalog names [\#6807](https://github.com/apache/arrow-rs/issues/6807) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
+- Add a lint to automatically check for unused dependencies [\#6796](https://github.com/apache/arrow-rs/issues/6796) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)]
 
 **Merged pull requests:**
 
-- Fix signed decimal e-notation parsing [\#6729](https://github.com/apache/arrow-rs/pull/6729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gruuya](https://github.com/gruuya))
-- Clean up some arrow-flight tests and duplicated code [\#6725](https://github.com/apache/arrow-rs/pull/6725) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime))
-- Update PR template section about API breaking changes [\#6723](https://github.com/apache/arrow-rs/pull/6723) ([findepi](https://github.com/findepi))
-- Support for casting `StringViewArray` to `DecimalArray` [\#6720](https://github.com/apache/arrow-rs/pull/6720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365))
-- File writer preserve dict bug [\#6711](https://github.com/apache/arrow-rs/pull/6711) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([brancz](https://github.com/brancz))
-- Add filter\_kernel benchmark for run array [\#6706](https://github.com/apache/arrow-rs/pull/6706) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3))
-- Fix string view ILIKE checks with NULL values [\#6705](https://github.com/apache/arrow-rs/pull/6705) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
-- Implement logical\_null\_count for more array types [\#6704](https://github.com/apache/arrow-rs/pull/6704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
-- Fix LIKE with escapes [\#6703](https://github.com/apache/arrow-rs/pull/6703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
-- Speed up `filter_bytes` [\#6699](https://github.com/apache/arrow-rs/pull/6699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan))
-- Minor: fix misleading comment in byte view [\#6695](https://github.com/apache/arrow-rs/pull/6695) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211))
-- minor fix on checking index [\#6694](https://github.com/apache/arrow-rs/pull/6694) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jp0317](https://github.com/jp0317))
-- Undo run end filter performance regression [\#6691](https://github.com/apache/arrow-rs/pull/6691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3))
-- Reimplement `PartialEq` of `GenericByteViewArray` compares by logical value [\#6689](https://github.com/apache/arrow-rs/pull/6689) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365))
-- feat: expose known\_schema from FlightDataEncoder [\#6688](https://github.com/apache/arrow-rs/pull/6688) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc))
-- Update hashbrown requirement from 0.14.2 to 0.15.1 [\#6684](https://github.com/apache/arrow-rs/pull/6684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot))
-- Support Duration in JSON Reader [\#6683](https://github.com/apache/arrow-rs/pull/6683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([simonvandel](https://github.com/simonvandel))
-- Check predicate and values are the same length for run end array filter safety [\#6675](https://github.com/apache/arrow-rs/pull/6675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3))
-- \[ffi\] Fix arrow-array null\_count error during conversion from C to Rust [\#6674](https://github.com/apache/arrow-rs/pull/6674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([adbmal](https://github.com/adbmal))
-- Support `Utf8View` for `bit_length` kernel [\#6671](https://github.com/apache/arrow-rs/pull/6671) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([austin362667](https://github.com/austin362667))
-- Fix string view LIKE checks with NULL values [\#6662](https://github.com/apache/arrow-rs/pull/6662) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
-- Improve documentation for `nullif` kernel [\#6658](https://github.com/apache/arrow-rs/pull/6658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
-- Improve test\_auth error message when contains\(\) fails [\#6657](https://github.com/apache/arrow-rs/pull/6657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi))
-- Let std::fmt::Debug for StructArray output Null/Validity info [\#6655](https://github.com/apache/arrow-rs/pull/6655) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([XinyuZeng](https://github.com/XinyuZeng))
-- Include offending line number when processing CSV file fails [\#6653](https://github.com/apache/arrow-rs/pull/6653) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
-- feat: add write\_bytes for GenericBinaryBuilder [\#6652](https://github.com/apache/arrow-rs/pull/6652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tisonkun](https://github.com/tisonkun))
-- feat: Support Utf8View in JSON serialization [\#6651](https://github.com/apache/arrow-rs/pull/6651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jonmmease](https://github.com/jonmmease))
-- fix: include chrono-tz in flight sql cli [\#6650](https://github.com/apache/arrow-rs/pull/6650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum))
-- Handle primitive REPEATED field not contained in LIST annotated group [\#6649](https://github.com/apache/arrow-rs/pull/6649) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([zeevm](https://github.com/zeevm))
-- Implement `append_n` for `BooleanBuilder` [\#6646](https://github.com/apache/arrow-rs/pull/6646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([delamarch3](https://github.com/delamarch3))
-- fix: recurse into Map datatype when hydrating dictionaries [\#6645](https://github.com/apache/arrow-rs/pull/6645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([nathanielc](https://github.com/nathanielc))
-- fix: enable TLS roots for flight CLI client [\#6640](https://github.com/apache/arrow-rs/pull/6640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([crepererum](https://github.com/crepererum))
-- doc: Clarify take kernel semantics [\#6632](https://github.com/apache/arrow-rs/pull/6632) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya))
-- Return error rather than panic when too many row groups are written [\#6629](https://github.com/apache/arrow-rs/pull/6629) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
-- Fix test feature selection so all feature combinations work as expected [\#6626](https://github.com/apache/arrow-rs/pull/6626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([itsjunetime](https://github.com/itsjunetime))
-- Add Parquet RowSelection benchmark [\#6623](https://github.com/apache/arrow-rs/pull/6623) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([XiangpengHao](https://github.com/XiangpengHao))
-- Optimize `take_bits` to optimize `take_boolean` / `take_primitive` / `take_byte_view`: up to -25% [\#6622](https://github.com/apache/arrow-rs/pull/6622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan))
-- Make downcast macros hygenic \(\#6400\) [\#6620](https://github.com/apache/arrow-rs/pull/6620) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
-- Update proc-macro2 requirement from =1.0.88 to =1.0.89 [\#6618](https://github.com/apache/arrow-rs/pull/6618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot))
-- Fix arrow-json writer empty [\#6614](https://github.com/apache/arrow-rs/pull/6614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gwik](https://github.com/gwik))
-- Add `ParquetObjectReader::with_runtime` [\#6612](https://github.com/apache/arrow-rs/pull/6612) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([itsjunetime](https://github.com/itsjunetime))
-- Re-enable `C#` arrow flight integration test [\#6611](https://github.com/apache/arrow-rs/pull/6611) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
-- Add Array::logical\_null\_count for inspecting number of null values [\#6608](https://github.com/apache/arrow-rs/pull/6608) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
-- Added casting from Binary/LargeBinary to Utf8View [\#6592](https://github.com/apache/arrow-rs/pull/6592) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ngli-me](https://github.com/ngli-me))
-- Parquet AsyncReader: Don't panic when empty offset\_index is Some\(\[\]\) [\#6582](https://github.com/apache/arrow-rs/pull/6582) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jroddev](https://github.com/jroddev))
-- Skip writing down null buffers for non-nullable primitive arrays [\#6524](https://github.com/apache/arrow-rs/pull/6524) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([bkirwi](https://github.com/bkirwi))
+- doc: add comment for timezone string [\#6899](https://github.com/apache/arrow-rs/pull/6899) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([xxchan](https://github.com/xxchan))
+- docs: fix typo [\#6890](https://github.com/apache/arrow-rs/pull/6890) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([rluvaton](https://github.com/rluvaton))
+- Minor: Fix deprecation notice for `arrow_to_parquet_schema` [\#6889](https://github.com/apache/arrow-rs/pull/6889) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
+- Add Field::with\_dict\_is\_ordered [\#6885](https://github.com/apache/arrow-rs/pull/6885) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
+- Deprecate "max statistics size" property in `WriterProperties` [\#6884](https://github.com/apache/arrow-rs/pull/6884) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
+- Add deprecation warnings for everything related to `dict_id` [\#6873](https://github.com/apache/arrow-rs/pull/6873) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([brancz](https://github.com/brancz))
+- Enable matching temporal as from\_type to Utf8View [\#6872](https://github.com/apache/arrow-rs/pull/6872) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Kev1n8](https://github.com/Kev1n8))
+- Enable string-based column projections from Parquet files [\#6871](https://github.com/apache/arrow-rs/pull/6871) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
+- Improvements to UTF-8 statistics truncation [\#6870](https://github.com/apache/arrow-rs/pull/6870) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
+- fix: make GetCatalogsBuilder sort catalog names  [\#6864](https://github.com/apache/arrow-rs/pull/6864) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([niebayes](https://github.com/niebayes))
+- add buffered data\_pages to parquet column writer total bytes estimation [\#6862](https://github.com/apache/arrow-rs/pull/6862) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([onursatici](https://github.com/onursatici))
+- Update prost-build requirement from =0.13.3 to =0.13.4 [\#6860](https://github.com/apache/arrow-rs/pull/6860) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot))
+- Minor: add comments explaining bad MSRV, output in json [\#6857](https://github.com/apache/arrow-rs/pull/6857) ([alamb](https://github.com/alamb))
+- perf: Use Cow in get\_format\_string in FFI\_ArrowSchema [\#6853](https://github.com/apache/arrow-rs/pull/6853) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove))
+- chore: add cast\_decimal benchmark [\#6850](https://github.com/apache/arrow-rs/pull/6850) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove))
+- arrow-array::builder: support Int8, Int16 and Int64 keys [\#6845](https://github.com/apache/arrow-rs/pull/6845) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ajwerner](https://github.com/ajwerner))
+- Add `ArrowToParquetSchemaConverter`, deprecate `arrow_to_parquet_schema` [\#6840](https://github.com/apache/arrow-rs/pull/6840) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb))
+- Remove APIs deprecated in 50.0.0 [\#6838](https://github.com/apache/arrow-rs/pull/6838) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- fix: decimal conversion looses value on lower precision [\#6836](https://github.com/apache/arrow-rs/pull/6836) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([himadripal](https://github.com/himadripal))
+- Update sysinfo requirement from 0.32.0 to 0.33.0 [\#6835](https://github.com/apache/arrow-rs/pull/6835) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([dependabot[bot]](https://github.com/apps/dependabot))
+- Optionally coerce names of maps and lists to match Parquet specification [\#6828](https://github.com/apache/arrow-rs/pull/6828) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
+- Remove deprecated unary\_dyn and try\_unary\_dyn [\#6824](https://github.com/apache/arrow-rs/pull/6824) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Remove deprecated flight\_data\_from\_arrow\_batch [\#6823](https://github.com/apache/arrow-rs/pull/6823) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi))
+- \[arrow-cast\] Support cast boolean from/to string view [\#6822](https://github.com/apache/arrow-rs/pull/6822) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365))
+- Hook up Avro Decoder [\#6820](https://github.com/apache/arrow-rs/pull/6820) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
+- Fix arrow-avro compilation without default features [\#6819](https://github.com/apache/arrow-rs/pull/6819) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Support shrink to empty [\#6817](https://github.com/apache/arrow-rs/pull/6817) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold))
+- \[arrow-cast\] Support cast numeric to string view \(alternate\) [\#6816](https://github.com/apache/arrow-rs/pull/6816) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb))
+- Hide implicit optional dependency features in arrow-flight [\#6806](https://github.com/apache/arrow-rs/pull/6806) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi))
+- fix: Encoding of List offsets was incorrect when slice offsets begin with zero [\#6805](https://github.com/apache/arrow-rs/pull/6805) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HawaiianSpork](https://github.com/HawaiianSpork))
+- Enable unused\_crate\_dependencies Rust lint, remove unused dependencies [\#6804](https://github.com/apache/arrow-rs/pull/6804) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi))
+- Minor: Fix docstrings for `ColumnProperties::statistics_enabled` property [\#6798](https://github.com/apache/arrow-rs/pull/6798) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
+- Add option to disable writing of Parquet offset index [\#6797](https://github.com/apache/arrow-rs/pull/6797) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
+- Remove unused dependencies [\#6792](https://github.com/apache/arrow-rs/pull/6792) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([findepi](https://github.com/findepi))
+- Add `Array::shrink_to_fit(&mut self)` [\#6790](https://github.com/apache/arrow-rs/pull/6790) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([emilk](https://github.com/emilk))
+- Formalize the default nested list field name to `item` [\#6785](https://github.com/apache/arrow-rs/pull/6785) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([gruuya](https://github.com/gruuya))
+- Improve UnionArray logical\_nulls tests [\#6781](https://github.com/apache/arrow-rs/pull/6781) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([gstvg](https://github.com/gstvg))
+- Improve list builder usage example in docs [\#6775](https://github.com/apache/arrow-rs/pull/6775) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Update proc-macro2 requirement from =1.0.89 to =1.0.92 [\#6772](https://github.com/apache/arrow-rs/pull/6772) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([dependabot[bot]](https://github.com/apps/dependabot))
+- Allow NullBuffer construction directly from array  [\#6769](https://github.com/apache/arrow-rs/pull/6769) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Include license and notice files in published crates [\#6767](https://github.com/apache/arrow-rs/pull/6767) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([ankane](https://github.com/ankane))
+- fix: remove redundant `bit_util::ceil` [\#6766](https://github.com/apache/arrow-rs/pull/6766) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([miroim](https://github.com/miroim))
+- Remove 'make\_row', expose a 'Row::new' method instead. [\#6763](https://github.com/apache/arrow-rs/pull/6763) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([jonded94](https://github.com/jonded94))
+- Read nested Parquet 2-level lists correctly [\#6757](https://github.com/apache/arrow-rs/pull/6757) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
+- Split `timestamp_s_to_datetime` to `date` and `time` to avoid unnecessary computation [\#6755](https://github.com/apache/arrow-rs/pull/6755) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jayzhan211](https://github.com/jayzhan211))
+- More trivial implementation of `Box<dyn AsyncArrowWriter>` and `Box<dyn AsyncArrowReader>` [\#6748](https://github.com/apache/arrow-rs/pull/6748) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([ethe](https://github.com/ethe))
+- Update cache action to v4 [\#6744](https://github.com/apache/arrow-rs/pull/6744) ([findepi](https://github.com/findepi))
+- Remove redundant implementation of `StringArrayType` [\#6743](https://github.com/apache/arrow-rs/pull/6743) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tlm365](https://github.com/tlm365))
+- Fix Dictionary logical nulls for RunArray/UnionArray Values [\#6740](https://github.com/apache/arrow-rs/pull/6740) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Allow reading Parquet maps that lack a `values` field [\#6730](https://github.com/apache/arrow-rs/pull/6730) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([etseidl](https://github.com/etseidl))
+- Improve default implementation of Array::is\_nullable [\#6721](https://github.com/apache/arrow-rs/pull/6721) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([findepi](https://github.com/findepi))
+- Fix Buffer::bit\_slice losing length with byte-aligned offsets [\#6707](https://github.com/apache/arrow-rs/pull/6707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([itsjunetime](https://github.com/itsjunetime))
 
 
 
diff --git a/Cargo.toml b/Cargo.toml
index 375a4efac55..75ba410f12a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -62,7 +62,7 @@ exclude = [
 ]
 
 [workspace.package]
-version = "53.3.0"
+version = "54.0.0"
 homepage = "https://github.com/apache/arrow-rs"
 repository = "https://github.com/apache/arrow-rs"
 authors = ["Apache Arrow <dev@arrow.apache.org>"]
@@ -77,20 +77,20 @@ edition = "2021"
 rust-version = "1.62"
 
 [workspace.dependencies]
-arrow = { version = "53.3.0", path = "./arrow", default-features = false }
-arrow-arith = { version = "53.3.0", path = "./arrow-arith" }
-arrow-array = { version = "53.3.0", path = "./arrow-array" }
-arrow-buffer = { version = "53.3.0", path = "./arrow-buffer" }
-arrow-cast = { version = "53.3.0", path = "./arrow-cast" }
-arrow-csv = { version = "53.3.0", path = "./arrow-csv" }
-arrow-data = { version = "53.3.0", path = "./arrow-data" }
-arrow-ipc = { version = "53.3.0", path = "./arrow-ipc" }
-arrow-json = { version = "53.3.0", path = "./arrow-json" }
-arrow-ord = { version = "53.3.0", path = "./arrow-ord" }
-arrow-row = { version = "53.3.0", path = "./arrow-row" }
-arrow-schema = { version = "53.3.0", path = "./arrow-schema" }
-arrow-select = { version = "53.3.0", path = "./arrow-select" }
-arrow-string = { version = "53.3.0", path = "./arrow-string" }
-parquet = { version = "53.3.0", path = "./parquet", default-features = false }
+arrow = { version = "54.0.0", path = "./arrow", default-features = false }
+arrow-arith = { version = "54.0.0", path = "./arrow-arith" }
+arrow-array = { version = "54.0.0", path = "./arrow-array" }
+arrow-buffer = { version = "54.0.0", path = "./arrow-buffer" }
+arrow-cast = { version = "54.0.0", path = "./arrow-cast" }
+arrow-csv = { version = "54.0.0", path = "./arrow-csv" }
+arrow-data = { version = "54.0.0", path = "./arrow-data" }
+arrow-ipc = { version = "54.0.0", path = "./arrow-ipc" }
+arrow-json = { version = "54.0.0", path = "./arrow-json" }
+arrow-ord = { version = "54.0.0", path = "./arrow-ord" }
+arrow-row = { version = "54.0.0", path = "./arrow-row" }
+arrow-schema = { version = "54.0.0", path = "./arrow-schema" }
+arrow-select = { version = "54.0.0", path = "./arrow-select" }
+arrow-string = { version = "54.0.0", path = "./arrow-string" }
+parquet = { version = "54.0.0", path = "./parquet", default-features = false }
 
 chrono = { version = "0.4.34", default-features = false, features = ["clock"] }
diff --git a/arrow-flight/README.md b/arrow-flight/README.md
index 661abfc5869..3ffc8780c2f 100644
--- a/arrow-flight/README.md
+++ b/arrow-flight/README.md
@@ -31,7 +31,7 @@ Add this to your Cargo.toml:
 
 ```toml
 [dependencies]
-arrow-flight = "53.3.0"
+arrow-flight = "54.0.0"
 ```
 
 Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information.
diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh
index d00cc498625..4a2f5e3f198 100755
--- a/dev/release/update_change_log.sh
+++ b/dev/release/update_change_log.sh
@@ -29,8 +29,8 @@
 
 set -e
 
-SINCE_TAG="53.2.0"
-FUTURE_RELEASE="53.3.0"
+SINCE_TAG="53.3.0"
+FUTURE_RELEASE="54.0.0"
 
 SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)"

From 1dbe523754eda1cefe526f00da2b92d2e83fcfa7 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 19 Dec 2024 14:51:30 -0500
Subject: [PATCH 25/29] [object store] Add planned release schedule (#6904)

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 723249ad29e..ed42f630514 100644
--- a/README.md
+++ b/README.md
@@ -83,6 +83,13 @@ versions approximately every 2 months.
 
 [`object_store`]: https://crates.io/crates/object_store
 
+Planned Release Schedule
+
+| Approximate Date | Version  | Notes                                   |
+| ---------------- | -------- | --------------------------------------- |
+| Dec 2024         | `0.11.2` | Minor, NO breaking API changes          |
+| Feb 2025         | `0.12.0` | Major, potentially breaking API changes |
+
 ### Deprecation Guidelines
 
 Minor releases may deprecate, but not remove APIs. Deprecating APIs allows

From 2908a80d9ca3e3fb0414e35b67856f1fb761304c Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Thu, 19 Dec 2024 21:52:18 +0200
Subject: [PATCH 26/29] add `extend_dictionary` in dictionary builder for
 improved performance (#6875)

* add `extend_dictionary` in dictionary builder for improved performance

* fix extends all nulls

* support null in mapped value

* adding comment

* run `clippy` and `fmt`

* fix ci

* Apply suggestions from code review

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 .../generic_bytes_dictionary_builder.rs       | 187 ++++++++++++++++-
 .../builder/primitive_dictionary_builder.rs   | 198 +++++++++++++++++-
 2 files changed, 379 insertions(+), 6 deletions(-)

diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
index bb0fb5e91be..ead151d5cee 100644
--- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs
@@ -17,7 +17,7 @@
 
 use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
 use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType};
-use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray};
+use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray, TypedDictionaryArray};
 use arrow_buffer::ArrowNativeType;
 use arrow_schema::{ArrowError, DataType};
 use hashbrown::HashTable;
@@ -305,6 +305,63 @@ where
         };
     }
 
+    /// Extends builder with an existing dictionary array.
+    ///
+    /// This is the same as [`Self::extend`] but is faster as it translates
+    /// the dictionary values once rather than doing a lookup for each item in the iterator
+    ///
+    /// when dictionary values are null (the actual mapped values) the keys are null
+    ///
+    pub fn extend_dictionary(
+        &mut self,
+        dictionary: &TypedDictionaryArray<K, GenericByteArray<T>>,
+    ) -> Result<(), ArrowError> {
+        let values = dictionary.values();
+
+        let v_len = values.len();
+        let k_len = dictionary.keys().len();
+        if v_len == 0 && k_len == 0 {
+            return Ok(());
+        }
+
+        // All nulls
+        if v_len == 0 {
+            self.append_nulls(k_len);
+            return Ok(());
+        }
+
+        if k_len == 0 {
+            return Err(ArrowError::InvalidArgumentError(
+                "Dictionary keys should not be empty when values are not empty".to_string(),
+            ));
+        }
+
+        // Orphan values will be carried over to the new dictionary
+        let mapped_values = values
+            .iter()
+            // Dictionary values can technically be null, so we need to handle that
+            .map(|dict_value| {
+                dict_value
+                    .map(|dict_value| self.get_or_insert_key(dict_value))
+                    .transpose()
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+
+        // Just insert the keys without additional lookups
+        dictionary.keys().iter().for_each(|key| match key {
+            None => self.append_null(),
+            Some(original_dict_index) => {
+                let index = original_dict_index.as_usize().min(v_len - 1);
+                match mapped_values[index] {
+                    None => self.append_null(),
+                    Some(mapped_value) => self.keys_builder.append_value(mapped_value),
+                }
+            }
+        });
+
+        Ok(())
+    }
+
     /// Builds the `DictionaryArray` and reset this builder.
     pub fn finish(&mut self) -> DictionaryArray<K> {
         self.dedup.clear();
@@ -445,8 +502,9 @@ mod tests {
     use super::*;
 
     use crate::array::Int8Array;
+    use crate::cast::AsArray;
     use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type};
-    use crate::{BinaryArray, StringArray};
+    use crate::{ArrowPrimitiveType, BinaryArray, StringArray};
 
     fn test_bytes_dictionary_builder<T>(values: Vec<&T::Native>)
     where
@@ -664,4 +722,129 @@ mod tests {
         assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]);
         assert_eq!(dict.values().len(), 4);
     }
+
+    #[test]
+    fn test_extend_dictionary() {
+        let some_dict = {
+            let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
+            builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
+            builder.extend([None::<&str>]);
+            builder.extend(["c", "d", "a"].into_iter().map(Some));
+            builder.append_null();
+            builder.finish()
+        };
+
+        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
+        builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some));
+        builder
+            .extend_dictionary(&some_dict.downcast_dict().unwrap())
+            .unwrap();
+        let dict = builder.finish();
+
+        assert_eq!(dict.values().len(), 6);
+
+        let values = dict
+            .downcast_dict::<GenericByteArray<Utf8Type>>()
+            .unwrap()
+            .into_iter()
+            .collect::<Vec<_>>();
+
+        assert_eq!(
+            values,
+            [
+                Some("e"),
+                Some("e"),
+                Some("f"),
+                Some("e"),
+                Some("d"),
+                Some("a"),
+                Some("b"),
+                Some("c"),
+                Some("a"),
+                Some("b"),
+                Some("c"),
+                None,
+                Some("c"),
+                Some("d"),
+                Some("a"),
+                None
+            ]
+        );
+    }
+    #[test]
+    fn test_extend_dictionary_with_null_in_mapped_value() {
+        let some_dict = {
+            let mut values_builder = GenericByteBuilder::<Utf8Type>::new();
+            let mut keys_builder = PrimitiveBuilder::<Int32Type>::new();
+
+            // Manually build a dictionary values that the mapped values have null
+            values_builder.append_null();
+            keys_builder.append_value(0);
+            values_builder.append_value("I like worm hugs");
+            keys_builder.append_value(1);
+
+            let values = values_builder.finish();
+            let keys = keys_builder.finish();
+
+            let data_type = DataType::Dictionary(
+                Box::new(Int32Type::DATA_TYPE),
+                Box::new(Utf8Type::DATA_TYPE),
+            );
+
+            let builder = keys
+                .into_data()
+                .into_builder()
+                .data_type(data_type)
+                .child_data(vec![values.into_data()]);
+
+            DictionaryArray::from(unsafe { builder.build_unchecked() })
+        };
+
+        let some_dict_values = some_dict.values().as_string::<i32>();
+        assert_eq!(
+            some_dict_values.into_iter().collect::<Vec<_>>(),
+            &[None, Some("I like worm hugs")]
+        );
+
+        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
+        builder
+            .extend_dictionary(&some_dict.downcast_dict().unwrap())
+            .unwrap();
+        let dict = builder.finish();
+
+        assert_eq!(dict.values().len(), 1);
+
+        let values = dict
+            .downcast_dict::<GenericByteArray<Utf8Type>>()
+            .unwrap()
+            .into_iter()
+            .collect::<Vec<_>>();
+
+        assert_eq!(values, [None, Some("I like worm hugs")]);
+    }
+
+    #[test]
+    fn test_extend_all_null_dictionary() {
+        let some_dict = {
+            let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
+            builder.append_nulls(2);
+            builder.finish()
+        };
+
+        let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
+        builder
+            .extend_dictionary(&some_dict.downcast_dict().unwrap())
+            .unwrap();
+        let dict = builder.finish();
+
+        assert_eq!(dict.values().len(), 0);
+
+        let values = dict
+            .downcast_dict::<GenericByteArray<Utf8Type>>()
+            .unwrap()
+            .into_iter()
+            .collect::<Vec<_>>();
+
+        assert_eq!(values, [None, None]);
+    }
 }
diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs
index ac40f8a469d..282f0ae9d5b 100644
--- a/arrow-array/src/builder/primitive_dictionary_builder.rs
+++ b/arrow-array/src/builder/primitive_dictionary_builder.rs
@@ -17,7 +17,9 @@
 
 use crate::builder::{ArrayBuilder, PrimitiveBuilder};
 use crate::types::ArrowDictionaryKeyType;
-use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray};
+use crate::{
+    Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, PrimitiveArray, TypedDictionaryArray,
+};
 use arrow_buffer::{ArrowNativeType, ToByteSlice};
 use arrow_schema::{ArrowError, DataType};
 use std::any::Any;
@@ -44,7 +46,7 @@ impl<T: ToByteSlice> PartialEq for Value<T> {
 
 impl<T: ToByteSlice> Eq for Value<T> {}
 
-/// Builder for [`DictionaryArray`] of [`PrimitiveArray`](crate::array::PrimitiveArray)
+/// Builder for [`DictionaryArray`] of [`PrimitiveArray`]
 ///
 /// # Example:
 ///
@@ -303,6 +305,63 @@ where
         };
     }
 
+    /// Extends builder with dictionary
+    ///
+    /// This is the same as [`Self::extend`] but is faster as it translates
+    /// the dictionary values once rather than doing a lookup for each item in the iterator
+    ///
+    /// when dictionary values are null (the actual mapped values) the keys are null
+    ///
+    pub fn extend_dictionary(
+        &mut self,
+        dictionary: &TypedDictionaryArray<K, PrimitiveArray<V>>,
+    ) -> Result<(), ArrowError> {
+        let values = dictionary.values();
+
+        let v_len = values.len();
+        let k_len = dictionary.keys().len();
+        if v_len == 0 && k_len == 0 {
+            return Ok(());
+        }
+
+        // All nulls
+        if v_len == 0 {
+            self.append_nulls(k_len);
+            return Ok(());
+        }
+
+        if k_len == 0 {
+            return Err(ArrowError::InvalidArgumentError(
+                "Dictionary keys should not be empty when values are not empty".to_string(),
+            ));
+        }
+
+        // Orphan values will be carried over to the new dictionary
+        let mapped_values = values
+            .iter()
+            // Dictionary values can technically be null, so we need to handle that
+            .map(|dict_value| {
+                dict_value
+                    .map(|dict_value| self.get_or_insert_key(dict_value))
+                    .transpose()
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+
+        // Just insert the keys without additional lookups
+        dictionary.keys().iter().for_each(|key| match key {
+            None => self.append_null(),
+            Some(original_dict_index) => {
+                let index = original_dict_index.as_usize().min(v_len - 1);
+                match mapped_values[index] {
+                    None => self.append_null(),
+                    Some(mapped_value) => self.keys_builder.append_value(mapped_value),
+                }
+            }
+        });
+
+        Ok(())
+    }
+
     /// Builds the `DictionaryArray` and reset this builder.
     pub fn finish(&mut self) -> DictionaryArray<K> {
         self.map.clear();
@@ -368,9 +427,9 @@ impl<K: ArrowDictionaryKeyType, P: ArrowPrimitiveType> Extend<Option<P::Native>>
 mod tests {
     use super::*;
 
-    use crate::array::UInt32Array;
-    use crate::array::UInt8Array;
+    use crate::array::{Int32Array, UInt32Array, UInt8Array};
     use crate::builder::Decimal128Builder;
+    use crate::cast::AsArray;
     use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type};
 
     #[test]
@@ -443,4 +502,135 @@ mod tests {
             )
         );
     }
+
+    #[test]
+    fn test_extend_dictionary() {
+        let some_dict = {
+            let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
+            builder.extend([1, 2, 3, 1, 2, 3, 1, 2, 3].into_iter().map(Some));
+            builder.extend([None::<i32>]);
+            builder.extend([4, 5, 1, 3, 1].into_iter().map(Some));
+            builder.append_null();
+            builder.finish()
+        };
+
+        let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
+        builder.extend([6, 6, 7, 6, 5].into_iter().map(Some));
+        builder
+            .extend_dictionary(&some_dict.downcast_dict().unwrap())
+            .unwrap();
+        let dict = builder.finish();
+
+        assert_eq!(dict.values().len(), 7);
+
+        let values = dict
+            .downcast_dict::<Int32Array>()
+            .unwrap()
+            .into_iter()
+            .collect::<Vec<_>>();
+
+        assert_eq!(
+            values,
+            [
+                Some(6),
+                Some(6),
+                Some(7),
+                Some(6),
+                Some(5),
+                Some(1),
+                Some(2),
+                Some(3),
+                Some(1),
+                Some(2),
+                Some(3),
+                Some(1),
+                Some(2),
+                Some(3),
+                None,
+                Some(4),
+                Some(5),
+                Some(1),
+                Some(3),
+                Some(1),
+                None
+            ]
+        );
+    }
+
+    #[test]
+    fn test_extend_dictionary_with_null_in_mapped_value() {
+        let some_dict = {
+            let mut values_builder = PrimitiveBuilder::<Int32Type>::new();
+            let mut keys_builder = PrimitiveBuilder::<Int32Type>::new();
+
+            // Manually build a dictionary values that the mapped values have null
+            values_builder.append_null();
+            keys_builder.append_value(0);
+            values_builder.append_value(42);
+            keys_builder.append_value(1);
+
+            let values = values_builder.finish();
+            let keys = keys_builder.finish();
+
+            let data_type = DataType::Dictionary(
+                Box::new(Int32Type::DATA_TYPE),
+                Box::new(values.data_type().clone()),
+            );
+
+            let builder = keys
+                .into_data()
+                .into_builder()
+                .data_type(data_type)
+                .child_data(vec![values.into_data()]);
+
+            DictionaryArray::from(unsafe { builder.build_unchecked() })
+        };
+
+        let some_dict_values = some_dict.values().as_primitive::<Int32Type>();
+        assert_eq!(
+            some_dict_values.into_iter().collect::<Vec<_>>(),
+            &[None, Some(42)]
+        );
+
+        let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
+        builder
+            .extend_dictionary(&some_dict.downcast_dict().unwrap())
+            .unwrap();
+        let dict = builder.finish();
+
+        assert_eq!(dict.values().len(), 1);
+
+        let values = dict
+            .downcast_dict::<Int32Array>()
+            .unwrap()
+            .into_iter()
+            .collect::<Vec<_>>();
+
+        assert_eq!(values, [None, Some(42)]);
+    }
+
+    #[test]
+    fn test_extend_all_null_dictionary() {
+        let some_dict = {
+            let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
+            builder.append_nulls(2);
+            builder.finish()
+        };
+
+        let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
+        builder
+            .extend_dictionary(&some_dict.downcast_dict().unwrap())
+            .unwrap();
+        let dict = builder.finish();
+
+        assert_eq!(dict.values().len(), 0);
+
+        let values = dict
+            .downcast_dict::<Int32Array>()
+            .unwrap()
+            .into_iter()
+            .collect::<Vec<_>>();
+
+        assert_eq!(values, [None, None]);
+    }
 }

From 2c84f243b882eff69806cd7294d38bf422fdb24a Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Fri, 20 Dec 2024 16:18:55 -0500
Subject: [PATCH 27/29] [object_store]: Version and Changelog for 0.11.2
 (#6908)

* [object_store]: Version and Changelog for 0.11.2

* increment version

* update script

* changelog

* tweaks

* Update object_store/CHANGELOG.md

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

---------

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
---
 object_store/CHANGELOG-old.md                 | 39 ++++++++++++++
 object_store/CHANGELOG.md                     | 51 ++++++++++---------
 object_store/Cargo.toml                       |  2 +-
 object_store/dev/release/README.md            |  5 +-
 object_store/dev/release/update_change_log.sh |  4 +-
 5 files changed, 72 insertions(+), 29 deletions(-)

diff --git a/object_store/CHANGELOG-old.md b/object_store/CHANGELOG-old.md
index 28dbde4e7b7..c42689240dd 100644
--- a/object_store/CHANGELOG-old.md
+++ b/object_store/CHANGELOG-old.md
@@ -19,6 +19,45 @@
 
 # Historical Changelog
 
+
+## [object_store_0.11.1](https://github.com/apache/arrow-rs/tree/object_store_0.11.1) (2024-10-15)
+
+[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.0...object_store_0.11.1)
+
+**Implemented enhancements:**
+
+- There is no way to pass object store client options as environment variables [\#6333](https://github.com/apache/arrow-rs/issues/6333) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- Better Document Backoff Algorithm [\#6324](https://github.com/apache/arrow-rs/issues/6324) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- Add direction to `list_with_offset` [\#6274](https://github.com/apache/arrow-rs/issues/6274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- Support server-side encryption with customer-provided keys \(SSE-C\) [\#6229](https://github.com/apache/arrow-rs/issues/6229) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+
+**Fixed bugs:**
+
+- \[object-store\] Requested tokio version is too old - does not compile [\#6458](https://github.com/apache/arrow-rs/issues/6458) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- Azure SAS tokens are visible when retry errors are logged via object\_store [\#6322](https://github.com/apache/arrow-rs/issues/6322) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+
+**Merged pull requests:**
+
+- object\_store: fix typo in with\_connect\_timeout\_disabled that actually disabled non-connect timeouts [\#6563](https://github.com/apache/arrow-rs/pull/6563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adriangb](https://github.com/adriangb))
+- object\_store: Clarify what is a prefix in list\(\) documentation [\#6520](https://github.com/apache/arrow-rs/pull/6520) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([progval](https://github.com/progval))
+- object\_store: enable lint `unreachable_pub` [\#6512](https://github.com/apache/arrow-rs/pull/6512) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker))
+- \[object\_store\] Retry S3 requests with 200 response with "Error" in body [\#6508](https://github.com/apache/arrow-rs/pull/6508) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([PeterKeDer](https://github.com/PeterKeDer))
+- \[object-store\] Require tokio 1.29.0. [\#6459](https://github.com/apache/arrow-rs/pull/6459) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ashtuchkin](https://github.com/ashtuchkin))
+- feat: expose HTTP/2 max frame size in `object_store` [\#6442](https://github.com/apache/arrow-rs/pull/6442) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum))
+- Derive `Clone` for `object_store::aws::AmazonS3` [\#6414](https://github.com/apache/arrow-rs/pull/6414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ethe](https://github.com/ethe))
+- object\_score: Support Azure Fabric OAuth Provider [\#6382](https://github.com/apache/arrow-rs/pull/6382) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666))
+- `object_store::GetOptions` derive `Clone` [\#6361](https://github.com/apache/arrow-rs/pull/6361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([samuelcolvin](https://github.com/samuelcolvin))
+- \[object\_store\] Propagate env vars as object store client options [\#6334](https://github.com/apache/arrow-rs/pull/6334) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ccciudatu](https://github.com/ccciudatu))
+- docs\[object\_store\]: clarify the backoff strategy that is actually implemented [\#6325](https://github.com/apache/arrow-rs/pull/6325) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([westonpace](https://github.com/westonpace))
+- fix: azure sas token visible in logs [\#6323](https://github.com/apache/arrow-rs/pull/6323) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel))
+- object\_store/delimited: Fix `TrailingEscape` condition [\#6265](https://github.com/apache/arrow-rs/pull/6265) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87))
+- fix\(object\_store\): only add encryption headers for SSE-C in get request [\#6260](https://github.com/apache/arrow-rs/pull/6260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb))
+- docs: Add parquet\_opendal in related projects [\#6236](https://github.com/apache/arrow-rs/pull/6236) ([Xuanwo](https://github.com/Xuanwo))
+- feat\(object\_store\): add support for server-side encryption with customer-provided keys \(SSE-C\) [\#6230](https://github.com/apache/arrow-rs/pull/6230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb))
+- feat: further TLS options on ClientOptions: \#5034 [\#6148](https://github.com/apache/arrow-rs/pull/6148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker))
+
+
+
 ## [object_store_0.11.0](https://github.com/apache/arrow-rs/tree/object_store_0.11.0) (2024-08-12)
 
 [Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.10.2...object_store_0.11.0)
diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md
index 95585983572..0e834c5e2ef 100644
--- a/object_store/CHANGELOG.md
+++ b/object_store/CHANGELOG.md
@@ -19,41 +19,42 @@
 
 # Changelog
 
-## [object_store_0.11.1](https://github.com/apache/arrow-rs/tree/object_store_0.11.1) (2024-10-15)
+## [object_store_0.11.2](https://github.com/apache/arrow-rs/tree/object_store_0.11.2) (2024-12-20)
 
-[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.0...object_store_0.11.1)
+[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.11.1...object_store_0.11.2)
 
 **Implemented enhancements:**
 
-- There is no way to pass object store client options as environment variables [\#6333](https://github.com/apache/arrow-rs/issues/6333) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
-- Better Document Backoff Algorithm [\#6324](https://github.com/apache/arrow-rs/issues/6324) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
-- Add direction to `list_with_offset` [\#6274](https://github.com/apache/arrow-rs/issues/6274) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
-- Support server-side encryption with customer-provided keys \(SSE-C\) [\#6229](https://github.com/apache/arrow-rs/issues/6229) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- object-store's AzureClient should protect against multiple streams performing put\_block in parallel for the same BLOB path [\#6868](https://github.com/apache/arrow-rs/issues/6868) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- Support S3 Put IfMatch [\#6799](https://github.com/apache/arrow-rs/issues/6799) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- object\_store Azure Government using OAuth [\#6759](https://github.com/apache/arrow-rs/issues/6759) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- Support for AWS Requester Pays buckets [\#6716](https://github.com/apache/arrow-rs/issues/6716) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- \[object-store\]: Implement credential\_process support for S3 [\#6422](https://github.com/apache/arrow-rs/issues/6422) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- object\_store: Conditional put and rename\_if\_not\_exist on S3 [\#6285](https://github.com/apache/arrow-rs/issues/6285) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
 
 **Fixed bugs:**
 
-- \[object-store\] Requested tokio version is too old - does not compile [\#6458](https://github.com/apache/arrow-rs/issues/6458) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
-- Azure SAS tokens are visible when retry errors are logged via object\_store [\#6322](https://github.com/apache/arrow-rs/issues/6322) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- `object_store` errors when `reqwest` `gzip` feature is enabled [\#6842](https://github.com/apache/arrow-rs/issues/6842) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- Multi-part s3 uploads fail when using checksum [\#6793](https://github.com/apache/arrow-rs/issues/6793) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- `with_unsigned_payload` shouldn't generate payload hash [\#6697](https://github.com/apache/arrow-rs/issues/6697) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- \[Object\_store\] min\_ttl is too high for GKE tokens [\#6625](https://github.com/apache/arrow-rs/issues/6625) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- object\_store `test_private_bucket` fails - store: "S3", source: BucketNotFound { bucket: "bloxbender" } [\#6600](https://github.com/apache/arrow-rs/issues/6600) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
+- S3 endpoint and trailing slash result in weird/invalid requests [\#6580](https://github.com/apache/arrow-rs/issues/6580) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)]
 
 **Merged pull requests:**
 
-- object\_store: fix typo in with\_connect\_timeout\_disabled that actually disabled non-connect timeouts [\#6563](https://github.com/apache/arrow-rs/pull/6563) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adriangb](https://github.com/adriangb))
-- object\_store: Clarify what is a prefix in list\(\) documentation [\#6520](https://github.com/apache/arrow-rs/pull/6520) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([progval](https://github.com/progval))
-- object\_store: enable lint `unreachable_pub` [\#6512](https://github.com/apache/arrow-rs/pull/6512) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker))
-- \[object\_store\] Retry S3 requests with 200 response with "Error" in body [\#6508](https://github.com/apache/arrow-rs/pull/6508) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([PeterKeDer](https://github.com/PeterKeDer))
-- \[object-store\] Require tokio 1.29.0. [\#6459](https://github.com/apache/arrow-rs/pull/6459) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ashtuchkin](https://github.com/ashtuchkin))
-- feat: expose HTTP/2 max frame size in `object_store` [\#6442](https://github.com/apache/arrow-rs/pull/6442) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum))
-- Derive `Clone` for `object_store::aws::AmazonS3` [\#6414](https://github.com/apache/arrow-rs/pull/6414) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ethe](https://github.com/ethe))
-- object\_score: Support Azure Fabric OAuth Provider [\#6382](https://github.com/apache/arrow-rs/pull/6382) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([RobinLin666](https://github.com/RobinLin666))
-- `object_store::GetOptions` derive `Clone` [\#6361](https://github.com/apache/arrow-rs/pull/6361) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([samuelcolvin](https://github.com/samuelcolvin))
-- \[object\_store\] Propagate env vars as object store client options [\#6334](https://github.com/apache/arrow-rs/pull/6334) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ccciudatu](https://github.com/ccciudatu))
-- docs\[object\_store\]: clarify the backoff strategy that is actually implemented [\#6325](https://github.com/apache/arrow-rs/pull/6325) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([westonpace](https://github.com/westonpace))
-- fix: azure sas token visible in logs [\#6323](https://github.com/apache/arrow-rs/pull/6323) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alexwilcoxson-rel](https://github.com/alexwilcoxson-rel))
-- object\_store/delimited: Fix `TrailingEscape` condition [\#6265](https://github.com/apache/arrow-rs/pull/6265) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([Turbo87](https://github.com/Turbo87))
-- fix\(object\_store\): only add encryption headers for SSE-C in get request [\#6260](https://github.com/apache/arrow-rs/pull/6260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb))
-- docs: Add parquet\_opendal in related projects [\#6236](https://github.com/apache/arrow-rs/pull/6236) ([Xuanwo](https://github.com/Xuanwo))
-- feat\(object\_store\): add support for server-side encryption with customer-provided keys \(SSE-C\) [\#6230](https://github.com/apache/arrow-rs/pull/6230) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jiachengdb](https://github.com/jiachengdb))
-- feat: further TLS options on ClientOptions: \#5034 [\#6148](https://github.com/apache/arrow-rs/pull/6148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([ByteBaker](https://github.com/ByteBaker))
+- Use randomized content ID for Azure multipart uploads [\#6869](https://github.com/apache/arrow-rs/pull/6869) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avarnon](https://github.com/avarnon))
+- Always explicitly disable `gzip` automatic decompression on reqwest client used by object\_store [\#6843](https://github.com/apache/arrow-rs/pull/6843) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([phillipleblanc](https://github.com/phillipleblanc))
+- object-store: remove S3ConditionalPut::ETagPutIfNotExists [\#6802](https://github.com/apache/arrow-rs/pull/6802) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch))
+- Fix multipart uploads with checksums on object locked buckets [\#6794](https://github.com/apache/arrow-rs/pull/6794) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([avantgardnerio](https://github.com/avantgardnerio))
+- Add AuthorityHost to AzureConfigKey [\#6773](https://github.com/apache/arrow-rs/pull/6773) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([zadeluca](https://github.com/zadeluca))
+- object\_store: Add support for requester pays buckets [\#6768](https://github.com/apache/arrow-rs/pull/6768) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([kylebarron](https://github.com/kylebarron))
+- check sign\_payload instead of skip\_signature before computing checksum [\#6698](https://github.com/apache/arrow-rs/pull/6698) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mherrerarendon](https://github.com/mherrerarendon))
+- Update quick-xml requirement from 0.36.0 to 0.37.0 in /object\_store [\#6687](https://github.com/apache/arrow-rs/pull/6687) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([crepererum](https://github.com/crepererum))
+- Support native S3 conditional writes [\#6682](https://github.com/apache/arrow-rs/pull/6682) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([benesch](https://github.com/benesch))
+- \[object\_store\] fix S3 endpoint and trailing slash result in invalid requests [\#6641](https://github.com/apache/arrow-rs/pull/6641) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([adbmal](https://github.com/adbmal))
+- Lower GCP token min\_ttl to 4 minutes and add backoff to token refresh logic [\#6638](https://github.com/apache/arrow-rs/pull/6638) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([mwylde](https://github.com/mwylde))
+- Remove `test_private_bucket` object\_store test [\#6601](https://github.com/apache/arrow-rs/pull/6601) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb))
 
 
 
diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml
index bcc8e0b9224..bf254b3a0bb 100644
--- a/object_store/Cargo.toml
+++ b/object_store/Cargo.toml
@@ -17,7 +17,7 @@
 
 [package]
 name = "object_store"
-version = "0.11.1"
+version = "0.11.2"
 edition = "2021"
 license = "MIT/Apache-2.0"
 readme = "README.md"
diff --git a/object_store/dev/release/README.md b/object_store/dev/release/README.md
index 912ff4cd8ba..2dd1f6243c0 100644
--- a/object_store/dev/release/README.md
+++ b/object_store/dev/release/README.md
@@ -24,7 +24,10 @@
 
 This file documents the release process for the `object_store` crate.
 
-At the time of writing, we release a new version of `object_store` on demand rather than on a regular schedule.
+We release a new version of `object_store` according to the schedule listed in 
+the [main README.md]
+
+[main README.md]: https://github.com/apache/arrow-rs?tab=readme-ov-file#object_store-crate
 
 As we are still in an early phase, we use the 0.x version scheme. If any code has 
 been merged to main that has a breaking API change, as defined in [Rust RFC 1105]
diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh
index 30724478ae1..2797b62c001 100755
--- a/object_store/dev/release/update_change_log.sh
+++ b/object_store/dev/release/update_change_log.sh
@@ -29,8 +29,8 @@
 
 set -e
 
-SINCE_TAG="object_store_0.11.0"
-FUTURE_RELEASE="object_store_0.11.1"
+SINCE_TAG="object_store_0.11.1"
+FUTURE_RELEASE="object_store_0.11.2"
 
 SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)"

From 10cf03c35808eedc191a9b3330bc7c268b7b71c1 Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Tue, 24 Dec 2024 22:22:33 +0800
Subject: [PATCH 28/29] feat(parquet): Add next_row_group API for
 ParquetRecordBatchStream (#6907)

* feat(parquet): Add next_row_group API for ParquetRecordBatchStream

Signed-off-by: Xuanwo <github@xuanwo.io>

* chore: Returning error instead of using unreachable

Signed-off-by: Xuanwo <github@xuanwo.io>

---------

Signed-off-by: Xuanwo <github@xuanwo.io>
---
 parquet/src/arrow/async_reader/mod.rs | 132 ++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)

diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs
index c408456df14..96715e1164b 100644
--- a/parquet/src/arrow/async_reader/mod.rs
+++ b/parquet/src/arrow/async_reader/mod.rs
@@ -613,6 +613,9 @@ impl<T> std::fmt::Debug for StreamState<T> {
 
 /// An asynchronous [`Stream`](https://docs.rs/futures/latest/futures/stream/trait.Stream.html) of [`RecordBatch`]
 /// for a parquet file that can be constructed using [`ParquetRecordBatchStreamBuilder`].
+///
+/// `ParquetRecordBatchStream` also provides [`ParquetRecordBatchStream::next_row_group`] for fetching row groups,
+/// allowing users to decode record batches separately from I/O.
 pub struct ParquetRecordBatchStream<T> {
     metadata: Arc<ParquetMetaData>,
 
@@ -654,6 +657,70 @@ impl<T> ParquetRecordBatchStream<T> {
     }
 }
 
+impl<T> ParquetRecordBatchStream<T>
+where
+    T: AsyncFileReader + Unpin + Send + 'static,
+{
+    /// Fetches the next row group from the stream.
+    ///
+    /// Users can continue to call this function to get row groups and decode them concurrently.
+    ///
+    /// ## Notes
+    ///
+    /// ParquetRecordBatchStream should be used either as a `Stream` or with `next_row_group`; they should not be used simultaneously.
+    ///
+    /// ## Returns
+    ///
+    /// - `Ok(None)` if the stream has ended.
+    /// - `Err(error)` if the stream has errored. All subsequent calls will return `Ok(None)`.
+    /// - `Ok(Some(reader))` which holds all the data for the row group.
+    pub async fn next_row_group(&mut self) -> Result<Option<ParquetRecordBatchReader>> {
+        loop {
+            match &mut self.state {
+                StreamState::Decoding(_) | StreamState::Reading(_) => {
+                    return Err(ParquetError::General(
+                        "Cannot combine the use of next_row_group with the Stream API".to_string(),
+                    ))
+                }
+                StreamState::Init => {
+                    let row_group_idx = match self.row_groups.pop_front() {
+                        Some(idx) => idx,
+                        None => return Ok(None),
+                    };
+
+                    let row_count = self.metadata.row_group(row_group_idx).num_rows() as usize;
+
+                    let selection = self.selection.as_mut().map(|s| s.split_off(row_count));
+
+                    let reader_factory = self.reader.take().expect("lost reader");
+
+                    let (reader_factory, maybe_reader) = reader_factory
+                        .read_row_group(
+                            row_group_idx,
+                            selection,
+                            self.projection.clone(),
+                            self.batch_size,
+                        )
+                        .await
+                        .map_err(|err| {
+                            self.state = StreamState::Error;
+                            err
+                        })?;
+                    self.reader = Some(reader_factory);
+
+                    if let Some(reader) = maybe_reader {
+                        return Ok(Some(reader));
+                    } else {
+                        // All rows skipped, read next row group
+                        continue;
+                    }
+                }
+                StreamState::Error => return Ok(None), // Ends the stream as error happens.
+            }
+        }
+    }
+}
+
 impl<T> Stream for ParquetRecordBatchStream<T>
 where
     T: AsyncFileReader + Unpin + Send + 'static,
@@ -1020,6 +1087,71 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn test_async_reader_with_next_row_group() {
+        let testdata = arrow::util::test_util::parquet_test_data();
+        let path = format!("{testdata}/alltypes_plain.parquet");
+        let data = Bytes::from(std::fs::read(path).unwrap());
+
+        let metadata = ParquetMetaDataReader::new()
+            .parse_and_finish(&data)
+            .unwrap();
+        let metadata = Arc::new(metadata);
+
+        assert_eq!(metadata.num_row_groups(), 1);
+
+        let async_reader = TestReader {
+            data: data.clone(),
+            metadata: metadata.clone(),
+            requests: Default::default(),
+        };
+
+        let requests = async_reader.requests.clone();
+        let builder = ParquetRecordBatchStreamBuilder::new(async_reader)
+            .await
+            .unwrap();
+
+        let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![1, 2]);
+        let mut stream = builder
+            .with_projection(mask.clone())
+            .with_batch_size(1024)
+            .build()
+            .unwrap();
+
+        let mut readers = vec![];
+        while let Some(reader) = stream.next_row_group().await.unwrap() {
+            readers.push(reader);
+        }
+
+        let async_batches: Vec<_> = readers
+            .into_iter()
+            .flat_map(|r| r.map(|v| v.unwrap()).collect::<Vec<_>>())
+            .collect();
+
+        let sync_batches = ParquetRecordBatchReaderBuilder::try_new(data)
+            .unwrap()
+            .with_projection(mask)
+            .with_batch_size(104)
+            .build()
+            .unwrap()
+            .collect::<ArrowResult<Vec<_>>>()
+            .unwrap();
+
+        assert_eq!(async_batches, sync_batches);
+
+        let requests = requests.lock().unwrap();
+        let (offset_1, length_1) = metadata.row_group(0).column(1).byte_range();
+        let (offset_2, length_2) = metadata.row_group(0).column(2).byte_range();
+
+        assert_eq!(
+            &requests[..],
+            &[
+                offset_1 as usize..(offset_1 + length_1) as usize,
+                offset_2 as usize..(offset_2 + length_2) as usize
+            ]
+        );
+    }
+
     #[tokio::test]
     async fn test_async_reader_with_index() {
         let testdata = arrow::util::test_util::parquet_test_data();

From 63899b7ad9d093e86acc6c98604aa66431102993 Mon Sep 17 00:00:00 2001
From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com>
Date: Thu, 26 Dec 2024 16:49:36 +0200
Subject: [PATCH 29/29] chore(arrow-ord): move `can_rank` to the `rank` file
 (#6910)

---
 arrow-ord/src/rank.rs |  9 +++++++++
 arrow-ord/src/sort.rs | 11 +----------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arrow-ord/src/rank.rs b/arrow-ord/src/rank.rs
index ecc693bab4e..e61cebef38e 100644
--- a/arrow-ord/src/rank.rs
+++ b/arrow-ord/src/rank.rs
@@ -24,6 +24,15 @@ use arrow_buffer::NullBuffer;
 use arrow_schema::{ArrowError, DataType, SortOptions};
 use std::cmp::Ordering;
 
+/// Whether `arrow_ord::rank` can rank an array of given data type.
+pub(crate) fn can_rank(data_type: &DataType) -> bool {
+    data_type.is_primitive()
+        || matches!(
+            data_type,
+            DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary
+        )
+}
+
 /// Assigns a rank to each value in `array` based on its position in the sorted order
 ///
 /// Where values are equal, they will be assigned the highest of their ranks,
diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs
index 60fc4a91852..51a6659e631 100644
--- a/arrow-ord/src/sort.rs
+++ b/arrow-ord/src/sort.rs
@@ -30,7 +30,7 @@ use arrow_select::take::take;
 use std::cmp::Ordering;
 use std::sync::Arc;
 
-use crate::rank::rank;
+use crate::rank::{can_rank, rank};
 pub use arrow_schema::SortOptions;
 
 /// Sort the `ArrayRef` using `SortOptions`.
@@ -190,15 +190,6 @@ fn partition_validity(array: &dyn Array) -> (Vec<u32>, Vec<u32>) {
     }
 }
 
-/// Whether `arrow_ord::rank` can rank an array of given data type.
-fn can_rank(data_type: &DataType) -> bool {
-    data_type.is_primitive()
-        || matches!(
-            data_type,
-            DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary
-        )
-}
-
 /// Whether `sort_to_indices` can sort an array of given data type.
 fn can_sort_to_indices(data_type: &DataType) -> bool {
     data_type.is_primitive()