From de1591770445773ca5947f44c8b5fe58fec985d7 Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@apache.org>
Date: Fri, 29 Sep 2023 23:30:27 +0900
Subject: [PATCH 1/7] Improve cache usage in CI (#7678)

* Improve cache usage in CI

* Trigger GA to check the effect of improvement
---
 .github/workflows/rust.yml | 77 ++++++++++----------------------------
 1 file changed, 20 insertions(+), 57 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index fa5c56b43e03..80de6e663fcd 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -47,18 +47,24 @@ jobs:
       image: amd64/rust
     steps:
       - uses: actions/checkout@v4
-      - name: Cache Cargo
-        uses: actions/cache@v3
-        with:
-          # these represent dependencies downloaded by cargo
-          # and thus do not depend on the OS, arch nor rust version.
-          path: /github/home/.cargo
-          key: cargo-cache-
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
           rust-version: stable
 
+      - name: Cache Cargo
+        uses: actions/cache@v3
+        with:
+          path: |
+            ~/.cargo/bin/
+            ~/.cargo/registry/index/
+            ~/.cargo/registry/cache/
+            ~/.cargo/git/db/
+            ./target/
+            ./datafusion-cli/target/
+          # this key equals the ones on `linux-build-lib` for re-use
+          key: cargo-cache-benchmark-${{ hashFiles('datafusion/**/Cargo.toml', 'benchmarks/Cargo.toml', 'datafusion-cli/Cargo.toml') }}
+
       - name: Check workspace without default features
         run: cargo check --no-default-features -p datafusion
 
@@ -84,12 +90,6 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: true
-      - name: Cache Cargo
-        uses: actions/cache@v3
-        with:
-          path: /github/home/.cargo
-          # this key equals the ones on `linux-build-lib` for re-use
-          key: cargo-cache-
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -109,12 +109,6 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: true
-      - name: Cache Cargo
-        uses: actions/cache@v3
-        with:
-          path: /github/home/.cargo
-          # this key equals the ones on `linux-build-lib` for re-use
-          key: cargo-cache-
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -211,12 +205,6 @@ jobs:
       image: amd64/rust
     steps:
       - uses: actions/checkout@v4
-      - name: Cache Cargo
-        uses: actions/cache@v3
-        with:
-          path: /github/home/.cargo
-          # this key equals the ones on `linux-build-lib` for re-use
-          key: cargo-cache-
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -241,9 +229,14 @@ jobs:
       - name: Cache Cargo
         uses: actions/cache@v3
         with:
-          path: /github/home/.cargo
+          path: |
+            ~/.cargo/bin/
+            ~/.cargo/registry/index/
+            ~/.cargo/registry/cache/
+            ~/.cargo/git/db/
+            ./target/
           # this key equals the ones on `linux-build-lib` for re-use
-          key: cargo-cache-
+          key: cargo-cache-benchmark-${{ hashFiles('datafusion/**/Cargo.toml', 'benchmarks/Cargo.toml') }}
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -377,12 +370,6 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: true
-      - name: Cache Cargo
-        uses: actions/cache@v3
-        with:
-          path: /github/home/.cargo
-          # this key equals the ones on `linux-build-lib` for re-use
-          key: cargo-cache-
       - uses: actions/setup-python@v4
         with:
           python-version: "3.8"
@@ -480,12 +467,6 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: true
-      - name: Cache Cargo
-        uses: actions/cache@v3
-        with:
-          path: /github/home/.cargo
-          # this key equals the ones on `linux-build-lib` for re-use
-          key: cargo-cache-
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -506,12 +487,6 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: true
-      - name: Cache Cargo
-        uses: actions/cache@v3
-        with:
-          path: /github/home/.cargo
-          # this key equals the ones on `linux-build-lib` for re-use
-          key: cargo-cache-
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -531,12 +506,6 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: true
-      - name: Cache Cargo
-        uses: actions/cache@v3
-        with:
-          path: /github/home/.cargo
-          # this key equals the ones on `linux-build-lib` for re-use
-          key: cargo-cache-
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:
@@ -563,12 +532,6 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: true
-      - name: Cache Cargo
-        uses: actions/cache@v3
-        with:
-          path: /github/home/.cargo
-          # this key equals the ones on `linux-build-lib` for re-use
-          key: cargo-cache-
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:

From 70cded6e3d95036d4150d4c77b7e57caa90d7a22 Mon Sep 17 00:00:00 2001
From: Ruihang Xia <waynestxia@gmail.com>
Date: Sat, 30 Sep 2023 00:30:26 +0800
Subject: [PATCH 2/7] fix: substrait limit when fetch is None (#7669)

* fix: substrait limit when fetch is None

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* Add comments

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/physical-plan/src/limit.rs                    | 2 +-
 datafusion/substrait/src/logical_plan/consumer.rs        | 9 +++++++--
 datafusion/substrait/src/logical_plan/producer.rs        | 3 ++-
 .../substrait/tests/cases/roundtrip_logical_plan.rs      | 5 +++++
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs
index 922c3db0efc8..31ed08399c2e 100644
--- a/datafusion/physical-plan/src/limit.rs
+++ b/datafusion/physical-plan/src/limit.rs
@@ -442,7 +442,7 @@ impl LimitStream {
 
             match &poll {
                 Poll::Ready(Some(Ok(batch))) => {
-                    if batch.num_rows() > 0 && self.skip == 0 {
+                    if batch.num_rows() > 0 {
                         break poll;
                     } else {
                         // continue to poll input stream
diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs
index 32b8f8ea547f..e1dde39427a5 100644
--- a/datafusion/substrait/src/logical_plan/consumer.rs
+++ b/datafusion/substrait/src/logical_plan/consumer.rs
@@ -227,8 +227,13 @@ pub async fn from_substrait_rel(
                     from_substrait_rel(ctx, input, extensions).await?,
                 );
                 let offset = fetch.offset as usize;
-                let count = fetch.count as usize;
-                input.limit(offset, Some(count))?.build()
+                // Since protobuf can't directly distinguish `None` vs `0` `None` is encoded as `MAX`
+                let count = if fetch.count as usize == usize::MAX {
+                    None
+                } else {
+                    Some(fetch.count as usize)
+                };
+                input.limit(offset, count)?.build()
             } else {
                 not_impl_err!("Fetch without an input is not valid")
             }
diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs
index e17b022f3b53..1124ea53a557 100644
--- a/datafusion/substrait/src/logical_plan/producer.rs
+++ b/datafusion/substrait/src/logical_plan/producer.rs
@@ -193,7 +193,8 @@ pub fn to_substrait_rel(
         }
         LogicalPlan::Limit(limit) => {
             let input = to_substrait_rel(limit.input.as_ref(), ctx, extension_info)?;
-            let limit_fetch = limit.fetch.unwrap_or(0);
+            // Since protobuf can't directly distinguish `None` vs `0` encode `None` as `MAX`
+            let limit_fetch = limit.fetch.unwrap_or(usize::MAX);
             Ok(Box::new(Rel {
                 rel_type: Some(RelType::Fetch(Box::new(FetchRel {
                     common: None,
diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
index f4d74ae42681..2554d0667e48 100644
--- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs
@@ -188,6 +188,11 @@ async fn select_with_limit() -> Result<()> {
     roundtrip_fill_na("SELECT * FROM data LIMIT 100").await
 }
 
+#[tokio::test]
+async fn select_without_limit() -> Result<()> {
+    roundtrip_fill_na("SELECT * FROM data OFFSET 10").await
+}
+
 #[tokio::test]
 async fn select_with_limit_offset() -> Result<()> {
     roundtrip("SELECT * FROM data LIMIT 200 OFFSET 10").await

From 2d6e768c6084c4955ac6dfb9c389aa8849464f05 Mon Sep 17 00:00:00 2001
From: Ruihang Xia <waynestxia@gmail.com>
Date: Sat, 30 Sep 2023 00:35:10 +0800
Subject: [PATCH 3/7] minor: revert parsing precedence between Aggr and UDAF
 (#7682)

* minor: revert parsing precedence between Aggr and UDAF

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add unit test

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
---
 .../user_defined/user_defined_aggregates.rs   | 37 +++++++++++++++++--
 datafusion/sql/src/expr/function.rs           | 18 ++++-----
 2 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs
index 64547bbdfa36..3b7b4d0e87b7 100644
--- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs
+++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs
@@ -169,6 +169,37 @@ async fn test_udaf_returning_struct_subquery() {
     assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap());
 }
 
+#[tokio::test]
+async fn test_udaf_shadows_builtin_fn() {
+    let TestContext {
+        mut ctx,
+        test_state,
+    } = TestContext::new();
+    let sql = "SELECT sum(arrow_cast(time, 'Int64')) from t";
+
+    // compute with builtin `sum` aggregator
+    let expected = [
+        "+-------------+",
+        "| SUM(t.time) |",
+        "+-------------+",
+        "| 19000       |",
+        "+-------------+",
+    ];
+    assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap());
+
+    // Register `TimeSum` with name `sum`. This will shadow the builtin one
+    let sql = "SELECT sum(time) from t";
+    TimeSum::register(&mut ctx, test_state.clone(), "sum");
+    let expected = [
+        "+----------------------------+",
+        "| sum(t.time)                |",
+        "+----------------------------+",
+        "| 1970-01-01T00:00:00.000019 |",
+        "+----------------------------+",
+    ];
+    assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap());
+}
+
 async fn execute(ctx: &SessionContext, sql: &str) -> Result<Vec<RecordBatch>> {
     ctx.sql(sql).await?.collect().await
 }
@@ -214,7 +245,7 @@ impl TestContext {
         // Tell DataFusion about the "first" function
         FirstSelector::register(&mut ctx);
         // Tell DataFusion about the "time_sum" function
-        TimeSum::register(&mut ctx, Arc::clone(&test_state));
+        TimeSum::register(&mut ctx, Arc::clone(&test_state), "time_sum");
 
         Self { ctx, test_state }
     }
@@ -281,7 +312,7 @@ impl TimeSum {
         Self { sum: 0, test_state }
     }
 
-    fn register(ctx: &mut SessionContext, test_state: Arc<TestState>) {
+    fn register(ctx: &mut SessionContext, test_state: Arc<TestState>, name: &str) {
         let timestamp_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
 
         // Returns the same type as its input
@@ -301,8 +332,6 @@ impl TimeSum {
         let accumulator: AccumulatorFactoryFunction =
             Arc::new(move |_| Ok(Box::new(Self::new(Arc::clone(&captured_state)))));
 
-        let name = "time_sum";
-
         let time_sum =
             AggregateUDF::new(name, &signature, &return_type, &accumulator, &state_type);
 
diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs
index 05f80fcfafa9..3861b4848d9b 100644
--- a/datafusion/sql/src/expr/function.rs
+++ b/datafusion/sql/src/expr/function.rs
@@ -124,6 +124,15 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                 return Ok(expr);
             }
         } else {
+            // User defined aggregate functions (UDAF) have precedence in case it has the same name as a scalar built-in function
+            if let Some(fm) = self.schema_provider.get_aggregate_meta(&name) {
+                let args =
+                    self.function_args_to_expr(function.args, schema, planner_context)?;
+                return Ok(Expr::AggregateUDF(expr::AggregateUDF::new(
+                    fm, args, None, None,
+                )));
+            }
+
             // next, aggregate built-ins
             if let Ok(fun) = AggregateFunction::from_str(&name) {
                 let distinct = function.distinct;
@@ -141,15 +150,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                 )));
             };
 
-            // User defined aggregate functions (UDAF)
-            if let Some(fm) = self.schema_provider.get_aggregate_meta(&name) {
-                let args =
-                    self.function_args_to_expr(function.args, schema, planner_context)?;
-                return Ok(Expr::AggregateUDF(expr::AggregateUDF::new(
-                    fm, args, None, None,
-                )));
-            }
-
             // Special case arrow_cast (as its type is dependent on its argument value)
             if name == ARROW_CAST_NAME {
                 let args =

From 85f3578f5fb47d28a8bc3a7b9be0284b3ced0fcd Mon Sep 17 00:00:00 2001
From: Jay Zhan <jayzhan211@gmail.com>
Date: Sat, 30 Sep 2023 00:51:39 +0800
Subject: [PATCH 4/7] Minor: Move hash utils to common (#7684)

* move hash utils to common

Signed-off-by: jayzhan211 <jayzhan211@gmail.com>

* support backward compatibility

Signed-off-by: jayzhan211 <jayzhan211@gmail.com>

---------

Signed-off-by: jayzhan211 <jayzhan211@gmail.com>
Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion-cli/Cargo.lock                        |  4 ++++
 datafusion/common/Cargo.toml                     |  4 ++++
 .../{physical-expr => common}/src/hash_utils.rs  | 16 ++++++++--------
 datafusion/common/src/lib.rs                     |  1 +
 .../physical-expr/src/expressions/in_list.rs     |  2 +-
 datafusion/physical-expr/src/lib.rs              |  4 +++-
 .../src/aggregates/group_values/row.rs           |  2 +-
 datafusion/physical-plan/src/lib.rs              |  3 ++-
 .../src/windows/bounded_window_agg_exec.rs       |  3 ++-
 9 files changed, 26 insertions(+), 13 deletions(-)
 rename datafusion/{physical-expr => common}/src/hash_utils.rs (98%)

diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 0ca83452bd02..775f8ec87e38 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -1122,9 +1122,13 @@ dependencies = [
 name = "datafusion-common"
 version = "31.0.0"
 dependencies = [
+ "ahash",
  "arrow",
  "arrow-array",
+ "arrow-buffer",
+ "arrow-schema",
  "chrono",
+ "half",
  "num_cpus",
  "object_store",
  "parquet",
diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml
index f2b8f1a1e4be..b5cdec1be17b 100644
--- a/datafusion/common/Cargo.toml
+++ b/datafusion/common/Cargo.toml
@@ -39,10 +39,14 @@ default = ["parquet"]
 pyarrow = ["pyo3", "arrow/pyarrow"]
 
 [dependencies]
+ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
 apache-avro = { version = "0.16", default-features = false, features = ["snappy"], optional = true }
 arrow = { workspace = true }
 arrow-array = { workspace = true }
+arrow-buffer = { workspace = true }
+arrow-schema = { workspace = true }
 chrono = { workspace = true }
+half = { version = "2.1", default-features = false }
 num_cpus = "1.13.0"
 object_store = { version = "0.7.0", default-features = false, optional = true }
 parquet = { workspace = true, optional = true }
diff --git a/datafusion/physical-expr/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs
similarity index 98%
rename from datafusion/physical-expr/src/hash_utils.rs
rename to datafusion/common/src/hash_utils.rs
index 379e0eba5277..9198461e00bf 100644
--- a/datafusion/physical-expr/src/hash_utils.rs
+++ b/datafusion/common/src/hash_utils.rs
@@ -17,19 +17,19 @@
 
 //! Functionality used both on logical and physical plans
 
+use std::sync::Arc;
+
 use ahash::RandomState;
 use arrow::array::*;
 use arrow::datatypes::*;
 use arrow::row::Rows;
 use arrow::{downcast_dictionary_array, downcast_primitive_array};
 use arrow_buffer::i256;
-use datafusion_common::{
-    cast::{
-        as_boolean_array, as_generic_binary_array, as_primitive_array, as_string_array,
-    },
-    internal_err, DataFusionError, Result,
+
+use crate::cast::{
+    as_boolean_array, as_generic_binary_array, as_primitive_array, as_string_array,
 };
-use std::sync::Arc;
+use crate::error::{DataFusionError, Result, _internal_err};
 
 // Combines two hashes into one hash
 #[inline]
@@ -51,7 +51,7 @@ fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col:
     }
 }
 
-pub(crate) trait HashValue {
+pub trait HashValue {
     fn hash_one(&self, state: &RandomState) -> u64;
 }
 
@@ -337,7 +337,7 @@ pub fn create_hashes<'a>(
             }
             _ => {
                 // This is internal because we should have caught this before.
-                return internal_err!(
+                return _internal_err!(
                     "Unsupported data type in hasher: {}",
                     col.data_type()
                 );
diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs
index eeb5b2681370..71782f67046d 100644
--- a/datafusion/common/src/lib.rs
+++ b/datafusion/common/src/lib.rs
@@ -25,6 +25,7 @@ mod error;
 pub mod file_options;
 pub mod format;
 mod functional_dependencies;
+pub mod hash_utils;
 mod join_type;
 pub mod parsers;
 #[cfg(feature = "pyarrow")]
diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs
index c92bbbb74f16..bdc476f5b3a1 100644
--- a/datafusion/physical-expr/src/expressions/in_list.rs
+++ b/datafusion/physical-expr/src/expressions/in_list.rs
@@ -24,7 +24,6 @@ use std::fmt::Debug;
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
-use crate::hash_utils::HashValue;
 use crate::physical_expr::down_cast_any_ref;
 use crate::utils::expr_list_eq_any_order;
 use crate::PhysicalExpr;
@@ -37,6 +36,7 @@ use arrow::datatypes::*;
 use arrow::record_batch::RecordBatch;
 use arrow::util::bit_iterator::BitIndexIterator;
 use arrow::{downcast_dictionary_array, downcast_primitive_array};
+use datafusion_common::hash_utils::HashValue;
 use datafusion_common::{
     cast::{as_boolean_array, as_generic_binary_array, as_string_array},
     internal_err, not_impl_err, DataFusionError, Result, ScalarValue,
diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs
index e83dee2e6c80..48d5f4e1308b 100644
--- a/datafusion/physical-expr/src/lib.rs
+++ b/datafusion/physical-expr/src/lib.rs
@@ -28,7 +28,6 @@ pub mod equivalence;
 pub mod execution_props;
 pub mod expressions;
 pub mod functions;
-pub mod hash_utils;
 pub mod intervals;
 pub mod math_expressions;
 mod partitioning;
@@ -49,6 +48,9 @@ pub mod utils;
 pub mod var_provider;
 pub mod window;
 
+// For backwards compatibility
+pub use datafusion_common::hash_utils;
+
 pub use aggregate::groups_accumulator::{
     EmitTo, GroupsAccumulator, GroupsAccumulatorAdapter,
 };
diff --git a/datafusion/physical-plan/src/aggregates/group_values/row.rs b/datafusion/physical-plan/src/aggregates/group_values/row.rs
index 746537557d46..10ff9edb8912 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/row.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/row.rs
@@ -22,9 +22,9 @@ use arrow::record_batch::RecordBatch;
 use arrow::row::{RowConverter, Rows, SortField};
 use arrow_array::{Array, ArrayRef};
 use arrow_schema::{DataType, SchemaRef};
+use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::{DataFusionError, Result};
 use datafusion_execution::memory_pool::proxy::{RawTableAllocExt, VecAllocExt};
-use datafusion_physical_expr::hash_utils::create_hashes;
 use datafusion_physical_expr::EmitTo;
 use hashbrown::raw::RawTable;
 
diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs
index 76adf7611d6f..aca10893db3d 100644
--- a/datafusion/physical-plan/src/lib.rs
+++ b/datafusion/physical-plan/src/lib.rs
@@ -375,10 +375,11 @@ pub mod windows;
 
 use crate::repartition::RepartitionExec;
 use crate::sorts::sort_preserving_merge::SortPreservingMergeExec;
+pub use datafusion_common::hash_utils;
 pub use datafusion_common::utils::project_schema;
 use datafusion_execution::TaskContext;
 pub use datafusion_physical_expr::{
-    expressions, functions, hash_utils, ordering_equivalence_properties_helper, udf,
+    expressions, functions, ordering_equivalence_properties_helper, udf,
 };
 
 #[cfg(test)]
diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
index 4108b4220599..dfef0ddefa03 100644
--- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
+++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
@@ -43,6 +43,8 @@ use arrow::{
     datatypes::{Schema, SchemaBuilder, SchemaRef},
     record_batch::RecordBatch,
 };
+
+use datafusion_common::hash_utils::create_hashes;
 use datafusion_common::utils::{
     evaluate_partition_ranges, get_arrayref_at_indices, get_at_indices,
     get_record_batch_at_indices, get_row_at_idx,
@@ -51,7 +53,6 @@ use datafusion_common::{exec_err, plan_err, DataFusionError, Result};
 use datafusion_execution::TaskContext;
 use datafusion_expr::window_state::{PartitionBatchState, WindowAggState};
 use datafusion_expr::ColumnarValue;
-use datafusion_physical_expr::hash_utils::create_hashes;
 use datafusion_physical_expr::window::{
     PartitionBatches, PartitionKey, PartitionWindowAggStates, WindowState,
 };

From 692ea24357d32b1242c476f0ed33498c815ac921 Mon Sep 17 00:00:00 2001
From: Devin D'Angelo <devinjdangelo@gmail.com>
Date: Sat, 30 Sep 2023 01:22:52 -0400
Subject: [PATCH 5/7] Update Default Parquet Write Compression (#7692)

* update compression default

* fix tests

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/common/src/config.rs                           | 2 +-
 datafusion/sqllogictest/test_files/information_schema.slt | 2 +-
 docs/source/user-guide/configs.md                         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
index b34c64ff8893..261c2bf435a4 100644
--- a/datafusion/common/src/config.rs
+++ b/datafusion/common/src/config.rs
@@ -307,7 +307,7 @@ config_namespace! {
         /// lzo, brotli(level), lz4, zstd(level), and lz4_raw.
         /// These values are not case sensitive. If NULL, uses
         /// default parquet writer setting
-        pub compression: Option<String>, default = None
+        pub compression: Option<String>, default = Some("zstd(3)".into())
 
         /// Sets if dictionary encoding is enabled. If NULL, uses
         /// default parquet writer setting
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt
index f90901021637..12aa9089a0c9 100644
--- a/datafusion/sqllogictest/test_files/information_schema.slt
+++ b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -156,7 +156,7 @@ datafusion.execution.parquet.bloom_filter_enabled false
 datafusion.execution.parquet.bloom_filter_fpp NULL
 datafusion.execution.parquet.bloom_filter_ndv NULL
 datafusion.execution.parquet.column_index_truncate_length NULL
-datafusion.execution.parquet.compression NULL
+datafusion.execution.parquet.compression zstd(3)
 datafusion.execution.parquet.created_by datafusion
 datafusion.execution.parquet.data_page_row_count_limit 18446744073709551615
 datafusion.execution.parquet.data_pagesize_limit 1048576
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
index 7fe229b4d3c6..638ac5a36b83 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -58,7 +58,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
 | datafusion.execution.parquet.data_pagesize_limit           | 1048576                   | Sets best effort maximum size of data page in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 | datafusion.execution.parquet.write_batch_size              | 1024                      | Sets write_batch_size in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
 | datafusion.execution.parquet.writer_version                | 1.0                       | Sets parquet writer version valid values are "1.0" and "2.0"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| datafusion.execution.parquet.compression                   | NULL                      | Sets default parquet compression codec Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                        |
+| datafusion.execution.parquet.compression                   | zstd(3)                   | Sets default parquet compression codec Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                        |
 | datafusion.execution.parquet.dictionary_enabled            | NULL                      | Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
 | datafusion.execution.parquet.dictionary_page_size_limit    | 1048576                   | Sets best effort maximum dictionary page size, in bytes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
 | datafusion.execution.parquet.statistics_enabled            | NULL                      | Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting                                                                                                                                                                                                                                                                                                                                                                                                                           |

From 5d522fb55628de063ea59ee738aa341cb73b71ff Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@apache.org>
Date: Sun, 1 Oct 2023 19:33:25 +0900
Subject: [PATCH 6/7] Stop using cache for the benchmark job (#7706)

---
 .github/workflows/rust.yml | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 80de6e663fcd..37533396e0d8 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -226,17 +226,6 @@ jobs:
       - uses: actions/checkout@v4
         with:
           submodules: true
-      - name: Cache Cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/bin/
-            ~/.cargo/registry/index/
-            ~/.cargo/registry/cache/
-            ~/.cargo/git/db/
-            ./target/
-          # this key equals the ones on `linux-build-lib` for re-use
-          key: cargo-cache-benchmark-${{ hashFiles('datafusion/**/Cargo.toml', 'benchmarks/Cargo.toml') }}
       - name: Setup Rust toolchain
         uses: ./.github/actions/setup-builder
         with:

From 14cdf72099bca3bb82c0103a5586b89ddb2feecc Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@apache.org>
Date: Sun, 1 Oct 2023 19:46:38 +0900
Subject: [PATCH 7/7] Change rust.yml to run benchmark (#7708)

* Change rust.yml to run benchmark

* Restore unrelated change
---
 .github/workflows/rust.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 37533396e0d8..0f572c9687dd 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -241,7 +241,7 @@ jobs:
       - name: Verify that benchmark queries return expected results
         run: |
           export TPCH_DATA=`realpath datafusion/sqllogictest/test_files/tpch/data`
-          cargo test serde_q --profile release-nonlto --features=ci -- --test-threads=1
+          cargo test plan_q --package datafusion-benchmarks --profile release-nonlto --features=ci -- --test-threads=1
           INCLUDE_TPCH=true cargo test --test sqllogictests
       - name: Verify Working Directory Clean
         run: git diff --exit-code