From de1591770445773ca5947f44c8b5fe58fec985d7 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Fri, 29 Sep 2023 23:30:27 +0900 Subject: [PATCH 1/7] Improve cache usage in CI (#7678) * Improve cache usage in CI * Trigger GA to check the effect of improvement --- .github/workflows/rust.yml | 77 ++++++++++---------------------------- 1 file changed, 20 insertions(+), 57 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index fa5c56b43e03..80de6e663fcd 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -47,18 +47,24 @@ jobs: image: amd64/rust steps: - uses: actions/checkout@v4 - - name: Cache Cargo - uses: actions/cache@v3 - with: - # these represent dependencies downloaded by cargo - # and thus do not depend on the OS, arch nor rust version. - path: /github/home/.cargo - key: cargo-cache- - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: rust-version: stable + - name: Cache Cargo + uses: actions/cache@v3 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + ./target/ + ./datafusion-cli/target/ + # this key equals the ones on `linux-build-lib` for re-use + key: cargo-cache-benchmark-${{ hashFiles('datafusion/**/Cargo.toml', 'benchmarks/Cargo.toml', 'datafusion-cli/Cargo.toml') }} + - name: Check workspace without default features run: cargo check --no-default-features -p datafusion @@ -84,12 +90,6 @@ jobs: - uses: actions/checkout@v4 with: submodules: true - - name: Cache Cargo - uses: actions/cache@v3 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -109,12 +109,6 @@ jobs: - uses: actions/checkout@v4 with: submodules: true - - name: Cache Cargo - uses: actions/cache@v3 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -211,12 +205,6 @@ jobs: image: amd64/rust steps: - uses: actions/checkout@v4 - - name: Cache Cargo - uses: actions/cache@v3 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -241,9 +229,14 @@ jobs: - name: Cache Cargo uses: actions/cache@v3 with: - path: /github/home/.cargo + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + ./target/ # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- + key: cargo-cache-benchmark-${{ hashFiles('datafusion/**/Cargo.toml', 'benchmarks/Cargo.toml') }} - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -377,12 +370,6 @@ jobs: - uses: actions/checkout@v4 with: submodules: true - - name: Cache Cargo - uses: actions/cache@v3 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - uses: actions/setup-python@v4 with: python-version: "3.8" @@ -480,12 +467,6 @@ jobs: - uses: actions/checkout@v4 with: submodules: true - - name: Cache Cargo - uses: actions/cache@v3 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -506,12 +487,6 @@ jobs: - uses: actions/checkout@v4 with: submodules: true - - name: Cache Cargo - uses: actions/cache@v3 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -531,12 +506,6 @@ jobs: - uses: actions/checkout@v4 with: submodules: true - - name: Cache Cargo - uses: actions/cache@v3 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -563,12 +532,6 @@ jobs: - uses: actions/checkout@v4 with: submodules: true - - name: Cache Cargo - uses: actions/cache@v3 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: From 70cded6e3d95036d4150d4c77b7e57caa90d7a22 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Sat, 30 Sep 2023 00:30:26 +0800 Subject: [PATCH 2/7] fix: substrait limit when fetch is None (#7669) * fix: substrait limit when fetch is None Signed-off-by: Ruihang Xia * Add comments --------- Signed-off-by: Ruihang Xia Co-authored-by: Andrew Lamb --- datafusion/physical-plan/src/limit.rs | 2 +- datafusion/substrait/src/logical_plan/consumer.rs | 9 +++++++-- datafusion/substrait/src/logical_plan/producer.rs | 3 ++- .../substrait/tests/cases/roundtrip_logical_plan.rs | 5 +++++ 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index 922c3db0efc8..31ed08399c2e 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -442,7 +442,7 @@ impl LimitStream { match &poll { Poll::Ready(Some(Ok(batch))) => { - if batch.num_rows() > 0 && self.skip == 0 { + if batch.num_rows() > 0 { break poll; } else { // continue to poll input stream diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 32b8f8ea547f..e1dde39427a5 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -227,8 +227,13 @@ pub async fn from_substrait_rel( from_substrait_rel(ctx, input, extensions).await?, ); let offset = fetch.offset as usize; - let count = fetch.count as usize; - input.limit(offset, Some(count))?.build() + // Since protobuf can't directly distinguish `None` vs `0` `None` is encoded as `MAX` + let count = if fetch.count as usize == usize::MAX { + None + } else { + Some(fetch.count as usize) + }; + input.limit(offset, count)?.build() } else { not_impl_err!("Fetch without an input is not valid") } diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs index e17b022f3b53..1124ea53a557 100644 --- a/datafusion/substrait/src/logical_plan/producer.rs +++ b/datafusion/substrait/src/logical_plan/producer.rs @@ -193,7 +193,8 @@ pub fn to_substrait_rel( } LogicalPlan::Limit(limit) => { let input = to_substrait_rel(limit.input.as_ref(), ctx, extension_info)?; - let limit_fetch = limit.fetch.unwrap_or(0); + // Since protobuf can't directly distinguish `None` vs `0` encode `None` as `MAX` + let limit_fetch = limit.fetch.unwrap_or(usize::MAX); Ok(Box::new(Rel { rel_type: Some(RelType::Fetch(Box::new(FetchRel { common: None, diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index f4d74ae42681..2554d0667e48 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -188,6 +188,11 @@ async fn select_with_limit() -> Result<()> { roundtrip_fill_na("SELECT * FROM data LIMIT 100").await } +#[tokio::test] +async fn select_without_limit() -> Result<()> { + roundtrip_fill_na("SELECT * FROM data OFFSET 10").await +} + #[tokio::test] async fn select_with_limit_offset() -> Result<()> { roundtrip("SELECT * FROM data LIMIT 200 OFFSET 10").await From 2d6e768c6084c4955ac6dfb9c389aa8849464f05 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Sat, 30 Sep 2023 00:35:10 +0800 Subject: [PATCH 3/7] minor: revert parsing precedence between Aggr and UDAF (#7682) * minor: revert parsing precedence between Aggr and UDAF Signed-off-by: Ruihang Xia * add unit test Signed-off-by: Ruihang Xia --------- Signed-off-by: Ruihang Xia --- .../user_defined/user_defined_aggregates.rs | 37 +++++++++++++++++-- datafusion/sql/src/expr/function.rs | 18 ++++----- 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs index 64547bbdfa36..3b7b4d0e87b7 100644 --- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs +++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs @@ -169,6 +169,37 @@ async fn test_udaf_returning_struct_subquery() { assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap()); } +#[tokio::test] +async fn test_udaf_shadows_builtin_fn() { + let TestContext { + mut ctx, + test_state, + } = TestContext::new(); + let sql = "SELECT sum(arrow_cast(time, 'Int64')) from t"; + + // compute with builtin `sum` aggregator + let expected = [ + "+-------------+", + "| SUM(t.time) |", + "+-------------+", + "| 19000 |", + "+-------------+", + ]; + assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap()); + + // Register `TimeSum` with name `sum`. This will shadow the builtin one + let sql = "SELECT sum(time) from t"; + TimeSum::register(&mut ctx, test_state.clone(), "sum"); + let expected = [ + "+----------------------------+", + "| sum(t.time) |", + "+----------------------------+", + "| 1970-01-01T00:00:00.000019 |", + "+----------------------------+", + ]; + assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap()); +} + async fn execute(ctx: &SessionContext, sql: &str) -> Result> { ctx.sql(sql).await?.collect().await } @@ -214,7 +245,7 @@ impl TestContext { // Tell DataFusion about the "first" function FirstSelector::register(&mut ctx); // Tell DataFusion about the "time_sum" function - TimeSum::register(&mut ctx, Arc::clone(&test_state)); + TimeSum::register(&mut ctx, Arc::clone(&test_state), "time_sum"); Self { ctx, test_state } } @@ -281,7 +312,7 @@ impl TimeSum { Self { sum: 0, test_state } } - fn register(ctx: &mut SessionContext, test_state: Arc) { + fn register(ctx: &mut SessionContext, test_state: Arc, name: &str) { let timestamp_type = DataType::Timestamp(TimeUnit::Nanosecond, None); // Returns the same type as its input @@ -301,8 +332,6 @@ impl TimeSum { let accumulator: AccumulatorFactoryFunction = Arc::new(move |_| Ok(Box::new(Self::new(Arc::clone(&captured_state))))); - let name = "time_sum"; - let time_sum = AggregateUDF::new(name, &signature, &return_type, &accumulator, &state_type); diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 05f80fcfafa9..3861b4848d9b 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -124,6 +124,15 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { return Ok(expr); } } else { + // User defined aggregate functions (UDAF) have precedence in case it has the same name as a scalar built-in function + if let Some(fm) = self.schema_provider.get_aggregate_meta(&name) { + let args = + self.function_args_to_expr(function.args, schema, planner_context)?; + return Ok(Expr::AggregateUDF(expr::AggregateUDF::new( + fm, args, None, None, + ))); + } + // next, aggregate built-ins if let Ok(fun) = AggregateFunction::from_str(&name) { let distinct = function.distinct; @@ -141,15 +150,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ))); }; - // User defined aggregate functions (UDAF) - if let Some(fm) = self.schema_provider.get_aggregate_meta(&name) { - let args = - self.function_args_to_expr(function.args, schema, planner_context)?; - return Ok(Expr::AggregateUDF(expr::AggregateUDF::new( - fm, args, None, None, - ))); - } - // Special case arrow_cast (as its type is dependent on its argument value) if name == ARROW_CAST_NAME { let args = From 85f3578f5fb47d28a8bc3a7b9be0284b3ced0fcd Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Sat, 30 Sep 2023 00:51:39 +0800 Subject: [PATCH 4/7] Minor: Move hash utils to common (#7684) * move hash utils to common Signed-off-by: jayzhan211 * support backward compatibility Signed-off-by: jayzhan211 --------- Signed-off-by: jayzhan211 Co-authored-by: Andrew Lamb --- datafusion-cli/Cargo.lock | 4 ++++ datafusion/common/Cargo.toml | 4 ++++ .../{physical-expr => common}/src/hash_utils.rs | 16 ++++++++-------- datafusion/common/src/lib.rs | 1 + .../physical-expr/src/expressions/in_list.rs | 2 +- datafusion/physical-expr/src/lib.rs | 4 +++- .../src/aggregates/group_values/row.rs | 2 +- datafusion/physical-plan/src/lib.rs | 3 ++- .../src/windows/bounded_window_agg_exec.rs | 3 ++- 9 files changed, 26 insertions(+), 13 deletions(-) rename datafusion/{physical-expr => common}/src/hash_utils.rs (98%) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 0ca83452bd02..775f8ec87e38 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1122,9 +1122,13 @@ dependencies = [ name = "datafusion-common" version = "31.0.0" dependencies = [ + "ahash", "arrow", "arrow-array", + "arrow-buffer", + "arrow-schema", "chrono", + "half", "num_cpus", "object_store", "parquet", diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index f2b8f1a1e4be..b5cdec1be17b 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -39,10 +39,14 @@ default = ["parquet"] pyarrow = ["pyo3", "arrow/pyarrow"] [dependencies] +ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } apache-avro = { version = "0.16", default-features = false, features = ["snappy"], optional = true } arrow = { workspace = true } arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-schema = { workspace = true } chrono = { workspace = true } +half = { version = "2.1", default-features = false } num_cpus = "1.13.0" object_store = { version = "0.7.0", default-features = false, optional = true } parquet = { workspace = true, optional = true } diff --git a/datafusion/physical-expr/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs similarity index 98% rename from datafusion/physical-expr/src/hash_utils.rs rename to datafusion/common/src/hash_utils.rs index 379e0eba5277..9198461e00bf 100644 --- a/datafusion/physical-expr/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -17,19 +17,19 @@ //! Functionality used both on logical and physical plans +use std::sync::Arc; + use ahash::RandomState; use arrow::array::*; use arrow::datatypes::*; use arrow::row::Rows; use arrow::{downcast_dictionary_array, downcast_primitive_array}; use arrow_buffer::i256; -use datafusion_common::{ - cast::{ - as_boolean_array, as_generic_binary_array, as_primitive_array, as_string_array, - }, - internal_err, DataFusionError, Result, + +use crate::cast::{ + as_boolean_array, as_generic_binary_array, as_primitive_array, as_string_array, }; -use std::sync::Arc; +use crate::error::{DataFusionError, Result, _internal_err}; // Combines two hashes into one hash #[inline] @@ -51,7 +51,7 @@ fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: } } -pub(crate) trait HashValue { +pub trait HashValue { fn hash_one(&self, state: &RandomState) -> u64; } @@ -337,7 +337,7 @@ pub fn create_hashes<'a>( } _ => { // This is internal because we should have caught this before. - return internal_err!( + return _internal_err!( "Unsupported data type in hasher: {}", col.data_type() ); diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index eeb5b2681370..71782f67046d 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -25,6 +25,7 @@ mod error; pub mod file_options; pub mod format; mod functional_dependencies; +pub mod hash_utils; mod join_type; pub mod parsers; #[cfg(feature = "pyarrow")] diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index c92bbbb74f16..bdc476f5b3a1 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -24,7 +24,6 @@ use std::fmt::Debug; use std::hash::{Hash, Hasher}; use std::sync::Arc; -use crate::hash_utils::HashValue; use crate::physical_expr::down_cast_any_ref; use crate::utils::expr_list_eq_any_order; use crate::PhysicalExpr; @@ -37,6 +36,7 @@ use arrow::datatypes::*; use arrow::record_batch::RecordBatch; use arrow::util::bit_iterator::BitIndexIterator; use arrow::{downcast_dictionary_array, downcast_primitive_array}; +use datafusion_common::hash_utils::HashValue; use datafusion_common::{ cast::{as_boolean_array, as_generic_binary_array, as_string_array}, internal_err, not_impl_err, DataFusionError, Result, ScalarValue, diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs index e83dee2e6c80..48d5f4e1308b 100644 --- a/datafusion/physical-expr/src/lib.rs +++ b/datafusion/physical-expr/src/lib.rs @@ -28,7 +28,6 @@ pub mod equivalence; pub mod execution_props; pub mod expressions; pub mod functions; -pub mod hash_utils; pub mod intervals; pub mod math_expressions; mod partitioning; @@ -49,6 +48,9 @@ pub mod utils; pub mod var_provider; pub mod window; +// For backwards compatibility +pub use datafusion_common::hash_utils; + pub use aggregate::groups_accumulator::{ EmitTo, GroupsAccumulator, GroupsAccumulatorAdapter, }; diff --git a/datafusion/physical-plan/src/aggregates/group_values/row.rs b/datafusion/physical-plan/src/aggregates/group_values/row.rs index 746537557d46..10ff9edb8912 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/row.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/row.rs @@ -22,9 +22,9 @@ use arrow::record_batch::RecordBatch; use arrow::row::{RowConverter, Rows, SortField}; use arrow_array::{Array, ArrayRef}; use arrow_schema::{DataType, SchemaRef}; +use datafusion_common::hash_utils::create_hashes; use datafusion_common::{DataFusionError, Result}; use datafusion_execution::memory_pool::proxy::{RawTableAllocExt, VecAllocExt}; -use datafusion_physical_expr::hash_utils::create_hashes; use datafusion_physical_expr::EmitTo; use hashbrown::raw::RawTable; diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 76adf7611d6f..aca10893db3d 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -375,10 +375,11 @@ pub mod windows; use crate::repartition::RepartitionExec; use crate::sorts::sort_preserving_merge::SortPreservingMergeExec; +pub use datafusion_common::hash_utils; pub use datafusion_common::utils::project_schema; use datafusion_execution::TaskContext; pub use datafusion_physical_expr::{ - expressions, functions, hash_utils, ordering_equivalence_properties_helper, udf, + expressions, functions, ordering_equivalence_properties_helper, udf, }; #[cfg(test)] diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index 4108b4220599..dfef0ddefa03 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -43,6 +43,8 @@ use arrow::{ datatypes::{Schema, SchemaBuilder, SchemaRef}, record_batch::RecordBatch, }; + +use datafusion_common::hash_utils::create_hashes; use datafusion_common::utils::{ evaluate_partition_ranges, get_arrayref_at_indices, get_at_indices, get_record_batch_at_indices, get_row_at_idx, @@ -51,7 +53,6 @@ use datafusion_common::{exec_err, plan_err, DataFusionError, Result}; use datafusion_execution::TaskContext; use datafusion_expr::window_state::{PartitionBatchState, WindowAggState}; use datafusion_expr::ColumnarValue; -use datafusion_physical_expr::hash_utils::create_hashes; use datafusion_physical_expr::window::{ PartitionBatches, PartitionKey, PartitionWindowAggStates, WindowState, }; From 692ea24357d32b1242c476f0ed33498c815ac921 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Sat, 30 Sep 2023 01:22:52 -0400 Subject: [PATCH 5/7] Update Default Parquet Write Compression (#7692) * update compression default * fix tests --------- Co-authored-by: Andrew Lamb --- datafusion/common/src/config.rs | 2 +- datafusion/sqllogictest/test_files/information_schema.slt | 2 +- docs/source/user-guide/configs.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index b34c64ff8893..261c2bf435a4 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -307,7 +307,7 @@ config_namespace! { /// lzo, brotli(level), lz4, zstd(level), and lz4_raw. /// These values are not case sensitive. If NULL, uses /// default parquet writer setting - pub compression: Option, default = None + pub compression: Option, default = Some("zstd(3)".into()) /// Sets if dictionary encoding is enabled. If NULL, uses /// default parquet writer setting diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index f90901021637..12aa9089a0c9 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -156,7 +156,7 @@ datafusion.execution.parquet.bloom_filter_enabled false datafusion.execution.parquet.bloom_filter_fpp NULL datafusion.execution.parquet.bloom_filter_ndv NULL datafusion.execution.parquet.column_index_truncate_length NULL -datafusion.execution.parquet.compression NULL +datafusion.execution.parquet.compression zstd(3) datafusion.execution.parquet.created_by datafusion datafusion.execution.parquet.data_page_row_count_limit 18446744073709551615 datafusion.execution.parquet.data_pagesize_limit 1048576 diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 7fe229b4d3c6..638ac5a36b83 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -58,7 +58,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.data_pagesize_limit | 1048576 | Sets best effort maximum size of data page in bytes | | datafusion.execution.parquet.write_batch_size | 1024 | Sets write_batch_size in bytes | | datafusion.execution.parquet.writer_version | 1.0 | Sets parquet writer version valid values are "1.0" and "2.0" | -| datafusion.execution.parquet.compression | NULL | Sets default parquet compression codec Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.compression | zstd(3) | Sets default parquet compression codec Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.dictionary_enabled | NULL | Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | Sets best effort maximum dictionary page size, in bytes | | datafusion.execution.parquet.statistics_enabled | NULL | Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | From 5d522fb55628de063ea59ee738aa341cb73b71ff Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sun, 1 Oct 2023 19:33:25 +0900 Subject: [PATCH 6/7] Stop using cache for the benchmark job (#7706) --- .github/workflows/rust.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 80de6e663fcd..37533396e0d8 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -226,17 +226,6 @@ jobs: - uses: actions/checkout@v4 with: submodules: true - - name: Cache Cargo - uses: actions/cache@v3 - with: - path: | - ~/.cargo/bin/ - ~/.cargo/registry/index/ - ~/.cargo/registry/cache/ - ~/.cargo/git/db/ - ./target/ - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache-benchmark-${{ hashFiles('datafusion/**/Cargo.toml', 'benchmarks/Cargo.toml') }} - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: From 14cdf72099bca3bb82c0103a5586b89ddb2feecc Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Sun, 1 Oct 2023 19:46:38 +0900 Subject: [PATCH 7/7] Change rust.yml to run benchmark (#7708) * Change rust.yml to run benchmark * Restore unrelated change --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 37533396e0d8..0f572c9687dd 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -241,7 +241,7 @@ jobs: - name: Verify that benchmark queries return expected results run: | export TPCH_DATA=`realpath datafusion/sqllogictest/test_files/tpch/data` - cargo test serde_q --profile release-nonlto --features=ci -- --test-threads=1 + cargo test plan_q --package datafusion-benchmarks --profile release-nonlto --features=ci -- --test-threads=1 INCLUDE_TPCH=true cargo test --test sqllogictests - name: Verify Working Directory Clean run: git diff --exit-code