-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(df-repr/bridge): upgrade datafusion to 43.0.0 (#260)
Despite the upgrade, * New `create_df_context` to be used across all crates to create a datafusion context with optd. We had too much duplicate code before to set up the context. * The main refactor is about the aggregation expressions. Datafusion has a new way of doing that. * Datafusion removed cross join. We didn't. We can eventually remove it but now it's blocked on two-stage cascades: if we simply treat cross join the same as inner join, we would time out. * Several other refactors to adapt to datafusion (i.e., limit node now takes i64, empty relation / placeholder row executor) * Keep as much as the original datafusion cli crate as possible. We now only patch main.rs and exec.rs. * There's one more breaking change that we might encounter later when doing sort physical properties. Now datafusion logical plan will remove duplicate sorts if there are no limits present. I feel this is a bad move b/c it's not a direct mapping from the original SQL statement... --------- Signed-off-by: Alex Chi <[email protected]>
- Loading branch information
Showing
67 changed files
with
12,894 additions
and
14,378 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,52 +18,61 @@ | |
[package] | ||
name = "datafusion-optd-cli" | ||
description = "Command Line Client for DataFusion query engine." | ||
version = "32.0.0" | ||
version = "43.0.0" | ||
authors = ["Apache DataFusion <[email protected]>"] | ||
edition = "2021" | ||
keywords = ["arrow", "datafusion", "query", "sql"] | ||
license = "Apache-2.0" | ||
homepage = "https://github.com/cmu-db/optd" | ||
repository = "https://github.com/cmu-db/optd" | ||
rust-version = "1.70" | ||
# Specify MSRV here as `cargo msrv` doesn't support workspace version | ||
rust-version = "1.79" | ||
readme = "README.md" | ||
|
||
[dependencies] | ||
arrow = "47.0.0" | ||
async-trait = "0.1.41" | ||
aws-config = "0.55" | ||
aws-credential-types = "0.55" | ||
clap = { version = "3", features = ["derive", "cargo"] } | ||
datafusion = { version = "32.0.0", features = [ | ||
arrow = { version = "53.0.0" } | ||
async-trait = "0.1.73" | ||
aws-config = "1.5.5" | ||
aws-sdk-sso = "1.43.0" | ||
aws-sdk-ssooidc = "1.44.0" | ||
aws-sdk-sts = "1.43.0" | ||
# end pin aws-sdk crates | ||
aws-credential-types = "1.2.0" | ||
clap = { version = "4.5.16", features = ["derive", "cargo"] } | ||
datafusion = { version = "43.0.0", features = [ | ||
"avro", | ||
"crypto_expressions", | ||
"datetime_expressions", | ||
"encoding_expressions", | ||
"parquet", | ||
"regex_expressions", | ||
"unicode_expressions", | ||
"compression", | ||
] } | ||
dirs = "4.0.0" | ||
env_logger = "0.9" | ||
dirs = "5.0.1" | ||
env_logger = "0.11" | ||
futures = "0.3" | ||
mimalloc = { version = "0.1", default-features = false } | ||
object_store = { version = "0.7.0", features = ["aws", "gcp"] } | ||
object_store = { version = "0.11.0", features = ["aws", "gcp", "http"] } | ||
parking_lot = { version = "0.12" } | ||
parquet = { version = "53.0.0", default-features = false } | ||
regex = "1.8" | ||
rustyline = "11.0" | ||
rustyline = "14.0" | ||
tokio = { version = "1.24", features = [ | ||
"macros", | ||
"rt", | ||
"rt-multi-thread", | ||
"sync", | ||
"parking_lot", | ||
"signal", | ||
] } | ||
url = "2.2" | ||
# begin optd-cli patch | ||
optd-datafusion-bridge = { path = "../optd-datafusion-bridge", version = "0.1" } | ||
optd-datafusion-repr-adv-cost = { path = "../optd-datafusion-repr-adv-cost", version = "0.1" } | ||
optd-datafusion-repr = { path = "../optd-datafusion-repr", version = "0.1" } | ||
tracing-subscriber = "0.3" | ||
tracing = "0.1" | ||
# end optd-cli patch | ||
|
||
[dev-dependencies] | ||
assert_cmd = "2.0" | ||
ctor = "0.2.0" | ||
predicates = "3.0" | ||
rstest = "0.17" | ||
rstest = "0.22" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
//! Shows an example of a custom session context that unions the input plan with itself. | ||
//! To run this example, use `cargo run --example cli-session-context` from within the `datafusion-cli` directory. | ||
use std::sync::Arc; | ||
|
||
use datafusion::{ | ||
dataframe::DataFrame, | ||
error::DataFusionError, | ||
execution::{context::SessionState, TaskContext}, | ||
logical_expr::{LogicalPlan, LogicalPlanBuilder}, | ||
prelude::SessionContext, | ||
}; | ||
use datafusion_optd_cli::{ | ||
cli_context::CliSessionContext, exec::exec_from_repl, print_options::PrintOptions, | ||
}; | ||
use object_store::ObjectStore; | ||
|
||
/// This is a toy example of a custom session context that unions the input plan with itself. | ||
struct MyUnionerContext { | ||
ctx: SessionContext, | ||
} | ||
|
||
impl Default for MyUnionerContext { | ||
fn default() -> Self { | ||
Self { | ||
ctx: SessionContext::new(), | ||
} | ||
} | ||
} | ||
|
||
#[async_trait::async_trait] | ||
impl CliSessionContext for MyUnionerContext { | ||
fn task_ctx(&self) -> Arc<TaskContext> { | ||
self.ctx.task_ctx() | ||
} | ||
|
||
fn session_state(&self) -> SessionState { | ||
self.ctx.state() | ||
} | ||
|
||
fn register_object_store( | ||
&self, | ||
url: &url::Url, | ||
object_store: Arc<dyn ObjectStore>, | ||
) -> Option<Arc<dyn ObjectStore + 'static>> { | ||
self.ctx.register_object_store(url, object_store) | ||
} | ||
|
||
fn register_table_options_extension_from_scheme(&self, _scheme: &str) { | ||
unimplemented!() | ||
} | ||
|
||
async fn execute_logical_plan(&self, plan: LogicalPlan) -> Result<DataFrame, DataFusionError> { | ||
let new_plan = LogicalPlanBuilder::from(plan.clone()) | ||
.union(plan.clone())? | ||
.build()?; | ||
|
||
self.ctx.execute_logical_plan(new_plan).await | ||
} | ||
} | ||
|
||
#[tokio::main] | ||
/// Runs the example. | ||
pub async fn main() { | ||
let my_ctx = MyUnionerContext::default(); | ||
|
||
let mut print_options = PrintOptions { | ||
format: datafusion_optd_cli::print_format::PrintFormat::Automatic, | ||
quiet: false, | ||
maxrows: datafusion_optd_cli::print_options::MaxRows::Unlimited, | ||
color: true, | ||
}; | ||
|
||
exec_from_repl(&my_ctx, &mut print_options).await.unwrap(); | ||
} |
Oops, something went wrong.