From d34c22c0c60f4e6afa7481213e696e01f9f050d5 Mon Sep 17 00:00:00 2001 From: Ritu Pathak <33733001+Sweetsuro@users.noreply.github.com> Date: Thu, 15 Feb 2024 23:16:45 -0500 Subject: [PATCH] feat: Eliminate Duplicated Expr Rule (#67) # Major Changes Add an eliminate duplicate expression rule which: 1. Removes duplicate sort expressions 2. Removes duplicate aggregate group bys Also, this PR adds derive traits for Hash, PartialEq, and Eq for RelNode. Examples: `select * from t1 order by id, name, id desc, id asc, name desc` becomes `select * from t1 order by id, name` `select * from t1 group by id, name, id` becomes `select * from t1 group by id, name` ## Rule Type Heuristics (always apply), Transformation Rule (logical to logical) --- optd-core/src/rel_node.rs | 2 +- optd-datafusion-repr/src/lib.rs | 7 +- optd-datafusion-repr/src/plan_nodes/expr.rs | 4 + optd-datafusion-repr/src/plan_nodes/sort.rs | 4 + optd-datafusion-repr/src/rules.rs | 4 + .../src/rules/eliminate_duplicated_expr.rs | 117 ++++++++++++++++++ .../eliminate_duplicated_expr.planner.sql | 108 ++++++++++++++++ .../tests/eliminate_duplicated_expr.yml | 29 +++++ 8 files changed, 272 insertions(+), 3 deletions(-) create mode 100644 optd-datafusion-repr/src/rules/eliminate_duplicated_expr.rs create mode 100644 optd-sqlplannertest/tests/eliminate_duplicated_expr.planner.sql create mode 100644 optd-sqlplannertest/tests/eliminate_duplicated_expr.yml diff --git a/optd-core/src/rel_node.rs b/optd-core/src/rel_node.rs index ca3e538b..a27d2fdb 100644 --- a/optd-core/src/rel_node.rs +++ b/optd-core/src/rel_node.rs @@ -144,7 +144,7 @@ impl Value { } /// A RelNode is consisted of a plan node type and some children. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Hash, PartialEq, Eq)] pub struct RelNode { pub typ: T, pub children: Vec>, diff --git a/optd-datafusion-repr/src/lib.rs b/optd-datafusion-repr/src/lib.rs index 92898ba6..6a6f4731 100644 --- a/optd-datafusion-repr/src/lib.rs +++ b/optd-datafusion-repr/src/lib.rs @@ -11,8 +11,9 @@ use properties::{ schema::{Catalog, SchemaPropertyBuilder}, }; use rules::{ - EliminateFilterRule, EliminateJoinRule, EliminateLimitRule, HashJoinRule, JoinAssocRule, - JoinCommuteRule, PhysicalConversionRule, ProjectionPullUpJoin, + EliminateDuplicatedAggExprRule, EliminateDuplicatedSortExprRule, EliminateFilterRule, + EliminateJoinRule, EliminateLimitRule, HashJoinRule, JoinAssocRule, JoinCommuteRule, + PhysicalConversionRule, ProjectionPullUpJoin, }; pub use adaptive::PhysicalCollector; @@ -53,6 +54,8 @@ impl DatafusionOptimizer { rules.push(Arc::new(EliminateJoinRule::new())); rules.push(Arc::new(EliminateFilterRule::new())); rules.push(Arc::new(EliminateLimitRule::new())); + rules.push(Arc::new(EliminateDuplicatedSortExprRule::new())); + rules.push(Arc::new(EliminateDuplicatedAggExprRule::new())); let cost_model = AdaptiveCostModel::new(50); Self { diff --git a/optd-datafusion-repr/src/plan_nodes/expr.rs b/optd-datafusion-repr/src/plan_nodes/expr.rs index 380649f9..fbd27062 100644 --- a/optd-datafusion-repr/src/plan_nodes/expr.rs +++ b/optd-datafusion-repr/src/plan_nodes/expr.rs @@ -37,6 +37,10 @@ impl ExprList { .map(|x| Expr::from_rel_node(x.clone()).unwrap()) .collect_vec() } + + pub fn from_group(rel_node: OptRelNodeRef) -> Self { + Self(rel_node) + } } impl OptRelNode for ExprList { diff --git a/optd-datafusion-repr/src/plan_nodes/sort.rs b/optd-datafusion-repr/src/plan_nodes/sort.rs index e99c25f3..8fe205ef 100644 --- a/optd-datafusion-repr/src/plan_nodes/sort.rs +++ b/optd-datafusion-repr/src/plan_nodes/sort.rs @@ -6,6 +6,10 @@ use super::{OptRelNode, OptRelNodeRef, OptRelNodeTyp, PlanNode}; #[derive(Clone, Debug)] pub struct LogicalSort(pub PlanNode); +// each expression in ExprList is represented as a SortOrderExpr +// 1. nulls_first is not included from DF +// 2. node type defines sort order per expression +// 3. actual expr is stored as a child of this node define_plan_node!( LogicalSort : PlanNode, Sort, [ diff --git a/optd-datafusion-repr/src/rules.rs b/optd-datafusion-repr/src/rules.rs index 08b4c843..2e49af82 100644 --- a/optd-datafusion-repr/src/rules.rs +++ b/optd-datafusion-repr/src/rules.rs @@ -1,4 +1,5 @@ // mod filter_join; +mod eliminate_duplicated_expr; mod eliminate_filter; mod eliminate_limit; mod joins; @@ -6,6 +7,9 @@ mod macros; mod physical; // pub use filter_join::FilterJoinPullUpRule; +pub use eliminate_duplicated_expr::{ + EliminateDuplicatedAggExprRule, EliminateDuplicatedSortExprRule, +}; pub use eliminate_filter::EliminateFilterRule; pub use eliminate_limit::EliminateLimitRule; pub use joins::{ diff --git a/optd-datafusion-repr/src/rules/eliminate_duplicated_expr.rs b/optd-datafusion-repr/src/rules/eliminate_duplicated_expr.rs new file mode 100644 index 00000000..6984aa4b --- /dev/null +++ b/optd-datafusion-repr/src/rules/eliminate_duplicated_expr.rs @@ -0,0 +1,117 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use itertools::Itertools; +use optd_core::rules::{Rule, RuleMatcher}; +use optd_core::{optimizer::Optimizer, rel_node::RelNode}; + +use crate::plan_nodes::{ + Expr, ExprList, LogicalAgg, LogicalSort, OptRelNode, OptRelNodeTyp, PlanNode, SortOrderExpr, + SortOrderType, +}; + +use super::macros::define_rule; + +define_rule!( + EliminateDuplicatedSortExprRule, + apply_eliminate_duplicated_sort_expr, + (Sort, child, [exprs]) +); + +/// Removes duplicate sort expressions +/// For exmaple: +/// select * +/// from t1 +/// order by id desc, id, name, id asc +/// becomes +/// select * +/// from t1 +/// order by id desc, name +fn apply_eliminate_duplicated_sort_expr( + _optimizer: &impl Optimizer, + EliminateDuplicatedSortExprRulePicks { child, exprs }: EliminateDuplicatedSortExprRulePicks, +) -> Vec> { + let sort_keys: Vec = exprs + .children + .iter() + .map(|x| Expr::from_rel_node(x.clone()).unwrap()) + .collect_vec(); + + let normalized_sort_keys: Vec>> = exprs + .children + .iter() + .map(|x| match x.typ { + OptRelNodeTyp::SortOrder(_) => SortOrderExpr::new( + SortOrderType::Asc, + SortOrderExpr::from_rel_node(x.clone()).unwrap().child(), + ) + .into_rel_node(), + _ => x.clone(), + }) + .collect_vec(); + + let mut dedup_expr: Vec = Vec::new(); + let mut dedup_set: HashSet>> = HashSet::new(); + + sort_keys + .iter() + .zip(normalized_sort_keys.iter()) + .for_each(|(expr, normalized_expr)| { + if !dedup_set.contains(normalized_expr) { + dedup_expr.push(expr.clone()); + dedup_set.insert(normalized_expr.to_owned()); + } + }); + + if dedup_expr.len() != sort_keys.len() { + let node = LogicalSort::new( + PlanNode::from_group(child.into()), + ExprList::new(dedup_expr), + ); + return vec![node.into_rel_node().as_ref().clone()]; + } + vec![] +} + +define_rule!( + EliminateDuplicatedAggExprRule, + apply_eliminate_duplicated_agg_expr, + (Agg, child, exprs, [groups]) +); + +/// Removes duplicate group by expressions +/// For exmaple: +/// select * +/// from t1 +/// group by id, name, id, id +/// becomes +/// select * +/// from t1 +/// group by id, name +fn apply_eliminate_duplicated_agg_expr( + _optimizer: &impl Optimizer, + EliminateDuplicatedAggExprRulePicks { + child, + exprs, + groups, + }: EliminateDuplicatedAggExprRulePicks, +) -> Vec> { + let mut dedup_expr: Vec = Vec::new(); + let mut dedup_set: HashSet>> = HashSet::new(); + groups.children.iter().for_each(|expr| { + if !dedup_set.contains(expr) { + dedup_expr.push(Expr::from_rel_node(expr.clone()).unwrap()); + dedup_set.insert(expr.clone()); + } + }); + + if dedup_expr.len() != groups.children.len() { + let node = LogicalAgg::new( + PlanNode::from_group(child.into()), + ExprList::from_group(exprs.into()), + ExprList::new(dedup_expr), + ); + return vec![node.into_rel_node().as_ref().clone()]; + } + vec![] +} diff --git a/optd-sqlplannertest/tests/eliminate_duplicated_expr.planner.sql b/optd-sqlplannertest/tests/eliminate_duplicated_expr.planner.sql new file mode 100644 index 00000000..b31e774f --- /dev/null +++ b/optd-sqlplannertest/tests/eliminate_duplicated_expr.planner.sql @@ -0,0 +1,108 @@ +-- (no id or description) +create table t1(v1 int, v2 int); +insert into t1 values (0, 0), (1, 1), (5, 2), (2, 4), (0, 2); + +/* +5 +*/ + +-- Test without sorts/aggs. +select * from t1; + +/* +LogicalProjection { exprs: [ #0, #1 ] } +└── LogicalScan { table: t1 } +PhysicalProjection { exprs: [ #0, #1 ] } +└── PhysicalScan { table: t1 } +0 0 +1 1 +5 2 +2 4 +0 2 +*/ + +-- Test whether the optimizer handles duplicate sort expressions correctly. +select * from t1 order by v1, v2, v1 desc, v2 desc, v1 asc; + +/* +LogicalSort +├── exprs: +│ ┌── SortOrder { order: Asc } +│ │ └── #0 +│ ├── SortOrder { order: Asc } +│ │ └── #1 +│ ├── SortOrder { order: Desc } +│ │ └── #0 +│ ├── SortOrder { order: Desc } +│ │ └── #1 +│ └── SortOrder { order: Asc } +│ └── #0 +└── LogicalProjection { exprs: [ #0, #1 ] } + └── LogicalScan { table: t1 } +PhysicalSort +├── exprs: +│ ┌── SortOrder { order: Asc } +│ │ └── #0 +│ └── SortOrder { order: Asc } +│ └── #1 +└── PhysicalProjection { exprs: [ #0, #1 ] } + └── PhysicalScan { table: t1 } +0 0 +0 2 +1 1 +2 4 +5 2 +*/ + +-- Test whether the optimizer handles duplicate agg expressions correctly. +select * from t1 group by v1, v2, v1; + +/* +LogicalProjection { exprs: [ #0, #1 ] } +└── LogicalAgg { exprs: [], groups: [ #0, #1, #0 ] } + └── LogicalScan { table: t1 } +PhysicalProjection { exprs: [ #0, #1 ] } +└── PhysicalAgg { aggrs: [], groups: [ #0, #1 ] } + └── PhysicalScan { table: t1 } +0 0 +1 1 +5 2 +2 4 +0 2 +*/ + +-- Test whether the optimizer handles duplicate sort and agg expressions correctly. +select * from t1 group by v1, v2, v1, v2, v2 order by v1, v2, v1 desc, v2 desc, v1 asc; + +/* +LogicalSort +├── exprs: +│ ┌── SortOrder { order: Asc } +│ │ └── #0 +│ ├── SortOrder { order: Asc } +│ │ └── #1 +│ ├── SortOrder { order: Desc } +│ │ └── #0 +│ ├── SortOrder { order: Desc } +│ │ └── #1 +│ └── SortOrder { order: Asc } +│ └── #0 +└── LogicalProjection { exprs: [ #0, #1 ] } + └── LogicalAgg { exprs: [], groups: [ #0, #1, #0, #1, #1 ] } + └── LogicalScan { table: t1 } +PhysicalSort +├── exprs: +│ ┌── SortOrder { order: Asc } +│ │ └── #0 +│ └── SortOrder { order: Asc } +│ └── #1 +└── PhysicalProjection { exprs: [ #0, #1 ] } + └── PhysicalAgg { aggrs: [], groups: [ #0, #1 ] } + └── PhysicalScan { table: t1 } +0 0 +0 2 +1 1 +2 4 +5 2 +*/ + diff --git a/optd-sqlplannertest/tests/eliminate_duplicated_expr.yml b/optd-sqlplannertest/tests/eliminate_duplicated_expr.yml new file mode 100644 index 00000000..4c0d57b4 --- /dev/null +++ b/optd-sqlplannertest/tests/eliminate_duplicated_expr.yml @@ -0,0 +1,29 @@ +- sql: | + create table t1(v1 int, v2 int); + insert into t1 values (0, 0), (1, 1), (5, 2), (2, 4), (0, 2); + tasks: + - execute +- sql: | + select * from t1; + desc: Test without sorts/aggs. + tasks: + - explain:logical_optd,physical_optd + - execute +- sql: | + select * from t1 order by v1, v2, v1 desc, v2 desc, v1 asc; + desc: Test whether the optimizer handles duplicate sort expressions correctly. + tasks: + - explain:logical_optd,physical_optd + - execute +- sql: | + select * from t1 group by v1, v2, v1; + desc: Test whether the optimizer handles duplicate agg expressions correctly. + tasks: + - explain:logical_optd,physical_optd + - execute +- sql: | + select * from t1 group by v1, v2, v1, v2, v2 order by v1, v2, v1 desc, v2 desc, v1 asc; + desc: Test whether the optimizer handles duplicate sort and agg expressions correctly. + tasks: + - explain:logical_optd,physical_optd + - execute \ No newline at end of file