From d34c22c0c60f4e6afa7481213e696e01f9f050d5 Mon Sep 17 00:00:00 2001
From: Ritu Pathak <33733001+Sweetsuro@users.noreply.github.com>
Date: Thu, 15 Feb 2024 23:16:45 -0500
Subject: [PATCH] feat: Eliminate Duplicated Expr Rule (#67)

# Major Changes
Add an eliminate duplicate expression rule which:
1. Removes duplicate sort expressions
2. Removes duplicate aggregate group bys

Also, this PR adds derive traits for Hash, PartialEq, and Eq for
RelNode.

Examples:
`select * from t1 order by id, name, id desc, id asc, name desc` becomes
`select * from t1 order by id, name`

`select * from t1 group by id, name, id` becomes
`select * from t1 group by id, name`

## Rule Type
Heuristics (always apply), Transformation Rule (logical to logical)
---
 optd-core/src/rel_node.rs                     |   2 +-
 optd-datafusion-repr/src/lib.rs               |   7 +-
 optd-datafusion-repr/src/plan_nodes/expr.rs   |   4 +
 optd-datafusion-repr/src/plan_nodes/sort.rs   |   4 +
 optd-datafusion-repr/src/rules.rs             |   4 +
 .../src/rules/eliminate_duplicated_expr.rs    | 117 ++++++++++++++++++
 .../eliminate_duplicated_expr.planner.sql     | 108 ++++++++++++++++
 .../tests/eliminate_duplicated_expr.yml       |  29 +++++
 8 files changed, 272 insertions(+), 3 deletions(-)
 create mode 100644 optd-datafusion-repr/src/rules/eliminate_duplicated_expr.rs
 create mode 100644 optd-sqlplannertest/tests/eliminate_duplicated_expr.planner.sql
 create mode 100644 optd-sqlplannertest/tests/eliminate_duplicated_expr.yml
diff --git a/optd-core/src/rel_node.rs b/optd-core/src/rel_node.rs
index ca3e538b..a27d2fdb 100644
--- a/optd-core/src/rel_node.rs
+++ b/optd-core/src/rel_node.rs
@@ -144,7 +144,7 @@ impl Value {
 }
 
 /// A RelNode is consisted of a plan node type and some children.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
 pub struct RelNode<T: RelNodeTyp> {
     pub typ: T,
     pub children: Vec<RelNodeRef<T>>,
diff --git a/optd-datafusion-repr/src/lib.rs b/optd-datafusion-repr/src/lib.rs
index 92898ba6..6a6f4731 100644
--- a/optd-datafusion-repr/src/lib.rs
+++ b/optd-datafusion-repr/src/lib.rs
@@ -11,8 +11,9 @@ use properties::{
     schema::{Catalog, SchemaPropertyBuilder},
 };
 use rules::{
-    EliminateFilterRule, EliminateJoinRule, EliminateLimitRule, HashJoinRule, JoinAssocRule,
-    JoinCommuteRule, PhysicalConversionRule, ProjectionPullUpJoin,
+    EliminateDuplicatedAggExprRule, EliminateDuplicatedSortExprRule, EliminateFilterRule,
+    EliminateJoinRule, EliminateLimitRule, HashJoinRule, JoinAssocRule, JoinCommuteRule,
+    PhysicalConversionRule, ProjectionPullUpJoin,
 };
 
 pub use adaptive::PhysicalCollector;
@@ -53,6 +54,8 @@ impl DatafusionOptimizer {
         rules.push(Arc::new(EliminateJoinRule::new()));
         rules.push(Arc::new(EliminateFilterRule::new()));
         rules.push(Arc::new(EliminateLimitRule::new()));
+        rules.push(Arc::new(EliminateDuplicatedSortExprRule::new()));
+        rules.push(Arc::new(EliminateDuplicatedAggExprRule::new()));
 
         let cost_model = AdaptiveCostModel::new(50);
         Self {
diff --git a/optd-datafusion-repr/src/plan_nodes/expr.rs b/optd-datafusion-repr/src/plan_nodes/expr.rs
index 380649f9..fbd27062 100644
--- a/optd-datafusion-repr/src/plan_nodes/expr.rs
+++ b/optd-datafusion-repr/src/plan_nodes/expr.rs
@@ -37,6 +37,10 @@ impl ExprList {
             .map(|x| Expr::from_rel_node(x.clone()).unwrap())
             .collect_vec()
     }
+
+    pub fn from_group(rel_node: OptRelNodeRef) -> Self {
+        Self(rel_node)
+    }
 }
 
 impl OptRelNode for ExprList {
diff --git a/optd-datafusion-repr/src/plan_nodes/sort.rs b/optd-datafusion-repr/src/plan_nodes/sort.rs
index e99c25f3..8fe205ef 100644
--- a/optd-datafusion-repr/src/plan_nodes/sort.rs
+++ b/optd-datafusion-repr/src/plan_nodes/sort.rs
@@ -6,6 +6,10 @@ use super::{OptRelNode, OptRelNodeRef, OptRelNodeTyp, PlanNode};
 #[derive(Clone, Debug)]
 pub struct LogicalSort(pub PlanNode);
 
+// each expression in ExprList is represented as a SortOrderExpr
+// 1. nulls_first is not included from DF
+// 2. node type defines sort order per expression
+// 3. actual expr is stored as a child of this node
 define_plan_node!(
     LogicalSort : PlanNode,
     Sort, [
diff --git a/optd-datafusion-repr/src/rules.rs b/optd-datafusion-repr/src/rules.rs
index 08b4c843..2e49af82 100644
--- a/optd-datafusion-repr/src/rules.rs
+++ b/optd-datafusion-repr/src/rules.rs
@@ -1,4 +1,5 @@
 // mod filter_join;
+mod eliminate_duplicated_expr;
 mod eliminate_filter;
 mod eliminate_limit;
 mod joins;
@@ -6,6 +7,9 @@ mod macros;
 mod physical;
 
 // pub use filter_join::FilterJoinPullUpRule;
+pub use eliminate_duplicated_expr::{
+    EliminateDuplicatedAggExprRule, EliminateDuplicatedSortExprRule,
+};
 pub use eliminate_filter::EliminateFilterRule;
 pub use eliminate_limit::EliminateLimitRule;
 pub use joins::{
diff --git a/optd-datafusion-repr/src/rules/eliminate_duplicated_expr.rs b/optd-datafusion-repr/src/rules/eliminate_duplicated_expr.rs
new file mode 100644
index 00000000..6984aa4b
--- /dev/null
+++ b/optd-datafusion-repr/src/rules/eliminate_duplicated_expr.rs
@@ -0,0 +1,117 @@
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+
+use itertools::Itertools;
+use optd_core::rules::{Rule, RuleMatcher};
+use optd_core::{optimizer::Optimizer, rel_node::RelNode};
+
+use crate::plan_nodes::{
+    Expr, ExprList, LogicalAgg, LogicalSort, OptRelNode, OptRelNodeTyp, PlanNode, SortOrderExpr,
+    SortOrderType,
+};
+
+use super::macros::define_rule;
+
+define_rule!(
+    EliminateDuplicatedSortExprRule,
+    apply_eliminate_duplicated_sort_expr,
+    (Sort, child, [exprs])
+);
+
+/// Removes duplicate sort expressions
+/// For exmaple:
+///     select *
+///     from t1
+///     order by id desc, id, name, id asc
+/// becomes
+///     select *
+///     from t1
+///     order by id desc, name
+fn apply_eliminate_duplicated_sort_expr(
+    _optimizer: &impl Optimizer<OptRelNodeTyp>,
+    EliminateDuplicatedSortExprRulePicks { child, exprs }: EliminateDuplicatedSortExprRulePicks,
+) -> Vec<RelNode<OptRelNodeTyp>> {
+    let sort_keys: Vec<Expr> = exprs
+        .children
+        .iter()
+        .map(|x| Expr::from_rel_node(x.clone()).unwrap())
+        .collect_vec();
+
+    let normalized_sort_keys: Vec<Arc<RelNode<OptRelNodeTyp>>> = exprs
+        .children
+        .iter()
+        .map(|x| match x.typ {
+            OptRelNodeTyp::SortOrder(_) => SortOrderExpr::new(
+                SortOrderType::Asc,
+                SortOrderExpr::from_rel_node(x.clone()).unwrap().child(),
+            )
+            .into_rel_node(),
+            _ => x.clone(),
+        })
+        .collect_vec();
+
+    let mut dedup_expr: Vec<Expr> = Vec::new();
+    let mut dedup_set: HashSet<Arc<RelNode<OptRelNodeTyp>>> = HashSet::new();
+
+    sort_keys
+        .iter()
+        .zip(normalized_sort_keys.iter())
+        .for_each(|(expr, normalized_expr)| {
+            if !dedup_set.contains(normalized_expr) {
+                dedup_expr.push(expr.clone());
+                dedup_set.insert(normalized_expr.to_owned());
+            }
+        });
+
+    if dedup_expr.len() != sort_keys.len() {
+        let node = LogicalSort::new(
+            PlanNode::from_group(child.into()),
+            ExprList::new(dedup_expr),
+        );
+        return vec![node.into_rel_node().as_ref().clone()];
+    }
+    vec![]
+}
+
+define_rule!(
+    EliminateDuplicatedAggExprRule,
+    apply_eliminate_duplicated_agg_expr,
+    (Agg, child, exprs, [groups])
+);
+
+/// Removes duplicate group by expressions
+/// For exmaple:
+///     select *
+///     from t1
+///     group by id, name, id, id
+/// becomes
+///     select *
+///     from t1
+///     group by id, name
+fn apply_eliminate_duplicated_agg_expr(
+    _optimizer: &impl Optimizer<OptRelNodeTyp>,
+    EliminateDuplicatedAggExprRulePicks {
+        child,
+        exprs,
+        groups,
+    }: EliminateDuplicatedAggExprRulePicks,
+) -> Vec<RelNode<OptRelNodeTyp>> {
+    let mut dedup_expr: Vec<Expr> = Vec::new();
+    let mut dedup_set: HashSet<Arc<RelNode<OptRelNodeTyp>>> = HashSet::new();
+    groups.children.iter().for_each(|expr| {
+        if !dedup_set.contains(expr) {
+            dedup_expr.push(Expr::from_rel_node(expr.clone()).unwrap());
+            dedup_set.insert(expr.clone());
+        }
+    });
+
+    if dedup_expr.len() != groups.children.len() {
+        let node = LogicalAgg::new(
+            PlanNode::from_group(child.into()),
+            ExprList::from_group(exprs.into()),
+            ExprList::new(dedup_expr),
+        );
+        return vec![node.into_rel_node().as_ref().clone()];
+    }
+    vec![]
+}
diff --git a/optd-sqlplannertest/tests/eliminate_duplicated_expr.planner.sql b/optd-sqlplannertest/tests/eliminate_duplicated_expr.planner.sql
new file mode 100644
index 00000000..b31e774f
--- /dev/null
+++ b/optd-sqlplannertest/tests/eliminate_duplicated_expr.planner.sql
@@ -0,0 +1,108 @@
+-- (no id or description)
+create table t1(v1 int, v2 int);
+insert into t1 values (0, 0), (1, 1), (5, 2), (2, 4), (0, 2);
+
+/*
+5
+*/
+
+-- Test without sorts/aggs.
+select * from t1;
+
+/*
+LogicalProjection { exprs: [ #0, #1 ] }
+└── LogicalScan { table: t1 }
+PhysicalProjection { exprs: [ #0, #1 ] }
+└── PhysicalScan { table: t1 }
+0 0
+1 1
+5 2
+2 4
+0 2
+*/
+
+-- Test whether the optimizer handles duplicate sort expressions correctly.
+select * from t1 order by v1, v2, v1 desc, v2 desc, v1 asc;
+
+/*
+LogicalSort
+├── exprs:
+│   ┌── SortOrder { order: Asc }
+│   │   └── #0
+│   ├── SortOrder { order: Asc }
+│   │   └── #1
+│   ├── SortOrder { order: Desc }
+│   │   └── #0
+│   ├── SortOrder { order: Desc }
+│   │   └── #1
+│   └── SortOrder { order: Asc }
+│       └── #0
+└── LogicalProjection { exprs: [ #0, #1 ] }
+    └── LogicalScan { table: t1 }
+PhysicalSort
+├── exprs:
+│   ┌── SortOrder { order: Asc }
+│   │   └── #0
+│   └── SortOrder { order: Asc }
+│       └── #1
+└── PhysicalProjection { exprs: [ #0, #1 ] }
+    └── PhysicalScan { table: t1 }
+0 0
+0 2
+1 1
+2 4
+5 2
+*/
+
+-- Test whether the optimizer handles duplicate agg expressions correctly.
+select * from t1 group by v1, v2, v1;
+
+/*
+LogicalProjection { exprs: [ #0, #1 ] }
+└── LogicalAgg { exprs: [], groups: [ #0, #1, #0 ] }
+    └── LogicalScan { table: t1 }
+PhysicalProjection { exprs: [ #0, #1 ] }
+└── PhysicalAgg { aggrs: [], groups: [ #0, #1 ] }
+    └── PhysicalScan { table: t1 }
+0 0
+1 1
+5 2
+2 4
+0 2
+*/
+
+-- Test whether the optimizer handles duplicate sort and agg expressions correctly.
+select * from t1 group by v1, v2, v1, v2, v2 order by v1, v2, v1 desc, v2 desc, v1 asc;
+
+/*
+LogicalSort
+├── exprs:
+│   ┌── SortOrder { order: Asc }
+│   │   └── #0
+│   ├── SortOrder { order: Asc }
+│   │   └── #1
+│   ├── SortOrder { order: Desc }
+│   │   └── #0
+│   ├── SortOrder { order: Desc }
+│   │   └── #1
+│   └── SortOrder { order: Asc }
+│       └── #0
+└── LogicalProjection { exprs: [ #0, #1 ] }
+    └── LogicalAgg { exprs: [], groups: [ #0, #1, #0, #1, #1 ] }
+        └── LogicalScan { table: t1 }
+PhysicalSort
+├── exprs:
+│   ┌── SortOrder { order: Asc }
+│   │   └── #0
+│   └── SortOrder { order: Asc }
+│       └── #1
+└── PhysicalProjection { exprs: [ #0, #1 ] }
+    └── PhysicalAgg { aggrs: [], groups: [ #0, #1 ] }
+        └── PhysicalScan { table: t1 }
+0 0
+0 2
+1 1
+2 4
+5 2
+*/
+
diff --git a/optd-sqlplannertest/tests/eliminate_duplicated_expr.yml b/optd-sqlplannertest/tests/eliminate_duplicated_expr.yml
new file mode 100644
index 00000000..4c0d57b4
--- /dev/null
+++ b/optd-sqlplannertest/tests/eliminate_duplicated_expr.yml
@@ -0,0 +1,29 @@
+- sql: |
+    create table t1(v1 int, v2 int);
+    insert into t1 values (0, 0), (1, 1), (5, 2), (2, 4), (0, 2);
+  tasks:
+    - execute
+- sql: |
+    select * from t1;
+  desc: Test without sorts/aggs.
+  tasks:
+    - explain:logical_optd,physical_optd
+    - execute
+- sql: |
+    select * from t1 order by v1, v2, v1 desc, v2 desc, v1 asc;
+  desc: Test whether the optimizer handles duplicate sort expressions correctly.
+  tasks:
+    - explain:logical_optd,physical_optd
+    - execute
+- sql: |
+    select * from t1 group by v1, v2, v1;
+  desc: Test whether the optimizer handles duplicate agg expressions correctly.
+  tasks:
+    - explain:logical_optd,physical_optd
+    - execute
+- sql: |
+    select * from t1 group by v1, v2, v1, v2, v2 order by v1, v2, v1 desc, v2 desc, v1 asc;
+  desc: Test whether the optimizer handles duplicate sort and agg expressions correctly.
+  tasks:
+    - explain:logical_optd,physical_optd
+    - execute
\ No newline at end of file