Skip to content

Commit

Permalink
feat: "hello world" selectivity computation (#70)
Browse files Browse the repository at this point in the history
**Description**: reason this PR just contains the simplest possible
selectivity computation is because computing selectivity at all requires
lots of refactoring, so I want that refactoring reviewed separately from
the "core" selectivity computation logic.

**Demo**: passing `test_colref_eq_constint_in_mcv` test
![Screenshot 2024-02-16 at 12 37
56](https://github.com/cmu-db/optd/assets/20631215/66661d41-0b3f-4148-96bf-0db2cabb00c0)
  • Loading branch information
wangpatrick57 authored Feb 19, 2024
1 parent ad37149 commit 755db92
Show file tree
Hide file tree
Showing 18 changed files with 357 additions and 148 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions ci.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
# runs the stuff in CI.yaml locally
# unfortunately this needs to be updated manually. just update it if you get annoyed by GHAs failing

set -ex

cargo fmt --all -- --check
cargo clippy --workspace --all-targets --all-features --locked -- -D warnings
cargo test --no-fail-fast --workspace --all-features --locked
2 changes: 2 additions & 0 deletions optd-core/src/cascades/memo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,8 @@ impl<T: RelNodeTyp> Memo<T> {
self.groups.remove(&group_id);
}

/// If group_id exists, it adds expr_id to the existing group
/// Otherwise, it creates a new group of that group_id and insert expr_id into the new group
fn add_expr_to_group(
&mut self,
expr_id: ExprId,
Expand Down
11 changes: 8 additions & 3 deletions optd-core/src/cascades/optimizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,11 @@ pub struct CascadesOptimizer<T: RelNodeTyp> {

/// `RelNode` only contains the representation of the plan nodes. Sometimes, we need more context, i.e., group id and
/// expr id, during the optimization phase. All these information are collected in this struct.
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)]
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)]
pub struct RelNodeContext {
pub group_id: GroupId,
pub expr_id: ExprId,
pub children_group_ids: Vec<GroupId>,
}

#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)]
Expand Down Expand Up @@ -322,9 +323,13 @@ impl<T: RelNodeTyp> CascadesOptimizer<T> {
.get_all_expr_bindings(expr_id, false, false, level)
}

pub fn get_all_group_physical_bindings(&self, group_id: GroupId) -> Vec<RelNodeRef<T>> {
pub fn get_all_group_bindings(
&self,
group_id: GroupId,
physical_only: bool,
) -> Vec<RelNodeRef<T>> {
self.memo
.get_all_group_bindings(group_id, true, true, Some(10))
.get_all_group_bindings(group_id, physical_only, true, Some(10))
}

pub(super) fn is_group_explored(&self, group_id: GroupId) -> bool {
Expand Down
28 changes: 21 additions & 7 deletions optd-core/src/cascades/tasks/optimize_inputs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ impl<T: RelNodeTyp> Task<T> for OptimizeInputsTask {
trace!(event = "task_begin", task = "optimize_inputs", expr_id = %self.expr_id, continue_from = ?self.continue_from);
let expr = optimizer.get_expr_memoed(self.expr_id);
let group_id = optimizer.get_group_id(self.expr_id);
let children = &expr.children;
let children_group_ids = &expr.children;
let cost = optimizer.cost();

if let Some(ContinueTask {
Expand All @@ -149,10 +149,17 @@ impl<T: RelNodeTyp> Task<T> for OptimizeInputsTask {
let context = RelNodeContext {
expr_id: self.expr_id,
group_id,
children_group_ids: children_group_ids.clone(),
};
if self.should_terminate(
cost.sum(
&cost.compute_cost(&expr.typ, &expr.data, &input_cost, Some(context)),
&cost.compute_cost(
&expr.typ,
&expr.data,
&input_cost,
Some(context.clone()),
Some(optimizer),
),
&input_cost,
)
.0[0],
Expand All @@ -161,8 +168,8 @@ impl<T: RelNodeTyp> Task<T> for OptimizeInputsTask {
trace!(event = "task_finish", task = "optimize_inputs", expr_id = %self.expr_id);
return Ok(vec![]);
}
if next_group_idx < children.len() {
let group_id = children[next_group_idx];
if next_group_idx < children_group_ids.len() {
let group_id = children_group_ids[next_group_idx];
let group_idx = next_group_idx;
let group_info = optimizer.get_group_info(group_id);
let mut has_full_winner = false;
Expand All @@ -176,7 +183,8 @@ impl<T: RelNodeTyp> Task<T> for OptimizeInputsTask {
&expr.typ,
&expr.data,
&input_cost,
Some(context),
Some(context.clone()),
Some(optimizer),
),
&input_cost,
)
Expand Down Expand Up @@ -243,7 +251,13 @@ impl<T: RelNodeTyp> Task<T> for OptimizeInputsTask {
} else {
self.update_winner(
&cost.sum(
&cost.compute_cost(&expr.typ, &expr.data, &input_cost, Some(context)),
&cost.compute_cost(
&expr.typ,
&expr.data,
&input_cost,
Some(context.clone()),
Some(optimizer),
),
&input_cost,
),
optimizer,
Expand All @@ -252,7 +266,7 @@ impl<T: RelNodeTyp> Task<T> for OptimizeInputsTask {
Ok(vec![])
}
} else {
let input_cost = self.first_invoke(children, optimizer);
let input_cost = self.first_invoke(children_group_ids, optimizer);
trace!(event = "task_yield", task = "optimize_inputs", expr_id = %self.expr_id);
Ok(vec![Box::new(self.continue_from(
ContinueTask {
Expand Down
4 changes: 3 additions & 1 deletion optd-core/src/cost.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::{
cascades::RelNodeContext,
cascades::{CascadesOptimizer, RelNodeContext},
rel_node::{RelNode, RelNodeTyp, Value},
};

Expand All @@ -13,6 +13,8 @@ pub trait CostModel<T: RelNodeTyp>: 'static + Send + Sync {
data: &Option<Value>,
children: &[Cost],
context: Option<RelNodeContext>,
// one reason we need the optimizer is to traverse children nodes to build up an expression tree
optimizer: Option<&CascadesOptimizer<T>>,
) -> Cost;

fn compute_plan_node_cost(&self, node: &RelNode<T>) -> Cost;
Expand Down
35 changes: 8 additions & 27 deletions optd-datafusion-bridge/src/from_optd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ use datafusion::{
use optd_datafusion_repr::{
plan_nodes::{
BetweenExpr, BinOpExpr, BinOpType, CastExpr, ColumnRefExpr, ConstantExpr, ConstantType,
Expr, ExprList, FuncExpr, FuncType, JoinType, LikeExpr, LogOpExpr, LogOpType, OptRelNode,
OptRelNodeRef, OptRelNodeTyp, PhysicalAgg, PhysicalEmptyRelation, PhysicalFilter,
PhysicalHashJoin, PhysicalLimit, PhysicalNestedLoopJoin, PhysicalProjection, PhysicalScan,
PhysicalSort, PlanNode, SortOrderExpr, SortOrderType,
Expr, FuncExpr, FuncType, JoinType, LikeExpr, OptRelNode, OptRelNodeRef, OptRelNodeTyp,
PhysicalAgg, PhysicalEmptyRelation, PhysicalFilter, PhysicalHashJoin, PhysicalLimit,
PhysicalNestedLoopJoin, PhysicalProjection, PhysicalScan, PhysicalSort, PlanNode,
SortOrderExpr, SortOrderType,
},
properties::schema::Schema as OptdSchema,
PhysicalCollector,
Expand Down Expand Up @@ -191,23 +191,6 @@ impl OptdPlanContext<'_> {
}
}
OptRelNodeTyp::Sort => unreachable!(),
OptRelNodeTyp::LogOp(typ) => {
let expr = LogOpExpr::from_rel_node(expr.into_rel_node()).unwrap();
let mut children = expr.children().to_vec().into_iter();
let first_expr = Self::conv_from_optd_expr(children.next().unwrap(), context)?;
let op = match typ {
LogOpType::And => datafusion::logical_expr::Operator::And,
LogOpType::Or => datafusion::logical_expr::Operator::Or,
};
children.try_fold(first_expr, |acc, expr| {
let expr = Self::conv_from_optd_expr(expr, context)?;
Ok(
Arc::new(datafusion::physical_plan::expressions::BinaryExpr::new(
acc, op, expr,
)) as Arc<dyn PhysicalExpr>,
)
})
}
OptRelNodeTyp::BinOp(op) => {
let expr = BinOpExpr::from_rel_node(expr.into_rel_node()).unwrap();
let left = Self::conv_from_optd_expr(expr.left_child(), context)?;
Expand Down Expand Up @@ -237,12 +220,10 @@ impl OptdPlanContext<'_> {
// TODO: should we just convert between to x <= c1 and x >= c2?
let expr = BetweenExpr::from_rel_node(expr.into_rel_node()).unwrap();
Self::conv_from_optd_expr(
LogOpExpr::new(
LogOpType::And,
ExprList::new(vec![
BinOpExpr::new(expr.child(), expr.lower(), BinOpType::Geq).into_expr(),
BinOpExpr::new(expr.child(), expr.upper(), BinOpType::Leq).into_expr(),
]),
BinOpExpr::new(
BinOpExpr::new(expr.child(), expr.lower(), BinOpType::Geq).into_expr(),
BinOpExpr::new(expr.child(), expr.upper(), BinOpType::Leq).into_expr(),
BinOpType::And,
)
.into_expr(),
context,
Expand Down
26 changes: 16 additions & 10 deletions optd-datafusion-bridge/src/into_optd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ use datafusion_expr::Expr as DFExpr;
use optd_core::rel_node::RelNode;
use optd_datafusion_repr::plan_nodes::{
BetweenExpr, BinOpExpr, BinOpType, CastExpr, ColumnRefExpr, ConstantExpr, Expr, ExprList,
FuncExpr, FuncType, JoinType, LikeExpr, LogOpExpr, LogOpType, LogicalAgg, LogicalEmptyRelation,
LogicalFilter, LogicalJoin, LogicalLimit, LogicalProjection, LogicalScan, LogicalSort,
OptRelNode, OptRelNodeRef, OptRelNodeTyp, PlanNode, SortOrderExpr, SortOrderType,
FuncExpr, FuncType, JoinType, LikeExpr, LogicalAgg, LogicalEmptyRelation, LogicalFilter,
LogicalJoin, LogicalLimit, LogicalProjection, LogicalScan, LogicalSort, OptRelNode,
OptRelNodeRef, OptRelNodeTyp, PlanNode, SortOrderExpr, SortOrderType,
};

use crate::OptdPlanContext;
Expand Down Expand Up @@ -296,13 +296,19 @@ impl OptdPlanContext<'_> {
} else if log_ops.len() == 1 {
Ok(LogicalJoin::new(left, right, log_ops.remove(0), join_type))
} else {
let expr_list = ExprList::new(log_ops);
Ok(LogicalJoin::new(
left,
right,
LogOpExpr::new(LogOpType::And, expr_list).into_expr(),
join_type,
))
// Build a left-deep tree from log_ops
// I wanted to pop from the left instead of the right to maintain the order, even if it's slower
// you can obv change log_ops to a Deque to avoid this issue but I didn't bother since I don't wanna
// do premature optimization
let left_nonlog_op = log_ops.remove(0);
let right_nonlog_op = log_ops.remove(0);
let mut cond =
BinOpExpr::new(left_nonlog_op, right_nonlog_op, BinOpType::And).into_expr();
while !log_ops.is_empty() {
cond = BinOpExpr::new(cond, log_ops.remove(0), BinOpType::And).into_expr();
}

Ok(LogicalJoin::new(left, right, cond, join_type))
}
}

Expand Down
4 changes: 2 additions & 2 deletions optd-datafusion-bridge/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use std::{
sync::{Arc, Mutex},
};

struct OptdPlanContext<'a> {
pub struct OptdPlanContext<'a> {
tables: HashMap<String, Arc<dyn TableSource>>,
session_state: &'a SessionState,
pub optimizer: Option<&'a DatafusionOptimizer>,
Expand Down Expand Up @@ -251,7 +251,7 @@ impl OptdQueryPlanner {
));
let bindings = optimizer
.optd_optimizer()
.get_all_group_physical_bindings(group_id);
.get_all_group_bindings(group_id, true);
let mut join_orders = BTreeSet::new();
let mut logical_join_orders = BTreeSet::new();
for binding in bindings {
Expand Down
1 change: 1 addition & 0 deletions optd-datafusion-repr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ optd-core = { path = "../optd-core" }
camelpaste = "0.1"
datafusion-expr = "32.0.0"
async-trait = "0.1"
datafusion = "32.0.0"
4 changes: 2 additions & 2 deletions optd-datafusion-repr/src/bin/test_optimize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use optd_core::{
rel_node::Value,
};
use optd_datafusion_repr::{
cost::OptCostModel,
cost::{OptCostModel, PerTableStats},
plan_nodes::{
BinOpExpr, BinOpType, ColumnRefExpr, ConstantExpr, JoinType, LogicalFilter, LogicalJoin,
LogicalScan, OptRelNode, OptRelNodeTyp, PlanNode,
Expand Down Expand Up @@ -36,7 +36,7 @@ pub fn main() {
Box::new(OptCostModel::new(
[("t1", 1000), ("t2", 100), ("t3", 10000)]
.into_iter()
.map(|(x, y)| (x.to_string(), y))
.map(|(x, y)| (x.to_string(), PerTableStats::new(y, vec![])))
.collect(),
)),
vec![],
Expand Down
4 changes: 3 additions & 1 deletion optd-datafusion-repr/src/cost.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ mod adaptive_cost;
mod base_cost;

pub use adaptive_cost::{AdaptiveCostModel, RuntimeAdaptionStorage};
pub use base_cost::{OptCostModel, COMPUTE_COST, IO_COST, ROW_COUNT};
pub use base_cost::{
OptCostModel, PerColumnStats, PerTableStats, COMPUTE_COST, IO_COST, ROW_COUNT,
};
10 changes: 7 additions & 3 deletions optd-datafusion-repr/src/cost/adaptive_cost.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::{

use crate::{cost::OptCostModel, plan_nodes::OptRelNodeTyp};
use optd_core::{
cascades::{GroupId, RelNodeContext},
cascades::{CascadesOptimizer, GroupId, RelNodeContext},
cost::{Cost, CostModel},
rel_node::{RelNode, Value},
};
Expand Down Expand Up @@ -43,6 +43,7 @@ impl CostModel<OptRelNodeTyp> for AdaptiveCostModel {
data: &Option<Value>,
children: &[Cost],
context: Option<RelNodeContext>,
optimizer: Option<&CascadesOptimizer<OptRelNodeTyp>>,
) -> Cost {
if let OptRelNodeTyp::PhysicalScan = node {
let guard = self.runtime_row_cnt.lock().unwrap();
Expand All @@ -57,8 +58,11 @@ impl CostModel<OptRelNodeTyp> for AdaptiveCostModel {
return OptCostModel::cost(1.0, 0.0, 1.0);
}
}
let (mut row_cnt, compute_cost, io_cost) =
OptCostModel::cost_tuple(&self.base_model.compute_cost(node, data, children, None));
let (mut row_cnt, compute_cost, io_cost) = OptCostModel::cost_tuple(
&self
.base_model
.compute_cost(node, data, children, context.clone(), optimizer),
);
if let Some(context) = context {
let guard = self.runtime_row_cnt.lock().unwrap();
if let Some((runtime_row_cnt, iter)) = guard.history.get(&context.group_id) {
Expand Down
Loading

0 comments on commit 755db92

Please sign in to comment.