Skip to content

Commit

Permalink
feat: using proper magic numbers in various edge cases (#143)
Browse files Browse the repository at this point in the history
**Summary**: Using magic numbers from Postgres in various selectivity
edge cases.

**Demo**:

Different (unfortunately worse) q-error on TPC-H SF1. See #127 for
per-query details on how this PR affects q-error.

![Screenshot 2024-03-30 at 11 27
24](https://github.com/cmu-db/optd/assets/20631215/b0cce5d4-6ac8-4cd5-b0cf-48f86db14d26)


**Details**:
* Fixed the cardinality of Q10!
* `INVALID_SEL` is **no longer used** at all during cardtest. It is
still used during plannertest as some plannertests use the optd
optimizer instead of the datafusion logical optimizer. This can be
checked by replacing all instances of `INVALID_SEL` with a `panic!()`
and seeing that cardtest still runs.
* Using magic number from Postgres for `LIKE`.
* Using magic number from Postgres for equality with various complex
expressions.
* Using magic number from Postgres for range comparison with various
complex expressions.
* Replaced `INVALID_SEL` with `panic!()` and `unreachable!()` statements
in places where it makes sense.
  • Loading branch information
wangpatrick57 authored Mar 30, 2024
1 parent bd40a10 commit f42a3cd
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 107 deletions.
235 changes: 145 additions & 90 deletions optd-datafusion-repr/src/cost/base_cost.rs
Original file line number Diff line number Diff line change
Expand Up @@ -315,9 +315,16 @@ pub trait Distribution: 'static + Send + Sync {
pub const ROW_COUNT: usize = 1;
pub const COMPUTE_COST: usize = 2;
pub const IO_COST: usize = 3;
// used to indicate a combination of unimplemented!(), unreachable!(), or panic!()
// TODO: a future PR will remove this and get the code working for all of TPC-H
const INVALID_SELECTIVITY: f64 = 0.001;

// Default statistics. All are from selfuncs.h in Postgres unless specified otherwise
// Default selectivity estimate for equalities such as "A = b"
const DEFAULT_EQ_SEL: f64 = 0.005;
// Default selectivity estimate for inequalities such as "A < b"
const DEFAULT_INEQ_SEL: f64 = 0.3333333333333333;
// Default selectivity estimate for pattern-match operators such as LIKE
const DEFAULT_MATCH_SEL: f64 = 0.005;

const INVALID_SEL: f64 = 0.01;

impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
pub fn row_cnt(Cost(cost): &Cost) -> f64 {
Expand Down Expand Up @@ -421,10 +428,10 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
row_cnt.min(fetch as f64)
}
} else {
(row_cnt * INVALID_SELECTIVITY).max(1.0)
panic!("compute_cost() should not be called if optimizer is None")
}
} else {
(row_cnt * INVALID_SELECTIVITY).max(1.0)
panic!("compute_cost() should not be called if context is None")
};
Self::cost(row_cnt, compute_cost, 0.0)
}
Expand All @@ -446,13 +453,13 @@ impl<M: MostCommonValues, D: Distribution> CostModel<OptRelNodeTyp> for OptCostM
if let Some(expr_tree) = expr_trees.first() {
self.get_filter_selectivity(Arc::clone(expr_tree), &column_refs)
} else {
INVALID_SELECTIVITY
panic!("encountered a PhysicalFilter without an expression")
}
} else {
INVALID_SELECTIVITY
panic!("compute_cost() should not be called if optimizer is None")
}
}
None => INVALID_SELECTIVITY,
None => panic!("compute_cost() should not be called if context is None"),
};

Self::cost(
Expand Down Expand Up @@ -552,53 +559,73 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
column_refs: &GroupColumnRefs,
) -> f64 {
assert!(expr_tree.typ.is_expression());
match expr_tree.typ {
match &expr_tree.typ {
OptRelNodeTyp::Constant(_) => todo!("check bool type or else panic"),
OptRelNodeTyp::ColumnRef => todo!("check bool type or else panic"),
OptRelNodeTyp::UnOp(un_op_typ) => {
assert!(expr_tree.children.len() == 1);
let child = expr_tree.child(0);
match un_op_typ {
// not doesn't care about nulls so there's no complex logic. it just reverses the selectivity
// for instance, != _will not_ include nulls but "NOT ==" _will_ include nulls
UnOpType::Not => 1.0 - self.get_filter_selectivity(child, column_refs),
UnOpType::Neg => panic!(
"the selectivity of operations that return numerical values is undefined"
),
}
}
OptRelNodeTyp::BinOp(bin_op_typ) => {
assert!(expr_tree.children.len() == 2);
let left_child = expr_tree.child(0);
let right_child = expr_tree.child(1);

if bin_op_typ.is_comparison() {
self.get_comparison_op_selectivity(
bin_op_typ,
*bin_op_typ,
left_child,
right_child,
column_refs,
)
} else if bin_op_typ.is_numerical() {
INVALID_SELECTIVITY
panic!(
"the selectivity of operations that return numerical values is undefined"
)
} else {
unreachable!("all BinOpTypes should be true for at least one is_*() function")
}
}
OptRelNodeTyp::LogOp(log_op_typ) => {
self.get_log_op_selectivity(log_op_typ, &expr_tree.children, column_refs)
self.get_log_op_selectivity(*log_op_typ, &expr_tree.children, column_refs)
}
OptRelNodeTyp::UnOp(un_op_typ) => {
assert!(expr_tree.children.len() == 1);
let child = expr_tree.child(0);
match un_op_typ {
// not doesn't care about nulls so there's no complex logic. it just reverses the selectivity
// for instance, != _will not_ include nulls but "NOT ==" _will_ include nulls
UnOpType::Not => 1.0 - self.get_filter_selectivity(child, column_refs),
_ => INVALID_SELECTIVITY,
}
OptRelNodeTyp::Func(_) => todo!("check bool type or else panic"),
OptRelNodeTyp::SortOrder(_) => {
panic!("the selectivity of sort order expressions is undefined")
}
OptRelNodeTyp::Between => INVALID_SEL,
OptRelNodeTyp::Cast => todo!("check bool type or else panic"),
OptRelNodeTyp::Like => DEFAULT_MATCH_SEL,
OptRelNodeTyp::DataType(_) => {
panic!("the selectivity of a data type is not defined")
}
_ => INVALID_SELECTIVITY,
OptRelNodeTyp::InList => INVALID_SEL,
_ => unreachable!(
"all expression OptRelNodeTyp were enumerated. this should be unreachable"
),
}
}

/// Comparison operators are the base case for recursion in get_filter_selectivity()
fn get_comparison_op_selectivity(
&self,
bin_op_typ: BinOpType,
comp_bin_op_typ: BinOpType,
left: OptRelNodeRef,
right: OptRelNodeRef,
column_refs: &GroupColumnRefs,
) -> f64 {
assert!(bin_op_typ.is_comparison());
assert!(comp_bin_op_typ.is_comparison());

// the # of column refs determines how we handle the logic
// it's more convenient to refer to the children based on whether they're column nodes or not
// rather than by left/right
let mut col_ref_nodes = vec![];
let mut non_col_ref_nodes = vec![];
let is_left_col_ref;
Expand All @@ -623,8 +650,9 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
non_col_ref_nodes.push(right);
}

// handle the different cases of column nodes
if col_ref_nodes.is_empty() {
INVALID_SELECTIVITY
INVALID_SEL
} else if col_ref_nodes.len() == 1 {
let col_ref_node = col_ref_nodes
.pop()
Expand All @@ -636,79 +664,98 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
.pop()
.expect("non_col_ref_nodes should have a value since col_ref_nodes.len() == 1");

if let OptRelNodeTyp::Constant(_) = non_col_ref_node.as_ref().typ {
let value = non_col_ref_node
.as_ref()
.data
.as_ref()
.expect("constants should have data");
match match bin_op_typ {
BinOpType::Eq => {
self.get_column_equality_selectivity(table, *col_idx, value, true)
}
BinOpType::Neq => {
self.get_column_equality_selectivity(table, *col_idx, value, false)
match non_col_ref_node.as_ref().typ {
OptRelNodeTyp::Constant(_) => {
let value = non_col_ref_node
.as_ref()
.data
.as_ref()
.expect("constants should have data");
match comp_bin_op_typ {
BinOpType::Eq => {
self.get_column_equality_selectivity(table, *col_idx, value, true)
}
BinOpType::Neq => {
self.get_column_equality_selectivity(table, *col_idx, value, false)
}
BinOpType::Lt => self.get_column_range_selectivity(
table,
*col_idx,
value,
is_left_col_ref,
false,
),
BinOpType::Leq => self.get_column_range_selectivity(
table,
*col_idx,
value,
is_left_col_ref,
true,
),
BinOpType::Gt => self.get_column_range_selectivity(
table,
*col_idx,
value,
!is_left_col_ref,
false,
),
BinOpType::Geq => self.get_column_range_selectivity(
table,
*col_idx,
value,
!is_left_col_ref,
true,
),
_ => unreachable!("all comparison BinOpTypes were enumerated. this should be unreachable"),
}
BinOpType::Lt => self.get_column_range_selectivity(
table,
*col_idx,
value,
is_left_col_ref,
false,
),
BinOpType::Leq => self.get_column_range_selectivity(
table,
*col_idx,
value,
is_left_col_ref,
true,
),
BinOpType::Gt => self.get_column_range_selectivity(
table,
*col_idx,
value,
!is_left_col_ref,
false,
),
BinOpType::Geq => self.get_column_range_selectivity(
table,
*col_idx,
value,
!is_left_col_ref,
true,
),
_ => None,
} {
Some(sel) => sel,
None => INVALID_SELECTIVITY,
}
} else {
INVALID_SELECTIVITY
OptRelNodeTyp::BinOp(_) => {
Self::get_default_comparison_op_selectivity(comp_bin_op_typ)
}
OptRelNodeTyp::Cast => INVALID_SEL,
_ => unimplemented!(
"unhandled case of comparing a column ref node to {}",
non_col_ref_node.as_ref().typ
),
}
} else {
INVALID_SELECTIVITY
unimplemented!("non base table column refs need to be implemented")
}
} else if col_ref_nodes.len() == 2 {
INVALID_SELECTIVITY
Self::get_default_comparison_op_selectivity(comp_bin_op_typ)
} else {
unreachable!("We could have at most pushed left and right into col_ref_nodes")
unreachable!("we could have at most pushed left and right into col_ref_nodes")
}
}

/// The default selectivity of a comparison expression
/// Used when one side of the comparison is a column while the other side is something too
/// complex/impossible to evaluate (subquery, UDF, another column, we have no stats, etc.)
fn get_default_comparison_op_selectivity(comp_bin_op_typ: BinOpType) -> f64 {
assert!(comp_bin_op_typ.is_comparison());
match comp_bin_op_typ {
BinOpType::Eq => DEFAULT_EQ_SEL,
BinOpType::Neq => 1.0 - DEFAULT_EQ_SEL,
BinOpType::Lt | BinOpType::Leq | BinOpType::Gt | BinOpType::Geq => DEFAULT_INEQ_SEL,
_ => unreachable!(
"all comparison BinOpTypes were enumerated. this should be unreachable"
),
}
}

/// Get the selectivity of an expression of the form "column equals value" (or "value equals column")
/// Computes selectivity based off of statistics
/// Will handle the case of statistics missing
/// Equality predicates are handled entirely differently from range predicates so this is its own function
/// Also, get_column_equality_selectivity is a subroutine when computing range selectivity, which is another
/// reason for separating these into two functions
/// If it is unable to find the statistics, it returns None
/// is_eq means whether it's == or !=
fn get_column_equality_selectivity(
&self,
table: &str,
col_idx: usize,
value: &Value,
is_eq: bool,
) -> Option<f64> {
) -> f64 {
if let Some(per_table_stats) = self.per_table_stats_map.get(table) {
if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(col_idx)
{
Expand All @@ -722,16 +769,26 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
// note that nulls are not included in ndistinct so we don't need to do non_mcv_cnt - 1 if null_frac > 0
(non_mcv_freq - per_column_stats.null_frac) / (non_mcv_cnt as f64)
};
Some(if is_eq {
if is_eq {
eq_freq
} else {
1.0 - eq_freq - per_column_stats.null_frac
})
}
} else {
None
#[allow(clippy::collapsible_else_if)]
if is_eq {
DEFAULT_EQ_SEL
} else {
1.0 - DEFAULT_EQ_SEL
}
}
} else {
None
#[allow(clippy::collapsible_else_if)]
if is_eq {
DEFAULT_EQ_SEL
} else {
1.0 - DEFAULT_EQ_SEL
}
}
}

Expand All @@ -748,7 +805,7 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
value: &Value,
is_col_lt_val: bool,
is_col_eq_val: bool,
) -> Option<f64> {
) -> f64 {
if let Some(per_table_stats) = self.per_table_stats_map.get(table) {
if let Some(Some(per_column_stats)) = per_table_stats.per_column_stats_vec.get(col_idx)
{
Expand All @@ -764,12 +821,10 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
// depending on whether value is in mcvs or not, we use different logic to turn total_leq_cdf into total_lt_cdf
// this logic just so happens to be the exact same logic as get_column_equality_selectivity implements
let total_lt_freq = total_leq_freq
- self
.get_column_equality_selectivity(table, col_idx, value, true)
.expect("we already know that table and col_idx exist");
- self.get_column_equality_selectivity(table, col_idx, value, true);

// use either total_leq_freq or total_lt_freq to get the selectivity
Some(if is_col_lt_val {
if is_col_lt_val {
if is_col_eq_val {
// this branch means <=
total_leq_freq
Expand All @@ -788,12 +843,12 @@ impl<M: MostCommonValues, D: Distribution> OptCostModel<M, D> {
// this branch means >. same logic as above
1.0 - total_leq_freq - per_column_stats.null_frac
}
})
}
} else {
None
DEFAULT_INEQ_SEL
}
} else {
None
DEFAULT_INEQ_SEL
}
}

Expand Down
4 changes: 2 additions & 2 deletions optd-perftest/src/cardtest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,14 @@ pub trait CardtestRunnerDBMSHelper {

pub async fn cardtest<P: AsRef<Path>>(
workspace_dpath: P,
use_cached_optd_stats: bool,
no_cached_optd_stats: bool,
pguser: &str,
pgpassword: &str,
tpch_config: TpchConfig,
) -> anyhow::Result<HashMap<String, Vec<Cardinfo>>> {
let pg_dbms = Box::new(PostgresDBMS::build(&workspace_dpath, pguser, pgpassword)?);
let truecard_getter = pg_dbms.clone();
let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath, use_cached_optd_stats).await?);
let df_dbms = Box::new(DatafusionDBMS::new(&workspace_dpath, no_cached_optd_stats).await?);
let dbmss: Vec<Box<dyn CardtestRunnerDBMSHelper>> = vec![pg_dbms, df_dbms];

let tpch_benchmark = Benchmark::Tpch(tpch_config.clone());
Expand Down
Loading

0 comments on commit f42a3cd

Please sign in to comment.