Skip to content

Commit

Permalink
feat: Seq scanner scans data by time range (#4809)
Browse files Browse the repository at this point in the history
* feat: seq scan by partition

* feat: part metrics

* chore: remove unused codes

* chore: fmt stream

* feat: build ranges returns smallvec

* feat: move scan mem/file ranges to util and reuse

* feat: log metrics

* chore: correct some metrics

* feat: get explain info from ranges

* test: group test and remove unused codes

* chore: fix clippy

* feat: change PartitionRange end to exclusive

* test: add tests
  • Loading branch information
evenyag authored Oct 17, 2024
1 parent 613e07a commit e0c4157
Show file tree
Hide file tree
Showing 10 changed files with 517 additions and 1,024 deletions.
37 changes: 19 additions & 18 deletions src/mito2/src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ pub mod projection;
pub(crate) mod prune;
pub(crate) mod range;
pub(crate) mod scan_region;
pub(crate) mod scan_util;
pub(crate) mod seq_scan;
pub(crate) mod unordered_scan;

Expand Down Expand Up @@ -57,7 +58,6 @@ use crate::error::{
use crate::memtable::BoxedBatchIterator;
use crate::metrics::{READ_BATCHES_RETURN, READ_ROWS_RETURN, READ_STAGE_ELAPSED};
use crate::read::prune::PruneReader;
use crate::sst::parquet::reader::{ReaderFilterMetrics, ReaderMetrics};

/// Storage internal representation of a batch of rows for a primary key (time series).
///
Expand Down Expand Up @@ -738,7 +738,7 @@ impl<T: BatchReader + ?Sized> BatchReader for Box<T> {
pub(crate) struct ScannerMetrics {
/// Duration to prepare the scan task.
prepare_scan_cost: Duration,
/// Duration to build parts.
/// Duration to build file ranges.
build_parts_cost: Duration,
/// Duration to build the (merge) reader.
build_reader_cost: Duration,
Expand All @@ -758,31 +758,17 @@ pub(crate) struct ScannerMetrics {
num_mem_ranges: usize,
/// Number of file ranges scanned.
num_file_ranges: usize,
/// Filter related metrics for readers.
filter_metrics: ReaderFilterMetrics,
}

impl ScannerMetrics {
/// Sets and observes metrics on initializing parts.
fn observe_init_part(&mut self, build_parts_cost: Duration, reader_metrics: &ReaderMetrics) {
self.build_parts_cost = build_parts_cost;

// Observes metrics.
/// Observes metrics.
fn observe_metrics(&self) {
READ_STAGE_ELAPSED
.with_label_values(&["prepare_scan"])
.observe(self.prepare_scan_cost.as_secs_f64());
READ_STAGE_ELAPSED
.with_label_values(&["build_parts"])
.observe(self.build_parts_cost.as_secs_f64());

// We only call this once so we overwrite it directly.
self.filter_metrics = reader_metrics.filter_metrics;
// Observes filter metrics.
self.filter_metrics.observe();
}

/// Observes metrics on scanner finish.
fn observe_metrics_on_finish(&self) {
READ_STAGE_ELAPSED
.with_label_values(&["build_reader"])
.observe(self.build_reader_cost.as_secs_f64());
Expand All @@ -801,6 +787,21 @@ impl ScannerMetrics {
READ_ROWS_RETURN.observe(self.num_rows as f64);
READ_BATCHES_RETURN.observe(self.num_batches as f64);
}

/// Merges metrics from another [ScannerMetrics].
fn merge_from(&mut self, other: &ScannerMetrics) {
self.prepare_scan_cost += other.prepare_scan_cost;
self.build_parts_cost += other.build_parts_cost;
self.build_reader_cost += other.build_reader_cost;
self.scan_cost += other.scan_cost;
self.convert_cost += other.convert_cost;
self.yield_cost += other.yield_cost;
self.total_cost += other.total_cost;
self.num_batches += other.num_batches;
self.num_rows += other.num_rows;
self.num_mem_ranges += other.num_mem_ranges;
self.num_file_ranges += other.num_file_ranges;
}
}

#[cfg(test)]
Expand Down
105 changes: 104 additions & 1 deletion src/mito2/src/read/range.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@

//! Structs for partition ranges.
use common_time::Timestamp;
use smallvec::{smallvec, SmallVec};
use store_api::region_engine::PartitionRange;

use crate::memtable::MemtableRef;
use crate::read::scan_region::ScanInput;
Expand Down Expand Up @@ -48,6 +50,26 @@ pub(crate) struct RangeMeta {
}

impl RangeMeta {
/// Creates a [PartitionRange] with specific identifier.
/// It converts the inclusive max timestamp to exclusive end timestamp.
pub(crate) fn new_partition_range(&self, identifier: usize) -> PartitionRange {
PartitionRange {
start: self.time_range.0,
end: Timestamp::new(
// The i64::MAX timestamp may be invisible but we don't guarantee to support this
// value now.
self.time_range
.1
.value()
.checked_add(1)
.unwrap_or(self.time_range.1.value()),
self.time_range.1.unit(),
),
num_rows: self.num_rows,
identifier,
}
}

/// Creates a list of ranges from the `input` for seq scan.
pub(crate) fn seq_scan_ranges(input: &ScanInput) -> Vec<RangeMeta> {
let mut ranges = Vec::with_capacity(input.memtables.len() + input.files.len());
Expand Down Expand Up @@ -177,7 +199,7 @@ impl RangeMeta {
}

fn push_seq_mem_ranges(memtables: &[MemtableRef], ranges: &mut Vec<RangeMeta>) {
// For non append-only mode, each range only contains one memtable.
// For non append-only mode, each range only contains one memtable by default.
for (i, memtable) in memtables.iter().enumerate() {
let stats = memtable.stats();
let Some(time_range) = stats.time_range() else {
Expand All @@ -195,6 +217,7 @@ impl RangeMeta {
}
}

// TODO(yingwen): Support multiple row groups in a range so we can split them later.
fn push_seq_file_ranges(
num_memtables: usize,
files: &[FileHandle],
Expand Down Expand Up @@ -264,3 +287,83 @@ fn maybe_split_ranges_for_seq_scan(ranges: Vec<RangeMeta>) -> Vec<RangeMeta> {

new_ranges
}

#[cfg(test)]
mod tests {
use common_time::timestamp::TimeUnit;
use common_time::Timestamp;

use super::*;

type Output = (Vec<usize>, i64, i64);

fn run_group_ranges_test(input: &[(usize, i64, i64)], expect: &[Output]) {
let ranges = input
.iter()
.map(|(idx, start, end)| {
let time_range = (
Timestamp::new(*start, TimeUnit::Second),
Timestamp::new(*end, TimeUnit::Second),
);
RangeMeta {
time_range,
indices: smallvec![*idx],
row_group_indices: smallvec![RowGroupIndex {
index: *idx,
row_group_index: 0
}],
num_rows: 1,
}
})
.collect();
let output = group_ranges_for_seq_scan(ranges);
let actual: Vec<_> = output
.iter()
.map(|range| {
let indices = range.indices.to_vec();
let group_indices: Vec<_> = range
.row_group_indices
.iter()
.map(|idx| idx.index)
.collect();
assert_eq!(indices, group_indices);
let range = range.time_range;
(indices, range.0.value(), range.1.value())
})
.collect();
assert_eq!(expect, actual);
}

#[test]
fn test_group_ranges() {
// Group 1 part.
run_group_ranges_test(&[(1, 0, 2000)], &[(vec![1], 0, 2000)]);

// 1, 2, 3, 4 => [3, 1, 4], [2]
run_group_ranges_test(
&[
(1, 1000, 2000),
(2, 6000, 7000),
(3, 0, 1500),
(4, 1500, 3000),
],
&[(vec![3, 1, 4], 0, 3000), (vec![2], 6000, 7000)],
);

// 1, 2, 3 => [3], [1], [2],
run_group_ranges_test(
&[(1, 3000, 4000), (2, 4001, 6000), (3, 0, 1000)],
&[
(vec![3], 0, 1000),
(vec![1], 3000, 4000),
(vec![2], 4001, 6000),
],
);

// 1, 2, 3 => [3], [1, 2]
run_group_ranges_test(
&[(1, 3000, 4000), (2, 4000, 6000), (3, 0, 1000)],
&[(vec![3], 0, 1000), (vec![1, 2], 3000, 6000)],
);
}
}
Loading

0 comments on commit e0c4157

Please sign in to comment.