Skip to content

Commit

Permalink
feat(mito): merge reader for mito2 (#2210)
Browse files Browse the repository at this point in the history
* feat: Implement slice and first/last timestamp for Batch

* feat(mito): implements sort/concat for Batch

* chore: fix typo

* chore: remove comments

* feat: sort and dedup

* test: test batch operations

* chore: cast enum to test op type

* test: test filter related api

* sytle: fix clippy

* feat: implement Node and CompareFirst

* feat: merge reader wip

* feat: merge wip

* feat: use batch's operation to sort and dedup

* feat: implement BatchReader for MergeReader

* feat: simplify codes

* test: test merge reader

* refactor: use test util to create batch

* refactor: remove unused imports

* feat: update comment

* chore: remove metadata() from Source

* chroe: update comment

* feat: source supports batch iterator

* chore: update comment
  • Loading branch information
evenyag authored Aug 24, 2023
1 parent e5ba3d1 commit 4ee1034
Show file tree
Hide file tree
Showing 5 changed files with 596 additions and 50 deletions.
2 changes: 1 addition & 1 deletion src/mito2/src/memtable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ use crate::read::Batch;
/// Should be unique under the same region.
pub type MemtableId = u32;

pub type BoxedBatchIterator = Box<dyn Iterator<Item = Result<Batch>>>;
pub type BoxedBatchIterator = Box<dyn Iterator<Item = Result<Batch>> + Send + Sync>;

/// In memory write buffer.
pub trait Memtable: Send + Sync + fmt::Debug {
Expand Down
60 changes: 18 additions & 42 deletions src/mito2/src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

//! Common structs and utilities for reading data.
pub mod merge;

use std::sync::Arc;

use api::v1::OpType;
Expand All @@ -29,12 +31,12 @@ use datatypes::vectors::{
BooleanVector, Helper, UInt32Vector, UInt64Vector, UInt8Vector, Vector, VectorRef,
};
use snafu::{ensure, OptionExt, ResultExt};
use store_api::metadata::RegionMetadataRef;
use store_api::storage::{ColumnId, SequenceNumber};

use crate::error::{
ComputeArrowSnafu, ComputeVectorSnafu, ConvertVectorSnafu, InvalidBatchSnafu, Result,
};
use crate::memtable::BoxedBatchIterator;

/// Storage internal representation of a batch of rows
/// for a primary key (time series).
Expand Down Expand Up @@ -109,7 +111,7 @@ impl Batch {
self.num_rows() == 0
}

/// Returns the first timestamp in the batch.
/// Returns the first timestamp in the batch or `None` if the batch is empty.
pub fn first_timestamp(&self) -> Option<Timestamp> {
if self.timestamps.is_empty() {
return None;
Expand All @@ -118,7 +120,7 @@ impl Batch {
Some(self.get_timestamp(0))
}

/// Returns the last timestamp in the batch.
/// Returns the last timestamp in the batch or `None` if the batch is empty.
pub fn last_timestamp(&self) -> Option<Timestamp> {
if self.timestamps.is_empty() {
return None;
Expand Down Expand Up @@ -554,20 +556,23 @@ pub struct SourceStats {
/// Async [Batch] reader and iterator wrapper.
///
/// This is the data source for SST writers or internal readers.
pub enum Source {}
pub enum Source {
/// Source from a [BoxedBatchReader].
Reader(BoxedBatchReader),
/// Source from a [BoxedBatchIterator].
Iter(BoxedBatchIterator),
}

impl Source {
/// Returns next [Batch] from this data source.
pub(crate) async fn next_batch(&mut self) -> Result<Option<Batch>> {
unimplemented!()
}

/// Returns the metadata of the source region.
pub(crate) fn metadata(&self) -> RegionMetadataRef {
unimplemented!()
match self {
Source::Reader(reader) => reader.next_batch().await,
Source::Iter(iter) => iter.next().transpose(),
}
}

// TODO(yingwen): Maybe remove this method.
// TODO(yingwen): Remove this method once we support collecting stats in the writer.
/// Returns statisics of fetched batches.
pub(crate) fn stats(&self) -> SourceStats {
unimplemented!()
Expand Down Expand Up @@ -603,46 +608,17 @@ impl<T: BatchReader + ?Sized> BatchReader for Box<T> {

#[cfg(test)]
mod tests {
use datatypes::arrow::array::{TimestampMillisecondArray, UInt64Array, UInt8Array};

use super::*;
use crate::error::Error;

fn new_batch_builder(
timestamps: &[i64],
sequences: &[u64],
op_types: &[OpType],
field: &[u64],
) -> BatchBuilder {
let mut builder = BatchBuilder::new(b"test".to_vec());
builder
.timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
timestamps.iter().copied(),
)))
.unwrap()
.sequences_array(Arc::new(UInt64Array::from_iter_values(
sequences.iter().copied(),
)))
.unwrap()
.op_types_array(Arc::new(UInt8Array::from_iter_values(
op_types.iter().map(|v| *v as u8),
)))
.unwrap()
.push_field_array(
1,
Arc::new(UInt64Array::from_iter_values(field.iter().copied())),
)
.unwrap();
builder
}
use crate::test_util::new_batch_builder;

fn new_batch(
timestamps: &[i64],
sequences: &[u64],
op_types: &[OpType],
field: &[u64],
) -> Batch {
new_batch_builder(timestamps, sequences, op_types, field)
new_batch_builder(b"test", timestamps, sequences, op_types, field)
.build()
.unwrap()
}
Expand Down
Loading

0 comments on commit 4ee1034

Please sign in to comment.