Skip to content

Commit

Permalink
refactor: replace some usage of MutableBitmap by BitVec (#610)
Browse files Browse the repository at this point in the history
  • Loading branch information
evenyag authored Nov 21, 2022
1 parent 62fcb54 commit 0791c65
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 73 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use bitvec::prelude as bv;
pub use bitvec::prelude;

// `Lsb0` provides the best codegen for bit manipulation,
// see https://github.com/bitvecto-rs/bitvec/blob/main/doc/order/Lsb0.md
pub type BitVec = bv::BitVec<u8, bv::Lsb0>;
pub type BitVec = prelude::BitVec<u8>;
4 changes: 2 additions & 2 deletions src/common/base/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.

pub mod bitset;
pub mod bit_vec;
pub mod buffer;
pub mod bytes;

pub use bitset::BitVec;
pub use bit_vec::BitVec;
22 changes: 0 additions & 22 deletions src/datatypes/src/vectors/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ use std::sync::Arc;

use arrow::array::{Array, ArrayRef, BooleanArray, MutableArray, MutableBooleanArray};
use arrow::bitmap::utils::{BitmapIter, ZipValidity};
use arrow::bitmap::MutableBitmap;
use arrow::datatypes::DataType as ArrowDataType;
use snafu::{OptionExt, ResultExt};

use crate::data_type::ConcreteDataType;
Expand Down Expand Up @@ -75,14 +73,6 @@ impl<Ptr: Borrow<Option<bool>>> FromIterator<Ptr> for BooleanVector {
}
}

impl From<MutableBitmap> for BooleanVector {
fn from(bitmap: MutableBitmap) -> BooleanVector {
BooleanVector {
array: BooleanArray::new(ArrowDataType::Boolean, bitmap.into(), None),
}
}
}

impl Vector for BooleanVector {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::boolean_datatype()
Expand Down Expand Up @@ -351,16 +341,4 @@ mod tests {
let expect: VectorRef = Arc::new(BooleanVector::from_slice(&[true, false, true]));
assert_eq!(expect, vector);
}

#[test]
fn test_from_mutable_bitmap() {
let mut bitmap = MutableBitmap::new();
let values = [false, true, true, false, true];
for v in values {
bitmap.push(v);
}
let vector = BooleanVector::from(bitmap);
let expect = BooleanVector::from_slice(&values);
assert_eq!(expect, vector);
}
}
12 changes: 6 additions & 6 deletions src/datatypes/src/vectors/operations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ mod filter;
mod find_unique;
mod replicate;

use arrow::bitmap::MutableBitmap;
use common_base::BitVec;

use crate::error::Result;
use crate::types::PrimitiveElement;
Expand Down Expand Up @@ -50,7 +50,7 @@ pub trait VectorOp {
/// Panics if
/// - `selected.len() < self.len()`.
/// - `prev_vector` and `self` have different data types.
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>);
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>);

/// Filters the vector, returns elements matching the `filter` (i.e. where the values are true).
///
Expand All @@ -65,7 +65,7 @@ macro_rules! impl_scalar_vector_op {
replicate::$replicate(self, offsets)
}

fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
let prev_vector = prev_vector.map(|pv| pv.as_any().downcast_ref::<$VectorType>().unwrap());
find_unique::find_unique_scalar(self, selected, prev_vector);
}
Expand All @@ -92,7 +92,7 @@ impl VectorOp for ConstantVector {
replicate::replicate_constant(self, offsets)
}

fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<ConstantVector>());
find_unique::find_unique_constant(self, selected, prev_vector);
}
Expand All @@ -107,7 +107,7 @@ impl VectorOp for NullVector {
replicate::replicate_null(self, offsets)
}

fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<NullVector>());
find_unique::find_unique_null(self, selected, prev_vector);
}
Expand All @@ -125,7 +125,7 @@ where
replicate::replicate_primitive(self, offsets)
}

fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
let prev_vector =
prev_vector.and_then(|pv| pv.as_any().downcast_ref::<PrimitiveVector<T>>());
find_unique::find_unique_scalar(self, selected, prev_vector);
Expand Down
37 changes: 15 additions & 22 deletions src/datatypes/src/vectors/operations/find_unique.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use arrow::bitmap::MutableBitmap;
use common_base::BitVec;

use crate::scalars::ScalarVector;
use crate::vectors::{ConstantVector, NullVector, Vector};
Expand All @@ -22,7 +22,7 @@ use crate::vectors::{ConstantVector, NullVector, Vector};
// in any other case.
pub(crate) fn find_unique_scalar<'a, T: ScalarVector>(
vector: &'a T,
selected: &'a mut MutableBitmap,
selected: &'a mut BitVec,
prev_vector: Option<&'a T>,
) where
T::RefItem<'a>: PartialEq,
Expand Down Expand Up @@ -63,7 +63,7 @@ pub(crate) fn find_unique_scalar<'a, T: ScalarVector>(

pub(crate) fn find_unique_null(
vector: &NullVector,
selected: &mut MutableBitmap,
selected: &mut BitVec,
prev_vector: Option<&NullVector>,
) {
if vector.is_empty() {
Expand All @@ -78,7 +78,7 @@ pub(crate) fn find_unique_null(

pub(crate) fn find_unique_constant(
vector: &ConstantVector,
selected: &mut MutableBitmap,
selected: &mut BitVec,
prev_vector: Option<&ConstantVector>,
) {
if vector.is_empty() {
Expand Down Expand Up @@ -107,7 +107,7 @@ mod tests {
use super::*;
use crate::vectors::{Int32Vector, StringVector, VectorOp};

fn check_bitmap(expect: &[bool], selected: &MutableBitmap) {
fn check_bitmap(expect: &[bool], selected: &BitVec) {
let actual = selected.iter().collect::<Vec<_>>();
assert_eq!(expect, actual);
}
Expand All @@ -124,7 +124,7 @@ mod tests {
let input = Int32Vector::from_iter(input);
let prev = prev.map(Int32Vector::from_slice);

let mut selected = MutableBitmap::from_len_zeroed(input.len());
let mut selected = BitVec::repeat(false, input.len());
input.find_unique(&mut selected, prev.as_ref().map(|v| v as _));

check_bitmap(expect, &selected);
Expand Down Expand Up @@ -164,7 +164,7 @@ mod tests {
let prev = Int32Vector::from_slice(&[1]);

let v1 = Int32Vector::from_slice(&[2, 3, 4]);
let mut selected = MutableBitmap::from_len_zeroed(v1.len());
let mut selected = BitVec::repeat(false, v1.len());
v1.find_unique(&mut selected, Some(&prev));

// Though element in v2 are the same as prev, but we should still keep them.
Expand All @@ -174,15 +174,8 @@ mod tests {
check_bitmap(&[true, true, true], &selected);
}

fn new_bitmap(bits: &[bool]) -> MutableBitmap {
let mut bitmap = MutableBitmap::from_len_zeroed(bits.len());
for (i, bit) in bits.iter().enumerate() {
if *bit {
bitmap.set(i, true);
}
}

bitmap
fn new_bitmap(bits: &[bool]) -> BitVec {
BitVec::from_iter(bits)
}

#[test]
Expand Down Expand Up @@ -222,7 +215,7 @@ mod tests {

fn check_find_unique_null(len: usize) {
let input = NullVector::new(len);
let mut selected = MutableBitmap::from_len_zeroed(input.len());
let mut selected = BitVec::repeat(false, input.len());
input.find_unique(&mut selected, None);

let mut expect = vec![false; len];
Expand All @@ -231,7 +224,7 @@ mod tests {
}
check_bitmap(&expect, &selected);

let mut selected = MutableBitmap::from_len_zeroed(input.len());
let mut selected = BitVec::repeat(false, input.len());
let prev = Some(NullVector::new(1));
input.find_unique(&mut selected, prev.as_ref().map(|v| v as _));
let expect = vec![false; len];
Expand Down Expand Up @@ -273,7 +266,7 @@ mod tests {

fn check_find_unique_constant(len: usize) {
let input = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[8])), len);
let mut selected = MutableBitmap::from_len_zeroed(len);
let mut selected = BitVec::repeat(false, len);
input.find_unique(&mut selected, None);

let mut expect = vec![false; len];
Expand All @@ -282,7 +275,7 @@ mod tests {
}
check_bitmap(&expect, &selected);

let mut selected = MutableBitmap::from_len_zeroed(len);
let mut selected = BitVec::repeat(false, len);
let prev = Some(ConstantVector::new(
Arc::new(Int32Vector::from_slice(&[8])),
1,
Expand Down Expand Up @@ -340,7 +333,7 @@ mod tests {
#[test]
fn test_find_unique_string() {
let input = StringVector::from_slice(&["a", "a", "b", "c"]);
let mut selected = MutableBitmap::from_len_zeroed(4);
let mut selected = BitVec::repeat(false, 4);
input.find_unique(&mut selected, None);
let expect = vec![true, false, true, true];
check_bitmap(&expect, &selected);
Expand All @@ -352,7 +345,7 @@ mod tests {
use $crate::vectors::$VectorType;

let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method));
let mut selected = MutableBitmap::from_len_zeroed(4);
let mut selected = BitVec::repeat(false, 4);
v.find_unique(&mut selected, None);
let expect = vec![true, false, true, true];
check_bitmap(&expect, &selected);
Expand Down
4 changes: 2 additions & 2 deletions src/storage/src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ mod merge;
use std::cmp::Ordering;

use async_trait::async_trait;
use datatypes::arrow::bitmap::MutableBitmap;
use common_base::BitVec;
use datatypes::data_type::DataType;
use datatypes::prelude::ConcreteDataType;
use datatypes::vectors::{BooleanVector, MutableVector, VectorRef};
Expand Down Expand Up @@ -127,7 +127,7 @@ pub trait BatchOp {
/// - `batch` and `prev` have different number of columns (unless `prev` is
/// empty).
/// - `selected.len()` is less than the number of rows.
fn find_unique(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>);
fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>);

/// Filters the `batch`, returns elements matching the `filter` (i.e. where the values
/// are true).
Expand Down
17 changes: 10 additions & 7 deletions src/storage/src/read/dedup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
// limitations under the License.

use async_trait::async_trait;
use datatypes::arrow::bitmap::MutableBitmap;
use common_base::BitVec;
use datatypes::prelude::ScalarVector;
use datatypes::vectors::BooleanVector;

use crate::error::Result;
Expand All @@ -28,6 +29,8 @@ pub struct DedupReader<R> {
reader: R,
/// Previous batch from the reader.
prev_batch: Option<Batch>,
/// Reused bitmap buffer.
selected: BitVec,
}

impl<R> DedupReader<R> {
Expand All @@ -36,6 +39,7 @@ impl<R> DedupReader<R> {
schema,
reader,
prev_batch: None,
selected: BitVec::default(),
}
}

Expand All @@ -48,12 +52,11 @@ impl<R> DedupReader<R> {
return Ok(batch);
}

// The `arrow` filter needs `BooleanArray` as input so there is no convenient
// and efficient way to reuse the bitmap. Though we could use `MutableBooleanArray`,
// but we couldn't zero all bits in the mutable array easily.
let mut selected = MutableBitmap::from_len_zeroed(batch.num_rows());
// Reinitialize the bit map to zeros.
self.selected.clear();
self.selected.resize(batch.num_rows(), false);
self.schema
.find_unique(&batch, &mut selected, self.prev_batch.as_ref());
.find_unique(&batch, &mut self.selected, self.prev_batch.as_ref());

// Store current batch to `prev_batch` so we could compare the next batch
// with this batch. We store batch before filtering it mainly for correctness, as
Expand All @@ -66,7 +69,7 @@ impl<R> DedupReader<R> {
// TODO(yingwen): To support `DELETE`, we could find all rows whose op_types are equal
// to `OpType::Delete`, mark their `selected` to false, then filter the batch.

let filter = BooleanVector::from(selected);
let filter = BooleanVector::from_iterator(self.selected.iter().by_vals());
// Filter duplicate rows.
self.schema.filter(&batch, &filter)
}
Expand Down
20 changes: 10 additions & 10 deletions src/storage/src/schema/projected.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ use std::cmp::Ordering;
use std::collections::{BTreeSet, HashMap};
use std::sync::Arc;

use common_base::BitVec;
use common_error::prelude::*;
use datatypes::arrow::bitmap::MutableBitmap;
use datatypes::schema::{SchemaBuilder, SchemaRef};
use datatypes::vectors::BooleanVector;
use store_api::storage::{Chunk, ColumnId};
Expand Down Expand Up @@ -303,7 +303,7 @@ impl BatchOp for ProjectedSchema {
})
}

fn find_unique(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>) {
fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>) {
if let Some(prev) = prev {
assert_eq!(batch.num_columns(), prev.num_columns());
}
Expand Down Expand Up @@ -503,18 +503,18 @@ mod tests {
let schema = read_util::new_projected_schema();
let batch = read_util::new_kv_batch(&[(1000, Some(1)), (2000, Some(2)), (2000, Some(2))]);

let mut selected = MutableBitmap::from_len_zeroed(3);
let mut selected = BitVec::repeat(false, 3);
schema.find_unique(&batch, &mut selected, None);
assert!(selected.get(0));
assert!(selected.get(1));
assert!(!selected.get(2));
assert!(selected[0]);
assert!(selected[1]);
assert!(!selected[2]);

let mut selected = MutableBitmap::from_len_zeroed(3);
let mut selected = BitVec::repeat(false, 3);
let prev = read_util::new_kv_batch(&[(1000, Some(1))]);
schema.find_unique(&batch, &mut selected, Some(&prev));
assert!(!selected.get(0));
assert!(selected.get(1));
assert!(!selected.get(2));
assert!(!selected[0]);
assert!(selected[1]);
assert!(!selected[2]);
}

#[test]
Expand Down

0 comments on commit 0791c65

Please sign in to comment.