From 0791c65149efea7fa0acec67531878dbe37186a5 Mon Sep 17 00:00:00 2001 From: Yingwen Date: Mon, 21 Nov 2022 17:36:53 +0800 Subject: [PATCH] refactor: replace some usage of MutableBitmap by BitVec (#610) --- src/common/base/src/{bitset.rs => bit_vec.rs} | 4 +- src/common/base/src/lib.rs | 4 +- src/datatypes/src/vectors/boolean.rs | 22 ----------- src/datatypes/src/vectors/operations.rs | 12 +++--- .../src/vectors/operations/find_unique.rs | 37 ++++++++----------- src/storage/src/read.rs | 4 +- src/storage/src/read/dedup.rs | 17 +++++---- src/storage/src/schema/projected.rs | 20 +++++----- 8 files changed, 47 insertions(+), 73 deletions(-) rename src/common/base/src/{bitset.rs => bit_vec.rs} (91%) diff --git a/src/common/base/src/bitset.rs b/src/common/base/src/bit_vec.rs similarity index 91% rename from src/common/base/src/bitset.rs rename to src/common/base/src/bit_vec.rs index 3244e749e75d..bdc1e4020eda 100644 --- a/src/common/base/src/bitset.rs +++ b/src/common/base/src/bit_vec.rs @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use bitvec::prelude as bv; +pub use bitvec::prelude; // `Lsb0` provides the best codegen for bit manipulation, // see https://github.com/bitvecto-rs/bitvec/blob/main/doc/order/Lsb0.md -pub type BitVec = bv::BitVec; +pub type BitVec = prelude::BitVec; diff --git a/src/common/base/src/lib.rs b/src/common/base/src/lib.rs index 2e58f5d826f2..c3d8a047295b 100644 --- a/src/common/base/src/lib.rs +++ b/src/common/base/src/lib.rs @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod bitset; +pub mod bit_vec; pub mod buffer; pub mod bytes; -pub use bitset::BitVec; +pub use bit_vec::BitVec; diff --git a/src/datatypes/src/vectors/boolean.rs b/src/datatypes/src/vectors/boolean.rs index fc1302f2c61f..11c40bd66157 100644 --- a/src/datatypes/src/vectors/boolean.rs +++ b/src/datatypes/src/vectors/boolean.rs @@ -18,8 +18,6 @@ use std::sync::Arc; use arrow::array::{Array, ArrayRef, BooleanArray, MutableArray, MutableBooleanArray}; use arrow::bitmap::utils::{BitmapIter, ZipValidity}; -use arrow::bitmap::MutableBitmap; -use arrow::datatypes::DataType as ArrowDataType; use snafu::{OptionExt, ResultExt}; use crate::data_type::ConcreteDataType; @@ -75,14 +73,6 @@ impl>> FromIterator for BooleanVector { } } -impl From for BooleanVector { - fn from(bitmap: MutableBitmap) -> BooleanVector { - BooleanVector { - array: BooleanArray::new(ArrowDataType::Boolean, bitmap.into(), None), - } - } -} - impl Vector for BooleanVector { fn data_type(&self) -> ConcreteDataType { ConcreteDataType::boolean_datatype() @@ -351,16 +341,4 @@ mod tests { let expect: VectorRef = Arc::new(BooleanVector::from_slice(&[true, false, true])); assert_eq!(expect, vector); } - - #[test] - fn test_from_mutable_bitmap() { - let mut bitmap = MutableBitmap::new(); - let values = [false, true, true, false, true]; - for v in values { - bitmap.push(v); - } - let vector = BooleanVector::from(bitmap); - let expect = BooleanVector::from_slice(&values); - assert_eq!(expect, vector); - } } diff --git a/src/datatypes/src/vectors/operations.rs b/src/datatypes/src/vectors/operations.rs index ea09c5ef7997..e63f338a0546 100644 --- a/src/datatypes/src/vectors/operations.rs +++ b/src/datatypes/src/vectors/operations.rs @@ -16,7 +16,7 @@ mod filter; mod find_unique; mod replicate; -use arrow::bitmap::MutableBitmap; +use common_base::BitVec; use crate::error::Result; use crate::types::PrimitiveElement; @@ -50,7 +50,7 @@ pub trait VectorOp { /// Panics if /// - `selected.len() < self.len()`. /// - `prev_vector` and `self` have different data types. - fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>); + fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>); /// Filters the vector, returns elements matching the `filter` (i.e. where the values are true). /// @@ -65,7 +65,7 @@ macro_rules! impl_scalar_vector_op { replicate::$replicate(self, offsets) } - fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) { + fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { let prev_vector = prev_vector.map(|pv| pv.as_any().downcast_ref::<$VectorType>().unwrap()); find_unique::find_unique_scalar(self, selected, prev_vector); } @@ -92,7 +92,7 @@ impl VectorOp for ConstantVector { replicate::replicate_constant(self, offsets) } - fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) { + fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); find_unique::find_unique_constant(self, selected, prev_vector); } @@ -107,7 +107,7 @@ impl VectorOp for NullVector { replicate::replicate_null(self, offsets) } - fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) { + fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::()); find_unique::find_unique_null(self, selected, prev_vector); } @@ -125,7 +125,7 @@ where replicate::replicate_primitive(self, offsets) } - fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) { + fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) { let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::>()); find_unique::find_unique_scalar(self, selected, prev_vector); diff --git a/src/datatypes/src/vectors/operations/find_unique.rs b/src/datatypes/src/vectors/operations/find_unique.rs index 6041a9d9fb27..d63a3c66b9d2 100644 --- a/src/datatypes/src/vectors/operations/find_unique.rs +++ b/src/datatypes/src/vectors/operations/find_unique.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use arrow::bitmap::MutableBitmap; +use common_base::BitVec; use crate::scalars::ScalarVector; use crate::vectors::{ConstantVector, NullVector, Vector}; @@ -22,7 +22,7 @@ use crate::vectors::{ConstantVector, NullVector, Vector}; // in any other case. pub(crate) fn find_unique_scalar<'a, T: ScalarVector>( vector: &'a T, - selected: &'a mut MutableBitmap, + selected: &'a mut BitVec, prev_vector: Option<&'a T>, ) where T::RefItem<'a>: PartialEq, @@ -63,7 +63,7 @@ pub(crate) fn find_unique_scalar<'a, T: ScalarVector>( pub(crate) fn find_unique_null( vector: &NullVector, - selected: &mut MutableBitmap, + selected: &mut BitVec, prev_vector: Option<&NullVector>, ) { if vector.is_empty() { @@ -78,7 +78,7 @@ pub(crate) fn find_unique_null( pub(crate) fn find_unique_constant( vector: &ConstantVector, - selected: &mut MutableBitmap, + selected: &mut BitVec, prev_vector: Option<&ConstantVector>, ) { if vector.is_empty() { @@ -107,7 +107,7 @@ mod tests { use super::*; use crate::vectors::{Int32Vector, StringVector, VectorOp}; - fn check_bitmap(expect: &[bool], selected: &MutableBitmap) { + fn check_bitmap(expect: &[bool], selected: &BitVec) { let actual = selected.iter().collect::>(); assert_eq!(expect, actual); } @@ -124,7 +124,7 @@ mod tests { let input = Int32Vector::from_iter(input); let prev = prev.map(Int32Vector::from_slice); - let mut selected = MutableBitmap::from_len_zeroed(input.len()); + let mut selected = BitVec::repeat(false, input.len()); input.find_unique(&mut selected, prev.as_ref().map(|v| v as _)); check_bitmap(expect, &selected); @@ -164,7 +164,7 @@ mod tests { let prev = Int32Vector::from_slice(&[1]); let v1 = Int32Vector::from_slice(&[2, 3, 4]); - let mut selected = MutableBitmap::from_len_zeroed(v1.len()); + let mut selected = BitVec::repeat(false, v1.len()); v1.find_unique(&mut selected, Some(&prev)); // Though element in v2 are the same as prev, but we should still keep them. @@ -174,15 +174,8 @@ mod tests { check_bitmap(&[true, true, true], &selected); } - fn new_bitmap(bits: &[bool]) -> MutableBitmap { - let mut bitmap = MutableBitmap::from_len_zeroed(bits.len()); - for (i, bit) in bits.iter().enumerate() { - if *bit { - bitmap.set(i, true); - } - } - - bitmap + fn new_bitmap(bits: &[bool]) -> BitVec { + BitVec::from_iter(bits) } #[test] @@ -222,7 +215,7 @@ mod tests { fn check_find_unique_null(len: usize) { let input = NullVector::new(len); - let mut selected = MutableBitmap::from_len_zeroed(input.len()); + let mut selected = BitVec::repeat(false, input.len()); input.find_unique(&mut selected, None); let mut expect = vec![false; len]; @@ -231,7 +224,7 @@ mod tests { } check_bitmap(&expect, &selected); - let mut selected = MutableBitmap::from_len_zeroed(input.len()); + let mut selected = BitVec::repeat(false, input.len()); let prev = Some(NullVector::new(1)); input.find_unique(&mut selected, prev.as_ref().map(|v| v as _)); let expect = vec![false; len]; @@ -273,7 +266,7 @@ mod tests { fn check_find_unique_constant(len: usize) { let input = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[8])), len); - let mut selected = MutableBitmap::from_len_zeroed(len); + let mut selected = BitVec::repeat(false, len); input.find_unique(&mut selected, None); let mut expect = vec![false; len]; @@ -282,7 +275,7 @@ mod tests { } check_bitmap(&expect, &selected); - let mut selected = MutableBitmap::from_len_zeroed(len); + let mut selected = BitVec::repeat(false, len); let prev = Some(ConstantVector::new( Arc::new(Int32Vector::from_slice(&[8])), 1, @@ -340,7 +333,7 @@ mod tests { #[test] fn test_find_unique_string() { let input = StringVector::from_slice(&["a", "a", "b", "c"]); - let mut selected = MutableBitmap::from_len_zeroed(4); + let mut selected = BitVec::repeat(false, 4); input.find_unique(&mut selected, None); let expect = vec![true, false, true, true]; check_bitmap(&expect, &selected); @@ -352,7 +345,7 @@ mod tests { use $crate::vectors::$VectorType; let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method)); - let mut selected = MutableBitmap::from_len_zeroed(4); + let mut selected = BitVec::repeat(false, 4); v.find_unique(&mut selected, None); let expect = vec![true, false, true, true]; check_bitmap(&expect, &selected); diff --git a/src/storage/src/read.rs b/src/storage/src/read.rs index 6893060be7c5..3c81d37c151c 100644 --- a/src/storage/src/read.rs +++ b/src/storage/src/read.rs @@ -20,7 +20,7 @@ mod merge; use std::cmp::Ordering; use async_trait::async_trait; -use datatypes::arrow::bitmap::MutableBitmap; +use common_base::BitVec; use datatypes::data_type::DataType; use datatypes::prelude::ConcreteDataType; use datatypes::vectors::{BooleanVector, MutableVector, VectorRef}; @@ -127,7 +127,7 @@ pub trait BatchOp { /// - `batch` and `prev` have different number of columns (unless `prev` is /// empty). /// - `selected.len()` is less than the number of rows. - fn find_unique(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>); + fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>); /// Filters the `batch`, returns elements matching the `filter` (i.e. where the values /// are true). diff --git a/src/storage/src/read/dedup.rs b/src/storage/src/read/dedup.rs index 30e3c2b24ea0..5d359df3d63c 100644 --- a/src/storage/src/read/dedup.rs +++ b/src/storage/src/read/dedup.rs @@ -13,7 +13,8 @@ // limitations under the License. use async_trait::async_trait; -use datatypes::arrow::bitmap::MutableBitmap; +use common_base::BitVec; +use datatypes::prelude::ScalarVector; use datatypes::vectors::BooleanVector; use crate::error::Result; @@ -28,6 +29,8 @@ pub struct DedupReader { reader: R, /// Previous batch from the reader. prev_batch: Option, + /// Reused bitmap buffer. + selected: BitVec, } impl DedupReader { @@ -36,6 +39,7 @@ impl DedupReader { schema, reader, prev_batch: None, + selected: BitVec::default(), } } @@ -48,12 +52,11 @@ impl DedupReader { return Ok(batch); } - // The `arrow` filter needs `BooleanArray` as input so there is no convenient - // and efficient way to reuse the bitmap. Though we could use `MutableBooleanArray`, - // but we couldn't zero all bits in the mutable array easily. - let mut selected = MutableBitmap::from_len_zeroed(batch.num_rows()); + // Reinitialize the bit map to zeros. + self.selected.clear(); + self.selected.resize(batch.num_rows(), false); self.schema - .find_unique(&batch, &mut selected, self.prev_batch.as_ref()); + .find_unique(&batch, &mut self.selected, self.prev_batch.as_ref()); // Store current batch to `prev_batch` so we could compare the next batch // with this batch. We store batch before filtering it mainly for correctness, as @@ -66,7 +69,7 @@ impl DedupReader { // TODO(yingwen): To support `DELETE`, we could find all rows whose op_types are equal // to `OpType::Delete`, mark their `selected` to false, then filter the batch. - let filter = BooleanVector::from(selected); + let filter = BooleanVector::from_iterator(self.selected.iter().by_vals()); // Filter duplicate rows. self.schema.filter(&batch, &filter) } diff --git a/src/storage/src/schema/projected.rs b/src/storage/src/schema/projected.rs index 7f78bf9809e0..6e746c9ff923 100644 --- a/src/storage/src/schema/projected.rs +++ b/src/storage/src/schema/projected.rs @@ -16,8 +16,8 @@ use std::cmp::Ordering; use std::collections::{BTreeSet, HashMap}; use std::sync::Arc; +use common_base::BitVec; use common_error::prelude::*; -use datatypes::arrow::bitmap::MutableBitmap; use datatypes::schema::{SchemaBuilder, SchemaRef}; use datatypes::vectors::BooleanVector; use store_api::storage::{Chunk, ColumnId}; @@ -303,7 +303,7 @@ impl BatchOp for ProjectedSchema { }) } - fn find_unique(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>) { + fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>) { if let Some(prev) = prev { assert_eq!(batch.num_columns(), prev.num_columns()); } @@ -503,18 +503,18 @@ mod tests { let schema = read_util::new_projected_schema(); let batch = read_util::new_kv_batch(&[(1000, Some(1)), (2000, Some(2)), (2000, Some(2))]); - let mut selected = MutableBitmap::from_len_zeroed(3); + let mut selected = BitVec::repeat(false, 3); schema.find_unique(&batch, &mut selected, None); - assert!(selected.get(0)); - assert!(selected.get(1)); - assert!(!selected.get(2)); + assert!(selected[0]); + assert!(selected[1]); + assert!(!selected[2]); - let mut selected = MutableBitmap::from_len_zeroed(3); + let mut selected = BitVec::repeat(false, 3); let prev = read_util::new_kv_batch(&[(1000, Some(1))]); schema.find_unique(&batch, &mut selected, Some(&prev)); - assert!(!selected.get(0)); - assert!(selected.get(1)); - assert!(!selected.get(2)); + assert!(!selected[0]); + assert!(selected[1]); + assert!(!selected[2]); } #[test]