Skip to content

Commit

Permalink
add special case for creating a full bitmap container
Browse files Browse the repository at this point in the history
we can setting an initial value in that case
  • Loading branch information
Dr-Emann committed Sep 7, 2024
1 parent feb538c commit 16e39fc
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 19 deletions.
8 changes: 7 additions & 1 deletion roaring/src/bitmap/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -413,11 +413,17 @@ mod test {
assert_eq!(rb.min(), Some(CONTAINER_OFFSET + 8));


// Ensure we can set the last byte
// Ensure we can set the last byte in an array container
let bytes = [0x80];
let rb = RoaringBitmap::from_bitmap_bytes(0xFFFFFFF8, &bytes);
assert_eq!(rb.len(), 1);
assert!(rb.contains(u32::MAX));

// Ensure we can set the last byte in a bitmap container
let bytes = vec![0xFF; 0x1_0000 / 8];
let rb = RoaringBitmap::from_bitmap_bytes(0xFFFF0000, &bytes);
assert_eq!(rb.len(), 0x1_0000);
assert!(rb.contains(u32::MAX));
}

#[test]
Expand Down
50 changes: 32 additions & 18 deletions roaring/src/bitmap/store/bitmap_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,27 +38,41 @@ impl BitmapStore {
}

pub fn from_lsb0_bytes_unchecked(bytes: &[u8], byte_offset: usize, bits_set: u64) -> Self {
assert!(byte_offset + bytes.len() <= BITMAP_LENGTH * size_of::<u64>());

let mut bits = Box::new([0u64; BITMAP_LENGTH]);
// Safety: It's safe to reinterpret u64s as u8s because u8 has less alignment requirements,
// and has no padding/uninitialized data.
let dst = unsafe {
std::slice::from_raw_parts_mut(
bits.as_mut_ptr().cast::<u8>(),
BITMAP_LENGTH * size_of::<u64>(),
)
const BITMAP_BYTES: usize = BITMAP_LENGTH * size_of::<u64>();
assert!(byte_offset.checked_add(bytes.len()).map_or(false, |sum| sum <= BITMAP_BYTES));

// If we know we're writing the full bitmap, we can avoid the initial memset to 0
let mut bits = if bytes.len() == BITMAP_BYTES {
debug_assert_eq!(byte_offset, 0); // Must be true from the above assert

// Safety: We've checked that the length is correct, and we use an unaligned load in case
// the bytes are not 8 byte aligned.
// The optimizer can see through this, and avoid the double copy to copy directly into
// the allocated box from bytes with memcpy
let bytes_as_words =
unsafe { bytes.as_ptr().cast::<[u64; BITMAP_LENGTH]>().read_unaligned() };
Box::new(bytes_as_words)
} else {
let mut bits = Box::new([0u64; BITMAP_LENGTH]);
// Safety: It's safe to reinterpret u64s as u8s because u8 has less alignment requirements,
// and has no padding/uninitialized data.
let dst = unsafe {
std::slice::from_raw_parts_mut(bits.as_mut_ptr().cast::<u8>(), BITMAP_BYTES)
};
let dst = &mut dst[byte_offset..byte_offset + bytes.len()];
dst.copy_from_slice(bytes);
bits
};
let dst = &mut dst[byte_offset..][..bytes.len()];
dst.copy_from_slice(bytes);

let start_word = byte_offset / size_of::<u64>();
let end_word = (byte_offset + bytes.len() + (size_of::<u64>() - 1)) / size_of::<u64>();
if !cfg!(target_endian = "little") {
// Convert all words we touched (even partially) to little-endian
let start_word = byte_offset / size_of::<u64>();
let end_word = (byte_offset + bytes.len() + (size_of::<u64>() - 1)) / size_of::<u64>();

// The 0th byte is the least significant byte, so we've written the bytes in little-endian
// order, convert to native endian. Expect this to get optimized away for little-endian.
for word in &mut bits[start_word..end_word] {
*word = u64::from_le(*word);
// The 0th byte is the least significant byte, so we've written the bytes in little-endian
for word in &mut bits[start_word..end_word] {
*word = u64::from_le(*word);
}
}

Self::from_unchecked(bits_set, bits)
Expand Down

0 comments on commit 16e39fc

Please sign in to comment.