diff --git a/roaring/src/bitmap/serialization.rs b/roaring/src/bitmap/serialization.rs
index f38f89f6..b618866a 100644
--- a/roaring/src/bitmap/serialization.rs
+++ b/roaring/src/bitmap/serialization.rs
@@ -413,11 +413,17 @@ mod test {
         assert_eq!(rb.min(), Some(CONTAINER_OFFSET + 8));
         
         
-        // Ensure we can set the last byte
+        // Ensure we can set the last byte in an array container
         let bytes = [0x80];
         let rb = RoaringBitmap::from_bitmap_bytes(0xFFFFFFF8, &bytes);
         assert_eq!(rb.len(), 1);
         assert!(rb.contains(u32::MAX));
+        
+        // Ensure we can set the last byte in a bitmap container
+        let bytes = vec![0xFF; 0x1_0000 / 8];
+        let rb = RoaringBitmap::from_bitmap_bytes(0xFFFF0000, &bytes);
+        assert_eq!(rb.len(), 0x1_0000);
+        assert!(rb.contains(u32::MAX));
     }
     
     #[test]
diff --git a/roaring/src/bitmap/store/bitmap_store.rs b/roaring/src/bitmap/store/bitmap_store.rs
index 7151f74d..f2084092 100644
--- a/roaring/src/bitmap/store/bitmap_store.rs
+++ b/roaring/src/bitmap/store/bitmap_store.rs
@@ -38,27 +38,41 @@ impl BitmapStore {
     }
 
     pub fn from_lsb0_bytes_unchecked(bytes: &[u8], byte_offset: usize, bits_set: u64) -> Self {
-        assert!(byte_offset + bytes.len() <= BITMAP_LENGTH * size_of::<u64>());
-
-        let mut bits = Box::new([0u64; BITMAP_LENGTH]);
-        // Safety: It's safe to reinterpret u64s as u8s because u8 has less alignment requirements,
-        // and has no padding/uninitialized data.
-        let dst = unsafe {
-            std::slice::from_raw_parts_mut(
-                bits.as_mut_ptr().cast::<u8>(),
-                BITMAP_LENGTH * size_of::<u64>(),
-            )
+        const BITMAP_BYTES: usize = BITMAP_LENGTH * size_of::<u64>();
+        assert!(byte_offset.checked_add(bytes.len()).map_or(false, |sum| sum <= BITMAP_BYTES));
+
+        // If we know we're writing the full bitmap, we can avoid the initial memset to 0
+        let mut bits = if bytes.len() == BITMAP_BYTES {
+            debug_assert_eq!(byte_offset, 0); // Must be true from the above assert
+
+            // Safety: We've checked that the length is correct, and we use an unaligned load in case
+            //         the bytes are not 8 byte aligned.
+            // The optimizer can see through this, and avoid the double copy to copy directly into
+            // the allocated box from bytes with memcpy
+            let bytes_as_words =
+                unsafe { bytes.as_ptr().cast::<[u64; BITMAP_LENGTH]>().read_unaligned() };
+            Box::new(bytes_as_words)
+        } else {
+            let mut bits = Box::new([0u64; BITMAP_LENGTH]);
+            // Safety: It's safe to reinterpret u64s as u8s because u8 has less alignment requirements,
+            // and has no padding/uninitialized data.
+            let dst = unsafe {
+                std::slice::from_raw_parts_mut(bits.as_mut_ptr().cast::<u8>(), BITMAP_BYTES)
+            };
+            let dst = &mut dst[byte_offset..][..bytes.len()];
+            dst.copy_from_slice(bytes);
+            bits
         };
-        let dst = &mut dst[byte_offset..][..bytes.len()];
-        dst.copy_from_slice(bytes);
 
-        let start_word = byte_offset / size_of::<u64>();
-        let end_word = (byte_offset + bytes.len() + (size_of::<u64>() - 1)) / size_of::<u64>();
+        if !cfg!(target_endian = "little") {
+            // Convert all words we touched (even partially) to little-endian
+            let start_word = byte_offset / size_of::<u64>();
+            let end_word = (byte_offset + bytes.len() + (size_of::<u64>() - 1)) / size_of::<u64>();
 
-        // The 0th byte is the least significant byte, so we've written the bytes in little-endian
-        // order, convert to native endian. Expect this to get optimized away for little-endian.
-        for word in &mut bits[start_word..end_word] {
-            *word = u64::from_le(*word);
+            // The 0th byte is the least significant byte, so we've written the bytes in little-endian
+            for word in &mut bits[start_word..end_word] {
+                *word = u64::from_le(*word);
+            }
         }
 
         Self::from_unchecked(bits_set, bits)