33//!
44//! There is also an opportunistic `select` operation, but the general case has not been
55//! implemented.
6+ //!
7+ //! See also: ["Succinct data structure"](https://en.wikipedia.org/wiki/Succinct_data_structure).
68
7- type Chunk = u128 ;
9+ type SubblockBits = u128 ;
810
911// Static sizing of the various components of the data structure.
1012const BITS_PER_BLOCK : usize = 16384 ;
11- const BITS_PER_SUB_BLOCK : usize = 128 ;
13+ const BITS_PER_SUB_BLOCK : usize = SubblockBits :: BITS as usize ;
1214const SUB_BLOCKS_PER_BLOCK : usize = BITS_PER_BLOCK / BITS_PER_SUB_BLOCK ;
13- const BITS_PER_CHUNK : usize = 128 ;
14- const CHUNKS_PER_SUB_BLOCK : usize = BITS_PER_SUB_BLOCK / BITS_PER_CHUNK ;
1515
1616/// A container for a portion of the total bit vector and the associated indices.
1717/// The bits within each chunk are stored from most significant bit (msb) to least significant bit (lsb).
@@ -30,7 +30,6 @@ const CHUNKS_PER_SUB_BLOCK: usize = BITS_PER_SUB_BLOCK / BITS_PER_CHUNK;
3030/// sub-block rank: [ 0 ][ 2 ]
3131/// ```
3232#[ derive( Clone , Debug ) ]
33- #[ repr( C ) ]
3433struct Block {
3534 /// Rank of the first bit in this block (that is, the number of bits set in previous blocks).
3635 rank : u64 ,
@@ -39,38 +38,29 @@ struct Block {
3938 /// sub-blocks `0..i`. `sub_blocks[0]` is always zero.
4039 sub_blocks : [ u16 ; SUB_BLOCKS_PER_BLOCK ] ,
4140 /// The bit-vector.
42- bits : [ Chunk ; BITS_PER_BLOCK / BITS_PER_CHUNK ] ,
41+ bits : [ SubblockBits ; SUB_BLOCKS_PER_BLOCK ] ,
4342}
4443
4544impl Block {
46- fn new ( rank : u64 ) -> Self {
47- Self {
48- rank,
49- sub_blocks : [ 0 ; SUB_BLOCKS_PER_BLOCK ] ,
50- bits : [ 0 ; BITS_PER_BLOCK / BITS_PER_CHUNK ] ,
51- }
52- }
53-
5445 /// Set a bit without updating `self.sub_blocks`.
5546 ///
5647 /// This panics if the bit was already set, because that indicates that the original positions
5748 /// list is invalid/had duplicates.
5849 fn set ( & mut self , index : usize ) {
5950 assert ! ( index < BITS_PER_BLOCK ) ;
60- let chunk_idx = index / BITS_PER_CHUNK ;
61- let bit_idx = index % BITS_PER_CHUNK ;
62- let mask = 1 << ( ( BITS_PER_CHUNK - 1 ) - bit_idx) ;
51+ let chunk_idx = index / BITS_PER_SUB_BLOCK ;
52+ let bit_idx = index % BITS_PER_SUB_BLOCK ;
53+ let mask = 1 << ( ( BITS_PER_SUB_BLOCK - 1 ) - bit_idx) ;
6354 assert_eq ! ( self . bits[ chunk_idx] & mask, 0 , "toggling bits off indicates that the original data was incorrect, most likely containing duplicate values." ) ;
6455 self . bits [ chunk_idx] ^= mask;
6556 }
6657
6758 /// Tests whether the bit at the given index is set.
68- #[ allow( dead_code) ]
6959 fn get ( & self , index : usize ) -> bool {
7060 assert ! ( index < BITS_PER_BLOCK ) ;
71- let chunk_idx = index / BITS_PER_CHUNK ;
72- let bit_idx = index % BITS_PER_CHUNK ;
73- let mask = 1 << ( ( BITS_PER_CHUNK - 1 ) - bit_idx) ;
61+ let chunk_idx = index / BITS_PER_SUB_BLOCK ;
62+ let bit_idx = index % BITS_PER_SUB_BLOCK ;
63+ let mask = 1 << ( ( BITS_PER_SUB_BLOCK - 1 ) - bit_idx) ;
7464 self . bits [ chunk_idx] & mask != 0
7565 }
7666
@@ -84,19 +74,13 @@ impl Block {
8474 let sub_block = local_idx / BITS_PER_SUB_BLOCK ;
8575 rank += self . sub_blocks [ sub_block] as usize ;
8676
87- if BITS_PER_CHUNK != BITS_PER_SUB_BLOCK {
88- for i in sub_block * CHUNKS_PER_SUB_BLOCK ..local_idx / BITS_PER_CHUNK {
89- rank += self . bits [ i] . count_ones ( ) as usize ;
90- }
91- }
92-
93- let remainder = local_idx % BITS_PER_CHUNK ;
77+ let remainder = local_idx % BITS_PER_SUB_BLOCK ;
9478
95- let last_chunk = local_idx / BITS_PER_CHUNK ;
79+ let last_chunk = local_idx / BITS_PER_SUB_BLOCK ;
9680 let masked = if remainder == 0 {
9781 0
9882 } else {
99- self . bits [ last_chunk] >> ( BITS_PER_CHUNK - remainder)
83+ self . bits [ last_chunk] >> ( BITS_PER_SUB_BLOCK - remainder)
10084 } ;
10185 rank += masked. count_ones ( ) as usize ;
10286 let select = if masked == 0 {
@@ -110,7 +94,7 @@ impl Block {
11094 fn total_rank ( & self ) -> usize {
11195 self . sub_blocks [ SUB_BLOCKS_PER_BLOCK - 1 ] as usize
11296 + self . rank as usize
113- + self . bits [ ( SUB_BLOCKS_PER_BLOCK - 1 ) * CHUNKS_PER_SUB_BLOCK ..]
97+ + self . bits [ SUB_BLOCKS_PER_BLOCK - 1 ..]
11498 . iter ( )
11599 . map ( |c| c. count_ones ( ) as usize )
116100 . sum :: < usize > ( )
@@ -151,24 +135,11 @@ impl Block {
151135 }
152136}
153137
154- impl Default for Block {
155- fn default ( ) -> Self {
156- Block {
157- rank : 0 ,
158- sub_blocks : [ 0u16 ; SUB_BLOCKS_PER_BLOCK ] ,
159- bits : [ 0 ; BITS_PER_BLOCK / BITS_PER_CHUNK ] ,
160- }
161- }
162- }
163-
164138/// Builder for creating a [`BitRank`].
165139///
166140/// # Examples
167141///
168142/// ```text
169- /// // Note: This should work as a doctest, except this module is not public.
170- /// let mut bytes = Vec::<u8>::new();
171- ///
172143/// let mut builder = BitRankBuilder::new();
173144/// builder.push(17);
174145/// builder.push(23);
@@ -179,9 +150,6 @@ impl Default for Block {
179150#[ derive( Default ) ]
180151pub struct BitRankBuilder {
181152 blocks : Vec < Block > ,
182- curr_rank : u64 ,
183- curr_block_id : usize ,
184- curr_block : Option < Block > ,
185153}
186154
187155impl BitRankBuilder {
@@ -190,55 +158,56 @@ impl BitRankBuilder {
190158 Self :: default ( )
191159 }
192160
193- fn push_block ( & mut self , mut block : Block ) -> u64 {
194- let mut local_rank = 0 ;
195- for ( i, chunk) in block. bits . iter ( ) . enumerate ( ) {
196- // If the settings are ever changed, CHUNKS_PER_SUB_BLOCK will likely no longer be 1, so
197- // you will need this modulo.
198- #[ expect( clippy:: modulo_one) ]
199- if i % CHUNKS_PER_SUB_BLOCK == 0 {
200- block. sub_blocks [ i / CHUNKS_PER_SUB_BLOCK ] = local_rank;
161+ /// Returns a builder that can hold integers with values `0..cap`.
162+ pub fn with_capacity ( cap : usize ) -> Self {
163+ Self {
164+ blocks : Vec :: with_capacity ( cap. div_ceil ( BITS_PER_BLOCK ) ) ,
165+ }
166+ }
167+
168+ fn finish_last_block ( & mut self ) -> u64 {
169+ if let Some ( block) = self . blocks . last_mut ( ) {
170+ let mut local_rank = 0 ;
171+ for ( i, chunk) in block. bits . iter ( ) . enumerate ( ) {
172+ block. sub_blocks [ i] = local_rank;
173+ local_rank += chunk. count_ones ( ) as u16 ;
201174 }
202- local_rank += chunk. count_ones ( ) as u16 ;
175+ block. rank + local_rank as u64
176+ } else {
177+ 0
203178 }
204- let end_rank = block. rank + local_rank as u64 ;
205- self . blocks . push ( block) ;
206- end_rank
207179 }
208180
209181 /// Adds a bit. Bits must be added in order of increasing `position`.
210182 pub fn push ( & mut self , position : usize ) {
211183 let block_id = position / BITS_PER_BLOCK ;
212184 assert ! (
213- self . curr_block_id <= block_id,
185+ self . blocks . len ( ) <= block_id + 1 ,
214186 "positions must be increasing!"
215187 ) ;
216- while block_id > self . curr_block_id {
217- let curr_block = self
218- . curr_block
219- . take ( )
220- . unwrap_or_else ( || Block :: new ( self . curr_rank ) ) ;
221- let end_rank = self . push_block ( curr_block) ;
222- self . curr_rank = end_rank;
223- self . curr_block_id += 1 ;
224- }
225- match & mut self . curr_block {
226- None => {
227- let mut block = Block :: new ( self . curr_rank ) ;
228- block. set ( position % BITS_PER_BLOCK ) ;
229- self . curr_block = Some ( block) ;
230- }
231- Some ( block) => {
232- block. set ( position % BITS_PER_BLOCK ) ;
188+ if block_id >= self . blocks . len ( ) {
189+ let curr_rank = self . finish_last_block ( ) ;
190+ while block_id >= self . blocks . len ( ) {
191+ // Without this declared as a `const`, rustc 1.82 creates the Block value on the
192+ // stack first, then `memcpy`s it into `self.blocks`.
193+ const ZERO_BLOCK : Block = Block {
194+ rank : 0 ,
195+ sub_blocks : [ 0 ; SUB_BLOCKS_PER_BLOCK ] ,
196+ bits : [ 0 ; SUB_BLOCKS_PER_BLOCK ] ,
197+ } ;
198+ self . blocks . push ( ZERO_BLOCK ) ;
199+ self . blocks . last_mut ( ) . expect ( "just inserted" ) . rank = curr_rank;
233200 }
234201 }
202+ self . blocks
203+ . last_mut ( )
204+ . expect ( "just ensured there are enough blocks" )
205+ . set ( position % BITS_PER_BLOCK ) ;
235206 }
236207
237208 /// Finishes the `BitRank` by writing the last block of data.
238209 pub fn finish ( mut self ) -> BitRank {
239- if let Some ( last_block) = self . curr_block . take ( ) {
240- self . push_block ( last_block) ;
241- }
210+ self . finish_last_block ( ) ;
242211 BitRank {
243212 blocks : self . blocks ,
244213 }
@@ -256,8 +225,8 @@ impl BitRank {
256225 ///
257226 /// # Panics
258227 /// This may panic if the values produced by `iter` are not strictly increasing.
259- #[ allow( clippy:: should_implement_trait) ]
260228 #[ allow( dead_code) ]
229+ #[ allow( clippy:: should_implement_trait) ]
261230 pub fn from_iter < I : IntoIterator < Item = usize > > ( iter : I ) -> BitRank {
262231 let mut builder = BitRankBuilder :: new ( ) ;
263232 for position in iter {
@@ -457,7 +426,7 @@ mod tests {
457426 let mut rank = 0 ;
458427 let mut select = None ;
459428 for i in 0 ..random_bits. capacity ( ) {
460- if i % BITS_PER_CHUNK == 0 {
429+ if i % BITS_PER_SUB_BLOCK == 0 {
461430 select = None ;
462431 }
463432 assert_eq ! ( br. rank_select( i) , ( rank, select) ) ;
@@ -501,4 +470,30 @@ mod tests {
501470 }
502471 }
503472 }
473+
474+ #[ test]
475+ fn test_large_gap ( ) {
476+ let br = BitRank :: from_iter ( ( 3 ..4 ) . chain ( BITS_PER_BLOCK * 15 ..BITS_PER_BLOCK * 15 + 17 ) ) ;
477+ for i in 1 ..15 {
478+ assert_eq ! ( br. rank( BITS_PER_BLOCK * i) , 1 ) ;
479+ }
480+ for i in 0 ..18 {
481+ assert_eq ! ( br. rank( BITS_PER_BLOCK * 15 + i) , 1 + i) ;
482+ }
483+ }
484+
485+ #[ test]
486+ fn test_with_capacity ( ) {
487+ let mut b = BitRankBuilder :: with_capacity ( BITS_PER_BLOCK * 3 - 1 ) ;
488+ let initial_capacity = b. blocks . capacity ( ) ;
489+ assert ! ( initial_capacity >= 3 ) ;
490+ b. push ( BITS_PER_BLOCK * 3 - 2 ) ; // should not have to grow
491+ assert_eq ! ( b. blocks. capacity( ) , initial_capacity) ;
492+
493+ let mut b = BitRankBuilder :: with_capacity ( BITS_PER_BLOCK * 3 + 1 ) ;
494+ let initial_capacity = b. blocks . capacity ( ) ;
495+ assert ! ( initial_capacity >= 4 ) ;
496+ b. push ( BITS_PER_BLOCK * 3 ) ; // should not have to grow
497+ assert_eq ! ( b. blocks. capacity( ) , initial_capacity) ;
498+ }
504499}
0 commit comments