Skip to content

Commit 4683f6d

Browse files
committed
Update with the latest from upstream
1 parent 91ef54d commit 4683f6d

File tree

3 files changed

+200
-200
lines changed

3 files changed

+200
-200
lines changed

crates/string-offsets/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
[package]
22
authors = ["The blackbird team <support@github.com>"]
33
edition = "2021"
4-
name = "string-offests"
4+
name = "string-offsets"
55
version = "0.1.0"
66

7-
[dependencies]
7+
[dev-dependencies]
88
itertools = "0.13"
99
rand = "0.8"
1010
rand_chacha = "0.3"

crates/string-offsets/src/bitrank.rs

Lines changed: 77 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@
33
//!
44
//! There is also an opportunistic `select` operation, but the general case has not been
55
//! implemented.
6+
//!
7+
//! See also: ["Succinct data structure"](https://en.wikipedia.org/wiki/Succinct_data_structure).
68
7-
type Chunk = u128;
9+
type SubblockBits = u128;
810

911
// Static sizing of the various components of the data structure.
1012
const BITS_PER_BLOCK: usize = 16384;
11-
const BITS_PER_SUB_BLOCK: usize = 128;
13+
const BITS_PER_SUB_BLOCK: usize = SubblockBits::BITS as usize;
1214
const SUB_BLOCKS_PER_BLOCK: usize = BITS_PER_BLOCK / BITS_PER_SUB_BLOCK;
13-
const BITS_PER_CHUNK: usize = 128;
14-
const CHUNKS_PER_SUB_BLOCK: usize = BITS_PER_SUB_BLOCK / BITS_PER_CHUNK;
1515

1616
/// A container for a portion of the total bit vector and the associated indices.
1717
/// The bits within each chunk are stored from most significant bit (msb) to least significant bit (lsb).
@@ -30,7 +30,6 @@ const CHUNKS_PER_SUB_BLOCK: usize = BITS_PER_SUB_BLOCK / BITS_PER_CHUNK;
3030
/// sub-block rank: [ 0 ][ 2 ]
3131
/// ```
3232
#[derive(Clone, Debug)]
33-
#[repr(C)]
3433
struct Block {
3534
/// Rank of the first bit in this block (that is, the number of bits set in previous blocks).
3635
rank: u64,
@@ -39,38 +38,29 @@ struct Block {
3938
/// sub-blocks `0..i`. `sub_blocks[0]` is always zero.
4039
sub_blocks: [u16; SUB_BLOCKS_PER_BLOCK],
4140
/// The bit-vector.
42-
bits: [Chunk; BITS_PER_BLOCK / BITS_PER_CHUNK],
41+
bits: [SubblockBits; SUB_BLOCKS_PER_BLOCK],
4342
}
4443

4544
impl Block {
46-
fn new(rank: u64) -> Self {
47-
Self {
48-
rank,
49-
sub_blocks: [0; SUB_BLOCKS_PER_BLOCK],
50-
bits: [0; BITS_PER_BLOCK / BITS_PER_CHUNK],
51-
}
52-
}
53-
5445
/// Set a bit without updating `self.sub_blocks`.
5546
///
5647
/// This panics if the bit was already set, because that indicates that the original positions
5748
/// list is invalid/had duplicates.
5849
fn set(&mut self, index: usize) {
5950
assert!(index < BITS_PER_BLOCK);
60-
let chunk_idx = index / BITS_PER_CHUNK;
61-
let bit_idx = index % BITS_PER_CHUNK;
62-
let mask = 1 << ((BITS_PER_CHUNK - 1) - bit_idx);
51+
let chunk_idx = index / BITS_PER_SUB_BLOCK;
52+
let bit_idx = index % BITS_PER_SUB_BLOCK;
53+
let mask = 1 << ((BITS_PER_SUB_BLOCK - 1) - bit_idx);
6354
assert_eq!(self.bits[chunk_idx] & mask, 0, "toggling bits off indicates that the original data was incorrect, most likely containing duplicate values.");
6455
self.bits[chunk_idx] ^= mask;
6556
}
6657

6758
/// Tests whether the bit at the given index is set.
68-
#[allow(dead_code)]
6959
fn get(&self, index: usize) -> bool {
7060
assert!(index < BITS_PER_BLOCK);
71-
let chunk_idx = index / BITS_PER_CHUNK;
72-
let bit_idx = index % BITS_PER_CHUNK;
73-
let mask = 1 << ((BITS_PER_CHUNK - 1) - bit_idx);
61+
let chunk_idx = index / BITS_PER_SUB_BLOCK;
62+
let bit_idx = index % BITS_PER_SUB_BLOCK;
63+
let mask = 1 << ((BITS_PER_SUB_BLOCK - 1) - bit_idx);
7464
self.bits[chunk_idx] & mask != 0
7565
}
7666

@@ -84,19 +74,13 @@ impl Block {
8474
let sub_block = local_idx / BITS_PER_SUB_BLOCK;
8575
rank += self.sub_blocks[sub_block] as usize;
8676

87-
if BITS_PER_CHUNK != BITS_PER_SUB_BLOCK {
88-
for i in sub_block * CHUNKS_PER_SUB_BLOCK..local_idx / BITS_PER_CHUNK {
89-
rank += self.bits[i].count_ones() as usize;
90-
}
91-
}
92-
93-
let remainder = local_idx % BITS_PER_CHUNK;
77+
let remainder = local_idx % BITS_PER_SUB_BLOCK;
9478

95-
let last_chunk = local_idx / BITS_PER_CHUNK;
79+
let last_chunk = local_idx / BITS_PER_SUB_BLOCK;
9680
let masked = if remainder == 0 {
9781
0
9882
} else {
99-
self.bits[last_chunk] >> (BITS_PER_CHUNK - remainder)
83+
self.bits[last_chunk] >> (BITS_PER_SUB_BLOCK - remainder)
10084
};
10185
rank += masked.count_ones() as usize;
10286
let select = if masked == 0 {
@@ -110,7 +94,7 @@ impl Block {
11094
fn total_rank(&self) -> usize {
11195
self.sub_blocks[SUB_BLOCKS_PER_BLOCK - 1] as usize
11296
+ self.rank as usize
113-
+ self.bits[(SUB_BLOCKS_PER_BLOCK - 1) * CHUNKS_PER_SUB_BLOCK..]
97+
+ self.bits[SUB_BLOCKS_PER_BLOCK - 1..]
11498
.iter()
11599
.map(|c| c.count_ones() as usize)
116100
.sum::<usize>()
@@ -151,24 +135,11 @@ impl Block {
151135
}
152136
}
153137

154-
impl Default for Block {
155-
fn default() -> Self {
156-
Block {
157-
rank: 0,
158-
sub_blocks: [0u16; SUB_BLOCKS_PER_BLOCK],
159-
bits: [0; BITS_PER_BLOCK / BITS_PER_CHUNK],
160-
}
161-
}
162-
}
163-
164138
/// Builder for creating a [`BitRank`].
165139
///
166140
/// # Examples
167141
///
168142
/// ```text
169-
/// // Note: This should work as a doctest, except this module is not public.
170-
/// let mut bytes = Vec::<u8>::new();
171-
///
172143
/// let mut builder = BitRankBuilder::new();
173144
/// builder.push(17);
174145
/// builder.push(23);
@@ -179,9 +150,6 @@ impl Default for Block {
179150
#[derive(Default)]
180151
pub struct BitRankBuilder {
181152
blocks: Vec<Block>,
182-
curr_rank: u64,
183-
curr_block_id: usize,
184-
curr_block: Option<Block>,
185153
}
186154

187155
impl BitRankBuilder {
@@ -190,55 +158,56 @@ impl BitRankBuilder {
190158
Self::default()
191159
}
192160

193-
fn push_block(&mut self, mut block: Block) -> u64 {
194-
let mut local_rank = 0;
195-
for (i, chunk) in block.bits.iter().enumerate() {
196-
// If the settings are ever changed, CHUNKS_PER_SUB_BLOCK will likely no longer be 1, so
197-
// you will need this modulo.
198-
#[expect(clippy::modulo_one)]
199-
if i % CHUNKS_PER_SUB_BLOCK == 0 {
200-
block.sub_blocks[i / CHUNKS_PER_SUB_BLOCK] = local_rank;
161+
/// Returns a builder that can hold integers with values `0..cap`.
162+
pub fn with_capacity(cap: usize) -> Self {
163+
Self {
164+
blocks: Vec::with_capacity(cap.div_ceil(BITS_PER_BLOCK)),
165+
}
166+
}
167+
168+
fn finish_last_block(&mut self) -> u64 {
169+
if let Some(block) = self.blocks.last_mut() {
170+
let mut local_rank = 0;
171+
for (i, chunk) in block.bits.iter().enumerate() {
172+
block.sub_blocks[i] = local_rank;
173+
local_rank += chunk.count_ones() as u16;
201174
}
202-
local_rank += chunk.count_ones() as u16;
175+
block.rank + local_rank as u64
176+
} else {
177+
0
203178
}
204-
let end_rank = block.rank + local_rank as u64;
205-
self.blocks.push(block);
206-
end_rank
207179
}
208180

209181
/// Adds a bit. Bits must be added in order of increasing `position`.
210182
pub fn push(&mut self, position: usize) {
211183
let block_id = position / BITS_PER_BLOCK;
212184
assert!(
213-
self.curr_block_id <= block_id,
185+
self.blocks.len() <= block_id + 1,
214186
"positions must be increasing!"
215187
);
216-
while block_id > self.curr_block_id {
217-
let curr_block = self
218-
.curr_block
219-
.take()
220-
.unwrap_or_else(|| Block::new(self.curr_rank));
221-
let end_rank = self.push_block(curr_block);
222-
self.curr_rank = end_rank;
223-
self.curr_block_id += 1;
224-
}
225-
match &mut self.curr_block {
226-
None => {
227-
let mut block = Block::new(self.curr_rank);
228-
block.set(position % BITS_PER_BLOCK);
229-
self.curr_block = Some(block);
230-
}
231-
Some(block) => {
232-
block.set(position % BITS_PER_BLOCK);
188+
if block_id >= self.blocks.len() {
189+
let curr_rank = self.finish_last_block();
190+
while block_id >= self.blocks.len() {
191+
// Without this declared as a `const`, rustc 1.82 creates the Block value on the
192+
// stack first, then `memcpy`s it into `self.blocks`.
193+
const ZERO_BLOCK: Block = Block {
194+
rank: 0,
195+
sub_blocks: [0; SUB_BLOCKS_PER_BLOCK],
196+
bits: [0; SUB_BLOCKS_PER_BLOCK],
197+
};
198+
self.blocks.push(ZERO_BLOCK);
199+
self.blocks.last_mut().expect("just inserted").rank = curr_rank;
233200
}
234201
}
202+
self.blocks
203+
.last_mut()
204+
.expect("just ensured there are enough blocks")
205+
.set(position % BITS_PER_BLOCK);
235206
}
236207

237208
/// Finishes the `BitRank` by writing the last block of data.
238209
pub fn finish(mut self) -> BitRank {
239-
if let Some(last_block) = self.curr_block.take() {
240-
self.push_block(last_block);
241-
}
210+
self.finish_last_block();
242211
BitRank {
243212
blocks: self.blocks,
244213
}
@@ -256,8 +225,8 @@ impl BitRank {
256225
///
257226
/// # Panics
258227
/// This may panic if the values produced by `iter` are not strictly increasing.
259-
#[allow(clippy::should_implement_trait)]
260228
#[allow(dead_code)]
229+
#[allow(clippy::should_implement_trait)]
261230
pub fn from_iter<I: IntoIterator<Item = usize>>(iter: I) -> BitRank {
262231
let mut builder = BitRankBuilder::new();
263232
for position in iter {
@@ -457,7 +426,7 @@ mod tests {
457426
let mut rank = 0;
458427
let mut select = None;
459428
for i in 0..random_bits.capacity() {
460-
if i % BITS_PER_CHUNK == 0 {
429+
if i % BITS_PER_SUB_BLOCK == 0 {
461430
select = None;
462431
}
463432
assert_eq!(br.rank_select(i), (rank, select));
@@ -501,4 +470,30 @@ mod tests {
501470
}
502471
}
503472
}
473+
474+
#[test]
475+
fn test_large_gap() {
476+
let br = BitRank::from_iter((3..4).chain(BITS_PER_BLOCK * 15..BITS_PER_BLOCK * 15 + 17));
477+
for i in 1..15 {
478+
assert_eq!(br.rank(BITS_PER_BLOCK * i), 1);
479+
}
480+
for i in 0..18 {
481+
assert_eq!(br.rank(BITS_PER_BLOCK * 15 + i), 1 + i);
482+
}
483+
}
484+
485+
#[test]
486+
fn test_with_capacity() {
487+
let mut b = BitRankBuilder::with_capacity(BITS_PER_BLOCK * 3 - 1);
488+
let initial_capacity = b.blocks.capacity();
489+
assert!(initial_capacity >= 3);
490+
b.push(BITS_PER_BLOCK * 3 - 2); // should not have to grow
491+
assert_eq!(b.blocks.capacity(), initial_capacity);
492+
493+
let mut b = BitRankBuilder::with_capacity(BITS_PER_BLOCK * 3 + 1);
494+
let initial_capacity = b.blocks.capacity();
495+
assert!(initial_capacity >= 4);
496+
b.push(BITS_PER_BLOCK * 3); // should not have to grow
497+
assert_eq!(b.blocks.capacity(), initial_capacity);
498+
}
504499
}

0 commit comments

Comments
 (0)