Skip to content

Commit 3d63504

Browse files
committed
update README, docs, and include dropout into benchmarks
1 parent 5493b12 commit 3d63504

File tree

3 files changed

+38
-0
lines changed

3 files changed

+38
-0
lines changed

crates/bpe/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ Two additional encoders are included that are faster but deviate from the origin
219219

220220
- The greedy encoder picks the left-longest token.
221221
- The minimal encoder computes an encoding with the minimal number of tokens.
222+
- The minimal_dropout encoder implements BPE-Dropout [algorithm](https://arxiv.org/abs/1910.13267), randomly ignoring some multi-byte tokens at runtime.
222223

223224
The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 10000 from a random 20000 token original text using the o200k token set.
224225
(All encodings were computed from scratch for each slice.)

crates/bpe/benchmarks/performance.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,17 @@ use bpe_benchmarks::*;
99
use criterion::{
1010
criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration,
1111
};
12+
use rand::rngs::StdRng;
13+
use rand::SeedableRng;
1214
use rand::{rng, Rng};
1315

16+
fn get_rng(seed: u64) -> StdRng {
17+
// Expand the u64 seed to 32 bytes
18+
let mut seed_bytes = [0u8; 32];
19+
seed_bytes[..8].copy_from_slice(&seed.to_le_bytes());
20+
StdRng::from_seed(seed_bytes)
21+
}
22+
1423
fn counting_benchmark(c: &mut Criterion) {
1524
for (name, bpe, _, _) in TOKENIZERS.iter() {
1625
let input = create_test_string(&bpe.bpe, 80_000);
@@ -92,6 +101,20 @@ fn encoding_benchmark(c: &mut Criterion) {
92101
criterion::BatchSize::SmallInput,
93102
)
94103
});
104+
group.bench_with_input(
105+
BenchmarkId::new("minimal_dropout", bytes),
106+
&bytes,
107+
|b, bytes| {
108+
b.iter_batched(
109+
|| select_test_string(&text, *bytes),
110+
|text| {
111+
bpe.bpe
112+
.encode_minimal_dropout(text.as_bytes(), 0.1, get_rng(0))
113+
},
114+
criterion::BatchSize::SmallInput,
115+
)
116+
},
117+
);
95118
group.bench_with_input(
96119
BenchmarkId::new("huggingface", bytes),
97120
&bytes,

crates/bpe/src/byte_pair_encoding.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,20 @@ impl BytePairEncoding {
554554
/// This function computes the encoding while randomly rejecting some merges.
555555
/// Result of the encoding will be non-deterministic unless `seed` is provided.
556556
/// Implementation loosely follows original BPE dropout paper: https://arxiv.org/abs/1910.13267
557+
///
558+
/// In more detail: the tokenization uses dynamic programming, i.e. it models the tokenization as a graph,
559+
/// where every position between text bytes is a node and two nodes are connected when the text slice between those two nodes matches a token.
560+
// It then tries to find the shortest possible path from the beginning of the text till the end, i.e. it finds the shortest possible encoding.
561+
// For this is processes the nodes from left to right and visits all edges to the left. Then, it picks the edge which results in the shortest path.
562+
// The length of the shortest path is stored as second value, the edge (or rather token) is stored as first value.
563+
//
564+
// For the dropout (when dropout > 0.0), we uniformly drop edges from the graph, but always keep the one-byte tokens such that the graph stays connected.
565+
// Note: this is very different from how BPE works and cannot produce the same output as the algorithm
566+
// in the [paper's repository](https://github.com/VProv/BPE-Dropout/blob/master/bpe.py#L98), for two main reasons:
567+
// - `encode_minimal` already doesn't follow the original heap-based BPE procedure
568+
// - randomness source in dropout works differently in rust and python
569+
// - BPE-dropout authors discard all multi-byte tokens for each word separately, while this implementation does not split the "sentence" into words first
570+
// and hence may include previously discarded token later down the byte stream. At the sentence level though we don't expect it to make much difference.
557571
#[cfg(feature = "rand")]
558572
pub fn encode_minimal_dropout<R: rand::Rng>(
559573
&self,

0 commit comments

Comments
 (0)