update README, docs, and include dropout into benchmarks

marinegor · marinegor · commit 3d635045e31f · 2026-02-27T18:01:03.000+01:00
diff --git a/crates/bpe/README.md b/crates/bpe/README.md
@@ -219,6 +219,7 @@ Two additional encoders are included that are faster but deviate from the origin
 
 - The greedy encoder picks the left-longest token.
 - The minimal encoder computes an encoding with the minimal number of tokens.
+- The minimal_dropout encoder implements BPE-Dropout [algorithm](https://arxiv.org/abs/1910.13267), randomly ignoring some multi-byte tokens at runtime.
 
 The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 10000 from a random 20000 token original text using the o200k token set.
 (All encodings were computed from scratch for each slice.)
diff --git a/crates/bpe/benchmarks/performance.rs b/crates/bpe/benchmarks/performance.rs
@@ -9,8 +9,17 @@ use bpe_benchmarks::*;
 use criterion::{
     criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration,
 };
+use rand::rngs::StdRng;
+use rand::SeedableRng;
 use rand::{rng, Rng};
 
+fn get_rng(seed: u64) -> StdRng {
+    // Expand the u64 seed to 32 bytes
+    let mut seed_bytes = [0u8; 32];
+    seed_bytes[..8].copy_from_slice(&seed.to_le_bytes());
+    StdRng::from_seed(seed_bytes)
+}
+
 fn counting_benchmark(c: &mut Criterion) {
     for (name, bpe, _, _) in TOKENIZERS.iter() {
         let input = create_test_string(&bpe.bpe, 80_000);
@@ -92,6 +101,20 @@ fn encoding_benchmark(c: &mut Criterion) {
                     criterion::BatchSize::SmallInput,
                 )
             });
+            group.bench_with_input(
+                BenchmarkId::new("minimal_dropout", bytes),
+                &bytes,
+                |b, bytes| {
+                    b.iter_batched(
+                        || select_test_string(&text, *bytes),
+                        |text| {
+                            bpe.bpe
+                                .encode_minimal_dropout(text.as_bytes(), 0.1, get_rng(0))
+                        },
+                        criterion::BatchSize::SmallInput,
+                    )
+                },
+            );
             group.bench_with_input(
                 BenchmarkId::new("huggingface", bytes),
                 &bytes,
diff --git a/crates/bpe/src/byte_pair_encoding.rs b/crates/bpe/src/byte_pair_encoding.rs
@@ -554,6 +554,20 @@ impl BytePairEncoding {
     /// This function computes the encoding while randomly rejecting some merges.
     /// Result of the encoding will be non-deterministic unless `seed` is provided.
     /// Implementation loosely follows original BPE dropout paper: https://arxiv.org/abs/1910.13267
+    ///
+    /// In more detail: the tokenization uses dynamic programming, i.e. it models the tokenization as a graph,
+    /// where every position between text bytes is a node and two nodes are connected when the text slice between those two nodes matches a token.
+    // It then tries to find the shortest possible path from the beginning of the text till the end, i.e. it finds the shortest possible encoding.
+    // For this is processes the nodes from left to right and visits all edges to the left. Then, it picks the edge which results in the shortest path.
+    // The length of the shortest path is stored as second value, the edge (or rather token) is stored as first value.
+    //
+    // For the dropout (when dropout > 0.0), we uniformly drop edges from the graph, but always keep the one-byte tokens such that the graph stays connected.
+    // Note: this is very different from how BPE works and cannot produce the same output as the algorithm
+    // in the [paper's repository](https://github.com/VProv/BPE-Dropout/blob/master/bpe.py#L98), for two main reasons:
+    //   - `encode_minimal` already doesn't follow the original heap-based BPE procedure
+    //   - randomness source in dropout works differently in rust and python
+    //   - BPE-dropout authors discard all multi-byte tokens for each word separately, while this implementation does not split the "sentence" into words first
+    //     and hence may include previously discarded token later down the byte stream. At the sentence level though we don't expect it to make much difference.
     #[cfg(feature = "rand")]
     pub fn encode_minimal_dropout<R: rand::Rng>(
         &self,