github
diff --git a/‎crates/bpe/README.md‎
Lines changed: 2 additions & 1 deletion b/‎crates/bpe/README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎crates/bpe/benchmarks/performance.rs‎
Lines changed: 13 additions & 0 deletions b/‎crates/bpe/benchmarks/performance.rs‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎crates/bpe/images/performance-appending.svg‎
Lines changed: 35 additions & 62 deletions b/‎crates/bpe/images/performance-appending.svg‎
Lines changed: 35 additions & 62 deletions
diff --git a/‎crates/bpe/images/performance-comparison.svg‎
Lines changed: 44 additions & 74 deletions b/‎crates/bpe/images/performance-comparison.svg‎
Lines changed: 44 additions & 74 deletions
diff --git a/‎crates/bpe/images/performance-counting.svg‎
Lines changed: 35 additions & 62 deletions b/‎crates/bpe/images/performance-counting.svg‎
Lines changed: 35 additions & 62 deletions
diff --git a/‎crates/bpe/images/performance-encoding.svg‎
Lines changed: 74 additions & 86 deletions b/‎crates/bpe/images/performance-encoding.svg‎
Lines changed: 74 additions & 86 deletions
diff --git a/‎crates/bpe/images/performance-worstcase.svg‎
Lines changed: 73 additions & 89 deletions b/‎crates/bpe/images/performance-worstcase.svg‎
Lines changed: 73 additions & 89 deletions
diff --git a/‎crates/bpe/src/byte_pair_encoding.rs‎
Lines changed: 63 additions & 4 deletions b/‎crates/bpe/src/byte_pair_encoding.rs‎
Lines changed: 63 additions & 4 deletions
diff --git a/‎crates/bpe/tests/src/lib.rs‎
Lines changed: 48 additions & 0 deletions b/‎crates/bpe/tests/src/lib.rs‎
Lines changed: 48 additions & 0 deletions
@@ -203,7 +203,7 @@ We benchmarked the following scenarios:
   The data structure we built specifically for this purpose can answer those interval counting requests in typically constant times after the initial linear preprocessing of the text.
   This mode is not available in tiktoken, which only supports counting/encoding a complete text.
 
-All benchmarks were run single-threaded on a MacBook Pro M1.
+All benchmarks were run single-threaded on a MacBook Air M4.
 
 ### Encoding
 
@@ -219,6 +219,7 @@ Two additional encoders are included that are faster but deviate from the origin
 
 - The greedy encoder picks the left-longest token.
 - The minimal encoder computes an encoding with the minimal number of tokens.
+- The minimal_dropout encoder implements BPE-Dropout [algorithm](https://arxiv.org/abs/1910.13267), randomly ignoring some multi-byte tokens at runtime. Note that this implementation differs from the paper, and **has not** been tested in an actual language model training pipeline.
 
 The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 10000 from a random 20000 token original text using the o200k token set.
 (All encodings were computed from scratch for each slice.)
 
@@ -9,6 +9,8 @@ use bpe_benchmarks::*;
 use criterion::{
     criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration,
 };
+use rand::rngs::StdRng;
+use rand::SeedableRng;
 use rand::{rng, Rng};
 
 fn counting_benchmark(c: &mut Criterion) {
@@ -92,6 +94,17 @@ fn encoding_benchmark(c: &mut Criterion) {
                     criterion::BatchSize::SmallInput,
                 )
             });
+            group.bench_with_input(
+                BenchmarkId::new("minimal_dropout", bytes),
+                &bytes,
+                |b, bytes| {
+                    b.iter_batched(
+                        || select_test_string(&text, *bytes),
+                        |text| bpe.bpe.encode_minimal_dropout(text.as_bytes(), 0.1, rng()),
+                        criterion::BatchSize::SmallInput,
+                    )
+                },
+            );
             group.bench_with_input(
                 BenchmarkId::new("huggingface", bytes),
                 &bytes,
 
@@ -526,9 +526,9 @@ impl BytePairEncoding {
     /// tokenization produced by the original BPE algorithm.
     pub fn encode_minimal(&self, text: &[u8]) -> Vec<u32> {
         let mut last_token: Vec<(u32, u32)> = Vec::with_capacity(text.len());
-        let mut state = self.overlapping_searcher.start_state();
-        for (pos, c) in text.iter().enumerate() {
-            let (s, iter) = self.overlapping_searcher.consume(state, pos + 1, *c);
+        let mut state = self.overlapping_searcher_rev.start_state();
+        for (pos, c) in text.iter().rev().enumerate() {
+            let (s, iter) = self.overlapping_searcher_rev.consume(state, pos + 1, *c);
             state = s;
             let mut best = (0, u32::MAX);
             for m in iter {
@@ -548,7 +548,66 @@ impl BytePairEncoding {
             encoded.push(token);
             pos -= self.token_len(token);
         }
-        encoded.reverse();
+        encoded
+    }
+
+    /// This function computes the encoding while randomly rejecting some merges.
+    /// Result of the encoding will be non-deterministic unless `seed` is provided.
+    /// Implementation loosely follows original BPE dropout paper: https://arxiv.org/abs/1910.13267
+    ///
+    /// In more detail: the tokenization uses dynamic programming, i.e. it models the tokenization as a graph,
+    /// where every position between text bytes is a node and two nodes are connected when the text slice between those two nodes matches a token.
+    // It then tries to find the shortest possible path from the beginning of the text till the end, i.e. it finds the shortest possible encoding.
+    // For this nodes are processed from right to left. At each node, edges starting at that node and ending on the right are tested and
+    // the one producing the shortest path is stored together with the length of the shortest path to that node.
+    // The length of the shortest path is stored as second value, the edge (or rather token) is stored as first value.
+    // Then, we walk in reverse direction through the table along the shortest path.
+    // Note: the reason for constructing the table from back to front is that
+    // the reconstruction outputs the path from start till end (i.e. we don't have to reverse the path afterwards).
+    //
+    // For the dropout (when dropout > 0.0), we uniformly drop edges from the graph, but always keep the one-byte tokens such that the graph stays connected.
+    // Note: this is very different from how BPE works and cannot produce the same output as the algorithm
+    // in the [paper's repository](https://github.com/VProv/BPE-Dropout/blob/master/bpe.py#L98), for two main reasons:
+    //   - `encode_minimal` already doesn't follow the original heap-based BPE procedure
+    //   - BPE-dropout authors discard all multi-byte tokens for each word separately, while this implementation does not split the "sentence" into words first
+    //     and hence may include previously discarded token later down the byte stream. At the sentence level though we don't expect it to make much difference.
+    //     Also, this implementation of BPE constructs merges on the fly from the set of tokens, hence might come up with a different set of merges with the same dictionary.
+    #[cfg(feature = "rand")]
+    pub fn encode_minimal_dropout<R: rand::Rng>(
+        &self,
+        text: &[u8],
+        dropout: f32,
+        mut rng: R,
+    ) -> Vec<u32> {
+        assert!(0.0 <= dropout);
+        assert!(dropout <= 1.0);
+
+        let mut last_token: Vec<(u32, u32)> = Vec::with_capacity(text.len());
+        let mut state = self.overlapping_searcher_rev.start_state();
+        for (pos, c) in text.iter().rev().enumerate() {
+            let (s, iter) = self.overlapping_searcher_rev.consume(state, pos + 1, *c);
+            state = s;
+            let mut best = (0, u32::MAX);
+            for m in iter {
+                if m.end() > m.start() + 1 && dropout >= rng.random() {
+                    continue;
+                }
+                if m.start() == 0 {
+                    best = (m.value(), 1);
+                    break;
+                } else if last_token[m.start() - 1].1 + 1 < best.1 {
+                    best = (m.value(), last_token[m.start() - 1].1 + 1);
+                }
+            }
+            last_token.push(best);
+        }
+        let mut encoded = Vec::with_capacity(last_token.last().map(|l| l.1 as usize).unwrap_or(0));
+        let mut pos = text.len();
+        while pos > 0 {
+            let token = last_token[pos - 1].0;
+            encoded.push(token);
+            pos -= self.token_len(token);
+        }
         encoded
     }
 }
 
@@ -141,4 +141,52 @@ mod tests {
             assert_eq!(enc.token_count(), bpe.count(&input[i..]));
         }
     }
+
+    #[test]
+    fn test_bpe_dropout() {
+        use rand::rngs::StdRng;
+        use rand::SeedableRng;
+
+        fn get_rng(seed: u64) -> StdRng {
+            // Expand the u64 seed to 32 bytes
+            let mut seed_bytes = [0u8; 32];
+            seed_bytes[..8].copy_from_slice(&seed.to_le_bytes());
+            StdRng::from_seed(seed_bytes)
+        }
+
+        let bpe = &cl100k_base().bpe;
+        let bytes = 10000;
+        for _ in 0..8 {
+            let input = create_test_bytes(bpe, bytes);
+            let encoded = bpe.encode_minimal(&input);
+            let encoded_d_0_2 = bpe.encode_minimal_dropout(&input, 0.2, get_rng(0));
+            let encoded_d_0_9 = bpe.encode_minimal_dropout(&input, 0.9, get_rng(1));
+            let encoded_d_1_0 = bpe.encode_minimal_dropout(&input, 1.0, get_rng(1));
+            let encoded_d_0_9_again = bpe.encode_minimal_dropout(&input, 0.9, get_rng(1));
+            let decoded = bpe.decode_tokens(&encoded);
+            let decoded_min = bpe.decode_tokens(&encoded_d_0_2);
+            let decoded_max = bpe.decode_tokens(&encoded_d_0_9);
+            let decoded_max_again = bpe.decode_tokens(&encoded_d_0_9_again);
+            println!(
+                "Input length: {}, Encoded length: {}, Encoded with dropout length: {}-{}, max {}",
+                input.len(),
+                encoded.len(),
+                encoded_d_0_2.len(),
+                encoded_d_0_9.len(),
+                encoded_d_0_9_again.len()
+            );
+            assert_eq!(encoded_d_0_9, encoded_d_0_9_again);
+            assert_eq!(input, decoded);
+            assert_eq!(input, decoded_min);
+            assert_eq!(input, decoded_max);
+            assert_eq!(input, decoded_max_again);
+            assert_eq!(input.len(), encoded_d_1_0.len());
+            assert!(encoded_d_0_2.len() >= encoded.len());
+            assert!(encoded_d_0_9.len() > encoded.len());
+
+            assert_ne!(encoded, encoded_d_0_2);
+            assert_ne!(encoded, encoded_d_0_9);
+            assert_ne!(encoded_d_0_9, encoded_d_1_0);
+        }
+    }
 }