VERY Fast dropout implementation

aneubeck · aneubeck · commit c2152103c8ca · 2026-02-27T08:46:52.000+01:00
diff --git a/crates/bpe/src/byte_pair_encoding.rs b/crates/bpe/src/byte_pair_encoding.rs
@@ -526,9 +526,9 @@ impl BytePairEncoding {
     /// tokenization produced by the original BPE algorithm.
     pub fn encode_minimal(&self, text: &[u8]) -> Vec<u32> {
         let mut last_token: Vec<(u32, u32)> = Vec::with_capacity(text.len());
-        let mut state = self.overlapping_searcher.start_state();
-        for (pos, c) in text.iter().enumerate() {
-            let (s, iter) = self.overlapping_searcher.consume(state, pos + 1, *c);
+        let mut state = self.overlapping_searcher_rev.start_state();
+        for (pos, c) in text.iter().rev().enumerate() {
+            let (s, iter) = self.overlapping_searcher_rev.consume(state, pos + 1, *c);
             state = s;
             let mut best = (0, u32::MAX);
             for m in iter {
@@ -548,7 +548,43 @@ impl BytePairEncoding {
             encoded.push(token);
             pos -= self.token_len(token);
         }
-        encoded.reverse();
+        encoded
+    }
+
+    /// This function computes the encoding while randomly rejecting some merges.
+    /// Result of the encoding will be non-deterministic unless `seed` is provided.
+    /// Implementation loosely follows original BPE dropout paper: https://arxiv.org/abs/1910.13267
+    #[cfg(feature = "rand")]
+    pub fn encode_minimal_dropout<R: rand::Rng>(&self, text: &[u8], dropout: f32, mut rng: R) -> Vec<u32> {
+        assert!(0.0 <= dropout);
+        assert!(dropout <= 1.0);
+
+        let mut last_token: Vec<(u32, u32)> = Vec::with_capacity(text.len());
+        let mut state = self.overlapping_searcher_rev.start_state();
+        for (pos, c) in text.iter().rev().enumerate() {
+            let (s, iter) = self.overlapping_searcher_rev.consume(state, pos + 1, *c);
+            state = s;
+            let mut best = (0, u32::MAX);
+            for m in iter {
+                if m.end() > m.start() + 1 && dropout >= rng.random() {
+                    continue;
+                }
+                if m.start() == 0 {
+                    best = (m.value(), 1);
+                    break;
+                } else if last_token[m.start() - 1].1 + 1 < best.1 {
+                    best = (m.value(), last_token[m.start() - 1].1 + 1);
+                }
+            }
+            last_token.push(best);
+        }
+        let mut encoded = Vec::with_capacity(last_token.last().map(|l| l.1 as usize).unwrap_or(0));
+        let mut pos = text.len();
+        while pos > 0 {
+            let token = last_token[pos - 1].0;
+            encoded.push(token);
+            pos -= self.token_len(token);
+        }
         encoded
     }
 }
diff --git a/crates/bpe/tests/Cargo.toml b/crates/bpe/tests/Cargo.toml
@@ -8,3 +8,6 @@ bpe-openai = { path = "../../bpe-openai" }
 itertools = "0.14"
 rand = "0.9"
 tiktoken-rs = "0.9"
+
+[dev-dependencies]
+rand_chacha = { version = "0.9" }
diff --git a/crates/bpe/tests/src/lib.rs b/crates/bpe/tests/src/lib.rs
@@ -1,5 +1,7 @@
 #[cfg(test)]
 mod tests {
+    use std::time;
+
     use itertools::Itertools;
     use rand::{rng, Rng};
     use tiktoken_rs::cl100k_base_singleton;
@@ -141,4 +143,45 @@ mod tests {
             assert_eq!(enc.token_count(), bpe.count(&input[i..]));
         }
     }
+
+    #[test]
+    fn test_bpe_dropout() {
+        use rand::rngs::StdRng;
+        use rand::SeedableRng;
+
+        fn get_rng(seed: u64) -> StdRng {
+            // Expand the u64 seed to 32 bytes
+            let mut seed_bytes = [0u8; 32];
+            seed_bytes[..8].copy_from_slice(&seed.to_le_bytes());
+            StdRng::from_seed(seed_bytes)
+        }
+
+        let bpe = &cl100k_base().bpe;
+        for bytes in [10000, 20000] {
+            for _ in 0..8 {
+                let input = create_test_bytes(bpe, bytes);
+                let encoded = bpe.encode_minimal(&input);
+                let encoded_d_min = bpe.encode_minimal_dropout(&input, 0.2, get_rng(0));
+                let encoded_d_max = bpe.encode_minimal_dropout(&input, 0.9, get_rng(1));
+                let encoded_d_1_0 = bpe.encode_minimal_dropout(&input, 1.0, get_rng(2));
+                let decoded = bpe.decode_tokens(&encoded);
+                let decoded_min = bpe.decode_tokens(&encoded_d_min);
+                let decoded_max = bpe.decode_tokens(&encoded_d_max);
+                let decoded_max_again = bpe.decode_tokens(&encoded_d_1_0);
+                println!("Input length: {}, Encoded length: {}, Encoded with dropout length: {}-{}, max {}",
+                    input.len(), encoded.len(), encoded_d_min.len(), encoded_d_max.len(), encoded_d_1_0.len());
+                assert_eq!(input, decoded);
+                assert_eq!(input, decoded_min);
+                assert_eq!(input, decoded_max);
+                assert_eq!(input, decoded_max_again);
+                assert_eq!(input.len(), encoded_d_1_0.len());
+                assert!(encoded_d_min.len() >= encoded.len());
+                assert!(encoded_d_max.len() > encoded.len());
+
+                assert_ne!(encoded, encoded_d_min);
+                assert_ne!(encoded, encoded_d_max);
+                assert_ne!(encoded_d_max, encoded_d_1_0);
+            }
+        }
+    }
 }