Move equivalence tests to bpe-openai

Hendrik van Antwerpen · Hendrik van Antwerpen · commit 42a11fb24a03 · 2024-10-18T16:26:35.000+02:00
diff --git a/crates/bpe-openai/Cargo.toml b/crates/bpe-openai/Cargo.toml
@@ -20,6 +20,7 @@ rmp-serde = "1"
 
 [dev-dependencies]
 tiktoken-rs = "0.6"
+bpe = { version = "0.1.0", path = "../bpe", features = ["rand"] }
 
 [build-dependencies]
 base64 = "0.22.1"
diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs
@@ -4,6 +4,10 @@ use bpe::byte_pair_encoding::BytePairEncoding;
 use either::Either;
 use fancy_regex::Regex;
 
+// Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
+// The look-ahead character is dropped from the match by the Pretokenizer iterator.
+// Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!
+
 static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k_base.dict"));
     let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
@@ -56,6 +60,7 @@ pub struct Tokenizer {
 }
 
 impl Tokenizer {
+    /// Build a tokenizer with an optional pretokenization regex pattern.
     #[allow(clippy::result_large_err)]
     pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> {
         let pat = pat.map(fancy_regex::Regex::new).transpose()?;
@@ -109,45 +114,44 @@ pub fn o200k_base() -> &'static Tokenizer {
 
 #[cfg(test)]
 mod tests {
-    use tiktoken_rs::cl100k_base_singleton;
+    use bpe::byte_pair_encoding::{create_test_string, select_test_string};
+    use tiktoken_rs::{
+        cl100k_base_singleton, o200k_base_singleton, p50k_base_singleton, r50k_base_singleton,
+        CoreBPE,
+    };
 
     use super::*;
 
     #[test]
-    fn can_load_r50k() {
-        r50k_base().count("");
+    fn test_r50k() {
+        test_equivalence(r50k_base(), &r50k_base_singleton().lock());
     }
 
     #[test]
-    fn can_load_p50k() {
-        p50k_base().count("");
+    fn test_p50k() {
+        test_equivalence(p50k_base(), &p50k_base_singleton().lock());
     }
 
     #[test]
-    fn can_load_cl100k() {
-        cl100k_base().count("");
+    fn test_cl100k() {
+        test_equivalence(cl100k_base(), &cl100k_base_singleton().lock());
     }
 
     #[test]
-    fn can_load_o200k() {
-        o200k_base().count("");
+    fn test_o200k() {
+        test_equivalence(o200k_base(), &o200k_base_singleton().lock());
     }
 
-    /// Test demonstrating a case where input splitting makes a difference.
-    #[test]
-    fn splitting_difference() {
-        let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t    Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
-        let input = text.as_bytes();
-        let expected: Vec<_> = cl100k_base_singleton()
-            .lock()
-            .encode_ordinary(text)
-            .into_iter()
-            .collect();
-
-        let without_splitting = BPE_CL100K_BASE.bpe.encode_via_backtracking(input);
-        assert_ne!(without_splitting, expected);
-
-        let with_splitting: Vec<_> = BPE_CL100K_BASE.encode(text);
-        assert_eq!(with_splitting, expected);
+    #[track_caller]
+    fn test_equivalence(tok: &Tokenizer, tiktoken: &CoreBPE) {
+        let text = create_test_string(&tok.bpe, 80_000);
+        for bytes in [10, 100, 1000, 10_000] {
+            for _ in 0..32 {
+                let text = select_test_string(&text, bytes);
+                let tokens = tok.encode(text);
+                let tiktokens = tiktoken.encode_ordinary(text).to_vec();
+                assert_eq!(tokens, tiktokens, "encoding mismatch for {text:?}");
+            }
+        }
     }
 }
diff --git a/crates/bpe/benchmarks/equivalence.rs b/crates/bpe/benchmarks/equivalence.rs
@@ -1,21 +1,21 @@
+use bpe::byte_pair_encoding::{create_test_string, select_test_string};
 use bpe_benchmarks::*;
 
 #[cfg(test)]
 const N: usize = 32;
 
 #[test]
-fn test_encoding_equivalence_without_pretokenization() {
+fn test_huggingface_encoding_equivalence_without_pretokenization() {
     for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
         let huggingface = without_pretokenizer(huggingface);
-        let text = create_test_string(&bpe.bpe, 20000);
-        let inputs = (0..N)
-            .map(|_| select_test_bytes(text.as_bytes(), 100))
+        let text = create_test_string(&bpe.bpe, 80_000);
+        let texts = (0..N)
+            .map(|_| select_test_string(&text, 100))
             .chain(std::iter::once(
-                "You should see the Greek word 'kosme':       \"κόσμε\"".as_bytes(),
+                "You should see the Greek word 'kosme':       \"κόσμε\"",
             ));
-        for input in inputs {
-            let text = std::str::from_utf8(input).unwrap();
-            let out = bpe.bpe.encode_via_backtracking(input);
+        for text in texts {
+            let out = bpe.bpe.encode_via_backtracking(text.as_bytes());
             let huggingface_out = huggingface
                 .encode_fast(text, false)
                 .unwrap()
@@ -41,48 +41,35 @@ fn test_encoding_equivalence_without_pretokenization() {
 }
 
 #[test]
-fn test_encoding_equivalence_with_pretokenization() {
-    for (_, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
-        let text = create_test_string(&bpe.bpe, 20000);
-        let inputs = (0..N)
-            .map(|_| select_test_bytes(text.as_bytes(), 100))
+fn test_huggingface_encoding_equivalence_with_pretokenization() {
+    for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
+        let text = create_test_string(&bpe.bpe, 80_000);
+        let texts = (0..N)
+            .map(|_| select_test_string(&text, 100))
             .chain(std::iter::once(
-                "You should see the Greek word 'kosme':       \"κόσμε\"".as_bytes(),
+                "You should see the Greek word 'kosme':       \"κόσμε\"",
             ));
-        for input in inputs {
-            let text = std::str::from_utf8(input).unwrap();
+        for text in texts {
             let out = bpe.encode(text);
-            let tiktoken_out = tiktoken.encode_ordinary(text);
-            let tiktoken_out2 = tiktoken_out.to_vec();
-            let tiktoken_text = tiktoken.decode(tiktoken_out.clone()).unwrap();
             let huggingface_out = huggingface
                 .encode_fast(text, false)
                 .unwrap()
                 .get_ids()
                 .to_vec();
-            if tiktoken_out2 != huggingface_out {
+
+            if huggingface_out != out {
+                let text = bpe.decode(&out).unwrap();
                 let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
-                if tiktoken_text != huggingface_text {
+                if huggingface_text != text {
                     panic!(
                         "huggingface tokens and text differ: {:?} != {:?}",
-                        huggingface_text, tiktoken_text
+                        text, huggingface_text
                     );
                 } else {
                     panic!(
                         "huggingface tokens differ: {:?} != {:?}",
-                        huggingface_out, tiktoken_out2
-                    );
-                }
-            }
-            if tiktoken_out2 != out {
-                let text = bpe.decode(&out).unwrap();
-                if tiktoken_text != text {
-                    panic!(
-                        "bpe tokens and text differ: {:?} != {:?}",
-                        text, tiktoken_text
+                        out, huggingface_out
                     );
-                } else {
-                    panic!("bpe tokens differ: {:?} != {:?}", out, tiktoken_out2);
                 }
             }
         }
diff --git a/crates/bpe/benchmarks/lib.rs b/crates/bpe/benchmarks/lib.rs
@@ -1,8 +1,6 @@
 use std::sync::LazyLock;
 
-use bpe::byte_pair_encoding::BytePairEncoding;
 use bpe_openai::Tokenizer;
-use rand::{thread_rng, Rng};
 use tiktoken_rs::CoreBPE as TiktokenTokenizer;
 use tokenizers::pre_tokenizers::byte_level::ByteLevel as HuggingfaceByteLevel;
 use tokenizers::tokenizer::Tokenizer as HuggingfaceTokenizer;
@@ -31,46 +29,6 @@ pub static TOKENIZERS: LazyLock<
     ]
 });
 
-pub fn is_char_boundary(b: u8) -> bool {
-    // Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128
-    // Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192
-    // The rest are bytes belonging to the first byte of multi byte encodings (11xxxxxx): b >= 192
-    // When interpreting the byte representation as signed integers, then numbers in the range 128..192
-    // correspond to the smallest representable numbers. I.e. the two ranges [0, 128) and [192, 256) can
-    // be tested with a single signed comparison.
-    b as i8 >= -0x40 // NB: b < 128 || b >= 192
-}
-
-pub fn create_test_string(bpe: &BytePairEncoding, tokens: usize) -> String {
-    use rand::{thread_rng, Rng};
-    let mut text = String::new();
-    for _ in 0..tokens {
-        loop {
-            let i = thread_rng().gen_range(0..bpe.num_tokens());
-            let s = bpe.token_bytes(i as u32);
-            if s.iter().all(|b| is_char_boundary(*b)) {
-                if let Ok(s) = std::str::from_utf8(s) {
-                    text.push_str(s);
-                    break;
-                }
-            }
-        }
-    }
-    text
-}
-
-pub fn select_test_bytes(input: &[u8], bytes: usize) -> &[u8] {
-    let mut start = thread_rng().gen_range(0..input.len() - bytes);
-    while start > 0 && !is_char_boundary(input[start]) {
-        start -= 1;
-    }
-    let mut end = start + bytes;
-    while end < input.len() && !is_char_boundary(input[end]) {
-        end += 1;
-    }
-    &input[start..end]
-}
-
 pub fn without_pretokenizer(enc: &HuggingfaceTokenizer) -> HuggingfaceTokenizer {
     let mut enc = enc.clone();
     // boolean values taken from Xenova's tokenizer config
diff --git a/crates/bpe/benchmarks/performance.rs b/crates/bpe/benchmarks/performance.rs
diff --git a/crates/bpe/src/byte_pair_encoding.rs b/crates/bpe/src/byte_pair_encoding.rs