Skip to content

Commit 42a11fb

Browse files
author
Hendrik van Antwerpen
committed
Move equivalence tests to bpe-openai
1 parent 5b127c9 commit 42a11fb

6 files changed

Lines changed: 120 additions & 128 deletions

File tree

crates/bpe-openai/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ rmp-serde = "1"
2020

2121
[dev-dependencies]
2222
tiktoken-rs = "0.6"
23+
bpe = { version = "0.1.0", path = "../bpe", features = ["rand"] }
2324

2425
[build-dependencies]
2526
base64 = "0.22.1"

crates/bpe-openai/src/lib.rs

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ use bpe::byte_pair_encoding::BytePairEncoding;
44
use either::Either;
55
use fancy_regex::Regex;
66

7+
// Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
8+
// The look-ahead character is dropped from the match by the Pretokenizer iterator.
9+
// Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!
10+
711
static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
812
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k_base.dict"));
913
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
@@ -56,6 +60,7 @@ pub struct Tokenizer {
5660
}
5761

5862
impl Tokenizer {
63+
/// Build a tokenizer with an optional pretokenization regex pattern.
5964
#[allow(clippy::result_large_err)]
6065
pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> {
6166
let pat = pat.map(fancy_regex::Regex::new).transpose()?;
@@ -109,45 +114,44 @@ pub fn o200k_base() -> &'static Tokenizer {
109114

110115
#[cfg(test)]
111116
mod tests {
112-
use tiktoken_rs::cl100k_base_singleton;
117+
use bpe::byte_pair_encoding::{create_test_string, select_test_string};
118+
use tiktoken_rs::{
119+
cl100k_base_singleton, o200k_base_singleton, p50k_base_singleton, r50k_base_singleton,
120+
CoreBPE,
121+
};
113122

114123
use super::*;
115124

116125
#[test]
117-
fn can_load_r50k() {
118-
r50k_base().count("");
126+
fn test_r50k() {
127+
test_equivalence(r50k_base(), &r50k_base_singleton().lock());
119128
}
120129

121130
#[test]
122-
fn can_load_p50k() {
123-
p50k_base().count("");
131+
fn test_p50k() {
132+
test_equivalence(p50k_base(), &p50k_base_singleton().lock());
124133
}
125134

126135
#[test]
127-
fn can_load_cl100k() {
128-
cl100k_base().count("");
136+
fn test_cl100k() {
137+
test_equivalence(cl100k_base(), &cl100k_base_singleton().lock());
129138
}
130139

131140
#[test]
132-
fn can_load_o200k() {
133-
o200k_base().count("");
141+
fn test_o200k() {
142+
test_equivalence(o200k_base(), &o200k_base_singleton().lock());
134143
}
135144

136-
/// Test demonstrating a case where input splitting makes a difference.
137-
#[test]
138-
fn splitting_difference() {
139-
let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
140-
let input = text.as_bytes();
141-
let expected: Vec<_> = cl100k_base_singleton()
142-
.lock()
143-
.encode_ordinary(text)
144-
.into_iter()
145-
.collect();
146-
147-
let without_splitting = BPE_CL100K_BASE.bpe.encode_via_backtracking(input);
148-
assert_ne!(without_splitting, expected);
149-
150-
let with_splitting: Vec<_> = BPE_CL100K_BASE.encode(text);
151-
assert_eq!(with_splitting, expected);
145+
#[track_caller]
146+
fn test_equivalence(tok: &Tokenizer, tiktoken: &CoreBPE) {
147+
let text = create_test_string(&tok.bpe, 80_000);
148+
for bytes in [10, 100, 1000, 10_000] {
149+
for _ in 0..32 {
150+
let text = select_test_string(&text, bytes);
151+
let tokens = tok.encode(text);
152+
let tiktokens = tiktoken.encode_ordinary(text).to_vec();
153+
assert_eq!(tokens, tiktokens, "encoding mismatch for {text:?}");
154+
}
155+
}
152156
}
153157
}

crates/bpe/benchmarks/equivalence.rs

Lines changed: 21 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
1+
use bpe::byte_pair_encoding::{create_test_string, select_test_string};
12
use bpe_benchmarks::*;
23

34
#[cfg(test)]
45
const N: usize = 32;
56

67
#[test]
7-
fn test_encoding_equivalence_without_pretokenization() {
8+
fn test_huggingface_encoding_equivalence_without_pretokenization() {
89
for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
910
let huggingface = without_pretokenizer(huggingface);
10-
let text = create_test_string(&bpe.bpe, 20000);
11-
let inputs = (0..N)
12-
.map(|_| select_test_bytes(text.as_bytes(), 100))
11+
let text = create_test_string(&bpe.bpe, 80_000);
12+
let texts = (0..N)
13+
.map(|_| select_test_string(&text, 100))
1314
.chain(std::iter::once(
14-
"You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(),
15+
"You should see the Greek word 'kosme': \"κόσμε\"",
1516
));
16-
for input in inputs {
17-
let text = std::str::from_utf8(input).unwrap();
18-
let out = bpe.bpe.encode_via_backtracking(input);
17+
for text in texts {
18+
let out = bpe.bpe.encode_via_backtracking(text.as_bytes());
1919
let huggingface_out = huggingface
2020
.encode_fast(text, false)
2121
.unwrap()
@@ -41,48 +41,35 @@ fn test_encoding_equivalence_without_pretokenization() {
4141
}
4242

4343
#[test]
44-
fn test_encoding_equivalence_with_pretokenization() {
45-
for (_, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
46-
let text = create_test_string(&bpe.bpe, 20000);
47-
let inputs = (0..N)
48-
.map(|_| select_test_bytes(text.as_bytes(), 100))
44+
fn test_huggingface_encoding_equivalence_with_pretokenization() {
45+
for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
46+
let text = create_test_string(&bpe.bpe, 80_000);
47+
let texts = (0..N)
48+
.map(|_| select_test_string(&text, 100))
4949
.chain(std::iter::once(
50-
"You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(),
50+
"You should see the Greek word 'kosme': \"κόσμε\"",
5151
));
52-
for input in inputs {
53-
let text = std::str::from_utf8(input).unwrap();
52+
for text in texts {
5453
let out = bpe.encode(text);
55-
let tiktoken_out = tiktoken.encode_ordinary(text);
56-
let tiktoken_out2 = tiktoken_out.to_vec();
57-
let tiktoken_text = tiktoken.decode(tiktoken_out.clone()).unwrap();
5854
let huggingface_out = huggingface
5955
.encode_fast(text, false)
6056
.unwrap()
6157
.get_ids()
6258
.to_vec();
63-
if tiktoken_out2 != huggingface_out {
59+
60+
if huggingface_out != out {
61+
let text = bpe.decode(&out).unwrap();
6462
let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
65-
if tiktoken_text != huggingface_text {
63+
if huggingface_text != text {
6664
panic!(
6765
"huggingface tokens and text differ: {:?} != {:?}",
68-
huggingface_text, tiktoken_text
66+
text, huggingface_text
6967
);
7068
} else {
7169
panic!(
7270
"huggingface tokens differ: {:?} != {:?}",
73-
huggingface_out, tiktoken_out2
74-
);
75-
}
76-
}
77-
if tiktoken_out2 != out {
78-
let text = bpe.decode(&out).unwrap();
79-
if tiktoken_text != text {
80-
panic!(
81-
"bpe tokens and text differ: {:?} != {:?}",
82-
text, tiktoken_text
71+
out, huggingface_out
8372
);
84-
} else {
85-
panic!("bpe tokens differ: {:?} != {:?}", out, tiktoken_out2);
8673
}
8774
}
8875
}

crates/bpe/benchmarks/lib.rs

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
use std::sync::LazyLock;
22

3-
use bpe::byte_pair_encoding::BytePairEncoding;
43
use bpe_openai::Tokenizer;
5-
use rand::{thread_rng, Rng};
64
use tiktoken_rs::CoreBPE as TiktokenTokenizer;
75
use tokenizers::pre_tokenizers::byte_level::ByteLevel as HuggingfaceByteLevel;
86
use tokenizers::tokenizer::Tokenizer as HuggingfaceTokenizer;
@@ -31,46 +29,6 @@ pub static TOKENIZERS: LazyLock<
3129
]
3230
});
3331

34-
pub fn is_char_boundary(b: u8) -> bool {
35-
// Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128
36-
// Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192
37-
// The rest are bytes belonging to the first byte of multi byte encodings (11xxxxxx): b >= 192
38-
// When interpreting the byte representation as signed integers, then numbers in the range 128..192
39-
// correspond to the smallest representable numbers. I.e. the two ranges [0, 128) and [192, 256) can
40-
// be tested with a single signed comparison.
41-
b as i8 >= -0x40 // NB: b < 128 || b >= 192
42-
}
43-
44-
pub fn create_test_string(bpe: &BytePairEncoding, tokens: usize) -> String {
45-
use rand::{thread_rng, Rng};
46-
let mut text = String::new();
47-
for _ in 0..tokens {
48-
loop {
49-
let i = thread_rng().gen_range(0..bpe.num_tokens());
50-
let s = bpe.token_bytes(i as u32);
51-
if s.iter().all(|b| is_char_boundary(*b)) {
52-
if let Ok(s) = std::str::from_utf8(s) {
53-
text.push_str(s);
54-
break;
55-
}
56-
}
57-
}
58-
}
59-
text
60-
}
61-
62-
pub fn select_test_bytes(input: &[u8], bytes: usize) -> &[u8] {
63-
let mut start = thread_rng().gen_range(0..input.len() - bytes);
64-
while start > 0 && !is_char_boundary(input[start]) {
65-
start -= 1;
66-
}
67-
let mut end = start + bytes;
68-
while end < input.len() && !is_char_boundary(input[end]) {
69-
end += 1;
70-
}
71-
&input[start..end]
72-
}
73-
7432
pub fn without_pretokenizer(enc: &HuggingfaceTokenizer) -> HuggingfaceTokenizer {
7533
let mut enc = enc.clone();
7634
// boolean values taken from Xenova's tokenizer config

0 commit comments

Comments
 (0)