Skip to content

Commit d86dcde

Browse files
aneubeckHendrik van Antwerpen
authored andcommitted
fix eof negative look-ahead
1 parent 35c047d commit d86dcde

2 files changed

Lines changed: 4 additions & 2 deletions

File tree

crates/bpe-openai/src/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@ use regex_automata::{meta::Regex, util::captures::Captures, Anchored, Input};
1111
static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
1212
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
1313
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
14-
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+";
14+
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
1515
// Note: Rewrite the negative look-ahead with a positive pseudo look-ahead.
1616
// The look-ahead character is dropped from the match by the SpecialRegexp iterator.
17+
// Note: The negative look-ahead requires also the pattern `\\s+$` to handle end of file without dropping a character!
1718
let pat2 = "\\s+\\s";
1819
let pat3 = "\\s+";
1920
Tokenizer::with_many(bpe, &[pat1, pat2, pat3]).expect("valid regex")
@@ -28,6 +29,7 @@ static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
2829
"\\p{N}{1,3}",
2930
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
3031
"\\s*[\\r\\n]+",
32+
"\\s+$",
3133
].join("|");
3234
let pat2 = "\\s+\\s";
3335
let pat3 = "\\s+";

crates/bpe/benchmarks/equivalence.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ fn test_huggingface_encoding_equivalence_with_pretokenization() {
4747
let texts = (0..N)
4848
.map(|_| select_test_string(&text, 100))
4949
.chain(std::iter::once(
50-
"You should see the Greek word 'kosme': \"κόσμε\"",
50+
"You should see the Greek word 'kosme': \"κόσμε\" ",
5151
));
5252
for text in texts {
5353
let out = bpe.encode(text);

0 commit comments

Comments
 (0)