@@ -2,7 +2,11 @@ use std::sync::LazyLock;
22
33use bpe:: byte_pair_encoding:: BytePairEncoding ;
44use either:: Either ;
5- use fancy_regex:: Regex ;
5+ use regex_automata:: {
6+ meta:: { BuildError , Regex } ,
7+ util:: captures:: Captures ,
8+ Anchored , Input ,
9+ } ;
610
711// Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
812// The look-ahead character is dropped from the match by the Pretokenizer iterator.
@@ -11,23 +15,28 @@ use fancy_regex::Regex;
1115static BPE_CL100K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
1216 let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_cl100k_base.dict" ) ) ;
1317 let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
14- let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+(?!\\ S)|\\ s+" ;
15- Tokenizer :: new ( bpe, Some ( pat) ) . expect ( "valid regex" )
18+ let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+$" ;
19+ let pat2 = "\\ s+\\ s" ;
20+ let pat3 = "\\ s+" ;
21+ Tokenizer :: new_lookahead ( bpe, & [ ( pat1, false ) , ( pat2, true ) , ( pat3, false ) ] )
22+ . expect ( "valid regex" )
1623} ) ;
1724
1825static BPE_O200K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
1926 let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_o200k_base.dict" ) ) ;
2027 let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
21- let pat = [
28+ let pat1 = [
2229 "[^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]*[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?" ,
2330 "[^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]+[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?" ,
2431 "\\ p{N}{1,3}" ,
2532 " ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n/]*" ,
2633 "\\ s*[\\ r\\ n]+" ,
27- "\\ s+(?!\\ S)" ,
28- "\\ s+" ,
34+ "\\ s+$" ,
2935 ] . join ( "|" ) ;
30- Tokenizer :: new ( bpe, Some ( & pat) ) . expect ( "valid regex" )
36+ let pat2 = "\\ s+\\ s" ;
37+ let pat3 = "\\ s+" ;
38+ Tokenizer :: new_lookahead ( bpe, & [ ( & pat1, false ) , ( pat2, true ) , ( pat3, false ) ] )
39+ . expect ( "valid regex" )
3140} ) ;
3241
3342pub use bpe:: * ;
@@ -42,15 +51,33 @@ pub struct Tokenizer {
4251 /// The byte-pair encoding for this tokenizer.
4352 pub bpe : BytePairEncoding ,
4453 /// The pattern regex used to split the input.
45- pub pat : Option < Regex > ,
54+ pub pre : Option < Pretokenizer > ,
55+ }
56+
57+ pub struct Pretokenizer {
58+ /// The pattern regex used to split the input.
59+ pat : Regex ,
60+ /// For each pattern in the regex a boolean whether the last character is a look-ahead.
61+ lookahead : Vec < bool > ,
4662}
4763
4864impl Tokenizer {
4965 /// Build a tokenizer with an optional pretokenization regex pattern.
5066 #[ allow( clippy:: result_large_err) ]
51- pub fn new ( bpe : BytePairEncoding , pat : Option < & str > ) -> fancy_regex:: Result < Self > {
52- let pat = pat. map ( fancy_regex:: Regex :: new) . transpose ( ) ?;
53- Ok ( Self { bpe, pat } )
67+ pub fn new ( bpe : BytePairEncoding , pat : Option < & str > ) -> Result < Self , BuildError > {
68+ let pre = pat. map ( Pretokenizer :: new) . transpose ( ) ?;
69+ Ok ( Self { bpe, pre } )
70+ }
71+
72+ /// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
73+ /// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
74+ #[ allow( clippy:: result_large_err) ]
75+ pub fn new_lookahead (
76+ bpe : BytePairEncoding ,
77+ patterns : & [ ( & str , bool ) ] ,
78+ ) -> Result < Self , BuildError > {
79+ let pre = Some ( Pretokenizer :: new_lookahead ( patterns) ?) ;
80+ Ok ( Self { bpe, pre } )
5481 }
5582
5683 pub fn count ( & self , text : & str ) -> usize {
@@ -70,18 +97,83 @@ impl Tokenizer {
7097 }
7198
7299 pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & str > + ' a {
73- match & self . pat {
74- Some ( pat) => Either :: Left ( pat. find_iter ( text) . scan ( 0 , |start, m| {
75- let m = m. expect ( "match succeeded" ) ;
76- assert_eq ! ( * start, m. start( ) , "pattern should match all input text" ) ;
77- * start = m. end ( ) ;
78- Some ( m. as_str ( ) )
79- } ) ) ,
100+ match & self . pre {
101+ Some ( pre) => Either :: Left ( pre. split ( text) ) ,
80102 None => Either :: Right ( std:: iter:: once ( text) ) ,
81103 }
82104 }
83105}
84106
107+ impl Pretokenizer {
108+ /// Build a pretokenizer from the given regex pattern.
109+ #[ allow( clippy:: result_large_err) ]
110+ fn new ( pat : & str ) -> Result < Self , BuildError > {
111+ let pat = Regex :: new ( pat) ?;
112+ Ok ( Self {
113+ pat,
114+ lookahead : vec ! [ false ] ,
115+ } )
116+ }
117+
118+ /// Build a pretokenizer from the given regex patterns. If the boolean for a pattern is true,
119+ /// the pattern is assumed to be a look-ahead pattern with exactly one look-ahead character!
120+ #[ allow( clippy:: result_large_err) ]
121+ fn new_lookahead ( pats : & [ ( & str , bool ) ] ) -> Result < Self , BuildError > {
122+ let ( pats, lookahead) : ( Vec < _ > , _ ) = pats. iter ( ) . copied ( ) . unzip ( ) ;
123+ let pat = Regex :: new_many ( & pats) ?;
124+ Ok ( Self { pat, lookahead } )
125+ }
126+
127+ pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & str > + ' a {
128+ Splits {
129+ pat : & self . pat ,
130+ lookahead : & self . lookahead ,
131+ text,
132+ last : 0 ,
133+ caps : Captures :: matches ( self . pat . group_info ( ) . clone ( ) ) ,
134+ }
135+ }
136+ }
137+
138+ /// This is a small wrapper around the regex which emulates the behaviour of look-ahead by
139+ /// dropping the look-ahead character from the match. The assumption here is that the
140+ /// second pattern is always a look-ahead pattern, and that just a single character needs
141+ /// to be dropped. With this little hack, we can keep most of the regex patterns as they are,
142+ /// but achieve a >3x speedup.
143+ ///
144+ /// Alternatively, this could have been implemented with capture groups, but those were ~30%
145+ /// slower than this approach with multiple patterns.
146+ struct Splits < ' a > {
147+ pat : & ' a Regex ,
148+ lookahead : & ' a [ bool ] ,
149+ text : & ' a str ,
150+ last : usize ,
151+ caps : Captures ,
152+ }
153+
154+ impl < ' a > Iterator for Splits < ' a > {
155+ type Item = & ' a str ;
156+
157+ fn next ( & mut self ) -> Option < Self :: Item > {
158+ let input = Input :: new ( & self . text [ self . last ..] ) . anchored ( Anchored :: Yes ) ;
159+ self . caps . clear ( ) ;
160+ self . pat . captures ( input, & mut self . caps ) ;
161+ let m = self . caps . get_match ( ) ?;
162+ let start = self . last ;
163+ let mut end = self . last + m. range ( ) . end ;
164+ if self . lookahead [ m. pattern ( ) . as_usize ( ) ] {
165+ let last = self . text [ start..end]
166+ . chars ( )
167+ . next_back ( )
168+ . expect ( "Expected at least a look-ahead character!" ) ;
169+ end -= last. len_utf8 ( ) ;
170+ assert_ne ! ( end, start, "a look-ahead pattern must ALWAYS consume at least one character excluding the look-ahead character!" ) ;
171+ }
172+ self . last = end;
173+ Some ( & self . text [ start..end] )
174+ }
175+ }
176+
85177pub fn cl100k_base ( ) -> & ' static Tokenizer {
86178 & BPE_CL100K_BASE
87179}
0 commit comments