@@ -8,20 +8,6 @@ use fancy_regex::Regex;
88// The look-ahead character is dropped from the match by the Pretokenizer iterator.
99// Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!
1010
11- static BPE_R50K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
12- let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_r50k_base.dict" ) ) ;
13- let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
14- let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+|\\ s+(?!\\ S)|\\ s+" ;
15- Tokenizer :: new ( bpe, Some ( pat) ) . expect ( "valid regex" )
16- } ) ;
17-
18- static BPE_P50K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
19- let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_p50k_base.dict" ) ) ;
20- let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
21- let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+|\\ s+(?!\\ S)|\\ s+" ;
22- Tokenizer :: new ( bpe, Some ( pat) ) . expect ( "valid regex" )
23- } ) ;
24-
2511static BPE_CL100K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
2612 let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_cl100k_base.dict" ) ) ;
2713 let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
@@ -96,14 +82,6 @@ impl Tokenizer {
9682 }
9783}
9884
99- pub fn r50k_base ( ) -> & ' static Tokenizer {
100- & BPE_R50K_BASE
101- }
102-
103- pub fn p50k_base ( ) -> & ' static Tokenizer {
104- & BPE_P50K_BASE
105- }
106-
10785pub fn cl100k_base ( ) -> & ' static Tokenizer {
10886 & BPE_CL100K_BASE
10987}
@@ -115,23 +93,10 @@ pub fn o200k_base() -> &'static Tokenizer {
11593#[ cfg( test) ]
11694mod tests {
11795 use bpe:: byte_pair_encoding:: { create_test_string, select_test_string} ;
118- use tiktoken_rs:: {
119- cl100k_base_singleton, o200k_base_singleton, p50k_base_singleton, r50k_base_singleton,
120- CoreBPE ,
121- } ;
96+ use tiktoken_rs:: { cl100k_base_singleton, o200k_base_singleton, CoreBPE } ;
12297
12398 use super :: * ;
12499
125- #[ test]
126- fn test_r50k ( ) {
127- test_equivalence ( r50k_base ( ) , & r50k_base_singleton ( ) . lock ( ) ) ;
128- }
129-
130- #[ test]
131- fn test_p50k ( ) {
132- test_equivalence ( p50k_base ( ) , & p50k_base_singleton ( ) . lock ( ) ) ;
133- }
134-
135100 #[ test]
136101 fn test_cl100k ( ) {
137102 test_equivalence ( cl100k_base ( ) , & cl100k_base_singleton ( ) . lock ( ) ) ;
0 commit comments