@@ -2,7 +2,6 @@ use std::cmp::Reverse;
22use std:: collections:: BinaryHeap ;
33use std:: hash:: { Hash , Hasher } ;
44use std:: ops:: Range ;
5- use std:: sync:: LazyLock ;
65
76use aneubeck_daachorse:: { DoubleArrayAhoCorasick , DoubleArrayAhoCorasickBuilder } ;
87use fnv:: { FnvHashMap , FnvHasher } ;
@@ -12,19 +11,26 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer};
1211
1312use crate :: backtrack_encoder:: BacktrackEncoder ;
1413use crate :: bitfield:: BitField ;
15- use crate :: byte_pair_encoding:: data:: TokenDict ;
1614
17- static BPE_CL100K : LazyLock < BytePairEncoding > = LazyLock :: new ( || {
18- let bytes = include_bytes ! ( "data/bpe_cl100k.dict" ) ;
19- let dict: TokenDict = rmp_serde:: from_slice ( bytes) . expect ( "" ) ;
20- dict. into_bpe ( )
21- } ) ;
15+ #[ cfg( test) ]
16+ pub ( crate ) static BPE_CL100K : std:: sync:: LazyLock < BytePairEncoding > =
17+ std:: sync:: LazyLock :: new ( || {
18+ BytePairEncoding :: from_tiktoken (
19+ & tiktoken_rs:: cl100k_base_singleton ( ) . lock ( ) ,
20+ 100256 ,
21+ Some ( 17846336922010275747 ) ,
22+ )
23+ } ) ;
2224
23- static BPE_O200K : LazyLock < BytePairEncoding > = LazyLock :: new ( || {
24- let bytes = include_bytes ! ( "data/bpe_o200k.dict" ) ;
25- let dict: TokenDict = rmp_serde:: from_slice ( bytes) . expect ( "" ) ;
26- dict. into_bpe ( )
27- } ) ;
25+ #[ cfg( test) ]
26+ pub ( crate ) static BPE_O200K : std:: sync:: LazyLock < BytePairEncoding > =
27+ std:: sync:: LazyLock :: new ( || {
28+ BytePairEncoding :: from_tiktoken (
29+ & tiktoken_rs:: o200k_base_singleton ( ) . lock ( ) ,
30+ 199998 ,
31+ Some ( 17846336922010275747 ) ,
32+ )
33+ } ) ;
2834
2935/// Representation of the byte pair dictionary.
3036/// This struct provides various conversions.
@@ -215,14 +221,6 @@ fn find_token_by_bytes(
215221}
216222
217223impl BytePairEncoding {
218- pub fn cl100k ( ) -> & ' static Self {
219- & BPE_CL100K
220- }
221-
222- pub fn o200k ( ) -> & ' static Self {
223- & BPE_O200K
224- }
225-
226224 /// Construct a BytePairEncoding instance from a tiktoken dictionary.
227225 /// A suitable hash factor may be necessary to prevent hash collisions,
228226 /// which can by found using [`find_hash_factor_for_tiktoken`].
@@ -572,7 +570,7 @@ mod tests {
572570 use itertools:: Itertools ;
573571 use tiktoken_rs:: { cl100k_base_singleton, o200k_base_singleton} ;
574572
575- use crate :: byte_pair_encoding:: { create_test_bytes, BytePairEncoding } ;
573+ use crate :: byte_pair_encoding:: { create_test_bytes, BPE_CL100K , BPE_O200K } ;
576574
577575 #[ test]
578576 fn test_correctness_cl100k ( ) {
@@ -585,9 +583,9 @@ mod tests {
585583 ] )
586584 . unwrap ( ) ;
587585 let time = Instant :: now ( ) ;
588- let bpe = BytePairEncoding :: o200k ( ) ;
586+ let bpe = & BPE_CL100K ;
589587 println ! ( "{:?}" , time. elapsed( ) ) ;
590- let encoded1 = o200k_base_singleton ( )
588+ let encoded1 = cl100k_base_singleton ( )
591589 . lock ( )
592590 . encode_ordinary ( test_string)
593591 . into_iter ( )
@@ -612,9 +610,9 @@ mod tests {
612610 ] )
613611 . unwrap ( ) ;
614612 let time = Instant :: now ( ) ;
615- let bpe = BytePairEncoding :: cl100k ( ) ;
613+ let bpe = & BPE_O200K ;
616614 println ! ( "{:?}" , time. elapsed( ) ) ;
617- let encoded1 = cl100k_base_singleton ( )
615+ let encoded1 = o200k_base_singleton ( )
618616 . lock ( )
619617 . encode_ordinary ( test_string)
620618 . into_iter ( )
@@ -630,7 +628,7 @@ mod tests {
630628
631629 #[ test]
632630 fn test_bpe_equivalence ( ) {
633- let bpe = BytePairEncoding :: cl100k ( ) ;
631+ let bpe = & BPE_CL100K ;
634632 for tokens in [ 10 , 1000 , 10000 ] {
635633 for _ in 0 ..5 {
636634 let test_input = create_test_bytes ( bpe, tokens) ;
@@ -641,68 +639,3 @@ mod tests {
641639 }
642640 }
643641}
644-
645- mod data {
646- use serde:: { Deserialize , Serialize } ;
647-
648- use crate :: byte_pair_encoding:: BytePairEncoding ;
649-
650- #[ derive( Serialize , Deserialize ) ]
651- pub ( crate ) struct TokenDict {
652- tokens : Vec < Vec < u8 > > ,
653- hash_factor : u64 ,
654- }
655-
656- impl TokenDict {
657- pub ( crate ) fn into_bpe ( self ) -> BytePairEncoding {
658- BytePairEncoding :: from_dictionary ( self . tokens , Some ( self . hash_factor ) )
659- }
660- }
661-
662- #[ test]
663- fn update_token_dicts ( ) {
664- serialize_tokens (
665- "cl100k" ,
666- & tiktoken_rs:: cl100k_base ( ) . expect ( "tiktoken initialization must not fail!" ) ,
667- 100256 ,
668- 17846336922010275747 ,
669- ) ;
670- serialize_tokens (
671- "o200k" ,
672- & tiktoken_rs:: o200k_base ( ) . expect ( "tiktoken initialization must not fail!" ) ,
673- 199998 ,
674- 17846336922010275747 ,
675- ) ;
676- }
677-
678- #[ cfg( test) ]
679- #[ track_caller]
680- fn serialize_tokens (
681- name : & str ,
682- bpe : & tiktoken_rs:: CoreBPE ,
683- num_tokens : usize ,
684- hash_factor : u64 ,
685- ) {
686- use std:: fs:: File ;
687- use std:: path:: PathBuf ;
688-
689- use itertools:: Itertools ;
690- use serde:: Serialize ;
691-
692- let path = PathBuf :: from ( file ! ( ) ) ;
693- let dir = path. parent ( ) . unwrap ( ) ;
694- let data_file = dir. join ( format ! ( "data/bpe_{name}.dict" ) ) ;
695- let current_dir = std:: env:: current_dir ( ) . unwrap ( ) ;
696- let abs_path = current_dir. parent ( ) . unwrap ( ) . parent ( ) . unwrap ( ) ;
697- let file = File :: create ( abs_path. join ( data_file) ) . unwrap ( ) ;
698- let mut serializer = rmp_serde:: Serializer :: new ( file) ;
699- let tokens = ( 0 ..num_tokens)
700- . map ( |i| bpe. _decode_native ( & [ i] ) )
701- . collect_vec ( ) ;
702- let dict = TokenDict {
703- tokens,
704- hash_factor,
705- } ;
706- dict. serialize ( & mut serializer) . unwrap ( ) ;
707- }
708- }
0 commit comments