@@ -2,24 +2,23 @@ use std::cmp::Reverse;
22use std:: collections:: BinaryHeap ;
33use std:: hash:: { Hash , Hasher } ;
44use std:: ops:: Range ;
5+ use std:: sync:: LazyLock ;
56
67use aneubeck_daachorse:: { DoubleArrayAhoCorasick , DoubleArrayAhoCorasickBuilder } ;
78use fnv:: { FnvHashMap , FnvHasher } ;
89use itertools:: Itertools ;
9- use once_cell:: sync:: Lazy ;
1010use serde:: de:: Visitor ;
1111use serde:: { Deserialize , Deserializer , Serialize , Serializer } ;
12- use tiktoken_rs:: CoreBPE ;
1312
1413use crate :: backtrack_encoder:: BacktrackEncoder ;
1514use crate :: bitfield:: BitField ;
1615
17- static BPE_CL100K : Lazy < BytePairEncoding > = Lazy :: new ( || {
16+ static BPE_CL100K : LazyLock < BytePairEncoding > = LazyLock :: new ( || {
1817 let bytes = include_bytes ! ( "data/bpe_cl100k.dict" ) ;
1918 rmp_serde:: from_slice ( bytes) . expect ( "" )
2019} ) ;
2120
22- static BPE_O200K : Lazy < BytePairEncoding > = Lazy :: new ( || {
21+ static BPE_O200K : LazyLock < BytePairEncoding > = LazyLock :: new ( || {
2322 let bytes = include_bytes ! ( "data/bpe_o200k.dict" ) ;
2423 rmp_serde:: from_slice ( bytes) . expect ( "" )
2524} ) ;
@@ -194,7 +193,8 @@ impl BytePairEncoding {
194193 }
195194
196195 /// Construct a BytePairEncoding instance frmo a tiktoken dictionary.
197- pub fn from_tiktoken ( tiktoken_bpe : & CoreBPE , num_tokens : usize ) -> Self {
196+ #[ cfg( feature = "tiktoken-rs" ) ]
197+ pub fn from_tiktoken ( tiktoken_bpe : & tiktoken_rs:: CoreBPE , num_tokens : usize ) -> Self {
198198 Self :: from_dictionary ( ( 0 ..num_tokens) . map ( |i| tiktoken_bpe. _decode_native ( & [ i] ) ) )
199199 }
200200
@@ -492,6 +492,7 @@ impl BytePairEncoding {
492492 }
493493}
494494
495+ #[ cfg( feature = "rand" ) ]
495496pub fn create_test_bytes ( bpe : & BytePairEncoding , tokens : usize ) -> Vec < u8 > {
496497 use rand:: { thread_rng, Rng } ;
497498 let mut text = vec ! [ ] ;
@@ -576,7 +577,7 @@ mod data {
576577 #[ test]
577578 #[ ignore = "run manually to find a suitable hash factor" ]
578579 fn find_hash_factor ( ) {
579- let bpes: & mut [ ( CoreBPE , usize ) ] = & mut [
580+ let bpes = & mut [
580581 ( cl100k_base ( ) . unwrap ( ) , BPE_CL100K_LEN ) ,
581582 ( o200k_base ( ) . unwrap ( ) , BPE_O200K_LEN ) ,
582583 ] ;
@@ -609,7 +610,7 @@ mod data {
609610 }
610611
611612 #[ track_caller]
612- fn serialize_tokens ( dict : & CoreBPE , num_tokens : usize , name : & str ) {
613+ fn serialize_tokens ( dict : & tiktoken_rs :: CoreBPE , num_tokens : usize , name : & str ) {
613614 let path = PathBuf :: from ( file ! ( ) ) ;
614615 let dir = path. parent ( ) . unwrap ( ) ;
615616 let data_file = dir. join ( format ! ( "data/bpe_{name}.dict" ) ) ;
0 commit comments