@@ -2,7 +2,7 @@ use std::sync::LazyLock;
22
33use bpe:: byte_pair_encoding:: BytePairEncoding ;
44use either:: Either ;
5- use regex_automata:: { meta:: Regex , util:: captures:: Captures , Anchored , Input } ;
5+ use regex_automata:: { meta:: { BuildError , Regex } , util:: captures:: Captures , Anchored , Input } ;
66
77// Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
88// The look-ahead character is dropped from the match by the Pretokenizer iterator.
@@ -54,15 +54,15 @@ pub struct Tokenizer {
5454impl Tokenizer {
5555 /// Build a tokenizer with an optional pretokenization regex pattern.
5656 #[ allow( clippy:: result_large_err) ]
57- pub fn new ( bpe : BytePairEncoding , pat : Option < & str > ) -> Result < Self , ( ) > {
58- let pat = pat. map ( Regex :: new) . transpose ( ) . map_err ( |_| ( ) ) ?;
57+ pub fn new ( bpe : BytePairEncoding , pat : Option < & str > ) -> Result < Self , BuildError > {
58+ let pat = pat. map ( Regex :: new) . transpose ( ) ?;
5959 Ok ( Self { bpe, pat } )
6060 }
6161
6262 /// When using multiple patterns, the second pattern is assumed to be a look-ahead pattern with
6363 /// exactly one look-ahead character!
64- pub fn with_many ( bpe : BytePairEncoding , patterns : & [ & str ] ) -> Result < Self , ( ) > {
65- let pat = Some ( Regex :: new_many ( patterns) . map_err ( |_| ( ) ) ?) ;
64+ pub fn with_many ( bpe : BytePairEncoding , patterns : & [ & str ] ) -> Result < Self , BuildError > {
65+ let pat = Some ( Regex :: new_many ( patterns) ?) ;
6666 Ok ( Self { bpe, pat } )
6767 }
6868
@@ -121,7 +121,7 @@ impl<'a> Iterator for SpecialRegexp<'a> {
121121 let start = self . last ;
122122 let mut end = self . last + m. range ( ) . end ;
123123 if m. pattern ( ) == 1 . into ( ) {
124- let last = self . input [ start..end] . chars ( ) . rev ( ) . next ( ) . unwrap ( ) ;
124+ let last = self . input [ start..end] . chars ( ) . next_back ( ) . expect ( "Expected at least a look-ahead character!" ) ;
125125 end -= last. len_utf8 ( ) ;
126126 assert_ne ! ( end, start, "a look-ahead pattern must ALWAYS consume at least one character excluding the look-ahead character!" ) ;
127127 }
0 commit comments