Skip to content

Commit f183341

Browse files
author
Hendrik van Antwerpen
committed
Drop legacy token sets
1 parent ad953d9 commit f183341

File tree

5 files changed

+1
-40
lines changed

5 files changed

+1
-40
lines changed

crates/bpe-openai/README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ For convencience it re-exports the `bpe` crate so that depending on this crate i
77

88
Supported tokenizers:
99

10-
- r50k
11-
- p50k
1210
- cl100k
1311
- o200k
1412

crates/bpe-openai/build.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ use bpe::byte_pair_encoding::{read_tiktoken, BytePairEncoding};
77
use serde::Serialize;
88

99
fn main() {
10-
serialize_tiktoken_bpe("r50k_base", include_bytes!("data/r50k_base.tiktoken.gz"), 1);
11-
serialize_tiktoken_bpe("p50k_base", include_bytes!("data/p50k_base.tiktoken.gz"), 1);
1210
serialize_tiktoken_bpe(
1311
"cl100k_base",
1412
include_bytes!("data/cl100k_base.tiktoken.gz"),
-359 KB
Binary file not shown.
-359 KB
Binary file not shown.

crates/bpe-openai/src/lib.rs

Lines changed: 1 addition & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,6 @@ use fancy_regex::Regex;
88
// The look-ahead character is dropped from the match by the Pretokenizer iterator.
99
// Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!
1010

11-
static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
12-
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k_base.dict"));
13-
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
14-
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
15-
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
16-
});
17-
18-
static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
19-
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k_base.dict"));
20-
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
21-
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
22-
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
23-
});
24-
2511
static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
2612
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
2713
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
@@ -96,14 +82,6 @@ impl Tokenizer {
9682
}
9783
}
9884

99-
pub fn r50k_base() -> &'static Tokenizer {
100-
&BPE_R50K_BASE
101-
}
102-
103-
pub fn p50k_base() -> &'static Tokenizer {
104-
&BPE_P50K_BASE
105-
}
106-
10785
pub fn cl100k_base() -> &'static Tokenizer {
10886
&BPE_CL100K_BASE
10987
}
@@ -115,23 +93,10 @@ pub fn o200k_base() -> &'static Tokenizer {
11593
#[cfg(test)]
11694
mod tests {
11795
use bpe::byte_pair_encoding::{create_test_string, select_test_string};
118-
use tiktoken_rs::{
119-
cl100k_base_singleton, o200k_base_singleton, p50k_base_singleton, r50k_base_singleton,
120-
CoreBPE,
121-
};
96+
use tiktoken_rs::{cl100k_base_singleton, o200k_base_singleton, CoreBPE};
12297

12398
use super::*;
12499

125-
#[test]
126-
fn test_r50k() {
127-
test_equivalence(r50k_base(), &r50k_base_singleton().lock());
128-
}
129-
130-
#[test]
131-
fn test_p50k() {
132-
test_equivalence(p50k_base(), &p50k_base_singleton().lock());
133-
}
134-
135100
#[test]
136101
fn test_cl100k() {
137102
test_equivalence(cl100k_base(), &cl100k_base_singleton().lock());

0 commit comments

Comments
 (0)