|
1 | 1 | use std::collections::{HashMap, HashSet}; |
| 2 | +use std::sync::LazyLock; |
2 | 3 |
|
3 | 4 | use crate::utils::SysRegex; |
4 | 5 | use serde::{Deserialize, Serialize}; |
@@ -37,17 +38,15 @@ pub(crate) fn bytes_char() -> HashMap<u8, char> { |
37 | 38 | .collect() |
38 | 39 | } |
39 | 40 |
|
40 | | -lazy_static! { |
41 | | - /// Regex that matches exactly one token. |
42 | | - /// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98 |
43 | | - static ref RE: SysRegex = SysRegex::new( |
44 | | - r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" |
45 | | - ) |
46 | | - .unwrap(); |
47 | | - static ref BYTES_CHAR: HashMap<u8, char> = bytes_char(); |
48 | | - static ref CHAR_BYTES: HashMap<char, u8> = |
49 | | - bytes_char().into_iter().map(|(c, b)| (b, c)).collect(); |
50 | | -} |
| 41 | +/// Regex that matches exactly one token. |
| 42 | +/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98 |
| 43 | +static RE: LazyLock<SysRegex> = LazyLock::new(|| { |
| 44 | + SysRegex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+") |
| 45 | + .unwrap() |
| 46 | +}); |
| 47 | +static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char); |
| 48 | +static CHAR_BYTES: LazyLock<HashMap<char, u8>> = |
| 49 | + LazyLock::new(|| bytes_char().into_iter().map(|(c, b)| (b, c)).collect()); |
51 | 50 |
|
52 | 51 | #[derive(Copy, Clone, Debug, PartialEq, Eq)] |
53 | 52 | /// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care |
|
0 commit comments