Skip to content

Commit 759d7aa

Browse files
authored
replace lazy_static with stabilized std::sync::LazyLock in 1.80 (#1739)
1 parent 4383a25 commit 759d7aa

File tree

6 files changed

+20
-28
lines changed

6 files changed

+20
-28
lines changed

tokenizers/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ required-features = ["http"]
4242
harness = false
4343

4444
[dependencies]
45-
lazy_static = "1.4"
4645
rand = "0.8"
4746
onig = { version = "6.4", default-features = false, optional = true }
4847
regex = "1.10"

tokenizers/src/lib.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,6 @@
130130
131131
#[macro_use]
132132
extern crate log;
133-
#[macro_use]
134-
extern crate lazy_static;
135133

136134
#[macro_use]
137135
extern crate derive_builder;

tokenizers/src/normalizers/byte_level.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,13 @@ use crate::processors::byte_level::bytes_char;
22
use crate::tokenizer::{NormalizedString, Normalizer, Result};
33
use crate::utils::macro_rules_attribute;
44
use std::collections::{HashMap, HashSet};
5+
use std::sync::LazyLock;
56

67
#[derive(Clone, Debug)]
78
#[macro_rules_attribute(impl_serde_type!)]
89
pub struct ByteLevel;
910

10-
lazy_static! {
11-
static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
12-
static ref CHAR_BYTES: HashMap<char, u8> =
13-
bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
14-
}
11+
static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
1512

1613
impl Default for ByteLevel {
1714
fn default() -> Self {

tokenizers/src/pre_tokenizers/byte_level.rs

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use std::collections::{HashMap, HashSet};
2+
use std::sync::LazyLock;
23

34
use crate::utils::SysRegex;
45
use serde::{Deserialize, Serialize};
@@ -37,17 +38,15 @@ pub(crate) fn bytes_char() -> HashMap<u8, char> {
3738
.collect()
3839
}
3940

40-
lazy_static! {
41-
/// Regex that matches exactly one token.
42-
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
43-
static ref RE: SysRegex = SysRegex::new(
44-
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
45-
)
46-
.unwrap();
47-
static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
48-
static ref CHAR_BYTES: HashMap<char, u8> =
49-
bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
50-
}
41+
/// Regex that matches exactly one token.
42+
/// See https://github.com/openai/gpt-2/blob/master/src/encoder.py#L98
43+
static RE: LazyLock<SysRegex> = LazyLock::new(|| {
44+
SysRegex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
45+
.unwrap()
46+
});
47+
static BYTES_CHAR: LazyLock<HashMap<u8, char>> = LazyLock::new(bytes_char);
48+
static CHAR_BYTES: LazyLock<HashMap<char, u8>> =
49+
LazyLock::new(|| bytes_char().into_iter().map(|(c, b)| (b, c)).collect());
5150

5251
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
5352
/// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care

tokenizers/src/pre_tokenizers/whitespace.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use std::sync::LazyLock;
2+
13
use regex::Regex;
24

35
use crate::tokenizer::{
@@ -17,9 +19,7 @@ impl Default for Whitespace {
1719

1820
impl PreTokenizer for Whitespace {
1921
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
20-
lazy_static! {
21-
static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
22-
}
22+
static RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w+|[^\w\s]+").unwrap());
2323
let re_ref: &Regex = &RE;
2424

2525
pretokenized.split(|_, normalized| {

tokenizers/src/tokenizer/added_vocabulary.rs

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
55
use regex::Regex;
66
use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer};
77
use std::collections::{HashMap, HashSet};
8+
use std::sync::LazyLock;
89

910
/// Represent a token added by the user on top of the existing Model vocabulary.
1011
/// AddedToken can be configured to specify the behavior they should have in various situations
@@ -94,12 +95,10 @@ impl std::hash::Hash for AddedToken {
9495

9596
type MatchingSet = (AhoCorasick, Vec<u32>);
9697

97-
lazy_static! {
98-
static ref STARTS_WITH_WORD: Regex = Regex::new(r"^\w").unwrap();
99-
static ref ENDS_WITH_WORD: Regex = Regex::new(r"\w$").unwrap();
100-
static ref RIGHTMOST_SPACE_AT_START: Regex = Regex::new(r"^\s*").unwrap();
101-
static ref LEFTMOST_SPACE_AT_END: Regex = Regex::new(r"\s*$").unwrap();
102-
}
98+
static STARTS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\w").unwrap());
99+
static ENDS_WITH_WORD: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\w$").unwrap());
100+
static RIGHTMOST_SPACE_AT_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*").unwrap());
101+
static LEFTMOST_SPACE_AT_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s*$").unwrap());
103102

104103
fn ends_with_word(sentence: &str) -> bool {
105104
ENDS_WITH_WORD.is_match(sentence)

0 commit comments

Comments
 (0)