From a4988290d43e57281be3927587a41a14f73b6a82 Mon Sep 17 00:00:00 2001 From: Stephan Wolski Date: Tue, 6 May 2025 11:13:43 -0700 Subject: [PATCH] use fancy-regex instead of onig as tokenizers regex library Oniguruma doesn't build on GCC 15 and the project got archived last week, so this PR switches to the fancy-regex backend. `fancy-regex` also requires flipping on the `unstable_wasm` feature until huggingface/tokenizers#1772 lands. Right now that flag doesn't have any ill effects since everything WASM related downstream is behind `target_arch` checks. --- Cargo.lock | 46 +++++++------------------------- toktrie_hf_tokenizers/Cargo.toml | 5 +++- 2 files changed, 14 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fe7d81d6..7c0d6de5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -150,12 +150,6 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.9.0" @@ -672,8 +666,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi 0.11.0+wasi-snapshot-preview1", + "wasm-bindgen", ] [[package]] @@ -1160,7 +1156,7 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ - "bitflags 2.9.0", + "bitflags", "libc", ] @@ -1446,35 +1442,13 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" -[[package]] -name = "onig" -version = "6.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f" -dependencies = [ - "bitflags 1.3.2", - "libc", - "once_cell", - "onig_sys", -] - -[[package]] -name = "onig_sys" -version = "69.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7" -dependencies = [ - "cc", - "pkg-config", -] - [[package]] name = "openssl" version = "0.10.72" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fedfea7d58a1f73118430a55da6a286e7b044961736ce96a16a17068ea25e5da" dependencies = [ - "bitflags 2.9.0", + "bitflags", "cfg-if", "foreign-types", "libc", @@ -1747,7 +1721,7 @@ version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2f103c6d277498fbceb16e84d317e2a400f160f46904d5f5410848c829511a3" dependencies = [ - "bitflags 2.9.0", + "bitflags", ] [[package]] @@ -1897,7 +1871,7 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d97817398dd4bb2e6da002002db259209759911da105da92bec29ccb12cf58bf" dependencies = [ - "bitflags 2.9.0", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -1992,7 +1966,7 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags 2.9.0", + "bitflags", "core-foundation", "core-foundation-sys", "libc", @@ -2185,7 +2159,7 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" dependencies = [ - "bitflags 2.9.0", + "bitflags", "core-foundation", "system-configuration-sys", ] @@ -2278,13 +2252,13 @@ dependencies = [ "aho-corasick", "derive_builder", "esaxx-rs", + "fancy-regex", "getrandom 0.2.15", "itertools 0.13.0", "lazy_static", "log", "macro_rules_attribute", "monostate", - "onig", "paste", "rand", "rayon", @@ -2987,7 +2961,7 @@ version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ - "bitflags 2.9.0", + "bitflags", ] [[package]] diff --git a/toktrie_hf_tokenizers/Cargo.toml b/toktrie_hf_tokenizers/Cargo.toml index 150c6a5a..387d15f4 100644 --- a/toktrie_hf_tokenizers/Cargo.toml +++ b/toktrie_hf_tokenizers/Cargo.toml @@ -11,5 +11,8 @@ toktrie = { workspace = true } serde = { version = "1.0.217", features = ["derive"] } serde_json = "1.0.138" anyhow = "1.0.95" -tokenizers = { version = ">=0.20.0, <1.0.0", default-features = false, features = ["onig"] } +tokenizers = { version = ">=0.20.0, <1.0.0", default-features = false, features = [ + "unstable_wasm", + "fancy-regex", +] } log = "0.4.25"