From 377b4125a5e7e91d5231ae28a1ac57abe0c4ef89 Mon Sep 17 00:00:00 2001 From: Hannes Karppila Date: Thu, 9 Apr 2020 19:09:23 +0300 Subject: [PATCH 01/10] Update tinyvec dependency --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index ab5cb0c..776dd3e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,5 +21,5 @@ Unicode Standard Annex #15. exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "*.txt", "tests/*" ] [dependencies.tinyvec] -version = "0.3.2" +version = "0.3.3" features = ["alloc"] From 744c255fb9b8348e46ac4694d89094a5041921ab Mon Sep 17 00:00:00 2001 From: Hannes Karppila Date: Thu, 9 Apr 2020 19:10:25 +0300 Subject: [PATCH 02/10] Migrate to Rust 2018 edition --- Cargo.toml | 2 ++ src/lib.rs | 14 +++++++------- src/lookups.rs | 4 ++-- src/normalize.rs | 6 +++--- src/quick_check.rs | 8 ++++---- src/recompose.rs | 2 +- src/stream_safe.rs | 10 +++++----- src/tables.rs | 4 ++-- tests/tests.rs | 2 +- 9 files changed, 27 insertions(+), 25 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 776dd3e..423eeef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,8 @@ Decomposition and Recomposition, as described in Unicode Standard Annex #15. """ +edition = "2018" + exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "*.txt", "tests/*" ] [dependencies.tinyvec] diff --git a/src/lib.rs b/src/lib.rs index 56142a2..f9890ef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,9 +43,9 @@ extern crate tinyvec; -pub use tables::UNICODE_VERSION; -pub use decompose::Decompositions; -pub use quick_check::{ +pub use crate::tables::UNICODE_VERSION; +pub use crate::decompose::Decompositions; +pub use crate::quick_check::{ IsNormalized, is_nfc, is_nfc_quick, @@ -60,8 +60,8 @@ pub use quick_check::{ is_nfd_stream_safe, is_nfd_stream_safe_quick, }; -pub use recompose::Recompositions; -pub use stream_safe::StreamSafe; +pub use crate::recompose::Recompositions; +pub use crate::stream_safe::StreamSafe; use std::str::Chars; mod decompose; @@ -80,9 +80,9 @@ pub mod __test_api; /// Methods for composing and decomposing characters. pub mod char { - pub use normalize::{decompose_canonical, decompose_compatible, compose}; + pub use crate::normalize::{decompose_canonical, decompose_compatible, compose}; - pub use lookups::{canonical_combining_class, is_combining_mark}; + pub use crate::lookups::{canonical_combining_class, is_combining_mark}; } diff --git a/src/lookups.rs b/src/lookups.rs index edaa0a0..49578b6 100644 --- a/src/lookups.rs +++ b/src/lookups.rs @@ -10,8 +10,8 @@ //! Lookups of unicode properties using minimal perfect hashing. -use perfect_hash::mph_lookup; -use tables::*; +use crate::perfect_hash::mph_lookup; +use crate::tables::*; /// Look up the canonical combining class for a codepoint. /// diff --git a/src/normalize.rs b/src/normalize.rs index 87456df..af0007c 100644 --- a/src/normalize.rs +++ b/src/normalize.rs @@ -11,7 +11,7 @@ //! Functions for computing canonical and compatible decompositions for Unicode characters. use std::char; use std::ops::FnMut; -use lookups::{canonical_fully_decomposed, composition_table, compatibility_fully_decomposed}; +use crate::lookups::{canonical_fully_decomposed, composition_table, compatibility_fully_decomposed}; /// Compute canonical Unicode decomposition for character. /// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/) @@ -74,8 +74,8 @@ const T_BASE: u32 = 0x11A7; const L_COUNT: u32 = 19; const V_COUNT: u32 = 21; const T_COUNT: u32 = 28; -const N_COUNT: u32 = (V_COUNT * T_COUNT); -const S_COUNT: u32 = (L_COUNT * N_COUNT); +const N_COUNT: u32 = V_COUNT * T_COUNT; +const S_COUNT: u32 = L_COUNT * N_COUNT; const S_LAST: u32 = S_BASE + S_COUNT - 1; const L_LAST: u32 = L_BASE + L_COUNT - 1; diff --git a/src/quick_check.rs b/src/quick_check.rs index 49b1460..c953f5f 100644 --- a/src/quick_check.rs +++ b/src/quick_check.rs @@ -1,7 +1,7 @@ -use UnicodeNormalization; -use lookups::canonical_combining_class; -use stream_safe; -use tables; +use crate::UnicodeNormalization; +use crate::lookups::canonical_combining_class; +use crate::stream_safe; +use crate::tables; /// The QuickCheck algorithm can quickly determine if a text is or isn't /// normalized without any allocations in many cases, but it has to be able to diff --git a/src/recompose.rs b/src/recompose.rs index 40b20dc..e74107b 100644 --- a/src/recompose.rs +++ b/src/recompose.rs @@ -8,7 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use decompose::Decompositions; +use crate::decompose::Decompositions; use tinyvec::TinyVec; use std::fmt::{self, Write}; diff --git a/src/stream_safe.rs b/src/stream_safe.rs index 2cfcc36..74d8e8d 100644 --- a/src/stream_safe.rs +++ b/src/stream_safe.rs @@ -1,12 +1,12 @@ -use normalize::{ +use crate::normalize::{ hangul_decomposition_length, is_hangul_syllable, }; -use lookups::{ +use crate::lookups::{ canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed, stream_safe_trailing_nonstarters, }; -use tables::stream_safe_leading_nonstarters; +use crate::tables::stream_safe_leading_nonstarters; pub(crate) const MAX_NONSTARTERS: usize = 30; const COMBINING_GRAPHEME_JOINER: char = '\u{034F}'; @@ -111,8 +111,8 @@ mod tests { classify_nonstarters, }; use std::char; - use normalize::decompose_compatible; - use lookups::canonical_combining_class; + use crate::normalize::decompose_compatible; + use crate::lookups::canonical_combining_class; fn stream_safe(s: &str) -> String { StreamSafe::new(s.chars()).collect() diff --git a/src/tables.rs b/src/tables.rs index f92898d..368e6bb 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -11,8 +11,8 @@ // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly #![allow(missing_docs)] -use quick_check::IsNormalized; -use quick_check::IsNormalized::*; +use crate::quick_check::IsNormalized; +use crate::quick_check::IsNormalized::*; #[allow(unused)] pub const UNICODE_VERSION: (u64, u64, u64) = (9, 0, 0); diff --git a/tests/tests.rs b/tests/tests.rs index 03531b0..399af7f 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -7,7 +7,7 @@ use unicode_normalization::__test_api::{ mod data { pub mod normalization_tests; } -use data::normalization_tests::NORMALIZATION_TESTS; +use crate::data::normalization_tests::NORMALIZATION_TESTS; #[test] fn test_normalization_tests_unaffected() { From cb9fb4d57e522079302874e730408eff11d981d4 Mon Sep 17 00:00:00 2001 From: Hannes Karppila Date: Fri, 10 Apr 2020 21:48:08 +0300 Subject: [PATCH 03/10] Fix generated import statements in tables.rs --- scripts/unicode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index d67fa6e..a5bba96 100644 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -477,8 +477,8 @@ def minimal_perfect_hash(d): data = UnicodeData() with open("tables.rs", "w", newline = "\n") as out: out.write(PREAMBLE) - out.write("use quick_check::IsNormalized;\n") - out.write("use quick_check::IsNormalized::*;\n") + out.write("use crate::quick_check::IsNormalized;\n") + out.write("use crate::quick_check::IsNormalized::*;\n") out.write("\n") version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split(".")) From 3e2fe6474c94e697e776a62bf9b12315b02698e4 Mon Sep 17 00:00:00 2001 From: Hannes Karppila Date: Thu, 9 Apr 2020 19:13:08 +0300 Subject: [PATCH 04/10] Remove ignored inline annotations --- src/lib.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f9890ef..fcb1fcc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -92,27 +92,22 @@ pub mod char { pub trait UnicodeNormalization> { /// Returns an iterator over the string in Unicode Normalization Form D /// (canonical decomposition). - #[inline] fn nfd(self) -> Decompositions; /// Returns an iterator over the string in Unicode Normalization Form KD /// (compatibility decomposition). - #[inline] fn nfkd(self) -> Decompositions; /// An Iterator over the string in Unicode Normalization Form C /// (canonical decomposition followed by canonical composition). - #[inline] fn nfc(self) -> Recompositions; /// An Iterator over the string in Unicode Normalization Form KC /// (compatibility decomposition followed by canonical composition). - #[inline] fn nfkc(self) -> Recompositions; /// An Iterator over the string with Conjoining Grapheme Joiner characters /// inserted according to the Stream-Safe Text Process (UAX15-D4) - #[inline] fn stream_safe(self) -> StreamSafe; } From 1dbcb5fa0a14259501f51cec0ebec4484fc5c670 Mon Sep 17 00:00:00 2001 From: Hannes Karppila Date: Fri, 10 Apr 2020 21:32:54 +0300 Subject: [PATCH 05/10] Do not automatically format tables --- src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index fcb1fcc..b4a0416 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -71,6 +71,8 @@ mod perfect_hash; mod recompose; mod quick_check; mod stream_safe; + +#[rustfmt::skip] mod tables; #[cfg(test)] From 84ca78b38257facd621c1072e8a9e5db7a49630a Mon Sep 17 00:00:00 2001 From: Hannes Karppila Date: Thu, 9 Apr 2020 19:14:15 +0300 Subject: [PATCH 06/10] Apply rustfmt --- benches/bench.rs | 2 +- src/__test_api.rs | 2 +- src/decompose.rs | 10 +++--- src/lib.rs | 35 ++++++++------------ src/lookups.rs | 63 ++++++++++++++++++++++++++++-------- src/normalize.rs | 40 +++++++++++++++-------- src/perfect_hash.rs | 17 +++++++--- src/quick_check.rs | 27 +++++++--------- src/recompose.rs | 78 +++++++++++++++++++++------------------------ src/stream_safe.rs | 37 ++++++++++----------- src/test.rs | 29 ++++++++++------- tests/tests.rs | 8 ++--- 12 files changed, 195 insertions(+), 153 deletions(-) diff --git a/benches/bench.rs b/benches/bench.rs index b3ea836..3f529ec 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -1,7 +1,7 @@ #![feature(test)] #![feature(iterator_step_by)] -extern crate unicode_normalization; extern crate test; +extern crate unicode_normalization; use std::fs; use test::Bencher; diff --git a/src/__test_api.rs b/src/__test_api.rs index 9deff6b..934fa72 100644 --- a/src/__test_api.rs +++ b/src/__test_api.rs @@ -6,7 +6,7 @@ use crate::stream_safe::StreamSafe; pub fn stream_safe(s: &str) -> String { - StreamSafe::new(s.chars()).collect() + StreamSafe::new(s.chars()).collect() } pub mod quick_check { pub use crate::quick_check::*; diff --git a/src/decompose.rs b/src/decompose.rs index 6533c0c..0ba15d0 100644 --- a/src/decompose.rs +++ b/src/decompose.rs @@ -7,10 +7,10 @@ // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. -use tinyvec::TinyVec; use std::fmt::{self, Write}; use std::iter::Fuse; use std::ops::Range; +use tinyvec::TinyVec; #[derive(Clone)] enum DecompositionType { @@ -37,7 +37,7 @@ pub struct Decompositions { } #[inline] -pub fn new_canonical>(iter: I) -> Decompositions { +pub fn new_canonical>(iter: I) -> Decompositions { Decompositions { kind: self::DecompositionType::Canonical, iter: iter.fuse(), @@ -47,7 +47,7 @@ pub fn new_canonical>(iter: I) -> Decompositions { } #[inline] -pub fn new_compatible>(iter: I) -> Decompositions { +pub fn new_compatible>(iter: I) -> Decompositions { Decompositions { kind: self::DecompositionType::Compatible, iter: iter.fuse(), @@ -99,7 +99,7 @@ impl Decompositions { } } -impl> Iterator for Decompositions { +impl> Iterator for Decompositions { type Item = char; #[inline] @@ -149,7 +149,7 @@ impl> Iterator for Decompositions { } } -impl + Clone> fmt::Display for Decompositions { +impl + Clone> fmt::Display for Decompositions { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { for c in self.clone() { f.write_char(c)?; diff --git a/src/lib.rs b/src/lib.rs index b4a0416..dc58c50 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -38,60 +38,51 @@ //! ``` #![deny(missing_docs, unsafe_code)] -#![doc(html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", - html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png")] +#![doc( + html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", + html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" +)] extern crate tinyvec; -pub use crate::tables::UNICODE_VERSION; pub use crate::decompose::Decompositions; pub use crate::quick_check::{ + is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick, + is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick, IsNormalized, - is_nfc, - is_nfc_quick, - is_nfkc, - is_nfkc_quick, - is_nfc_stream_safe, - is_nfc_stream_safe_quick, - is_nfd, - is_nfd_quick, - is_nfkd, - is_nfkd_quick, - is_nfd_stream_safe, - is_nfd_stream_safe_quick, }; pub use crate::recompose::Recompositions; pub use crate::stream_safe::StreamSafe; +pub use crate::tables::UNICODE_VERSION; use std::str::Chars; mod decompose; mod lookups; mod normalize; mod perfect_hash; -mod recompose; mod quick_check; +mod recompose; mod stream_safe; #[rustfmt::skip] mod tables; -#[cfg(test)] -mod test; #[doc(hidden)] pub mod __test_api; +#[cfg(test)] +mod test; /// Methods for composing and decomposing characters. pub mod char { - pub use crate::normalize::{decompose_canonical, decompose_compatible, compose}; + pub use crate::normalize::{compose, decompose_canonical, decompose_compatible}; pub use crate::lookups::{canonical_combining_class, is_combining_mark}; } - /// Methods for iterating over strings while applying Unicode normalizations /// as described in /// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/). -pub trait UnicodeNormalization> { +pub trait UnicodeNormalization> { /// Returns an iterator over the string in Unicode Normalization Form D /// (canonical decomposition). fn nfd(self) -> Decompositions; @@ -140,7 +131,7 @@ impl<'a> UnicodeNormalization> for &'a str { } } -impl> UnicodeNormalization for I { +impl> UnicodeNormalization for I { #[inline] fn nfd(self) -> Decompositions { decompose::new_canonical(self) diff --git a/src/lookups.rs b/src/lookups.rs index 49578b6..5bf5090 100644 --- a/src/lookups.rs +++ b/src/lookups.rs @@ -14,42 +14,77 @@ use crate::perfect_hash::mph_lookup; use crate::tables::*; /// Look up the canonical combining class for a codepoint. -/// +/// /// The value returned is as defined in the Unicode Character Database. pub fn canonical_combining_class(c: char) -> u8 { - mph_lookup(c.into(), CANONICAL_COMBINING_CLASS_SALT, CANONICAL_COMBINING_CLASS_KV, - u8_lookup_fk, u8_lookup_fv, 0) + mph_lookup( + c.into(), + CANONICAL_COMBINING_CLASS_SALT, + CANONICAL_COMBINING_CLASS_KV, + u8_lookup_fk, + u8_lookup_fv, + 0, + ) } pub(crate) fn composition_table(c1: char, c2: char) -> Option { if c1 < '\u{10000}' && c2 < '\u{10000}' { - mph_lookup((c1 as u32) << 16 | (c2 as u32), - COMPOSITION_TABLE_SALT, COMPOSITION_TABLE_KV, - pair_lookup_fk, pair_lookup_fv_opt, None) + mph_lookup( + (c1 as u32) << 16 | (c2 as u32), + COMPOSITION_TABLE_SALT, + COMPOSITION_TABLE_KV, + pair_lookup_fk, + pair_lookup_fv_opt, + None, + ) } else { composition_table_astral(c1, c2) } } pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> { - mph_lookup(c.into(), CANONICAL_DECOMPOSED_SALT, CANONICAL_DECOMPOSED_KV, - pair_lookup_fk, pair_lookup_fv_opt, None) + mph_lookup( + c.into(), + CANONICAL_DECOMPOSED_SALT, + CANONICAL_DECOMPOSED_KV, + pair_lookup_fk, + pair_lookup_fv_opt, + None, + ) } pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> { - mph_lookup(c.into(), COMPATIBILITY_DECOMPOSED_SALT, COMPATIBILITY_DECOMPOSED_KV, - pair_lookup_fk, pair_lookup_fv_opt, None) + mph_lookup( + c.into(), + COMPATIBILITY_DECOMPOSED_SALT, + COMPATIBILITY_DECOMPOSED_KV, + pair_lookup_fk, + pair_lookup_fv_opt, + None, + ) } /// Return whether the given character is a combining mark (`General_Category=Mark`) pub fn is_combining_mark(c: char) -> bool { - mph_lookup(c.into(), COMBINING_MARK_SALT, COMBINING_MARK_KV, - bool_lookup_fk, bool_lookup_fv, false) + mph_lookup( + c.into(), + COMBINING_MARK_SALT, + COMBINING_MARK_KV, + bool_lookup_fk, + bool_lookup_fv, + false, + ) } pub fn stream_safe_trailing_nonstarters(c: char) -> usize { - mph_lookup(c.into(), TRAILING_NONSTARTERS_SALT, TRAILING_NONSTARTERS_KV, - u8_lookup_fk, u8_lookup_fv, 0) as usize + mph_lookup( + c.into(), + TRAILING_NONSTARTERS_SALT, + TRAILING_NONSTARTERS_KV, + u8_lookup_fk, + u8_lookup_fv, + 0, + ) as usize } /// Extract the key in a 24 bit key and 8 bit value packed in a u32. diff --git a/src/normalize.rs b/src/normalize.rs index af0007c..3d54360 100644 --- a/src/normalize.rs +++ b/src/normalize.rs @@ -9,15 +9,20 @@ // except according to those terms. //! Functions for computing canonical and compatible decompositions for Unicode characters. +use crate::lookups::{ + canonical_fully_decomposed, compatibility_fully_decomposed, composition_table, +}; use std::char; use std::ops::FnMut; -use crate::lookups::{canonical_fully_decomposed, composition_table, compatibility_fully_decomposed}; /// Compute canonical Unicode decomposition for character. /// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/) /// for more information. #[inline] -pub fn decompose_canonical(c: char, emit_char: F) where F: FnMut(char) { +pub fn decompose_canonical(c: char, emit_char: F) +where + F: FnMut(char), +{ decompose(c, canonical_fully_decomposed, emit_char) } @@ -26,14 +31,16 @@ pub fn decompose_canonical(c: char, emit_char: F) where F: FnMut(char) { /// for more information. #[inline] pub fn decompose_compatible(c: char, emit_char: F) { - let decompose_char = |c| compatibility_fully_decomposed(c) - .or_else(|| canonical_fully_decomposed(c)); + let decompose_char = + |c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); decompose(c, decompose_char, emit_char) } #[inline] fn decompose(c: char, decompose_char: D, mut emit_char: F) - where D: Fn(char) -> Option<&'static [char]>, F: FnMut(char) +where + D: Fn(char) -> Option<&'static [char]>, + F: FnMut(char), { // 7-bit ASCII never decomposes if c <= '\x7f' { @@ -93,7 +100,10 @@ pub(crate) fn is_hangul_syllable(c: char) -> bool { // Decompose a precomposed Hangul syllable #[allow(unsafe_code)] #[inline(always)] -fn decompose_hangul(s: char, mut emit_char: F) where F: FnMut(char) { +fn decompose_hangul(s: char, mut emit_char: F) +where + F: FnMut(char), +{ let s_index = s as u32 - S_BASE; let l_index = s_index / N_COUNT; unsafe { @@ -113,7 +123,11 @@ fn decompose_hangul(s: char, mut emit_char: F) where F: FnMut(char) { pub(crate) fn hangul_decomposition_length(s: char) -> usize { let si = s as u32 - S_BASE; let ti = si % T_COUNT; - if ti > 0 { 3 } else { 2 } + if ti > 0 { + 3 + } else { + 2 + } } // Compose a pair of Hangul Jamo @@ -124,17 +138,17 @@ fn compose_hangul(a: char, b: char) -> Option { let (a, b) = (a as u32, b as u32); match (a, b) { // Compose a leading consonant and a vowel together into an LV_Syllable - (L_BASE ... L_LAST, V_BASE ... V_LAST) => { + (L_BASE...L_LAST, V_BASE...V_LAST) => { let l_index = a - L_BASE; let v_index = b - V_BASE; let lv_index = l_index * N_COUNT + v_index * T_COUNT; let s = S_BASE + lv_index; - Some(unsafe {char::from_u32_unchecked(s)}) - }, + Some(unsafe { char::from_u32_unchecked(s) }) + } // Compose an LV_Syllable and a trailing consonant into an LVT_Syllable - (S_BASE ... S_LAST, T_FIRST ... T_LAST) if (a - S_BASE) % T_COUNT == 0 => { - Some(unsafe {char::from_u32_unchecked(a + (b - T_BASE))}) - }, + (S_BASE...S_LAST, T_FIRST...T_LAST) if (a - S_BASE) % T_COUNT == 0 => { + Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) }) + } _ => None, } } diff --git a/src/perfect_hash.rs b/src/perfect_hash.rs index 0a81714..3dbc166 100644 --- a/src/perfect_hash.rs +++ b/src/perfect_hash.rs @@ -20,16 +20,25 @@ fn my_hash(key: u32, salt: u32, n: usize) -> usize { } /// Do a lookup using minimal perfect hashing. -/// +/// /// The table is stored as a sequence of "salt" values, then a sequence of /// values that contain packed key/value pairs. The strategy is to hash twice. /// The first hash retrieves a salt value that makes the second hash unique. /// The hash function doesn't have to be very good, just good enough that the /// resulting map is unique. #[inline] -pub(crate) fn mph_lookup(x: u32, salt: &[u16], kv: &[KV], fk: FK, fv: FV, - default: V) -> V - where KV: Copy, FK: Fn(KV) -> u32, FV: Fn(KV) -> V +pub(crate) fn mph_lookup( + x: u32, + salt: &[u16], + kv: &[KV], + fk: FK, + fv: FV, + default: V, +) -> V +where + KV: Copy, + FK: Fn(KV) -> u32, + FV: Fn(KV) -> V, { let s = salt[my_hash(x, 0, salt.len())] as u32; let key_val = kv[my_hash(x, s, salt.len())]; diff --git a/src/quick_check.rs b/src/quick_check.rs index c953f5f..4507b2a 100644 --- a/src/quick_check.rs +++ b/src/quick_check.rs @@ -1,7 +1,7 @@ -use crate::UnicodeNormalization; use crate::lookups::canonical_combining_class; use crate::stream_safe; use crate::tables; +use crate::UnicodeNormalization; /// The QuickCheck algorithm can quickly determine if a text is or isn't /// normalized without any allocations in many cases, but it has to be able to @@ -19,7 +19,9 @@ pub enum IsNormalized { // https://unicode.org/reports/tr15/#Detecting_Normalization_Forms #[inline] fn quick_check(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized - where I: Iterator, F: Fn(char) -> IsNormalized +where + I: Iterator, + F: Fn(char) -> IsNormalized, { let mut last_cc = 0u8; let mut nonstarter_count = 0; @@ -42,7 +44,7 @@ fn quick_check(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized IsNormalized::No => return IsNormalized::No, IsNormalized::Maybe => { result = IsNormalized::Maybe; - }, + } } if stream_safe { let decomp = stream_safe::classify_nonstarters(ch); @@ -67,38 +69,37 @@ fn quick_check(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized /// `IsNormalized::Maybe` if further checks are necessary. In this case a check /// like `s.chars().nfc().eq(s.chars())` should suffice. #[inline] -pub fn is_nfc_quick>(s: I) -> IsNormalized { +pub fn is_nfc_quick>(s: I) -> IsNormalized { quick_check(s, tables::qc_nfc, false) } - /// Quickly check if a string is in NFKC. #[inline] -pub fn is_nfkc_quick>(s: I) -> IsNormalized { +pub fn is_nfkc_quick>(s: I) -> IsNormalized { quick_check(s, tables::qc_nfkc, false) } /// Quickly check if a string is in NFD. #[inline] -pub fn is_nfd_quick>(s: I) -> IsNormalized { +pub fn is_nfd_quick>(s: I) -> IsNormalized { quick_check(s, tables::qc_nfd, false) } /// Quickly check if a string is in NFKD. #[inline] -pub fn is_nfkd_quick>(s: I) -> IsNormalized { +pub fn is_nfkd_quick>(s: I) -> IsNormalized { quick_check(s, tables::qc_nfkd, false) } /// Quickly check if a string is Stream-Safe NFC. #[inline] -pub fn is_nfc_stream_safe_quick>(s: I) -> IsNormalized { +pub fn is_nfc_stream_safe_quick>(s: I) -> IsNormalized { quick_check(s, tables::qc_nfc, true) } /// Quickly check if a string is Stream-Safe NFD. #[inline] -pub fn is_nfd_stream_safe_quick>(s: I) -> IsNormalized { +pub fn is_nfd_stream_safe_quick>(s: I) -> IsNormalized { quick_check(s, tables::qc_nfd, true) } @@ -164,11 +165,7 @@ pub fn is_nfd_stream_safe(s: &str) -> bool { #[cfg(test)] mod tests { - use super::{ - IsNormalized, - is_nfc_stream_safe_quick, - is_nfd_stream_safe_quick, - }; + use super::{is_nfc_stream_safe_quick, is_nfd_stream_safe_quick, IsNormalized}; #[test] fn test_stream_safe_nfd() { diff --git a/src/recompose.rs b/src/recompose.rs index e74107b..29022e2 100644 --- a/src/recompose.rs +++ b/src/recompose.rs @@ -9,8 +9,8 @@ // except according to those terms. use crate::decompose::Decompositions; -use tinyvec::TinyVec; use std::fmt::{self, Write}; +use tinyvec::TinyVec; #[derive(Clone)] enum RecompositionState { @@ -30,7 +30,7 @@ pub struct Recompositions { } #[inline] -pub fn new_canonical>(iter: I) -> Recompositions { +pub fn new_canonical>(iter: I) -> Recompositions { Recompositions { iter: super::decompose::new_canonical(iter), state: self::RecompositionState::Composing, @@ -41,7 +41,7 @@ pub fn new_canonical>(iter: I) -> Recompositions { } #[inline] -pub fn new_compatible>(iter: I) -> Recompositions { +pub fn new_compatible>(iter: I) -> Recompositions { Recompositions { iter: super::decompose::new_compatible(iter), state: self::RecompositionState::Composing, @@ -51,7 +51,7 @@ pub fn new_compatible>(iter: I) -> Recompositions { } } -impl> Iterator for Recompositions { +impl> Iterator for Recompositions { type Item = char; #[inline] @@ -70,26 +70,24 @@ impl> Iterator for Recompositions { } self.composee = Some(ch); continue; - }, + } Some(k) => k, }; match self.last_ccc { - None => { - match super::char::compose(k, ch) { - Some(r) => { - self.composee = Some(r); - continue; - } - None => { - if ch_class == 0 { - self.composee = Some(ch); - return Some(k); - } - self.buffer.push(ch); - self.last_ccc = Some(ch_class); + None => match super::char::compose(k, ch) { + Some(r) => { + self.composee = Some(r); + continue; + } + None => { + if ch_class == 0 { + self.composee = Some(ch); + return Some(k); } + self.buffer.push(ch); + self.last_ccc = Some(ch_class); } - } + }, Some(l_class) => { if l_class >= ch_class { // `ch` is blocked from `composee` @@ -121,36 +119,32 @@ impl> Iterator for Recompositions { return self.composee.take(); } } - Purging(next) => { - match self.buffer.get(next).cloned() { - None => { - self.buffer.clear(); - self.state = Composing; - } - s => { - self.state = Purging(next + 1); - return s - } + Purging(next) => match self.buffer.get(next).cloned() { + None => { + self.buffer.clear(); + self.state = Composing; } - } - Finished(next) => { - match self.buffer.get(next).cloned() { - None => { - self.buffer.clear(); - return self.composee.take() - } - s => { - self.state = Finished(next + 1); - return s - } + s => { + self.state = Purging(next + 1); + return s; } - } + }, + Finished(next) => match self.buffer.get(next).cloned() { + None => { + self.buffer.clear(); + return self.composee.take(); + } + s => { + self.state = Finished(next + 1); + return s; + } + }, } } } } -impl + Clone> fmt::Display for Recompositions { +impl + Clone> fmt::Display for Recompositions { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { for c in self.clone() { f.write_char(c)?; diff --git a/src/stream_safe.rs b/src/stream_safe.rs index 74d8e8d..80123f0 100644 --- a/src/stream_safe.rs +++ b/src/stream_safe.rs @@ -1,11 +1,8 @@ -use crate::normalize::{ - hangul_decomposition_length, - is_hangul_syllable, -}; use crate::lookups::{ canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed, stream_safe_trailing_nonstarters, }; +use crate::normalize::{hangul_decomposition_length, is_hangul_syllable}; use crate::tables::stream_safe_leading_nonstarters; pub(crate) const MAX_NONSTARTERS: usize = 30; @@ -22,11 +19,15 @@ pub struct StreamSafe { impl StreamSafe { pub(crate) fn new(iter: I) -> Self { - Self { iter, nonstarter_count: 0, buffer: None } + Self { + iter, + nonstarter_count: 0, + buffer: None, + } } } -impl> Iterator for StreamSafe { +impl> Iterator for StreamSafe { type Item = char; #[inline] @@ -72,7 +73,7 @@ pub(crate) fn classify_nonstarters(c: char) -> Decomposition { leading_nonstarters: 0, trailing_nonstarters: 0, decomposition_len: 1, - } + }; } // Next, special case Hangul, since it's not handled by our tables. if is_hangul_syllable(c) { @@ -82,15 +83,12 @@ pub(crate) fn classify_nonstarters(c: char) -> Decomposition { decomposition_len: hangul_decomposition_length(c), }; } - let decomp = compatibility_fully_decomposed(c) - .or_else(|| canonical_fully_decomposed(c)); + let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c)); match decomp { - Some(decomp) => { - Decomposition { - leading_nonstarters: stream_safe_leading_nonstarters(c), - trailing_nonstarters: stream_safe_trailing_nonstarters(c), - decomposition_len: decomp.len(), - } + Some(decomp) => Decomposition { + leading_nonstarters: stream_safe_leading_nonstarters(c), + trailing_nonstarters: stream_safe_trailing_nonstarters(c), + decomposition_len: decomp.len(), }, None => { let is_nonstarter = canonical_combining_class(c) != 0; @@ -106,13 +104,10 @@ pub(crate) fn classify_nonstarters(c: char) -> Decomposition { #[cfg(test)] mod tests { - use super::{ - StreamSafe, - classify_nonstarters, - }; - use std::char; - use crate::normalize::decompose_compatible; + use super::{classify_nonstarters, StreamSafe}; use crate::lookups::canonical_combining_class; + use crate::normalize::decompose_compatible; + use std::char; fn stream_safe(s: &str) -> String { StreamSafe::new(s.chars()).collect() diff --git a/src/test.rs b/src/test.rs index 8aaadba..1a0f13a 100644 --- a/src/test.rs +++ b/src/test.rs @@ -8,11 +8,9 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. - -use std::char; -use super::UnicodeNormalization; use super::char::is_combining_mark; - +use super::UnicodeNormalization; +use std::char; #[test] fn test_nfd() { @@ -21,8 +19,11 @@ fn test_nfd() { assert_eq!($input.nfd().to_string(), $expected); // A dummy iterator that is not std::str::Chars directly; // note that `id_func` is used to ensure `Clone` implementation - assert_eq!($input.chars().map(|c| c).nfd().collect::(), $expected); - } + assert_eq!( + $input.chars().map(|c| c).nfd().collect::(), + $expected + ); + }; } t!("abc", "abc"); t!("\u{1e0b}\u{1c4}", "d\u{307}\u{1c4}"); @@ -41,7 +42,7 @@ fn test_nfkd() { macro_rules! t { ($input: expr, $expected: expr) => { assert_eq!($input.nfkd().to_string(), $expected); - } + }; } t!("abc", "abc"); t!("\u{1e0b}\u{1c4}", "d\u{307}DZ\u{30c}"); @@ -60,7 +61,7 @@ fn test_nfc() { macro_rules! t { ($input: expr, $expected: expr) => { assert_eq!($input.nfc().to_string(), $expected); - } + }; } t!("abc", "abc"); t!("\u{1e0b}\u{1c4}", "\u{1e0b}\u{1c4}"); @@ -72,7 +73,10 @@ fn test_nfc() { t!("\u{301}a", "\u{301}a"); t!("\u{d4db}", "\u{d4db}"); t!("\u{ac1c}", "\u{ac1c}"); - t!("a\u{300}\u{305}\u{315}\u{5ae}b", "\u{e0}\u{5ae}\u{305}\u{315}b"); + t!( + "a\u{300}\u{305}\u{315}\u{5ae}b", + "\u{e0}\u{5ae}\u{305}\u{315}b" + ); } #[test] @@ -80,7 +84,7 @@ fn test_nfkc() { macro_rules! t { ($input: expr, $expected: expr) => { assert_eq!($input.nfkc().to_string(), $expected); - } + }; } t!("abc", "abc"); t!("\u{1e0b}\u{1c4}", "\u{1e0b}D\u{17d}"); @@ -92,7 +96,10 @@ fn test_nfkc() { t!("\u{301}a", "\u{301}a"); t!("\u{d4db}", "\u{d4db}"); t!("\u{ac1c}", "\u{ac1c}"); - t!("a\u{300}\u{305}\u{315}\u{5ae}b", "\u{e0}\u{5ae}\u{305}\u{315}b"); + t!( + "a\u{300}\u{305}\u{315}\u{5ae}b", + "\u{e0}\u{5ae}\u{305}\u{315}b" + ); } #[test] diff --git a/tests/tests.rs b/tests/tests.rs index 399af7f..9aefd97 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -1,8 +1,6 @@ extern crate unicode_normalization; use unicode_normalization::UnicodeNormalization; -use unicode_normalization::__test_api::{ - stream_safe, -}; +use unicode_normalization::__test_api::stream_safe; mod data { pub mod normalization_tests; @@ -21,7 +19,9 @@ fn test_normalization_tests_unaffected() { #[test] fn test_official() { macro_rules! normString { - ($method: ident, $input: expr) => { $input.$method().collect::() } + ($method: ident, $input: expr) => { + $input.$method().collect::() + }; } for test in NORMALIZATION_TESTS { From 8ebdf9097cd5a880f2725506fe934414025051a2 Mon Sep 17 00:00:00 2001 From: Hannes Karppila Date: Thu, 9 Apr 2020 19:20:10 +0300 Subject: [PATCH 07/10] Add no_std + alloc support --- .travis.yml | 2 ++ Cargo.toml | 5 +++++ src/__test_api.rs | 5 +++++ src/decompose.rs | 6 +++--- src/lib.rs | 11 ++++++++++- src/no_std_prelude.rs | 6 ++++++ src/normalize.rs | 4 ++-- src/recompose.rs | 2 +- src/stream_safe.rs | 8 ++++++-- src/test.rs | 5 ++++- 10 files changed, 44 insertions(+), 10 deletions(-) create mode 100755 src/no_std_prelude.rs diff --git a/.travis.yml b/.travis.yml index a736bf7..f1132c9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,9 +7,11 @@ sudo: false script: - cargo build --verbose - cargo test --verbose + - cargo test --verbose --no-default-features - cargo package - cd target/package/unicode-normalization-* - cargo test --verbose + - cargo test --verbose --no-default-features notifications: email: on_success: never diff --git a/Cargo.toml b/Cargo.toml index 423eeef..6cf8fa7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,3 +25,8 @@ exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "*.txt", "tests/*" ] [dependencies.tinyvec] version = "0.3.3" features = ["alloc"] + + +[features] +default = ["std"] +std = [] diff --git a/src/__test_api.rs b/src/__test_api.rs index 934fa72..f1a3f92 100644 --- a/src/__test_api.rs +++ b/src/__test_api.rs @@ -4,10 +4,15 @@ // // If you're caught using this outside this crates tests/, you get to clean up the mess. +#[cfg(not(feature = "std"))] +use crate::no_std_prelude::*; + use crate::stream_safe::StreamSafe; + pub fn stream_safe(s: &str) -> String { StreamSafe::new(s.chars()).collect() } + pub mod quick_check { pub use crate::quick_check::*; } diff --git a/src/decompose.rs b/src/decompose.rs index 0ba15d0..f228023 100644 --- a/src/decompose.rs +++ b/src/decompose.rs @@ -7,9 +7,9 @@ // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. -use std::fmt::{self, Write}; -use std::iter::Fuse; -use std::ops::Range; +use core::fmt::{self, Write}; +use core::iter::Fuse; +use core::ops::Range; use tinyvec::TinyVec; #[derive(Clone)] diff --git a/src/lib.rs b/src/lib.rs index dc58c50..6749adc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -42,6 +42,13 @@ html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" )] +#![cfg_attr(not(feature = "std"), no_std)] + +#[cfg(not(feature = "std"))] +extern crate alloc; + +#[cfg(feature = "std")] +extern crate core; extern crate tinyvec; @@ -54,7 +61,9 @@ pub use crate::quick_check::{ pub use crate::recompose::Recompositions; pub use crate::stream_safe::StreamSafe; pub use crate::tables::UNICODE_VERSION; -use std::str::Chars; +use core::str::Chars; + +mod no_std_prelude; mod decompose; mod lookups; diff --git a/src/no_std_prelude.rs b/src/no_std_prelude.rs new file mode 100755 index 0000000..838d122 --- /dev/null +++ b/src/no_std_prelude.rs @@ -0,0 +1,6 @@ +#[cfg(not(feature = "std"))] +pub use alloc::{ + str::Chars, + string::{String, ToString}, + vec::Vec, +}; diff --git a/src/normalize.rs b/src/normalize.rs index 3d54360..1097c42 100644 --- a/src/normalize.rs +++ b/src/normalize.rs @@ -12,8 +12,8 @@ use crate::lookups::{ canonical_fully_decomposed, compatibility_fully_decomposed, composition_table, }; -use std::char; -use std::ops::FnMut; + +use core::{char, ops::FnMut}; /// Compute canonical Unicode decomposition for character. /// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/) diff --git a/src/recompose.rs b/src/recompose.rs index 29022e2..2a1960a 100644 --- a/src/recompose.rs +++ b/src/recompose.rs @@ -9,7 +9,7 @@ // except according to those terms. use crate::decompose::Decompositions; -use std::fmt::{self, Write}; +use core::fmt::{self, Write}; use tinyvec::TinyVec; #[derive(Clone)] diff --git a/src/stream_safe.rs b/src/stream_safe.rs index 80123f0..1ba7d76 100644 --- a/src/stream_safe.rs +++ b/src/stream_safe.rs @@ -107,7 +107,11 @@ mod tests { use super::{classify_nonstarters, StreamSafe}; use crate::lookups::canonical_combining_class; use crate::normalize::decompose_compatible; - use std::char; + + #[cfg(not(feature = "std"))] + use crate::no_std_prelude::*; + + use core::char; fn stream_safe(s: &str) -> String { StreamSafe::new(s.chars()).collect() @@ -131,7 +135,7 @@ mod tests { None => continue, }; let c = classify_nonstarters(ch); - let mut s = vec![]; + let mut s = Vec::new(); decompose_compatible(ch, |c| s.push(c)); assert_eq!(s.len(), c.decomposition_len); diff --git a/src/test.rs b/src/test.rs index 1a0f13a..2e87a87 100644 --- a/src/test.rs +++ b/src/test.rs @@ -10,7 +10,10 @@ use super::char::is_combining_mark; use super::UnicodeNormalization; -use std::char; +use core::char; + +#[cfg(not(feature = "std"))] +use crate::no_std_prelude::*; #[test] fn test_nfd() { From 679f93670e78b14a52f29940dc0d77c7a83a63cd Mon Sep 17 00:00:00 2001 From: Hannes Karppila Date: Thu, 9 Apr 2020 19:21:27 +0300 Subject: [PATCH 08/10] Remove stable feature flag iterator_step_by --- benches/bench.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benches/bench.rs b/benches/bench.rs index 3f529ec..a977156 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -1,5 +1,5 @@ #![feature(test)] -#![feature(iterator_step_by)] + extern crate test; extern crate unicode_normalization; From 8d96c349ff09fbb53b3330e02d9bfa3c0d217e83 Mon Sep 17 00:00:00 2001 From: Hannes Karppila Date: Thu, 9 Apr 2020 19:22:42 +0300 Subject: [PATCH 09/10] Add note about no_std support to readme --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 4dbeb3d..f5dd539 100644 --- a/README.md +++ b/README.md @@ -33,3 +33,7 @@ to your `Cargo.toml`: [dependencies] unicode-normalization = "0.1.8" ``` + +## `no_std` + `alloc` support + +This crate is completely `no_std` + `alloc` compatible. This can be enabled by disabling the `std` feature, i.e. specifying `default-features = false` for this crate on your `Cargo.toml`. From aacee6eda04a3a4ca56592e81899cf35319a768d Mon Sep 17 00:00:00 2001 From: Hannes Karppila Date: Thu, 9 Apr 2020 19:24:01 +0300 Subject: [PATCH 10/10] Bump version --- Cargo.toml | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6cf8fa7..cca619b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "unicode-normalization" -version = "0.1.12" +version = "0.1.13" authors = ["kwantam "] homepage = "https://github.com/unicode-rs/unicode-normalization" diff --git a/README.md b/README.md index f5dd539..0c63c3a 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ to your `Cargo.toml`: ```toml [dependencies] -unicode-normalization = "0.1.8" +unicode-normalization = "0.1.13" ``` ## `no_std` + `alloc` support