Skip to content

Commit e00ec65

Browse files
committed
Avoid surrogates when generating char using Standard distribution
1 parent b7b1176 commit e00ec65

2 files changed

Lines changed: 22 additions & 8 deletions

File tree

benches/misc.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,3 +150,14 @@ fn gen_1k_fill(b: &mut Bencher) {
150150
});
151151
b.bytes = 1024;
152152
}
153+
154+
#[bench]
155+
fn misc_gen_chars(b: &mut Bencher) {
156+
use std::iter;
157+
let mut rng = StdRng::from_rng(&mut thread_rng()).unwrap();
158+
b.iter(|| {
159+
let v: Vec<char> = iter::repeat(()).map(|()| rng.gen()).take(128).collect();
160+
v
161+
});
162+
b.bytes = 512;
163+
}

src/distributions/other.rs

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,18 @@ pub struct Alphanumeric;
4444
impl Distribution<char> for Standard {
4545
#[inline]
4646
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> char {
47-
let range = Uniform::new(0u32, 0x11_0000);
48-
loop {
49-
match char::from_u32(range.sample(rng)) {
50-
Some(c) => return c,
51-
// About 0.2% of numbers in the range 0..0x110000 are invalid
52-
// codepoints (surrogates).
53-
None => {}
54-
}
47+
// A valid `char` is either in the interval `[0, 0xD800)` or
48+
// `(0xDFFF, 0x11_0000)`. All `char`s must therefore be in
49+
// `[0, 0x11_0000)` but not in the "gap" `[0xD800, 0xDFFF]` which is
50+
// reserved for surrogates. This is the size of that gap.
51+
const GAP_SIZE: u32 = 0xDFFF - 0xD800 + 1;
52+
53+
let range = Uniform::new(GAP_SIZE, 0x11_0000);
54+
let mut n = range.sample(rng);
55+
if n <= 0xDFFF {
56+
n -= GAP_SIZE;
5557
}
58+
unsafe { char::from_u32_unchecked(n) }
5659
}
5760
}
5861

0 commit comments

Comments
 (0)