Skip to content

Commit 508dae6

Browse files
DonIsaacBoshen
andauthored
perf(lexer): dedupe numeric separator check (#3283)
## What This PR Does Updates numeric literal token lexing to record when separator characters (`_`) are found in a new `Token` flag. This then gets passed to `parse_int` and `parse_float`, removing the need for a second `_` check in those two functions. When run locally, I see no change to lexer benchmarks and minor improvements to codegen benchmarks. For some reason, semantic and source map benches seem to be doing slightly worse. Note that I attempted to implement this with `bitflags!` (making `escaped` and `is_on_newline` flags as well) and this caused performance degradation. My best guess is that it turned reads on these flags from a `mov` to a `mov` + a binary and. --------- Co-authored-by: Boshen <boshenc@gmail.com>
1 parent dad47a5 commit 508dae6

File tree

4 files changed

+67
-10
lines changed

4 files changed

+67
-10
lines changed

crates/oxc_parser/src/js/expression.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,8 +286,12 @@ impl<'a> ParserImpl<'a> {
286286
let token = self.cur_token();
287287
let src = self.cur_src();
288288
let value = match token.kind {
289-
Kind::Decimal | Kind::Binary | Kind::Octal | Kind::Hex => parse_int(src, token.kind),
290-
Kind::Float | Kind::PositiveExponential | Kind::NegativeExponential => parse_float(src),
289+
Kind::Decimal | Kind::Binary | Kind::Octal | Kind::Hex => {
290+
parse_int(src, token.kind, token.has_separator())
291+
}
292+
Kind::Float | Kind::PositiveExponential | Kind::NegativeExponential => {
293+
parse_float(src, token.has_separator())
294+
}
291295
_ => unreachable!(),
292296
}
293297
.map_err(|err| diagnostics::invalid_number(err, token.span()))?;

crates/oxc_parser/src/lexer/number.rs

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,35 @@ use std::borrow::Cow;
66

77
use super::kind::Kind;
88

9-
// the string passed in has `_` removed from the lexer
10-
pub fn parse_int(s: &str, kind: Kind) -> Result<f64, &'static str> {
9+
pub fn parse_int(s: &str, kind: Kind, has_sep: bool) -> Result<f64, &'static str> {
10+
let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) };
11+
let s = s.as_ref();
12+
debug_assert!(!s.contains('_'));
13+
14+
// SAFETY: we just checked that `s` has no `_` characters
15+
unsafe { parse_int_without_underscores_unchecked(s, kind) }
16+
}
17+
18+
pub fn parse_float(s: &str, has_sep: bool) -> Result<f64, &'static str> {
19+
let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) };
20+
debug_assert!(!s.contains('_'));
21+
22+
// SAFETY: we just checked that `s` has no `_` characters
23+
unsafe { parse_float_without_underscores_unchecked(&s) }
24+
}
25+
26+
/// # Safety
27+
///
28+
/// This function assumes that all `_` characters have been stripped from `s`.
29+
/// Violating this assumption does _not_ cause UB. However, this function is
30+
/// marked as unsafe to ensure consumers are aware of the assumption.
31+
unsafe fn parse_int_without_underscores_unchecked(
32+
s: &str,
33+
kind: Kind,
34+
) -> Result<f64, &'static str> {
1135
if kind == Kind::Decimal {
12-
return parse_float(s);
36+
return parse_float_without_underscores_unchecked(s);
1337
}
14-
let s = if s.contains('_') { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) };
15-
let s = s.as_ref();
1638
match kind {
1739
Kind::Binary => Ok(parse_binary(&s[2..])),
1840
Kind::Octal => {
@@ -28,8 +50,12 @@ pub fn parse_int(s: &str, kind: Kind) -> Result<f64, &'static str> {
2850
}
2951
}
3052

31-
pub fn parse_float(s: &str) -> Result<f64, &'static str> {
32-
let s = if s.contains('_') { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) };
53+
/// # Safety
54+
///
55+
/// This function assumes that all `_` characters have been stripped from `s`.
56+
/// Violating this assumption does _not_ cause UB. However, this function is
57+
/// marked as unsafe to ensure consumers are aware of the assumption.
58+
unsafe fn parse_float_without_underscores_unchecked(s: &str) -> Result<f64, &'static str> {
3359
s.parse::<f64>().map_err(|_| "invalid float")
3460
}
3561

crates/oxc_parser/src/lexer/numeric.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ impl<'a> Lexer<'a> {
5353
match c {
5454
'_' => {
5555
self.consume_char();
56+
// NOTE: it looks invalid numeric tokens are still parsed.
57+
// This seems to be a waste. It also requires us to put this
58+
// call here instead of after we ensure the next character
59+
// is a number character
60+
self.token.set_has_separator();
5661
if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
5762
self.consume_char();
5863
} else {
@@ -134,6 +139,11 @@ impl<'a> Lexer<'a> {
134139
match c {
135140
'_' => {
136141
self.consume_char();
142+
// NOTE: it looks invalid numeric tokens are still parsed.
143+
// This seems to be a waste. It also requires us to put this
144+
// call here instead of after we ensure the next character
145+
// is an ASCII digit
146+
self.token.set_has_separator();
137147
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
138148
self.consume_char();
139149
} else {

crates/oxc_parser/src/lexer/token.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,16 @@ pub struct Token {
2626
/// [Lexer::escaped_templates]: [super::Lexer::escaped_templates]
2727
pub escaped: bool,
2828

29+
/// True if for numeric literal tokens that contain separator characters (`_`).
30+
///
31+
/// Numeric literals are defined in Section 12.9.3 of the ECMAScript
32+
/// standard and include [`Kind::Decimal`], [`Kind::Binary`],
33+
/// [`Kind::Octal`], [`Kind::Hex`], etc.
34+
has_separator: bool,
35+
2936
// Padding to fill to 16 bytes.
3037
// This makes copying a `Token` 1 x xmmword load & store, rather than 1 x dword + 1 x qword
3138
// and `Token::default()` is 1 x xmmword store, rather than 1 x dword + 1 x qword.
32-
_padding1: u8,
3339
_padding2: u32,
3440
}
3541

@@ -50,4 +56,15 @@ impl Token {
5056
pub fn escaped(&self) -> bool {
5157
self.escaped
5258
}
59+
60+
#[inline]
61+
pub fn has_separator(&self) -> bool {
62+
debug_assert!(!self.has_separator || self.kind.is_number());
63+
self.has_separator
64+
}
65+
66+
pub(crate) fn set_has_separator(&mut self) {
67+
debug_assert!(!self.has_separator || self.kind.is_number() || self.kind == Kind::default());
68+
self.has_separator = true;
69+
}
5370
}

0 commit comments

Comments
 (0)