diff --git a/crates/oxc_parser/src/js/expression.rs b/crates/oxc_parser/src/js/expression.rs index f1ab54714c1cb..880516a35dbd2 100644 --- a/crates/oxc_parser/src/js/expression.rs +++ b/crates/oxc_parser/src/js/expression.rs @@ -286,8 +286,12 @@ impl<'a> ParserImpl<'a> { let token = self.cur_token(); let src = self.cur_src(); let value = match token.kind { - Kind::Decimal | Kind::Binary | Kind::Octal | Kind::Hex => parse_int(src, token.kind), - Kind::Float | Kind::PositiveExponential | Kind::NegativeExponential => parse_float(src), + Kind::Decimal | Kind::Binary | Kind::Octal | Kind::Hex => { + parse_int(src, token.kind, token.has_separator()) + } + Kind::Float | Kind::PositiveExponential | Kind::NegativeExponential => { + parse_float(src, token.has_separator()) + } _ => unreachable!(), } .map_err(|err| diagnostics::invalid_number(err, token.span()))?; diff --git a/crates/oxc_parser/src/lexer/number.rs b/crates/oxc_parser/src/lexer/number.rs index c1e746325788c..438a278a8ee99 100644 --- a/crates/oxc_parser/src/lexer/number.rs +++ b/crates/oxc_parser/src/lexer/number.rs @@ -6,13 +6,35 @@ use std::borrow::Cow; use super::kind::Kind; -// the string passed in has `_` removed from the lexer -pub fn parse_int(s: &str, kind: Kind) -> Result { +pub fn parse_int(s: &str, kind: Kind, has_sep: bool) -> Result { + let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) }; + let s = s.as_ref(); + debug_assert!(!s.contains('_')); + + // SAFETY: we just checked that `s` has no `_` characters + unsafe { parse_int_without_underscores_unchecked(s, kind) } +} + +pub fn parse_float(s: &str, has_sep: bool) -> Result { + let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) }; + debug_assert!(!s.contains('_')); + + // SAFETY: we just checked that `s` has no `_` characters + unsafe { parse_float_without_underscores_unchecked(&s) } +} + +/// # Safety +/// +/// This function assumes that all `_` characters have been stripped from `s`. +/// Violating this assumption does _not_ cause UB. However, this function is +/// marked as unsafe to ensure consumers are aware of the assumption. +unsafe fn parse_int_without_underscores_unchecked( + s: &str, + kind: Kind, +) -> Result { if kind == Kind::Decimal { - return parse_float(s); + return parse_float_without_underscores_unchecked(s); } - let s = if s.contains('_') { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) }; - let s = s.as_ref(); match kind { Kind::Binary => Ok(parse_binary(&s[2..])), Kind::Octal => { @@ -28,8 +50,12 @@ pub fn parse_int(s: &str, kind: Kind) -> Result { } } -pub fn parse_float(s: &str) -> Result { - let s = if s.contains('_') { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) }; +/// # Safety +/// +/// This function assumes that all `_` characters have been stripped from `s`. +/// Violating this assumption does _not_ cause UB. However, this function is +/// marked as unsafe to ensure consumers are aware of the assumption. +unsafe fn parse_float_without_underscores_unchecked(s: &str) -> Result { s.parse::().map_err(|_| "invalid float") } diff --git a/crates/oxc_parser/src/lexer/numeric.rs b/crates/oxc_parser/src/lexer/numeric.rs index 963bc063e40e6..513d630bd2b70 100644 --- a/crates/oxc_parser/src/lexer/numeric.rs +++ b/crates/oxc_parser/src/lexer/numeric.rs @@ -53,6 +53,11 @@ impl<'a> Lexer<'a> { match c { '_' => { self.consume_char(); + // NOTE: it looks invalid numeric tokens are still parsed. + // This seems to be a waste. It also requires us to put this + // call here instead of after we ensure the next character + // is a number character + self.token.set_has_separator(); if self.peek().is_some_and(|c| kind.matches_number_char(c)) { self.consume_char(); } else { @@ -134,6 +139,11 @@ impl<'a> Lexer<'a> { match c { '_' => { self.consume_char(); + // NOTE: it looks invalid numeric tokens are still parsed. + // This seems to be a waste. It also requires us to put this + // call here instead of after we ensure the next character + // is an ASCII digit + self.token.set_has_separator(); if self.peek().is_some_and(|c| c.is_ascii_digit()) { self.consume_char(); } else { diff --git a/crates/oxc_parser/src/lexer/token.rs b/crates/oxc_parser/src/lexer/token.rs index 84119516e5939..9b92e5591e422 100644 --- a/crates/oxc_parser/src/lexer/token.rs +++ b/crates/oxc_parser/src/lexer/token.rs @@ -26,10 +26,16 @@ pub struct Token { /// [Lexer::escaped_templates]: [super::Lexer::escaped_templates] pub escaped: bool, + /// True if for numeric literal tokens that contain separator characters (`_`). + /// + /// Numeric literals are defined in Section 12.9.3 of the ECMAScript + /// standard and include [`Kind::Decimal`], [`Kind::Binary`], + /// [`Kind::Octal`], [`Kind::Hex`], etc. + has_separator: bool, + // Padding to fill to 16 bytes. // This makes copying a `Token` 1 x xmmword load & store, rather than 1 x dword + 1 x qword // and `Token::default()` is 1 x xmmword store, rather than 1 x dword + 1 x qword. - _padding1: u8, _padding2: u32, } @@ -50,4 +56,15 @@ impl Token { pub fn escaped(&self) -> bool { self.escaped } + + #[inline] + pub fn has_separator(&self) -> bool { + debug_assert!(!self.has_separator || self.kind.is_number()); + self.has_separator + } + + pub(crate) fn set_has_separator(&mut self) { + debug_assert!(!self.has_separator || self.kind.is_number() || self.kind == Kind::default()); + self.has_separator = true; + } }