diff --git a/Cargo.lock b/Cargo.lock index 6b10b5a418a96..ce2977fcdcf7c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1506,6 +1506,7 @@ dependencies = [ "bitflags 2.5.0", "memchr", "num-bigint", + "num-traits", "ouroboros", "oxc_allocator", "oxc_ast", diff --git a/crates/oxc_parser/Cargo.toml b/crates/oxc_parser/Cargo.toml index 4106ff1267c69..c907dbd998500 100644 --- a/crates/oxc_parser/Cargo.toml +++ b/crates/oxc_parser/Cargo.toml @@ -24,12 +24,13 @@ oxc_ast = { workspace = true } oxc_syntax = { workspace = true } oxc_diagnostics = { workspace = true } -static_assertions = { workspace = true } assert-unchecked = { workspace = true } bitflags = { workspace = true } -rustc-hash = { workspace = true } num-bigint = { workspace = true } +num-traits = { workspace = true } +rustc-hash = { workspace = true } seq-macro = { workspace = true } +static_assertions = { workspace = true } memchr = { workspace = true } diff --git a/crates/oxc_parser/src/js/expression.rs b/crates/oxc_parser/src/js/expression.rs index 880516a35dbd2..6984722ea8ad7 100644 --- a/crates/oxc_parser/src/js/expression.rs +++ b/crates/oxc_parser/src/js/expression.rs @@ -326,7 +326,7 @@ impl<'a> ParserImpl<'a> { let token = self.cur_token(); let raw = self.cur_src(); let src = raw.strip_suffix('n').unwrap(); - let _value = parse_big_int(src, token.kind) + let _value = parse_big_int(src, token.kind, token.has_separator()) .map_err(|err| diagnostics::invalid_number(err, token.span()))?; self.bump_any(); Ok(self.ast.bigint_literal(self.end_span(span), Atom::from(raw), base)) diff --git a/crates/oxc_parser/src/lexer/number.rs b/crates/oxc_parser/src/lexer/number.rs index 438a278a8ee99..82e4b0703a426 100644 --- a/crates/oxc_parser/src/lexer/number.rs +++ b/crates/oxc_parser/src/lexer/number.rs @@ -2,38 +2,30 @@ //! code copied from [jsparagus](https://github.com/mozilla-spidermonkey/jsparagus/blob/master/crates/parser/src/numeric_value.rs) use num_bigint::BigInt; +use num_traits::Num as _; use std::borrow::Cow; use super::kind::Kind; pub fn parse_int(s: &str, kind: Kind, has_sep: bool) -> Result { let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) }; - let s = s.as_ref(); debug_assert!(!s.contains('_')); - // SAFETY: we just checked that `s` has no `_` characters - unsafe { parse_int_without_underscores_unchecked(s, kind) } + parse_int_without_underscores(&s, kind) } pub fn parse_float(s: &str, has_sep: bool) -> Result { let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) }; debug_assert!(!s.contains('_')); - // SAFETY: we just checked that `s` has no `_` characters - unsafe { parse_float_without_underscores_unchecked(&s) } + parse_float_without_underscores(&s) } -/// # Safety -/// -/// This function assumes that all `_` characters have been stripped from `s`. -/// Violating this assumption does _not_ cause UB. However, this function is -/// marked as unsafe to ensure consumers are aware of the assumption. -unsafe fn parse_int_without_underscores_unchecked( - s: &str, - kind: Kind, -) -> Result { +/// This function assumes `s` has had all numeric separators (`_`) removed. +/// Parsing will fail if this assumption is violated. +fn parse_int_without_underscores(s: &str, kind: Kind) -> Result { if kind == Kind::Decimal { - return parse_float_without_underscores_unchecked(s); + return parse_float_without_underscores(s); } match kind { Kind::Binary => Ok(parse_binary(&s[2..])), @@ -50,64 +42,84 @@ unsafe fn parse_int_without_underscores_unchecked( } } -/// # Safety -/// -/// This function assumes that all `_` characters have been stripped from `s`. -/// Violating this assumption does _not_ cause UB. However, this function is -/// marked as unsafe to ensure consumers are aware of the assumption. -unsafe fn parse_float_without_underscores_unchecked(s: &str) -> Result { +/// This function assumes `s` has had all numeric separators (`_`) removed. +/// Parsing will fail if this assumption is violated. +fn parse_float_without_underscores(s: &str) -> Result { s.parse::().map_err(|_| "invalid float") } +/// NOTE: bit shifting here is is safe and much faster than f64::mul_add. +/// It's safe because we're sure this number is an integer - if it wasn't, it +/// would be a [`Kind::Float`] instead. It's fast because shifting usually takes +/// 1 clock cycle on the ALU, while multiplication+addition uses the FPU and is +/// much slower. Addtiionally, this loop often gets unrolled by rustc since +/// these numbers are usually not long. On x84_64, FMUL has a latency of 4 clock +/// cycles, which doesn't include addition. Some platorms support mul + add in a +/// single instruction, but many others do not. +#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)] fn parse_binary(s: &str) -> f64 { debug_assert!(!s.is_empty()); - let mut result = 0_f64; + let mut result = 0_u64; - for c in s.as_bytes().iter().filter(|s| s != &&b'_') { + for c in s.as_bytes() { + debug_assert!(c != &b'_'); #[allow(clippy::cast_lossless)] - let value = (c - b'0') as f64; - result = result.mul_add(2.0, value); + let value = (c - b'0') as u64; + result <<= 1; + result |= value; } - result + result as f64 } +#[allow(clippy::cast_precision_loss)] fn parse_octal(s: &str) -> f64 { debug_assert!(!s.is_empty()); - let mut result = 0_f64; + let mut result = 0_u64; - for c in s.as_bytes().iter().filter(|s| s != &&b'_') { + for c in s.as_bytes() { + debug_assert!(c != &b'_'); #[allow(clippy::cast_lossless)] - let value = (c - b'0') as f64; - result = result.mul_add(8.0, value); + let value = (c - b'0') as u64; + result <<= 3; + result |= value; } - result + result as f64 } +#[allow(clippy::cast_precision_loss, clippy::cast_lossless)] fn parse_hex(s: &str) -> f64 { debug_assert!(!s.is_empty()); - let mut result = 0_f64; + let mut result = 0_u64; - for c in s.as_bytes().iter().filter(|s| s != &&b'_') { + for c in s.as_bytes() { + debug_assert!(c != &b'_'); let value = match c { b'0'..=b'9' => c - b'0', b'A'..=b'F' => c - b'A' + 10, b'a'..=b'f' => c - b'a' + 10, _ => unreachable!("invalid hex syntax {}", s), }; - #[allow(clippy::cast_lossless)] - let value = value as f64; - result = result.mul_add(16.0, value); + result <<= 4; + result |= value as u64; } - result + result as f64 +} + +pub fn parse_big_int(s: &str, kind: Kind, has_sep: bool) -> Result { + let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) }; + debug_assert!(!s.contains('_')); + parse_big_int_without_underscores(&s, kind) } -pub fn parse_big_int(s: &str, kind: Kind) -> Result { +/// This function assumes `s` has had all numeric separators (`_`) removed. +/// Parsing will fail if this assumption is violated. +fn parse_big_int_without_underscores(s: &str, kind: Kind) -> Result { let s = match kind { Kind::Decimal => s, Kind::Binary | Kind::Octal | Kind::Hex => &s[2..], @@ -120,5 +132,198 @@ pub fn parse_big_int(s: &str, kind: Kind) -> Result { Kind::Hex => 16, _ => unreachable!(), }; - BigInt::parse_bytes(s.as_bytes(), radix).ok_or("invalid bigint") + // NOTE: BigInt::from_bytes does a utf8 check, then uses from_str_radix + // under the hood. We already have a string, so we can just use that + // directly. + BigInt::from_str_radix(s, radix).map_err(|_| "invalid bigint") +} + +#[cfg(test)] +#[allow(clippy::unreadable_literal, clippy::mixed_case_hex_literals)] +mod test { + + use super::{parse_float, parse_int, Kind}; + + #[allow(clippy::cast_precision_loss)] + fn assert_all_ints_eq(test_cases: I, kind: Kind, has_sep: bool) + where + I: IntoIterator, + { + for (s, expected) in test_cases { + let parsed = parse_int(s, kind, has_sep); + assert_eq!( + parsed, + Ok(expected as f64), + "expected {s} to parse to {expected}, but got {parsed:?}" + ); + } + } + fn assert_all_floats_eq(test_cases: I, has_sep: bool) + where + I: IntoIterator, + { + for (s, expected) in test_cases { + let parsed = parse_float(s, has_sep); + assert_eq!( + parsed, + Ok(expected), + "expected {s} to parse to {expected}, but got {parsed:?}" + ); + } + } + + #[test] + #[allow(clippy::excessive_precision)] + fn test_int_precision() { + assert_eq!(parse_int("9007199254740991", Kind::Decimal, false), Ok(9007199254740991.0)); + } + + #[test] + #[allow(clippy::excessive_precision)] + fn test_float_precision() { + let cases = vec![ + ("1.7976931348623157e+308", 1.7976931348623157e+308), + ("0.000000001", 0.000_000_001), + ]; + assert_all_floats_eq(cases, false); + } + + #[test] + fn test_parse_int_no_sep() { + let decimal: Vec<(&str, i64)> = vec![ + // normal + ("0", 0), + ("-0", 0), + ("1", 1), + ("-1", -1), + ("000000000000", 0), + ("-000000000000", 0), + ("9007199254740991", 9007199254740991), // max safe integer, 2^53 - 1 + ("-9007199254740990", -9007199254740990), // min safe integer, -(2^53 - 1) + ]; + let binary = vec![ + ("0b0", 0b0), + ("0b1", 0b1), + ("0b10", 0b10), + ("0b110001001000100", 0b110001001000100), + ("0b110001001000100", 0b110001001000100), + ]; + let octal = vec![("0o0", 0o0), ("0o1", 0o1), ("0o10", 0o10), ("0o777", 0o777)]; + let hex: Vec<(&str, i64)> = vec![ + ("0x0", 0x0), + ("0X0", 0x0), + ("0xFF", 0xFF), + ("0xc", 0xc), // :) + ("0xdeadbeef", 0xdeadbeef), + ("0xFfEeDdCcBbAa", 0xFfEeDdCcBbAa), + ]; + + assert_all_ints_eq(decimal, Kind::Decimal, false); + assert_all_ints_eq(binary, Kind::Binary, false); + assert_all_ints_eq(octal, Kind::Octal, false); + assert_all_ints_eq(hex, Kind::Hex, false); + } + + #[test] + fn test_parse_int_with_sep() { + let decimal: Vec<(&str, i64)> = vec![ + // still works without separators + ("0", 0), + ("-0", 0), + ("1", 1), + ("-1", -1), + ("1_000_000", 1_000_000), + ("-1_000_000", -1_000_000), + ("000000000000", 0), + ("-000000000000", 0), + ("9_007_199_254_740_991", 9_007_199_254_740_991), // max safe integer, 2^53 - 1 + ("-9_007_199_254_740_990", -9_007_199_254_740_990), // min safe integer, -(2^53 - 1) + // still works for illegal tokens + ("1___000_000", 1_000_000), + ("1_", 1), + ("_1", 1), + ]; + + let binary = vec![ + ("0b0", 0b0), + ("0b1", 0b1), + ("0b10", 0b10), + ("0b110001001000100", 0b110001001000100), + ("0b110001001000100", 0b110001001000100), + ("0b1_1000_1001_0001_0000", 0b1_1000_1001_0001_0000), + // still works for illegal tokens + ("0b1_0000__0000", 0b1_0000_0000), + ("0b1_", 0b1), + ("0b_0", 0b0), + ]; + + let octal = vec![ + ("0o0", 0o0), + ("0o1", 0o1), + ("0o10", 0o10), + ("0o777", 0o777), + ("0o7_7_7", 0o777), + ("0o77_73_72", 0o77_73_72), + // still works for illegal tokens + ("0o1_0000__0000", 0o100_000_000), + ("0o1_", 0o1), + ("0o_0", 0o0), + ]; + + let hex: Vec<(&str, i64)> = vec![ + // still works without separators + ("0x0", 0x0), + ("0X0", 0x0), + ("0xFF", 0xFF), + ("0xFF_AA_11", 0xFFAA11), + ("0xdead_beef", 0xdead_beef), + ("0xFf_Ee_Dd_Cc_Bb_Aa", 0xFfEe_DdCc_BbAa), + ("0xFfEe_DdCc_BbAa", 0xFfEe_DdCc_BbAa), + // still works for illegal tokens + ("0x1_0000__0000", 0x100_000_000), + ("0x1_", 0x1), + ("0x_0", 0x0), + ]; + + assert_all_ints_eq(decimal, Kind::Decimal, true); + assert_all_ints_eq(binary, Kind::Binary, true); + assert_all_ints_eq(octal, Kind::Octal, true); + assert_all_ints_eq(hex, Kind::Hex, true); + } + + #[test] + fn test_decimal() { + let no_sep: Vec<(&'static str, f64)> = + vec![("0", 0.0), ("1.0", 1.0), ("1.1", 1.1), ("25.125", 25.125)]; + + let sep: Vec<(&'static str, f64)> = vec![ + ("1_000.0", 1000.0), + ("1.5_000", 1.5), + // works on invalid tokens + ("_0._5", 0.5), + ("0._5", 0.5), + ("0.5_", 0.5), + ]; + + // parse_int() handles Kind::Decimal as a float. Should we check if + // a '.' is encountered during lexing and pick which parser to use? + assert_all_floats_eq(no_sep.clone(), false); + assert_all_floats_eq(sep.clone(), true); + for (s, expected) in no_sep { + let parsed = parse_int(s, Kind::Decimal, false); + assert_eq!( + parsed, + Ok(expected), + "expected {s} to parse to {expected}, but got {parsed:?}" + ); + } + for (s, expected) in sep { + let parsed = parse_int(s, Kind::Decimal, true); + assert_eq!( + parsed, + Ok(expected), + "expected {s} to parse to {expected}, but got {parsed:?}" + ); + } + } }