oxc-project · Boshen · May 17, 2024 · May 15, 2024 · May 15, 2024 · May 16, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/oxc_parser/Cargo.toml b/crates/oxc_parser/Cargo.toml
@@ -24,12 +24,13 @@ oxc_ast         = { workspace = true }
 oxc_syntax      = { workspace = true }
 oxc_diagnostics = { workspace = true }
 
-static_assertions = { workspace = true }
 assert-unchecked  = { workspace = true }
 bitflags          = { workspace = true }
-rustc-hash        = { workspace = true }
 num-bigint        = { workspace = true }
+num-traits        = { workspace = true }
+rustc-hash        = { workspace = true }
 seq-macro         = { workspace = true }
+static_assertions = { workspace = true }
 
 memchr = { workspace = true }
 

diff --git a/crates/oxc_parser/src/js/expression.rs b/crates/oxc_parser/src/js/expression.rs
@@ -326,7 +326,7 @@ impl<'a> ParserImpl<'a> {
         let token = self.cur_token();
         let raw = self.cur_src();
         let src = raw.strip_suffix('n').unwrap();
-        let _value = parse_big_int(src, token.kind)
+        let _value = parse_big_int(src, token.kind, token.has_separator())
             .map_err(|err| diagnostics::invalid_number(err, token.span()))?;
         self.bump_any();
         Ok(self.ast.bigint_literal(self.end_span(span), Atom::from(raw), base))

diff --git a/crates/oxc_parser/src/lexer/number.rs b/crates/oxc_parser/src/lexer/number.rs
@@ -2,38 +2,30 @@
 //! code copied from [jsparagus](https://github.com/mozilla-spidermonkey/jsparagus/blob/master/crates/parser/src/numeric_value.rs)
 
 use num_bigint::BigInt;
+use num_traits::Num as _;
 use std::borrow::Cow;
 
 use super::kind::Kind;
 
 pub fn parse_int(s: &str, kind: Kind, has_sep: bool) -> Result<f64, &'static str> {
     let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) };
-    let s = s.as_ref();
     debug_assert!(!s.contains('_'));
 
-    // SAFETY: we just checked that `s` has no `_` characters
-    unsafe { parse_int_without_underscores_unchecked(s, kind) }
+    parse_int_without_underscores(&s, kind)
 }
 
 pub fn parse_float(s: &str, has_sep: bool) -> Result<f64, &'static str> {
     let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) };
     debug_assert!(!s.contains('_'));
 
-    // SAFETY: we just checked that `s` has no `_` characters
-    unsafe { parse_float_without_underscores_unchecked(&s) }
+    parse_float_without_underscores(&s)
 }
 
-/// # Safety
-///
-/// This function assumes that all `_` characters have been stripped from `s`.
-/// Violating this assumption does _not_ cause UB. However, this function is
-/// marked as unsafe to ensure consumers are aware of the assumption.
-unsafe fn parse_int_without_underscores_unchecked(
-    s: &str,
-    kind: Kind,
-) -> Result<f64, &'static str> {
+/// This function assumes `s` has had all numeric separators (`_`) removed.
+/// Parsing will fail if this assumption is violated.
+fn parse_int_without_underscores(s: &str, kind: Kind) -> Result<f64, &'static str> {
     if kind == Kind::Decimal {
-        return parse_float_without_underscores_unchecked(s);
+        return parse_float_without_underscores(s);
     }
     match kind {
         Kind::Binary => Ok(parse_binary(&s[2..])),
@@ -50,64 +42,84 @@ unsafe fn parse_int_without_underscores_unchecked(
     }
 }
 
-/// # Safety
-///
-/// This function assumes that all `_` characters have been stripped from `s`.
-/// Violating this assumption does _not_ cause UB. However, this function is
-/// marked as unsafe to ensure consumers are aware of the assumption.
-unsafe fn parse_float_without_underscores_unchecked(s: &str) -> Result<f64, &'static str> {
+/// This function assumes `s` has had all numeric separators (`_`) removed.
+/// Parsing will fail if this assumption is violated.
+fn parse_float_without_underscores(s: &str) -> Result<f64, &'static str> {
     s.parse::<f64>().map_err(|_| "invalid float")
 }
 
+/// NOTE: bit shifting here is is safe and much faster than f64::mul_add.
+/// It's safe because we're sure this number is an integer - if it wasn't, it
+/// would be a [`Kind::Float`] instead. It's fast because shifting usually takes
+/// 1 clock cycle on the ALU, while multiplication+addition uses the FPU and is
+/// much slower. Addtiionally, this loop often gets unrolled by rustc since
+/// these numbers are usually not long. On x84_64, FMUL has a latency of 4 clock
+/// cycles, which doesn't include addition. Some platorms support mul + add in a
+/// single instruction, but many others do not.
+#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
 fn parse_binary(s: &str) -> f64 {
     debug_assert!(!s.is_empty());
 
-    let mut result = 0_f64;
+    let mut result = 0_u64;
 
-    for c in s.as_bytes().iter().filter(|s| s != &&b'_') {
+    for c in s.as_bytes() {
+        debug_assert!(c != &b'_');
         #[allow(clippy::cast_lossless)]
-        let value = (c - b'0') as f64;
-        result = result.mul_add(2.0, value);
+        let value = (c - b'0') as u64;
+        result <<= 1;
+        result |= value;
     }
 
-    result
+    result as f64
 }
 
+#[allow(clippy::cast_precision_loss)]
 fn parse_octal(s: &str) -> f64 {
     debug_assert!(!s.is_empty());
 
-    let mut result = 0_f64;
+    let mut result = 0_u64;
 
-    for c in s.as_bytes().iter().filter(|s| s != &&b'_') {
+    for c in s.as_bytes() {
+        debug_assert!(c != &b'_');
         #[allow(clippy::cast_lossless)]
-        let value = (c - b'0') as f64;
-        result = result.mul_add(8.0, value);
+        let value = (c - b'0') as u64;
+        result <<= 3;
+        result |= value;
     }
 
-    result
+    result as f64
 }
 
+#[allow(clippy::cast_precision_loss, clippy::cast_lossless)]
 fn parse_hex(s: &str) -> f64 {
     debug_assert!(!s.is_empty());
 
-    let mut result = 0_f64;
+    let mut result = 0_u64;
 
-    for c in s.as_bytes().iter().filter(|s| s != &&b'_') {
+    for c in s.as_bytes() {
+        debug_assert!(c != &b'_');
         let value = match c {
             b'0'..=b'9' => c - b'0',
             b'A'..=b'F' => c - b'A' + 10,
             b'a'..=b'f' => c - b'a' + 10,
             _ => unreachable!("invalid hex syntax {}", s),
         };
-        #[allow(clippy::cast_lossless)]
-        let value = value as f64;
-        result = result.mul_add(16.0, value);
+        result <<= 4;
+        result |= value as u64;
     }
 
-    result
+    result as f64
+}
+
+pub fn parse_big_int(s: &str, kind: Kind, has_sep: bool) -> Result<BigInt, &'static str> {
+    let s = if has_sep { Cow::Owned(s.replace('_', "")) } else { Cow::Borrowed(s) };
+    debug_assert!(!s.contains('_'));
+    parse_big_int_without_underscores(&s, kind)
 }
 
-pub fn parse_big_int(s: &str, kind: Kind) -> Result<BigInt, &'static str> {
+/// This function assumes `s` has had all numeric separators (`_`) removed.
+/// Parsing will fail if this assumption is violated.
+fn parse_big_int_without_underscores(s: &str, kind: Kind) -> Result<BigInt, &'static str> {
     let s = match kind {
         Kind::Decimal => s,
         Kind::Binary | Kind::Octal | Kind::Hex => &s[2..],
@@ -120,5 +132,198 @@ pub fn parse_big_int(s: &str, kind: Kind) -> Result<BigInt, &'static str> {
         Kind::Hex => 16,
         _ => unreachable!(),
     };
-    BigInt::parse_bytes(s.as_bytes(), radix).ok_or("invalid bigint")
+    // NOTE: BigInt::from_bytes does a utf8 check, then uses from_str_radix
+    // under the hood. We already have a string, so we can just use that
+    // directly.
+    BigInt::from_str_radix(s, radix).map_err(|_| "invalid bigint")
+}
+
+#[cfg(test)]
+#[allow(clippy::unreadable_literal, clippy::mixed_case_hex_literals)]
+mod test {
+
+    use super::{parse_float, parse_int, Kind};
+
+    #[allow(clippy::cast_precision_loss)]
+    fn assert_all_ints_eq<I>(test_cases: I, kind: Kind, has_sep: bool)
+    where
+        I: IntoIterator<Item = (&'static str, i64)>,
+    {
+        for (s, expected) in test_cases {
+            let parsed = parse_int(s, kind, has_sep);
+            assert_eq!(
+                parsed,
+                Ok(expected as f64),
+                "expected {s} to parse to {expected}, but got {parsed:?}"
+            );
+        }
+    }
+    fn assert_all_floats_eq<I>(test_cases: I, has_sep: bool)
+    where
+        I: IntoIterator<Item = (&'static str, f64)>,
+    {
+        for (s, expected) in test_cases {
+            let parsed = parse_float(s, has_sep);
+            assert_eq!(
+                parsed,
+                Ok(expected),
+                "expected {s} to parse to {expected}, but got {parsed:?}"
+            );
+        }
+    }
+
+    #[test]
+    #[allow(clippy::excessive_precision)]
+    fn test_int_precision() {
+        assert_eq!(parse_int("9007199254740991", Kind::Decimal, false), Ok(9007199254740991.0));
+    }
+
+    #[test]
+    #[allow(clippy::excessive_precision)]
+    fn test_float_precision() {
+        let cases = vec![
+            ("1.7976931348623157e+308", 1.7976931348623157e+308),
+            ("0.000000001", 0.000_000_001),
+        ];
+        assert_all_floats_eq(cases, false);
+    }
+
+    #[test]
+    fn test_parse_int_no_sep() {
+        let decimal: Vec<(&str, i64)> = vec![
+            // normal
+            ("0", 0),
+            ("-0", 0),
+            ("1", 1),
+            ("-1", -1),
+            ("000000000000", 0),
+            ("-000000000000", 0),
+            ("9007199254740991", 9007199254740991), // max safe integer, 2^53 - 1
+            ("-9007199254740990", -9007199254740990), // min safe integer, -(2^53 - 1)
+        ];
+        let binary = vec![
+            ("0b0", 0b0),
+            ("0b1", 0b1),
+            ("0b10", 0b10),
+            ("0b110001001000100", 0b110001001000100),
+            ("0b110001001000100", 0b110001001000100),
+        ];
+        let octal = vec![("0o0", 0o0), ("0o1", 0o1), ("0o10", 0o10), ("0o777", 0o777)];
+        let hex: Vec<(&str, i64)> = vec![
+            ("0x0", 0x0),
+            ("0X0", 0x0),
+            ("0xFF", 0xFF),
+            ("0xc", 0xc), // :)
+            ("0xdeadbeef", 0xdeadbeef),
+            ("0xFfEeDdCcBbAa", 0xFfEeDdCcBbAa),
+        ];
+
+        assert_all_ints_eq(decimal, Kind::Decimal, false);
+        assert_all_ints_eq(binary, Kind::Binary, false);
+        assert_all_ints_eq(octal, Kind::Octal, false);
+        assert_all_ints_eq(hex, Kind::Hex, false);
+    }
+
+    #[test]
+    fn test_parse_int_with_sep() {
+        let decimal: Vec<(&str, i64)> = vec![
+            // still works without separators
+            ("0", 0),
+            ("-0", 0),
+            ("1", 1),
+            ("-1", -1),
+            ("1_000_000", 1_000_000),
+            ("-1_000_000", -1_000_000),
+            ("000000000000", 0),
+            ("-000000000000", 0),
+            ("9_007_199_254_740_991", 9_007_199_254_740_991), // max safe integer, 2^53 - 1
+            ("-9_007_199_254_740_990", -9_007_199_254_740_990), // min safe integer, -(2^53 - 1)
+            // still works for illegal tokens
+            ("1___000_000", 1_000_000),
+            ("1_", 1),
+            ("_1", 1),
+        ];
+
+        let binary = vec![
+            ("0b0", 0b0),
+            ("0b1", 0b1),
+            ("0b10", 0b10),
+            ("0b110001001000100", 0b110001001000100),
+            ("0b110001001000100", 0b110001001000100),
+            ("0b1_1000_1001_0001_0000", 0b1_1000_1001_0001_0000),
+            // still works for illegal tokens
+            ("0b1_0000__0000", 0b1_0000_0000),
+            ("0b1_", 0b1),
+            ("0b_0", 0b0),
+        ];
+
+        let octal = vec![
+            ("0o0", 0o0),
+            ("0o1", 0o1),
+            ("0o10", 0o10),
+            ("0o777", 0o777),
+            ("0o7_7_7", 0o777),
+            ("0o77_73_72", 0o77_73_72),
+            // still works for illegal tokens
+            ("0o1_0000__0000", 0o100_000_000),
+            ("0o1_", 0o1),
+            ("0o_0", 0o0),
+        ];
+
+        let hex: Vec<(&str, i64)> = vec![
+            // still works without separators
+            ("0x0", 0x0),
+            ("0X0", 0x0),
+            ("0xFF", 0xFF),
+            ("0xFF_AA_11", 0xFFAA11),
+            ("0xdead_beef", 0xdead_beef),
+            ("0xFf_Ee_Dd_Cc_Bb_Aa", 0xFfEe_DdCc_BbAa),
+            ("0xFfEe_DdCc_BbAa", 0xFfEe_DdCc_BbAa),
+            // still works for illegal tokens
+            ("0x1_0000__0000", 0x100_000_000),
+            ("0x1_", 0x1),
+            ("0x_0", 0x0),
+        ];
+
+        assert_all_ints_eq(decimal, Kind::Decimal, true);
+        assert_all_ints_eq(binary, Kind::Binary, true);
+        assert_all_ints_eq(octal, Kind::Octal, true);
+        assert_all_ints_eq(hex, Kind::Hex, true);
+    }
+
+    #[test]
+    fn test_decimal() {
+        let no_sep: Vec<(&'static str, f64)> =
+            vec![("0", 0.0), ("1.0", 1.0), ("1.1", 1.1), ("25.125", 25.125)];
+
+        let sep: Vec<(&'static str, f64)> = vec![
+            ("1_000.0", 1000.0),
+            ("1.5_000", 1.5),
+            // works on invalid tokens
+            ("_0._5", 0.5),
+            ("0._5", 0.5),
+            ("0.5_", 0.5),
+        ];
+
+        // parse_int() handles Kind::Decimal as a float. Should we check if
+        // a '.' is encountered during lexing and pick which parser to use?
+        assert_all_floats_eq(no_sep.clone(), false);
+        assert_all_floats_eq(sep.clone(), true);
+        for (s, expected) in no_sep {
+            let parsed = parse_int(s, Kind::Decimal, false);
+            assert_eq!(
+                parsed,
+                Ok(expected),
+                "expected {s} to parse to {expected}, but got {parsed:?}"
+            );
+        }
+        for (s, expected) in sep {
+            let parsed = parse_int(s, Kind::Decimal, true);
+            assert_eq!(
+                parsed,
+                Ok(expected),
+                "expected {s} to parse to {expected}, but got {parsed:?}"
+            );
+        }
+    }
 }