Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 75 additions & 83 deletions crates/oxc_parser/src/lexer/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,154 +4,149 @@ use oxc_span::Span;

use super::kind::Kind;

#[derive(Debug, Clone, Copy, Default)]
pub struct Token {
/// Token Kind
kind: Kind,

/// Start offset in source
start: u32,

/// End offset in source
end: u32,

/// Indicates the token is on a newline
is_on_new_line: bool,

/// True if the identifier / string / template kinds has escaped strings.
/// The escaped strings are saved in [Lexer::escaped_strings] and [Lexer::escaped_templates] by
/// [Token::start].
///
/// [Lexer::escaped_strings]: [super::Lexer::escaped_strings]
/// [Lexer::escaped_templates]: [super::Lexer::escaped_templates]
escaped: bool,

/// True if a string contains lone surrogates.
lone_surrogates: bool,

/// True if for numeric literal tokens that contain separator characters (`_`).
///
/// Numeric literals are defined in Section 12.9.3 of the ECMAScript
/// standard and include [`Kind::Decimal`], [`Kind::Binary`],
/// [`Kind::Octal`], [`Kind::Hex`], etc.
has_separator: bool,

// Padding to fill to 16 bytes.
// This makes copying a `Token` 1 x xmmword load & store, rather than 1 x dword + 1 x qword
// and `Token::default()` is 1 x xmmword store, rather than 1 x dword + 1 x qword.
_padding2: u16,
// Bit layout for u128:
// - Bits 0-31 (32 bits): `start`
// - Bits 32-63 (32 bits): `end`
// - Bits 64-71 (8 bits): `kind` (as u8)
// - Bit 72 (1 bit): `is_on_new_line`
// - Bit 73 (1 bit): `escaped`
// - Bit 74 (1 bit): `lone_surrogates`
// - Bit 75 (1 bit): `has_separator`

const START_SHIFT: u32 = 0;
const END_SHIFT: u32 = 32;
const KIND_SHIFT: u32 = 64;
const IS_ON_NEW_LINE_SHIFT: u32 = 72;
const ESCAPED_SHIFT: u32 = 73;
const LONE_SURROGATES_SHIFT: u32 = 74;
const HAS_SEPARATOR_SHIFT: u32 = 75;

const START_MASK: u128 = 0xFFFF_FFFF; // 32 bits
const KIND_MASK: u128 = 0xFF; // 8 bits
const END_MASK: u128 = 0xFFFF_FFFF; // 32 bits

const IS_ON_NEW_LINE_FLAG: u128 = 1 << IS_ON_NEW_LINE_SHIFT;
const ESCAPED_FLAG: u128 = 1 << ESCAPED_SHIFT;
const LONE_SURROGATES_FLAG: u128 = 1 << LONE_SURROGATES_SHIFT;
const HAS_SEPARATOR_FLAG: u128 = 1 << HAS_SEPARATOR_SHIFT;

#[derive(Debug, Clone, Copy)]
#[repr(transparent)]
pub struct Token(u128);

impl Default for Token {
fn default() -> Self {
let mut token = Self(0);
// Kind::default() is Kind::Eof. Kind::Eof as u8 needs to be set.
// Assuming Kind::Eof will be 1 after #[repr(u8)] (Undetermined = 0, Eof = 1)
token.set_kind(Kind::default());
token
}
}

impl Token {
#[inline]
pub(super) fn new_on_new_line() -> Self {
Self { is_on_new_line: true, ..Self::default() }
// Start with a default token, then set the flag.
let mut token = Self::default();
token.0 |= IS_ON_NEW_LINE_FLAG;
token
}

#[inline]
pub fn kind(&self) -> Kind {
self.kind
// SAFETY: This conversion is safe because Kind is #[repr(u8)] and we ensure the value stored is a valid Kind variant.
unsafe { std::mem::transmute(((self.0 >> KIND_SHIFT) & KIND_MASK) as u8) }
}

#[inline]
pub fn start(&self) -> u32 {
self.start
}

#[cfg(test)]
#[inline]
fn len(self) -> u32 {
self.end - self.start
((self.0 >> START_SHIFT) & START_MASK) as u32
}

#[inline]
pub fn end(&self) -> u32 {
self.end
((self.0 >> END_SHIFT) & END_MASK) as u32
}

#[inline]
pub fn is_on_new_line(&self) -> bool {
self.is_on_new_line
(self.0 & IS_ON_NEW_LINE_FLAG) != 0
}

#[inline]
pub fn escaped(&self) -> bool {
self.escaped
(self.0 & ESCAPED_FLAG) != 0
}

#[inline]
pub fn lone_surrogates(&self) -> bool {
self.lone_surrogates
(self.0 & LONE_SURROGATES_FLAG) != 0
}

#[inline]
pub fn span(&self) -> Span {
Span::new(self.start, self.end)
Span::new(self.start(), self.end())
}

#[inline]
pub(crate) fn has_separator(&self) -> bool {
self.has_separator
pub fn has_separator(&self) -> bool {
(self.0 & HAS_SEPARATOR_FLAG) != 0
}

#[inline]
pub(crate) fn set_has_separator(&mut self) {
self.has_separator = true;
self.0 |= HAS_SEPARATOR_FLAG;
}

// Helper to set kind, used in Default and potentially elsewhere
#[inline]
pub(crate) fn set_kind(&mut self, kind: Kind) {
self.kind = kind;
self.0 &= !(KIND_MASK << KIND_SHIFT); // Clear current kind bits
self.0 |= u128::from(kind as u8) << KIND_SHIFT;
}

#[inline]
pub(crate) fn set_start(&mut self, start: u32) {
self.start = start;
self.0 &= !(START_MASK << START_SHIFT); // Clear current start bits
self.0 |= u128::from(start) << START_SHIFT;
}

#[inline]
pub(crate) fn set_is_on_new_line(&mut self, value: bool) {
self.is_on_new_line = value;
self.0 = (self.0 & !IS_ON_NEW_LINE_FLAG) | (u128::from(value) * IS_ON_NEW_LINE_FLAG);
}

#[inline]
pub(crate) fn set_escaped(&mut self, escaped: bool) {
self.escaped = escaped;
self.0 = (self.0 & !ESCAPED_FLAG) | (u128::from(escaped) * ESCAPED_FLAG);
}

#[inline]
pub(crate) fn set_lone_surrogates(&mut self, value: bool) {
self.lone_surrogates = value;
self.0 = (self.0 & !LONE_SURROGATES_FLAG) | (u128::from(value) * LONE_SURROGATES_FLAG);
}

#[inline]
pub(crate) fn set_end(&mut self, end: u32) {
debug_assert!(end >= self.start, "end should not be before start");
self.end = end;
}

#[cfg(test)]
#[inline]
fn set_len(&mut self, len: u32) {
self.end = self.start + len;
let start = self.start();
debug_assert!(end >= start, "Token end ({end}) cannot be less than start ({start})");
self.0 &= !(END_MASK << END_SHIFT); // Clear current end bits
self.0 |= u128::from(end) << END_SHIFT;
}
}

#[cfg(test)]
mod test {
mod size_asserts {
use super::Kind;
use super::Token;

// Test token size
// Test new size
const _: () = assert!(std::mem::size_of::<Token>() == 16);

// Test default token values
#[test]
fn default_token_values() {
let token = Token::default();
assert_eq!(token.start(), 0);
assert_eq!(token.len(), 0u32);
assert_eq!(token.end(), 0);
assert_eq!(token.kind(), Kind::Eof); // Kind::default() is Eof
assert!(!token.is_on_new_line());
Expand All @@ -164,7 +159,6 @@ mod test {
fn new_on_new_line_token_values() {
let token = Token::new_on_new_line();
assert_eq!(token.start(), 0);
assert_eq!(token.len(), 0u32);
assert_eq!(token.end(), 0);
assert_eq!(token.kind(), Kind::Eof);
assert!(token.is_on_new_line());
Expand All @@ -177,7 +171,7 @@ mod test {
fn token_creation_and_retrieval() {
let kind = Kind::Ident;
let start = 100u32;
let len = 5u32;
let end = start + 5u32;
let is_on_new_line = true;
let escaped = false;
let lone_surrogates = true;
Expand All @@ -186,7 +180,7 @@ mod test {
let mut token = Token::default();
token.set_kind(kind);
token.set_start(start);
token.set_len(len);
token.set_end(end);
token.set_is_on_new_line(is_on_new_line);
token.set_escaped(escaped);
token.set_lone_surrogates(lone_surrogates);
Expand All @@ -197,8 +191,7 @@ mod test {

assert_eq!(token.kind(), kind);
assert_eq!(token.start(), start);
assert_eq!(token.len(), len);
assert_eq!(token.end(), start + len);
assert_eq!(token.end(), end);
assert_eq!(token.is_on_new_line(), is_on_new_line);
assert_eq!(token.escaped(), escaped);
assert_eq!(token.lone_surrogates(), lone_surrogates);
Expand All @@ -210,7 +203,7 @@ mod test {
let mut token = Token::default();
token.set_kind(Kind::Ident);
token.set_start(10);
token.set_len(5);
token.set_end(15);
// is_on_new_line, escaped, lone_surrogates, has_separator are false by default from Token::default()

assert_eq!(token.start(), 10);
Expand All @@ -222,19 +215,18 @@ mod test {
let mut token_for_set_end = Token::default();
token_for_set_end.set_kind(Kind::Ident);
token_for_set_end.set_start(10);
token_for_set_end.set_len(5);
token_for_set_end.set_end(15);

assert_eq!(token_for_set_end.end(), 15);
token_for_set_end.set_end(30);
assert_eq!(token_for_set_end.start(), 10);
assert_eq!(token_for_set_end.len(), 20u32);
assert_eq!(token_for_set_end.end(), 30);

// Test that other flags are not affected by set_start
let mut token_with_flags = Token::default();
token_with_flags.set_kind(Kind::Str);
token_with_flags.set_start(30);
token_with_flags.set_len(3);
token_with_flags.set_end(33);
token_with_flags.set_is_on_new_line(true);
token_with_flags.set_escaped(true);
token_with_flags.set_lone_surrogates(true);
Expand All @@ -251,7 +243,7 @@ mod test {
let mut token_with_flags2 = Token::default();
token_with_flags2.set_kind(Kind::Str);
token_with_flags2.set_start(50);
token_with_flags2.set_len(2);
token_with_flags2.set_end(52);
token_with_flags2.set_is_on_new_line(true);
// escaped is false by default
token_with_flags2.set_lone_surrogates(true);
Expand All @@ -273,7 +265,7 @@ mod test {
let mut token_flags_test_newline = Token::default();
token_flags_test_newline.set_kind(Kind::Str);
token_flags_test_newline.set_start(60);
token_flags_test_newline.set_len(2);
token_flags_test_newline.set_end(62);
// is_on_new_line is false by default
token_flags_test_newline.set_escaped(true);
token_flags_test_newline.set_lone_surrogates(true);
Expand All @@ -295,7 +287,7 @@ mod test {
let mut token_flags_test_lone_surrogates = Token::default();
token_flags_test_lone_surrogates.set_kind(Kind::Str);
token_flags_test_lone_surrogates.set_start(70);
token_flags_test_lone_surrogates.set_len(2);
token_flags_test_lone_surrogates.set_end(72);
token_flags_test_lone_surrogates.set_is_on_new_line(true);
token_flags_test_lone_surrogates.set_escaped(true);
// lone_surrogates is false by default
Expand Down
Loading