Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ name of compatibility, perhaps with a warning.)
broken-looking AST like `(macrocall (. A (quote (. B @x))))`. It should
probably be rejected.
* Operator prefix call syntax doesn't work in the cases like `+(a;b,c)` where
keyword parameters are separated by commas. A tuple is produced instead.
keyword parameters are separated by commas. A tuple is produced instead.
* `const` and `global` allow chained assignment, but the right hand side is not
constant. `a` const here but not `b`.
```
Expand Down Expand Up @@ -698,7 +698,7 @@ interface. Could we have `Expr2` wrap `SyntaxNode`?
tree library (rowan) for representing of a non-rust toy language is here
https://dev.to/cad97/lossless-syntax-trees-280c

Not all the design decisions in `rust-analyzer` are finalized but the
Not all the design decisions in `rust-analyzer` are finalized but the
[architecture document](https://github.com/rust-analyzer/rust-analyzer/blob/master/docs/dev/architecture.md)
is a fantastic source of design inspiration.

Expand Down Expand Up @@ -772,7 +772,7 @@ The tree datastructure design here is tricky:
parentheses in `2*(x + y)` and the explicit vs implicit multiplication
symbol in `2*x` vs `2x`.

2. There's various type of *analyses*
2. There's various type of *analyses*
- There's many useful ways to augment a syntax tree depending on use case.
- Analysis algorithms should be able to act on any tree type, ignoring
but carrying augmentations which they don't know about.
Expand Down Expand Up @@ -983,4 +983,3 @@ indentation from the syntax tree? Source formatting involves a big pile of
heuristics to get something which "looks nice"... and ML systems have become
very good at heuristics. Also, we've got huge piles of training data — just
choose some high quality, tastefully hand-formatted libraries.

2 changes: 1 addition & 1 deletion Tokenize/benchmark/lex_base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ function speed_test()
while !Tokenize.Lexers.eof(l)
t = Tokenize.Lexers.next_token(l)
tot_tokens += 1
if t.kind == Tokens.ERROR
if Tokens.iserror(t.kind)
tot_errors += 1
end
end
Expand Down
19 changes: 11 additions & 8 deletions Tokenize/src/lexer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module Lexers
include("utilities.jl")

import ..Tokens
import ..Tokens: Token, Kind, TokenError, UNICODE_OPS, EMPTY_TOKEN, isliteral
import ..Tokens: Token, Kind, UNICODE_OPS, EMPTY_TOKEN, isliteral

import ..Tokens: FUNCTION, ABSTRACT, IDENTIFIER, BAREMODULE, BEGIN, BREAK, CATCH, CONST, CONTINUE,
DO, ELSE, ELSEIF, END, EXPORT, FALSE, FINALLY, FOR, FUNCTION, GLOBAL, LET, LOCAL, IF,
Expand Down Expand Up @@ -52,6 +52,7 @@ mutable struct Lexer{IO_t <: IO}
charspos::Tuple{Int,Int,Int,Int}
doread::Bool
dotop::Bool
errored::Bool
end

function Lexer(io::IO)
Expand Down Expand Up @@ -80,7 +81,7 @@ function Lexer(io::IO)
end
Lexer(io, position(io), 1, 1, position(io), 1, 1, position(io),
Tokens.ERROR, Vector{StringState}(), IOBuffer(),
(c1,c2,c3,c4), (p1,p2,p3,p4), false, false)
(c1,c2,c3,c4), (p1,p2,p3,p4), false, false, false)
end
Lexer(str::AbstractString) = Lexer(IOBuffer(str))

Expand Down Expand Up @@ -243,11 +244,11 @@ Consumes all following characters until `accept(l, f)` is `false`.
end

"""
emit(l::Lexer, kind::Kind, err::TokenError=Tokens.NO_ERR)
emit(l::Lexer, kind::Kind)

Returns a `Token` of kind `kind` with contents `str` and starts a new `Token`.
"""
function emit(l::Lexer, kind::Kind, err::TokenError = Tokens.NO_ERR)
function emit(l::Lexer, kind::Kind)
suffix = false
if optakessuffix(kind)
while isopsuffix(peekchar(l))
Expand All @@ -256,20 +257,22 @@ function emit(l::Lexer, kind::Kind, err::TokenError = Tokens.NO_ERR)
end
end

tok = Token(kind, startpos(l), position(l) - 1, err, l.dotop, suffix)
tok = Token(kind, startpos(l), position(l) - 1, l.dotop, suffix)

l.dotop = false
l.last_token = kind
return tok
end

"""
emit_error(l::Lexer, err::TokenError=Tokens.UNKNOWN)
emit_error(l::Lexer, err::Kind=Tokens.ERROR)

Returns an `ERROR` token with error `err` and starts a new `Token`.
"""
function emit_error(l::Lexer, err::TokenError = Tokens.UNKNOWN)
return emit(l, Tokens.ERROR, err)
function emit_error(l::Lexer, err::Kind = Tokens.ERROR)
l.errored = true
@assert Tokens.iserror(err)
return emit(l, err)
end


Expand Down
23 changes: 6 additions & 17 deletions Tokenize/src/token.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ include("token_kinds.jl")
iskeyword(k::Kind) = begin_keywords < k < end_keywords
isliteral(k::Kind) = begin_literal < k < end_literal
isoperator(k::Kind) = begin_ops < k < end_ops

iserror(k::Kind) = begin_errors < k < end_errors
iscontextualkeyword(k::Kind) = begin_contextual_keywords < k < end_contextual_keywords

function iswordoperator(k::Kind)
Expand All @@ -32,40 +32,28 @@ function _add_kws()
end
_add_kws()

# TODO: more
@enum(TokenError,
NO_ERR,
EOF_MULTICOMMENT,
EOF_CHAR,
INVALID_NUMERIC_CONSTANT,
INVALID_OPERATOR,
INVALID_INTERPOLATION_TERMINATOR,
UNKNOWN,
)

# Error kind => description
TOKEN_ERROR_DESCRIPTION = Dict{TokenError, String}(
TOKEN_ERROR_DESCRIPTION = Dict{Kind, String}(
EOF_MULTICOMMENT => "unterminated multi-line comment #= ... =#",
EOF_CHAR => "unterminated character literal",
INVALID_NUMERIC_CONSTANT => "invalid numeric constant",
INVALID_OPERATOR => "invalid operator",
INVALID_INTERPOLATION_TERMINATOR => "interpolated variable ends with invalid character; use `\$(...)` instead",
UNKNOWN => "unknown",
ERROR => "unknown error",
)

struct Token
kind::Kind
# Offsets into a string or buffer
startbyte::Int # The byte where the token start in the buffer
endbyte::Int # The byte where the token ended in the buffer
token_error::TokenError
dotop::Bool
suffix::Bool
end
function Token(kind::Kind, startbyte::Int, endbyte::Int)
Token(kind, startbyte, endbyte, NO_ERR, false, false)
Token(kind, startbyte, endbyte, false, false)
end
Token() = Token(ERROR, 0, 0, UNKNOWN, false, false)
Token() = Token(ERROR, 0, 0, false, false)


const _EMPTY_RAWTOKEN = Token()
Expand All @@ -74,6 +62,7 @@ EMPTY_TOKEN(::Type{Token}) = _EMPTY_RAWTOKEN
function kind(t::Token)
isoperator(t.kind) && return OP
iskeyword(t.kind) && return KEYWORD
iserror(t.kind) && return ERROR
return t.kind
end
exactkind(t::Token) = t.kind
Expand Down
10 changes: 9 additions & 1 deletion Tokenize/src/token_kinds.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
@enum(Kind,
NONE, # Placeholder; never emitted by lexer
ENDMARKER, # EOF
ERROR,
COMMENT, # aadsdsa, #= fdsf #=
WHITESPACE, # '\n \t'
IDENTIFIER, # foo, Σxx
AT_SIGN, # @
COMMA, #,
SEMICOLON, # ;

begin_errors,
EOF_MULTICOMMENT,
EOF_CHAR,
INVALID_NUMERIC_CONSTANT,
INVALID_OPERATOR,
INVALID_INTERPOLATION_TERMINATOR,
ERROR,
end_errors,

begin_keywords,
KEYWORD, # general
BAREMODULE,
Expand Down
44 changes: 25 additions & 19 deletions Tokenize/test/lexer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ end
test_roundtrip("1234.0 .+1", Tokens.FLOAT, "1234.0")
test_roundtrip("1234.f(a)", Tokens.FLOAT, "1234.")
test_roundtrip("1234 .f(a)", Tokens.INTEGER, "1234")
test_roundtrip("1234.0.f(a)", Tokens.ERROR, "1234.0.")
test_roundtrip("1234.0.f(a)", Tokens.INVALID_NUMERIC_CONSTANT, "1234.0.")
test_roundtrip("1234.0 .f(a)", Tokens.FLOAT, "1234.0")
end

Expand Down Expand Up @@ -280,9 +280,9 @@ end
end

@testset "errors" begin
@test tok("#= #= =#", 1).kind == T.ERROR
@test tok("'dsadsa", 1).kind == T.ERROR
@test tok("aa **", 3).kind == T.ERROR
@test tok("#= #= =#", 1).kind == T.EOF_MULTICOMMENT
@test tok("'dsadsa", 1).kind == T.EOF_CHAR
@test tok("aa **", 3).kind == T.INVALID_OPERATOR
end

@testset "xor_eq" begin
Expand Down Expand Up @@ -501,9 +501,10 @@ end
str = """ "\$x෴" """
ts = collect(tokenize(str))
@test ts[4] ~ (T.IDENTIFIER , "x" , str)
@test ts[5] ~ (T.ERROR , "" , str)
@test ts[5] ~ (T.INVALID_INTERPOLATION_TERMINATOR , "" , str)
@test ts[6] ~ (T.STRING , "෴" , str)
@test ts[5].token_error == Tokens.INVALID_INTERPOLATION_TERMINATOR
@test Tokens.iserror(ts[5].kind)
@test ts[5].kind == Tokens.INVALID_INTERPOLATION_TERMINATOR
end
end

Expand Down Expand Up @@ -650,10 +651,10 @@ end
end

@testset "hex/bin/octal errors" begin
@test tok("0x").kind == T.ERROR
@test tok("0b").kind == T.ERROR
@test tok("0o").kind == T.ERROR
@test tok("0x 2", 1).kind == T.ERROR
@test tok("0x").kind == T.INVALID_NUMERIC_CONSTANT
@test tok("0b").kind == T.INVALID_NUMERIC_CONSTANT
@test tok("0o").kind == T.INVALID_NUMERIC_CONSTANT
@test tok("0x 2", 1).kind == T.INVALID_NUMERIC_CONSTANT
@test tok("0x.1p1").kind == T.FLOAT
end

Expand Down Expand Up @@ -716,15 +717,20 @@ end
@test tok("outer", 1).kind==T.OUTER
end

function test_error(tok, kind)
@test Tokens.iserror(tok.kind)
@test tok.kind == kind
end

@testset "token errors" begin
@test tok("1.2e2.3",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
@test tok("1.2.",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
@test tok("1.2.f",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
@test tok("0xv",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
@test tok("0b3",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
@test tok("0op",1).token_error === Tokens.INVALID_NUMERIC_CONSTANT
@test tok("--",1).token_error === Tokens.INVALID_OPERATOR
@test tok("1**2",2).token_error === Tokens.INVALID_OPERATOR
test_error(tok("1.2e2.3",1), Tokens.INVALID_NUMERIC_CONSTANT)
test_error(tok("1.2.",1), Tokens.INVALID_NUMERIC_CONSTANT)
test_error(tok("1.2.f",1), Tokens.INVALID_NUMERIC_CONSTANT)
test_error(tok("0xv",1), Tokens.INVALID_NUMERIC_CONSTANT)
test_error(tok("0b3",1), Tokens.INVALID_NUMERIC_CONSTANT)
test_error(tok("0op",1), Tokens.INVALID_NUMERIC_CONSTANT)
test_error(tok("--",1), Tokens.INVALID_OPERATOR)
test_error(tok("1**2",2), Tokens.INVALID_OPERATOR)
end

@testset "hat suffix" begin
Expand Down Expand Up @@ -765,7 +771,7 @@ end

@testset "invalid float" begin
s = ".0."
@test collect(tokenize(s))[1].kind == Tokens.ERROR
@test collect(tokenize(s))[1].kind == Tokens.INVALID_NUMERIC_CONSTANT
end

@testset "allow prime after end" begin
Expand Down
2 changes: 1 addition & 1 deletion src/parse_stream.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ function Base.summary(head::SyntaxHead)
end

function untokenize(head::SyntaxHead; unique=true, include_flag_suff=true)
str = untokenize(kind(head); unique=unique)
str = is_error(kind(head)) ? "error" : untokenize(kind(head); unique=unique)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems a potentially dubious to special case this in here rather than in untokenize(::Kind). I assume something fails somehow without this?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now, the parser expects any error to untokenize to "error". But now we have many error kinds. I could just add all of them to https://github.com/c42f/JuliaSyntax.jl/blob/77b4044218a7c7c5e34f8b1459a88fe6d405b64c/src/token_kinds.jl#L6 and have them sting to "error".

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I think they should be added to _str_to_kind — this allows them to be accessed with the K macro.

if is_dotted(head)
str = "."*str
end
Expand Down
1 change: 1 addition & 0 deletions src/tokens.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ kind(raw::TzTokens.Token) = TzTokens.exactkind(raw)
# Some renaming for naming consistency
is_literal(k) = TzTokens.isliteral(kind(k))
is_keyword(k) = TzTokens.iskeyword(kind(k))
is_error(k) = TzTokens.iserror(kind(k))
is_contextual_keyword(k) = TzTokens.iscontextualkeyword(kind(k))
is_operator(k) = TzTokens.isoperator(kind(k))
is_word_operator(k) = TzTokens.iswordoperator(kind(k))
Expand Down