diff --git a/docs/src/reference.md b/docs/src/reference.md index aeb1b44a..be6ff90a 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -80,6 +80,7 @@ class of tokenization errors and lets the parser deal with them. * We use flags rather than child nodes to represent the difference between `struct` and `mutable struct`, `module` and `baremodule` (#220) * Iterations are represented with the `iteration` and `in` heads rather than `=` within the header of a `for`. Thus `for i=is ; body end` parses to `(for (iteration (in i is)) (block body))`. Cartesian iteration as in `for a=as, b=bs body end` are represented with a nested `(iteration (in a as) (in b bs))` rather than a `block` containing `=` because these lists of iterators are neither semantically nor syntactically a sequence of statements, unlike other uses of `block`. Generators also use the `iteration` head - see information on that below. * Short form functions like `f(x) = x + 1` are represented with the `function` head rather than the `=` head. In this case the `SHORT_FORM_FUNCTION_FLAG` flag is set to allow the surface syntactic form to be easily distinguished from long form functions. +* All kinds of updating assignment operators like `+=` are represented with a single `K"op="` head, with the operator itself in infix position. For example, `x += 1` is `(op= x + 1)`, where the plus token is of kind `K"Identifer"`. This greatly reduces the number of distinct forms here from a rather big list (`$=` `%=` `&=` `*=` `+=` `-=` `//=` `/=` `<<=` `>>=` `>>>=` `\=` `^=` `|=` `÷=` `⊻=`) and makes the operator itself appear in the AST as kind `K"Identifier"`, as it should. It also makes it possible to add further unicode updating operators while keeping the AST stable. ## More detail on tree differences diff --git a/src/expr.jl b/src/expr.jl index f7832f1e..638e0b75 100644 --- a/src/expr.jl +++ b/src/expr.jl @@ -232,6 +232,16 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads, if k == K"?" headsym = :if + elseif k == K"op=" && length(args) == 3 + lhs = args[1] + op = args[2] + rhs = args[3] + headstr = string(args[2], '=') + if is_dotted(head) + headstr = '.'*headstr + end + headsym = Symbol(headstr) + args = Any[lhs, rhs] elseif k == K"macrocall" if length(args) >= 2 a2 = args[2] diff --git a/src/kinds.jl b/src/kinds.jl index dafc91de..21328c1a 100644 --- a/src/kinds.jl +++ b/src/kinds.jl @@ -293,23 +293,8 @@ register_kinds!(JuliaSyntax, 0, [ "BEGIN_ASSIGNMENTS" "BEGIN_SYNTACTIC_ASSIGNMENTS" "=" - "+=" - "-=" # Also used for "−=" - "*=" - "/=" - "//=" - "|=" - "^=" - "÷=" - "%=" - "<<=" - ">>=" - ">>>=" - "\\=" - "&=" + "op=" # Updating assignment operator ( $= %= &= *= += -= //= /= <<= >>= >>>= \= ^= |= ÷= ⊻= ) ":=" - "\$=" - "⊻=" "END_SYNTACTIC_ASSIGNMENTS" "~" "≔" diff --git a/src/parse_stream.jl b/src/parse_stream.jl index 42bedc49..33e029c6 100644 --- a/src/parse_stream.jl +++ b/src/parse_stream.jl @@ -871,8 +871,9 @@ end Bump the next token, splitting it into several pieces Tokens are defined by a number of `token_spec` of shape `(nbyte, kind, flags)`. -The number of input bytes of the last spec is taken from the remaining bytes of -the input token, with the associated `nbyte` ignored. +If all `nbyte` are positive, the sum must equal the token length. If one +`nbyte` is negative, that token is given `tok_len + nbyte` bytes and the sum of +all `nbyte` must equal zero. This is a hack which helps resolves the occasional lexing ambiguity. For example @@ -887,12 +888,14 @@ function bump_split(stream::ParseStream, split_spec::Vararg{Any, N}) where {N} tok = stream.lookahead[stream.lookahead_index] stream.lookahead_index += 1 b = _next_byte(stream) + toklen = tok.next_byte - b for (i, (nbyte, k, f)) in enumerate(split_spec) h = SyntaxHead(k, f) - b = (i == length(split_spec)) ? tok.next_byte : b + nbyte + b += nbyte < 0 ? (toklen + nbyte) : nbyte orig_k = k == K"." ? K"." : kind(tok) push!(stream.tokens, SyntaxToken(h, orig_k, false, b)) end + @assert tok.next_byte == b stream.peek_count = 0 return position(stream) end diff --git a/src/parser.jl b/src/parser.jl index cbe69856..0b66547f 100644 --- a/src/parser.jl +++ b/src/parser.jl @@ -340,7 +340,7 @@ function bump_dotsplit(ps, flags=EMPTY_FLAGS; bump_trivia(ps) mark = position(ps) k = remap_kind != K"None" ? remap_kind : kind(t) - pos = bump_split(ps, (1, K".", TRIVIA_FLAG), (0, k, flags)) + pos = bump_split(ps, (1, K".", TRIVIA_FLAG), (-1, k, flags)) if emit_dot_node pos = emit(ps, mark, K".") end @@ -626,7 +626,22 @@ function parse_assignment_with_initial_ex(ps::ParseState, mark, down::T) where { # a += b ==> (+= a b) # a .= b ==> (.= a b) is_short_form_func = k == K"=" && !is_dotted(t) && was_eventually_call(ps) - bump(ps, TRIVIA_FLAG) + if k == K"op=" + # x += y ==> (op= x + y) + # x .+= y ==> (.op= x + y) + bump_trivia(ps) + if is_dotted(t) + bump_split(ps, (1, K".", TRIVIA_FLAG), + (-2, K"Identifier", EMPTY_FLAGS), # op + (1, K"=", TRIVIA_FLAG)) + else + bump_split(ps, + (-1, K"Identifier", EMPTY_FLAGS), # op + (1, K"=", TRIVIA_FLAG)) + end + else + bump(ps, TRIVIA_FLAG) + end bump_trivia(ps) # Syntax Edition TODO: We'd like to call `down` here when # is_short_form_func is true, to prevent `f() = 1 = 2` from parsing. @@ -1843,7 +1858,7 @@ function parse_resword(ps::ParseState) # let x::1 ; end ==> (let (block (::-i x 1)) (block)) # let x ; end ==> (let (block x) (block)) # let x=1,y=2 ; end ==> (let (block (= x 1) (= y 2) (block))) - # let x+=1 ; end ==> (let (block (+= x 1)) (block)) + # let x+=1 ; end ==> (let (block (op= x + 1)) (block)) parse_comma_separated(ps, parse_eq_star) end emit(ps, m, K"block") @@ -2571,7 +2586,7 @@ function parse_import_path(ps::ParseState) # Modules with operator symbol names # import .⋆ ==> (import (importpath . ⋆)) bump_trivia(ps) - bump_split(ps, (1,K".",EMPTY_FLAGS), (1,peek(ps),EMPTY_FLAGS)) + bump_split(ps, (1,K".",EMPTY_FLAGS), (-1,peek(ps),EMPTY_FLAGS)) else # import @x ==> (import (importpath @x)) # import $A ==> (import (importpath ($ A))) @@ -2599,7 +2614,12 @@ function parse_import_path(ps::ParseState) warning="space between dots in import path") end bump_trivia(ps) - bump_split(ps, (1,K".",TRIVIA_FLAG), (1,k,EMPTY_FLAGS)) + m = position(ps) + bump_split(ps, (1,K".",TRIVIA_FLAG), (-1,k,EMPTY_FLAGS)) + if is_syntactic_operator(k) + # import A.= ==> (import (importpath A (error =))) + emit(ps, m, K"error", error="syntactic operators not allowed in import") + end elseif k == K"..." # Import the .. operator # import A... ==> (import (importpath A ..)) @@ -3550,13 +3570,13 @@ function parse_atom(ps::ParseState, check_identifiers=true) bump_dotsplit(ps, emit_dot_node=true, remap_kind= is_syntactic_operator(leading_kind) ? leading_kind : K"Identifier") if check_identifiers && !is_valid_identifier(leading_kind) - # += ==> (error +=) + # += ==> (error (op= +)) # ? ==> (error ?) - # .+= ==> (error (. +=)) + # .+= ==> (error (. (op= +))) emit(ps, mark, K"error", error="invalid identifier") else # Quoted syntactic operators allowed - # :+= ==> (quote-: +=) + # :+= ==> (quote-: (op= +)) end elseif is_keyword(leading_kind) if leading_kind == K"var" && (t = peek_token(ps,2); diff --git a/src/tokenize.jl b/src/tokenize.jl index af78bee4..b1e2325b 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -93,6 +93,7 @@ function _nondot_symbolic_operator_kinds() K"isa" K"in" K".'" + K"op=" ]) end @@ -527,14 +528,14 @@ function _next_token(l::Lexer, c) elseif c == '-' return lex_minus(l); elseif c == '−' # \minus '−' treated as hyphen '-' - return emit(l, accept(l, '=') ? K"-=" : K"-") + return emit(l, accept(l, '=') ? K"op=" : K"-") elseif c == '`' return lex_backtick(l); elseif is_identifier_start_char(c) return lex_identifier(l, c) elseif isdigit(c) return lex_digit(l, K"Integer") - elseif (k = get(_unicode_ops, c, K"error")) != K"error" + elseif (k = get(_unicode_ops, c, K"None")) != K"None" return emit(l, k) else emit(l, @@ -797,12 +798,12 @@ function lex_greater(l::Lexer) if accept(l, '>') if accept(l, '>') if accept(l, '=') - return emit(l, K">>>=") + return emit(l, K"op=") else # >>>?, ? not a = return emit(l, K">>>") end elseif accept(l, '=') - return emit(l, K">>=") + return emit(l, K"op=") else return emit(l, K">>") end @@ -819,7 +820,7 @@ end function lex_less(l::Lexer) if accept(l, '<') if accept(l, '=') - return emit(l, K"<<=") + return emit(l, K"op=") else # '<') return emit(l, K"|>") elseif accept(l, '|') @@ -910,7 +911,7 @@ function lex_plus(l::Lexer) if accept(l, '+') return emit(l, K"++") elseif accept(l, '=') - return emit(l, K"+=") + return emit(l, K"op=") end return emit(l, K"+") end @@ -925,7 +926,7 @@ function lex_minus(l::Lexer) elseif !l.dotop && accept(l, '>') return emit(l, K"->") elseif accept(l, '=') - return emit(l, K"-=") + return emit(l, K"op=") end return emit(l, K"-") end @@ -934,35 +935,35 @@ function lex_star(l::Lexer) if accept(l, '*') return emit(l, K"Error**") # "**" is an invalid operator use ^ elseif accept(l, '=') - return emit(l, K"*=") + return emit(l, K"op=") end return emit(l, K"*") end function lex_circumflex(l::Lexer) if accept(l, '=') - return emit(l, K"^=") + return emit(l, K"op=") end return emit(l, K"^") end function lex_division(l::Lexer) if accept(l, '=') - return emit(l, K"÷=") + return emit(l, K"op=") end return emit(l, K"÷") end function lex_dollar(l::Lexer) if accept(l, '=') - return emit(l, K"$=") + return emit(l, K"op=") end return emit(l, K"$") end function lex_xor(l::Lexer) if accept(l, '=') - return emit(l, K"⊻=") + return emit(l, K"op=") end return emit(l, K"⊻") end @@ -1110,7 +1111,7 @@ function lex_amper(l::Lexer) if accept(l, '&') return emit(l, K"&&") elseif accept(l, '=') - return emit(l, K"&=") + return emit(l, K"op=") else return emit(l, K"&") end @@ -1148,12 +1149,12 @@ end function lex_forwardslash(l::Lexer) if accept(l, '/') if accept(l, '=') - return emit(l, K"//=") + return emit(l, K"op=") else return emit(l, K"//") end elseif accept(l, '=') - return emit(l, K"/=") + return emit(l, K"op=") else return emit(l, K"/") end @@ -1161,7 +1162,7 @@ end function lex_backslash(l::Lexer) if accept(l, '=') - return emit(l, K"\=") + return emit(l, K"op=") end return emit(l, K"\\") end @@ -1193,7 +1194,7 @@ function lex_dot(l::Lexer) elseif pc == '−' l.dotop = true readchar(l) - return emit(l, accept(l, '=') ? K"-=" : K"-") + return emit(l, accept(l, '=') ? K"op=" : K"-") elseif pc =='*' l.dotop = true readchar(l) @@ -1222,7 +1223,7 @@ function lex_dot(l::Lexer) l.dotop = true readchar(l) if accept(l, '=') - return emit(l, K"&=") + return emit(l, K"op=") else if accept(l, '&') return emit(l, K"&&") diff --git a/test/expr.jl b/test/expr.jl index eb998229..200e8764 100644 --- a/test/expr.jl +++ b/test/expr.jl @@ -501,6 +501,16 @@ @test parsestmt("./x", ignore_errors=true) == Expr(:call, Expr(:error, Expr(:., :/)), :x) end + @testset "syntactic update-assignment operators" begin + @test parsestmt("x += y") == Expr(:(+=), :x, :y) + @test parsestmt("x .+= y") == Expr(:(.+=), :x, :y) + @test parsestmt(":+=") == QuoteNode(Symbol("+=")) + @test parsestmt(":(+=)") == QuoteNode(Symbol("+=")) + @test parsestmt(":.+=") == QuoteNode(Symbol(".+=")) + @test parsestmt(":(.+=)") == QuoteNode(Symbol(".+=")) + @test parsestmt("x \u2212= y") == Expr(:(-=), :x, :y) + end + @testset "let" begin @test parsestmt("let x=1\n end") == Expr(:let, Expr(:(=), :x, 1), Expr(:block, LineNumberNode(2))) diff --git a/test/parser.jl b/test/parser.jl index a747e1c7..9eb7caa2 100644 --- a/test/parser.jl +++ b/test/parser.jl @@ -62,8 +62,8 @@ tests = [ # parse_assignment "a = b" => "(= a b)" "a .= b" => "(.= a b)" - "a += b" => "(+= a b)" - "a .+= b" => "(.+= a b)" + "a += b" => "(op= a + b)" + "a .+= b" => "(.op= a + b)" "a, b = c, d" => "(= (tuple a b) (tuple c d))" "x, = xs" => "(= (tuple x) xs)" "[a ~b]" => "(hcat a (call-pre ~ b))" @@ -497,7 +497,7 @@ tests = [ "let x ; end" => "(let (block x) (block))" "let x::1 ; end" => "(let (block (::-i x 1)) (block))" "let x=1,y=2 end" => "(let (block (= x 1) (= y 2)) (block))" - "let x+=1 ; end" => "(let (block (+= x 1)) (block))" + "let x+=1 ; end" => "(let (block (op= x + 1)) (block))" "let ; end" => "(let (block) (block))" "let ; body end" => "(let (block) (block body))" "let\na\nb\nend" => "(let (block) (block a b))" @@ -576,7 +576,7 @@ tests = [ "const x = 1" => "(const (= x 1))" "const x .= 1" => "(error (const (.= x 1)))" "global x ~ 1" => "(global (call-i x ~ 1))" - "global x += 1" => "(global (+= x 1))" + "global x += 1" => "(global (op= x + 1))" "const x" => "(error (const x))" "global const x" => "(global (error (const x)))" "const global x" => "(error (const (global x)))" @@ -715,6 +715,7 @@ tests = [ "using :A" => "(using (importpath (error (quote-: A))))" "using A: :b" => "(using (: (importpath A) (importpath (error (quote-: b)))))" "using A: b.:c" => "(using (: (importpath A) (importpath b (quote-: c))))" + # Syntactic operators not allowed in import ], JuliaSyntax.parse_iteration_specs => [ "i = rhs" => "(iteration (in i rhs))" @@ -832,6 +833,7 @@ tests = [ "≕" => "≕" # Quoted syntactic operators allowed ":+=" => "(quote-: +=)" + ":.+=" => "(quote-: (. +=))" ":.=" => "(quote-: (. =))" ":.&&" => "(quote-: (. &&))" # Special symbols quoted @@ -1023,7 +1025,7 @@ tests = [ JuliaSyntax.parse_stmts => with_version.(v"1.11", [ "function f(public)\n public + 3\nend" => "(function (call f public) (block (call-i public + 3)))" "public A, B" => "(public A B)" - "if true \n public *= 4 \n end" => "(if true (block (*= public 4)))" + "if true \n public *= 4 \n end" => "(if true (block (op= public * 4)))" "module Mod\n public A, B \n end" => "(module Mod (block (public A B)))" "module Mod2\n a = 3; b = 6; public a, b\n end" => "(module Mod2 (block (= a 3) (= b 6) (public a b)))" "a = 3; b = 6; public a, b" => "(toplevel-; (= a 3) (= b 6) (public a b))" @@ -1141,6 +1143,12 @@ parsestmt_with_kind_tests = [ ":(<:)" => "(quote-: (parens <:::<:))" ":(&&)" => "(quote-: (parens &&::&&))" ":(=)" => "(quote-: (parens =::=))" + "a := b" => "(:= a::Identifier b::Identifier)" + "a += b" => "(op= a::Identifier +::Identifier b::Identifier)" + "a .+= b" => "(.op= a::Identifier +::Identifier b::Identifier)" + "a >>= b" => "(op= a::Identifier >>::Identifier b::Identifier)" + ":+=" => "(quote-: +=::op=)" + ":.+=" => "(quote-: (. +=::op=))" ] @testset "parser `Kind` remapping" begin @@ -1174,10 +1182,10 @@ end # · and · normalize to ⋅ @test parse_to_sexpr_str(JuliaSyntax.parse_eq, "a \u00B7 b") == "(call-i a \u22C5 b)" @test parse_to_sexpr_str(JuliaSyntax.parse_eq, "a \u0387 b") == "(call-i a \u22C5 b)" - # − normalizes to - + # − ('\u2212') normalizes to - ('\u002d') @test parse_to_sexpr_str(JuliaSyntax.parse_expr, "a \u2212 b") == "(call-i a - b)" - @test parse_to_sexpr_str(JuliaSyntax.parse_eq, "a \u2212= b") == "(-= a b)" - @test parse_to_sexpr_str(JuliaSyntax.parse_eq, "a .\u2212= b") == "(.-= a b)" + @test parse_to_sexpr_str(JuliaSyntax.parse_eq, "a \u2212= b") == "(op= a - b)" + @test parse_to_sexpr_str(JuliaSyntax.parse_eq, "a .\u2212= b") == "(.op= a - b)" end @testset "Unbalanced bidirectional unicode" begin diff --git a/test/tokenize.jl b/test/tokenize.jl index e2d069da..2a2309bc 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -175,14 +175,15 @@ end end @testset "test added operators" begin - @test tok("1+=2", 2).kind == K"+=" - @test tok("1-=2", 2).kind == K"-=" + @test tok("1+=2", 2).kind == K"op=" + @test tok("1-=2", 2).kind == K"op=" + @test tok("1*=2", 2).kind == K"op=" + @test tok("1^=2", 2).kind == K"op=" + @test tok("1÷=2", 2).kind == K"op=" + @test tok("1\\=2", 2).kind == K"op=" + @test tok("1\$=2", 2).kind == K"op=" + @test tok("1⊻=2", 2).kind == K"op=" @test tok("1:=2", 2).kind == K":=" - @test tok("1*=2", 2).kind == K"*=" - @test tok("1^=2", 2).kind == K"^=" - @test tok("1÷=2", 2).kind == K"÷=" - @test tok("1\\=2", 2).kind == K"\=" - @test tok("1\$=2", 2).kind == K"$=" @test tok("1-->2", 2).kind == K"-->" @test tok("1<--2", 2).kind == K"<--" @test tok("1<-->2", 2).kind == K"<-->" @@ -342,10 +343,6 @@ end @test length(collect(tokenize("x)"))) == 3 end -@testset "xor_eq" begin - @test tok("1 ⊻= 2", 3).kind==K"⊻=" -end - @testset "lex binary" begin @test tok("0b0101").kind==K"BinInt" end @@ -824,6 +821,9 @@ for opkind in Tokenize._nondot_symbolic_operator_kinds() tokens = collect(tokenize(str)) exop = expr.head == :call ? expr.args[1] : expr.head #println(str) + if Symbol(Tokenize.untokenize(tokens[arity == 1 ? 1 : 3], str)) != exop + @info "" arity str exop + end @test Symbol(Tokenize.untokenize(tokens[arity == 1 ? 1 : 3], str)) == exop else break @@ -842,7 +842,7 @@ end # https://github.com/JuliaLang/julia/pull/40948 @test tok("−").kind == K"-" - @test tok("−=").kind == K"-=" + @test tok("−=").kind == K"op=" @test tok(".−").dotop end