JuliaLang · c42f · Jan 22, 2025 · Jan 22, 2025
diff --git a/docs/src/reference.md b/docs/src/reference.md
@@ -80,6 +80,7 @@ class of tokenization errors and lets the parser deal with them.
 * We use flags rather than child nodes to represent the difference between `struct` and `mutable struct`, `module` and `baremodule` (#220)
 * Iterations are represented with the `iteration` and `in` heads rather than `=` within the header of a `for`. Thus `for i=is ; body end` parses to `(for (iteration (in i is)) (block body))`. Cartesian iteration as in `for a=as, b=bs body end` are represented with a nested `(iteration (in a as) (in b bs))` rather than a `block` containing `=` because these lists of iterators are neither semantically nor syntactically a sequence of statements, unlike other uses of `block`. Generators also use the `iteration` head - see information on that below.
 * Short form functions like `f(x) = x + 1` are represented with the `function` head rather than the `=` head. In this case the `SHORT_FORM_FUNCTION_FLAG` flag is set to allow the surface syntactic form to be easily distinguished from long form functions.
+* All kinds of updating assignment operators like `+=` are represented with a single `K"op="` head, with the operator itself in infix position. For example, `x += 1` is `(op= x + 1)`, where the plus token is of kind `K"Identifer"`. This greatly reduces the number of distinct forms here from a rather big list (`$=` `%=` `&=` `*=` `+=` `-=` `//=` `/=` `<<=` `>>=` `>>>=` `\=` `^=` `|=` `÷=` `⊻=`) and makes the operator itself appear in the AST as kind `K"Identifier"`, as it should. It also makes it possible to add further unicode updating operators while keeping the AST stable.
 
 ## More detail on tree differences
 

diff --git a/src/expr.jl b/src/expr.jl
@@ -232,6 +232,16 @@ function _internal_node_to_Expr(source, srcrange, head, childranges, childheads,
 
     if k == K"?"
         headsym = :if
+    elseif k == K"op=" && length(args) == 3
+        lhs = args[1]
+        op = args[2]
+        rhs = args[3]
+        headstr = string(args[2], '=')
+        if is_dotted(head)
+            headstr = '.'*headstr
+        end
+        headsym = Symbol(headstr)
+        args = Any[lhs, rhs]
     elseif k == K"macrocall"
         if length(args) >= 2
             a2 = args[2]

diff --git a/src/kinds.jl b/src/kinds.jl
@@ -293,23 +293,8 @@ register_kinds!(JuliaSyntax, 0, [
     "BEGIN_ASSIGNMENTS"
         "BEGIN_SYNTACTIC_ASSIGNMENTS"
         "="
-        "+="
-        "-="   # Also used for "−="
-        "*="
-        "/="
-        "//="
-        "|="
-        "^="
-        "÷="
-        "%="
-        "<<="
-        ">>="
-        ">>>="
-        "\\="
-        "&="
+        "op="  # Updating assignment operator ( $= %= &= *= += -= //= /= <<= >>= >>>= \= ^= |= ÷= ⊻= )
         ":="
-        "\$="
-        "⊻="
         "END_SYNTACTIC_ASSIGNMENTS"
         "~"
         "≔"

diff --git a/src/parse_stream.jl b/src/parse_stream.jl
@@ -871,8 +871,9 @@ end
 Bump the next token, splitting it into several pieces
 
 Tokens are defined by a number of `token_spec` of shape `(nbyte, kind, flags)`.
-The number of input bytes of the last spec is taken from the remaining bytes of
-the input token, with the associated `nbyte` ignored.
+If all `nbyte` are positive, the sum must equal the token length. If one
+`nbyte` is negative, that token is given `tok_len + nbyte` bytes and the sum of
+all `nbyte` must equal zero.
 
 This is a hack which helps resolves the occasional lexing ambiguity. For
 example
@@ -887,12 +888,14 @@ function bump_split(stream::ParseStream, split_spec::Vararg{Any, N}) where {N}
     tok = stream.lookahead[stream.lookahead_index]
     stream.lookahead_index += 1
     b = _next_byte(stream)
+    toklen = tok.next_byte - b
     for (i, (nbyte, k, f)) in enumerate(split_spec)
         h = SyntaxHead(k, f)
-        b = (i == length(split_spec)) ? tok.next_byte : b + nbyte
+        b += nbyte < 0 ? (toklen + nbyte) : nbyte
         orig_k = k == K"." ? K"." : kind(tok)
         push!(stream.tokens, SyntaxToken(h, orig_k, false, b))
     end
+    @assert tok.next_byte == b
     stream.peek_count = 0
     return position(stream)
 end

diff --git a/src/parser.jl b/src/parser.jl
@@ -340,7 +340,7 @@ function bump_dotsplit(ps, flags=EMPTY_FLAGS;
         bump_trivia(ps)
         mark = position(ps)
         k = remap_kind != K"None" ? remap_kind : kind(t)
-        pos = bump_split(ps, (1, K".", TRIVIA_FLAG), (0, k, flags))
+        pos = bump_split(ps, (1, K".", TRIVIA_FLAG), (-1, k, flags))
         if emit_dot_node
             pos = emit(ps, mark, K".")
         end
@@ -626,7 +626,22 @@ function parse_assignment_with_initial_ex(ps::ParseState, mark, down::T) where {
         # a += b   ==>  (+= a b)
         # a .= b   ==>  (.= a b)
         is_short_form_func = k == K"=" && !is_dotted(t) && was_eventually_call(ps)
-        bump(ps, TRIVIA_FLAG)
+        if k == K"op="
+            # x += y   ==>  (op= x + y)
+            # x .+= y  ==>  (.op= x + y)
+            bump_trivia(ps)
+            if is_dotted(t)
+                bump_split(ps, (1, K".", TRIVIA_FLAG),
+                           (-2, K"Identifier", EMPTY_FLAGS),  # op
+                           (1, K"=", TRIVIA_FLAG))
+            else
+                bump_split(ps,
+                           (-1, K"Identifier", EMPTY_FLAGS),  # op
+                           (1, K"=", TRIVIA_FLAG))
+            end
+        else
+            bump(ps, TRIVIA_FLAG)
+        end
         bump_trivia(ps)
         # Syntax Edition TODO: We'd like to call `down` here when
         # is_short_form_func is true, to prevent `f() = 1 = 2` from parsing.
@@ -1843,7 +1858,7 @@ function parse_resword(ps::ParseState)
             # let x::1 ; end    ==>  (let (block (::-i x 1)) (block))
             # let x ; end       ==>  (let (block x) (block))
             # let x=1,y=2 ; end ==>  (let (block (= x 1) (= y 2) (block)))
-            # let x+=1 ; end    ==>  (let (block (+= x 1)) (block))
+            # let x+=1 ; end    ==>  (let (block (op= x + 1)) (block))
             parse_comma_separated(ps, parse_eq_star)
         end
         emit(ps, m, K"block")
@@ -2571,7 +2586,7 @@ function parse_import_path(ps::ParseState)
         # Modules with operator symbol names
         # import .⋆  ==>  (import (importpath . ⋆))
         bump_trivia(ps)
-        bump_split(ps, (1,K".",EMPTY_FLAGS), (1,peek(ps),EMPTY_FLAGS))
+        bump_split(ps, (1,K".",EMPTY_FLAGS), (-1,peek(ps),EMPTY_FLAGS))
     else
         # import @x     ==>  (import (importpath @x))
         # import $A     ==>  (import (importpath ($ A)))
@@ -2599,7 +2614,12 @@ function parse_import_path(ps::ParseState)
                                 warning="space between dots in import path")
             end
             bump_trivia(ps)
-            bump_split(ps, (1,K".",TRIVIA_FLAG), (1,k,EMPTY_FLAGS))
+            m = position(ps)
+            bump_split(ps, (1,K".",TRIVIA_FLAG), (-1,k,EMPTY_FLAGS))
+            if is_syntactic_operator(k)
+                # import A.=  ==>  (import (importpath A (error =)))
+                emit(ps, m, K"error", error="syntactic operators not allowed in import")
+            end
         elseif k == K"..."
             # Import the .. operator
             # import A...  ==>  (import (importpath A ..))
@@ -3550,13 +3570,13 @@ function parse_atom(ps::ParseState, check_identifiers=true)
         bump_dotsplit(ps, emit_dot_node=true, remap_kind=
                       is_syntactic_operator(leading_kind) ? leading_kind : K"Identifier")
         if check_identifiers && !is_valid_identifier(leading_kind)
-            # +=   ==>  (error +=)
+            # +=   ==>  (error (op= +))
             # ?    ==>  (error ?)
-            # .+=  ==>  (error (. +=))
+            # .+=  ==>  (error (. (op= +)))
             emit(ps, mark, K"error", error="invalid identifier")
         else
             # Quoted syntactic operators allowed
-            # :+=  ==>  (quote-: +=)
+            # :+=  ==>  (quote-: (op= +))
         end
     elseif is_keyword(leading_kind)
         if leading_kind == K"var" && (t = peek_token(ps,2);

diff --git a/src/tokenize.jl b/src/tokenize.jl
@@ -93,6 +93,7 @@ function _nondot_symbolic_operator_kinds()
         K"isa"
         K"in"
         K".'"
+        K"op="
     ])
 end
 
@@ -527,14 +528,14 @@ function _next_token(l::Lexer, c)
     elseif c == '-'
         return lex_minus(l);
     elseif c == '−' # \minus '−' treated as hyphen '-'
-        return emit(l, accept(l, '=') ? K"-=" : K"-")
+        return emit(l, accept(l, '=') ? K"op=" : K"-")
     elseif c == '`'
         return lex_backtick(l);
     elseif is_identifier_start_char(c)
         return lex_identifier(l, c)
     elseif isdigit(c)
         return lex_digit(l, K"Integer")
-    elseif (k = get(_unicode_ops, c, K"error")) != K"error"
+    elseif (k = get(_unicode_ops, c, K"None")) != K"None"
         return emit(l, k)
     else
         emit(l,
@@ -797,12 +798,12 @@ function lex_greater(l::Lexer)
     if accept(l, '>')
         if accept(l, '>')
             if accept(l, '=')
-                return emit(l, K">>>=")
+                return emit(l, K"op=")
             else # >>>?, ? not a =
                 return emit(l, K">>>")
             end
         elseif accept(l, '=')
-            return emit(l, K">>=")
+            return emit(l, K"op=")
         else
             return emit(l, K">>")
         end
@@ -819,7 +820,7 @@ end
 function lex_less(l::Lexer)
     if accept(l, '<')
         if accept(l, '=')
-            return emit(l, K"<<=")
+            return emit(l, K"op=")
         else # '<<?', ? not =, ' '
             return emit(l, K"<<")
         end
@@ -888,15 +889,15 @@ end
 
 function lex_percent(l::Lexer)
     if accept(l, '=')
-        return emit(l, K"%=")
+        return emit(l, K"op=")
     else
         return emit(l, K"%")
     end
 end
 
 function lex_bar(l::Lexer)
     if accept(l, '=')
-        return emit(l, K"|=")
+        return emit(l, K"op=")
     elseif accept(l, '>')
         return emit(l, K"|>")
     elseif accept(l, '|')
@@ -910,7 +911,7 @@ function lex_plus(l::Lexer)
     if accept(l, '+')
         return emit(l, K"++")
     elseif accept(l, '=')
-        return emit(l, K"+=")
+        return emit(l, K"op=")
     end
     return emit(l, K"+")
 end
@@ -925,7 +926,7 @@ function lex_minus(l::Lexer)
     elseif !l.dotop && accept(l, '>')
         return emit(l, K"->")
     elseif accept(l, '=')
-        return emit(l, K"-=")
+        return emit(l, K"op=")
     end
     return emit(l, K"-")
 end
@@ -934,35 +935,35 @@ function lex_star(l::Lexer)
     if accept(l, '*')
         return emit(l, K"Error**") # "**" is an invalid operator use ^
     elseif accept(l, '=')
-        return emit(l, K"*=")
+        return emit(l, K"op=")
     end
     return emit(l, K"*")
 end
 
 function lex_circumflex(l::Lexer)
     if accept(l, '=')
-        return emit(l, K"^=")
+        return emit(l, K"op=")
     end
     return emit(l, K"^")
 end
 
 function lex_division(l::Lexer)
     if accept(l, '=')
-        return emit(l, K"÷=")
+        return emit(l, K"op=")
     end
     return emit(l, K"÷")
 end
 
 function lex_dollar(l::Lexer)
     if accept(l, '=')
-        return emit(l, K"$=")
+        return emit(l, K"op=")
     end
     return emit(l, K"$")
 end
 
 function lex_xor(l::Lexer)
     if accept(l, '=')
-        return emit(l, K"⊻=")
+        return emit(l, K"op=")
     end
     return emit(l, K"⊻")
 end
@@ -1110,7 +1111,7 @@ function lex_amper(l::Lexer)
     if accept(l, '&')
         return emit(l, K"&&")
     elseif accept(l, '=')
-        return emit(l, K"&=")
+        return emit(l, K"op=")
     else
         return emit(l, K"&")
     end
@@ -1148,20 +1149,20 @@ end
 function lex_forwardslash(l::Lexer)
     if accept(l, '/')
         if accept(l, '=')
-            return emit(l, K"//=")
+            return emit(l, K"op=")
         else
             return emit(l, K"//")
         end
     elseif accept(l, '=')
-        return emit(l, K"/=")
+        return emit(l, K"op=")
     else
         return emit(l, K"/")
     end
 end
 
 function lex_backslash(l::Lexer)
     if accept(l, '=')
-        return emit(l, K"\=")
+        return emit(l, K"op=")
     end
     return emit(l, K"\\")
 end
@@ -1193,7 +1194,7 @@ function lex_dot(l::Lexer)
         elseif pc == '−'
             l.dotop = true
             readchar(l)
-            return emit(l, accept(l, '=') ? K"-=" : K"-")
+            return emit(l, accept(l, '=') ? K"op=" : K"-")
         elseif pc =='*'
             l.dotop = true
             readchar(l)
@@ -1222,7 +1223,7 @@ function lex_dot(l::Lexer)
             l.dotop = true
             readchar(l)
             if accept(l, '=')
-                return emit(l, K"&=")
+                return emit(l, K"op=")
             else
                 if accept(l, '&')
                     return emit(l, K"&&")

diff --git a/test/expr.jl b/test/expr.jl
@@ -501,6 +501,16 @@
         @test parsestmt("./x", ignore_errors=true) == Expr(:call, Expr(:error, Expr(:., :/)), :x)
     end
 
+    @testset "syntactic update-assignment operators" begin
+        @test parsestmt("x += y") == Expr(:(+=), :x, :y)
+        @test parsestmt("x .+= y") == Expr(:(.+=), :x, :y)
+        @test parsestmt(":+=") == QuoteNode(Symbol("+="))
+        @test parsestmt(":(+=)") == QuoteNode(Symbol("+="))
+        @test parsestmt(":.+=") == QuoteNode(Symbol(".+="))
+        @test parsestmt(":(.+=)") == QuoteNode(Symbol(".+="))
+        @test parsestmt("x \u2212= y") == Expr(:(-=), :x, :y)
+    end
+
     @testset "let" begin
         @test parsestmt("let x=1\n end") ==
             Expr(:let, Expr(:(=), :x, 1),  Expr(:block, LineNumberNode(2)))