diff --git a/src/kinds.jl b/src/kinds.jl index bf837716..f206f372 100644 --- a/src/kinds.jl +++ b/src/kinds.jl @@ -212,6 +212,7 @@ register_kinds!(JuliaSyntax, 0, [ "ErrorOverLongCharacter" "ErrorInvalidUTF8" "ErrorInvisibleChar" + "ErrorIdentifierStart" "ErrorUnknownCharacter" "ErrorBidiFormatting" # Generic error @@ -1175,6 +1176,7 @@ const _token_error_descriptions = Dict{Kind, String}( K"ErrorOverLongCharacter"=>"character literal contains multiple characters", K"ErrorInvalidUTF8"=>"invalid UTF-8 sequence", K"ErrorInvisibleChar"=>"invisible character", + K"ErrorIdentifierStart" => "identifier cannot begin with character", K"ErrorUnknownCharacter"=>"unknown unicode character", K"ErrorBidiFormatting"=>"unbalanced bidirectional unicode formatting", K"ErrorInvalidOperator" => "invalid operator", diff --git a/src/parse_stream.jl b/src/parse_stream.jl index b4ddbeaf..5df40e21 100644 --- a/src/parse_stream.jl +++ b/src/parse_stream.jl @@ -1051,7 +1051,7 @@ function validate_tokens(stream::ParseStream) elseif is_error(k) && k != K"error" # Emit messages for non-generic token errors tokstr = String(txtbuf[tokrange]) - msg = if k in KSet"ErrorInvisibleChar ErrorUnknownCharacter" + msg = if k in KSet"ErrorInvisibleChar ErrorUnknownCharacter ErrorIdentifierStart" "$(_token_error_descriptions[k]) $(repr(tokstr[1]))" elseif k in KSet"ErrorInvalidUTF8 ErrorBidiFormatting" "$(_token_error_descriptions[k]) $(repr(tokstr))" diff --git a/src/tokenize.jl b/src/tokenize.jl index 0f60309f..5a6b4d96 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -538,8 +538,9 @@ function _next_token(l::Lexer, c) return emit(l, k) else emit(l, - !isvalid(c) ? K"ErrorInvalidUTF8" : - is_invisible_char(c) ? K"ErrorInvisibleChar" : + !isvalid(c) ? K"ErrorInvalidUTF8" : + is_invisible_char(c) ? K"ErrorInvisibleChar" : + is_identifier_char(c) ? K"ErrorIdentifierStart" : K"ErrorUnknownCharacter") end end diff --git a/test/diagnostics.jl b/test/diagnostics.jl index 1d1f9e5d..ff9e76a2 100644 --- a/test/diagnostics.jl +++ b/test/diagnostics.jl @@ -19,6 +19,7 @@ end @test diagnostic("a$(c)b") == Diagnostic(2, 1+sizeof(string(c)), :error, "invisible character $(repr(c))") end + @test diagnostic("₁") == Diagnostic(1, 3, :error, "identifier cannot begin with character '₁'") @test diagnostic(":⥻") == Diagnostic(2, 4, :error, "unknown unicode character '⥻'") @test diagnostic("\"X \u202a X\"") == Diagnostic(2, 8, :error, "unbalanced bidirectional unicode formatting \"X \\u202a X\"") diff --git a/test/tokenize.jl b/test/tokenize.jl index 0837b9c8..eb30370f 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -1009,6 +1009,7 @@ end @testset "invalid UTF-8 characters" begin @test onlytok("\x00") == K"ErrorUnknownCharacter" + @test onlytok("₁") == K"ErrorIdentifierStart" bad_chars = [ first("\xe2") # malformed