Skip to content

Commit a6b8182

Browse files
clockflyhvanhovell
authored andcommitted
[SPARK-17364][SQL] Antlr lexer wrongly treats full qualified identifier as a decimal number token when parsing SQL string
## What changes were proposed in this pull request? The Antlr lexer we use to tokenize a SQL string may wrongly tokenize a fully qualified identifier as a decimal number token. For example, table identifier `default.123_table` is wrongly tokenized as ``` default // Matches lexer rule IDENTIFIER .123 // Matches lexer rule DECIMAL_VALUE _TABLE // Matches lexer rule IDENTIFIER ``` The correct tokenization for `default.123_table` should be: ``` default // Matches lexer rule IDENTIFIER, . // Matches a single dot 123_TABLE // Matches lexer rule IDENTIFIER ``` This PR fix the Antlr grammar so that it can tokenize fully qualified identifier correctly: 1. Fully qualified table name can be parsed correctly. For example, `select * from database.123_suffix`. 2. Fully qualified column name can be parsed correctly, for example `select a.123_suffix from a`. ### Before change #### Case 1: Failed to parse fully qualified column name ``` scala> spark.sql("select a.123_column from a").show org.apache.spark.sql.catalyst.parser.ParseException: extraneous input '.123' expecting {<EOF>, ... , IDENTIFIER, BACKQUOTED_IDENTIFIER}(line 1, pos 8) == SQL == select a.123_column from a --------^^^ ``` #### Case 2: Failed to parse fully qualified table name ``` scala> spark.sql("select * from default.123_table") org.apache.spark.sql.catalyst.parser.ParseException: extraneous input '.123' expecting {<EOF>, ... IDENTIFIER, BACKQUOTED_IDENTIFIER}(line 1, pos 21) == SQL == select * from default.123_table ---------------------^^^ ``` ### After Change #### Case 1: fully qualified column name, no ParseException thrown ``` scala> spark.sql("select a.123_column from a").show ``` #### Case 2: fully qualified table name, no ParseException thrown ``` scala> spark.sql("select * from default.123_table") ``` ## How was this patch tested? Unit test. Author: Sean Zhong <seanzhong@databricks.com> Closes #15006 from clockfly/SPARK-17364.
1 parent fe76739 commit a6b8182

3 files changed

Lines changed: 63 additions & 9 deletions

File tree

sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,30 @@
1616

1717
grammar SqlBase;
1818

19+
@members {
20+
/**
21+
* Verify whether current token is a valid decimal token (which contains dot).
22+
* Returns true if the character that follows the token is not a digit or letter or underscore.
23+
*
24+
* For example:
25+
* For char stream "2.3", "2." is not a valid decimal token, because it is followed by digit '3'.
26+
* For char stream "2.3_", "2.3" is not a valid decimal token, because it is followed by '_'.
27+
* For char stream "2.3W", "2.3" is not a valid decimal token, because it is followed by 'W'.
28+
* For char stream "12.0D 34.E2+0.12 " 12.0D is a valid decimal token because it is folllowed
29+
* by a space. 34.E2 is a valid decimal token because it is followed by symbol '+'
30+
* which is not a digit or letter or underscore.
31+
*/
32+
public boolean isValidDecimal() {
33+
int nextChar = _input.LA(1);
34+
if (nextChar >= 'A' && nextChar <= 'Z' || nextChar >= '0' && nextChar <= '9' ||
35+
nextChar == '_') {
36+
return false;
37+
} else {
38+
return true;
39+
}
40+
}
41+
}
42+
1943
tokens {
2044
DELIMITER
2145
}
@@ -920,23 +944,22 @@ INTEGER_VALUE
920944
;
921945

922946
DECIMAL_VALUE
923-
: DIGIT+ '.' DIGIT*
924-
| '.' DIGIT+
947+
: DECIMAL_DIGITS {isValidDecimal()}?
925948
;
926949

927950
SCIENTIFIC_DECIMAL_VALUE
928-
: DIGIT+ ('.' DIGIT*)? EXPONENT
929-
| '.' DIGIT+ EXPONENT
951+
: DIGIT+ EXPONENT
952+
| DECIMAL_DIGITS EXPONENT {isValidDecimal()}?
930953
;
931954

932955
DOUBLE_LITERAL
933-
:
934-
(INTEGER_VALUE | DECIMAL_VALUE | SCIENTIFIC_DECIMAL_VALUE) 'D'
956+
: DIGIT+ EXPONENT? 'D'
957+
| DECIMAL_DIGITS EXPONENT? 'D' {isValidDecimal()}?
935958
;
936959

937960
BIGDECIMAL_LITERAL
938-
:
939-
(INTEGER_VALUE | DECIMAL_VALUE | SCIENTIFIC_DECIMAL_VALUE) 'BD'
961+
: DIGIT+ EXPONENT? 'BD'
962+
| DECIMAL_DIGITS EXPONENT? 'BD' {isValidDecimal()}?
940963
;
941964

942965
IDENTIFIER
@@ -947,6 +970,11 @@ BACKQUOTED_IDENTIFIER
947970
: '`' ( ~'`' | '``' )* '`'
948971
;
949972

973+
fragment DECIMAL_DIGITS
974+
: DIGIT+ '.' DIGIT*
975+
| '.' DIGIT+
976+
;
977+
950978
fragment EXPONENT
951979
: 'E' [+-]? DIGIT+
952980
;

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.parser
1818

1919
import java.sql.{Date, Timestamp}
2020

21-
import org.apache.spark.sql.catalyst.FunctionIdentifier
21+
import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
2222
import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, _}
2323
import org.apache.spark.sql.catalyst.expressions._
2424
import org.apache.spark.sql.catalyst.plans.PlanTest
@@ -518,4 +518,17 @@ class ExpressionParserSuite extends PlanTest {
518518
assertEqual("current_date", CurrentDate())
519519
assertEqual("current_timestamp", CurrentTimestamp())
520520
}
521+
522+
test("SPARK-17364, fully qualified column name which starts with number") {
523+
assertEqual("123_", UnresolvedAttribute("123_"))
524+
assertEqual("1a.123_", UnresolvedAttribute("1a.123_"))
525+
// ".123" should not be treated as token of type DECIMAL_VALUE
526+
assertEqual("a.123A", UnresolvedAttribute("a.123A"))
527+
// ".123E3" should not be treated as token of type SCIENTIFIC_DECIMAL_VALUE
528+
assertEqual("a.123E3_column", UnresolvedAttribute("a.123E3_column"))
529+
// ".123D" should not be treated as token of type DOUBLE_LITERAL
530+
assertEqual("a.123D_column", UnresolvedAttribute("a.123D_column"))
531+
// ".123BD" should not be treated as token of type BIGDECIMAL_LITERAL
532+
assertEqual("a.123BD_column", UnresolvedAttribute("a.123BD_column"))
533+
}
521534
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,4 +91,17 @@ class TableIdentifierParserSuite extends SparkFunSuite {
9191
assert(TableIdentifier(nonReserved) === parseTableIdentifier(nonReserved))
9292
}
9393
}
94+
95+
test("SPARK-17364 table identifier - contains number") {
96+
assert(parseTableIdentifier("123_") == TableIdentifier("123_"))
97+
assert(parseTableIdentifier("1a.123_") == TableIdentifier("123_", Some("1a")))
98+
// ".123" should not be treated as token of type DECIMAL_VALUE
99+
assert(parseTableIdentifier("a.123A") == TableIdentifier("123A", Some("a")))
100+
// ".123E3" should not be treated as token of type SCIENTIFIC_DECIMAL_VALUE
101+
assert(parseTableIdentifier("a.123E3_LIST") == TableIdentifier("123E3_LIST", Some("a")))
102+
// ".123D" should not be treated as token of type DOUBLE_LITERAL
103+
assert(parseTableIdentifier("a.123D_LIST") == TableIdentifier("123D_LIST", Some("a")))
104+
// ".123BD" should not be treated as token of type BIGDECIMAL_LITERAL
105+
assert(parseTableIdentifier("a.123BD_LIST") == TableIdentifier("123BD_LIST", Some("a")))
106+
}
94107
}

0 commit comments

Comments
 (0)