apache · MaxGekk · Jul 1, 2020 · Jul 1, 2020 · Jul 1, 2020 · Jul 1, 2020
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
@@ -31,7 +31,11 @@ license: |
   - In Spark 3.1, `from_unixtime`, `unix_timestamp`,`to_unix_timestamp`, `to_timestamp` and `to_date` will fail if the specified datetime pattern is invalid. In Spark 3.0 or earlier, they result `NULL`.
 
   - In Spark 3.1, casting numeric to timestamp will be forbidden by default. It's strongly recommended to use dedicated functions: TIMESTAMP_SECONDS, TIMESTAMP_MILLIS and TIMESTAMP_MICROS. Or you can set `spark.sql.legacy.allowCastNumericToTimestamp` to true to work around it. See more details in SPARK-31710.
-
+
+## Upgrading from Spark SQL 3.0 to 3.0.1
+
+- In Spark 3.0, JSON datasource and JSON function `schema_of_json` infer TimestampType from string values if they match to the pattern defined by the JSON option `timestampFormat`. Since version 3.0.1, the timestamp type inference is disabled by default. Set the JSON option `inferTimestamp` to `true` to enable such type inference.
+
 ## Upgrading from Spark SQL 2.4 to 3.0
 
 ### Dataset/DataFrame APIs

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -133,7 +133,7 @@ private[sql] class JSONOptions(
    * Enables inferring of TimestampType from strings matched to the timestamp pattern
    * defined by the timestampFormat option.
    */
-  val inferTimestamp: Boolean = parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(true)
+  val inferTimestamp: Boolean = parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(false)
 
   /** Build a Jackson [[JsonFactory]] using JSON options. */
   def buildJsonFactory(): JsonFactory = {

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
@@ -35,22 +35,29 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
     assert(inferSchema.inferField(parser) === expectedType)
   }
 
-  def checkTimestampType(pattern: String, json: String): Unit = {
-    checkType(Map("timestampFormat" -> pattern), json, TimestampType)
+  def checkTimestampType(pattern: String, json: String, inferTimestamp: Boolean): Unit = {
+    checkType(
+      Map("timestampFormat" -> pattern, "inferTimestamp" -> inferTimestamp.toString),
+      json,
+      if (inferTimestamp) TimestampType else StringType)
   }
 
   test("inferring timestamp type") {
-    Seq("legacy", "corrected").foreach { legacyParserPolicy =>
-      withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) {
-        checkTimestampType("yyyy", """{"a": "2018"}""")
-        checkTimestampType("yyyy=MM", """{"a": "2018=12"}""")
-        checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""")
-        checkTimestampType(
-          "yyyy-MM-dd'T'HH:mm:ss.SSS",
-          """{"a": "2018-12-02T21:04:00.123"}""")
-        checkTimestampType(
-          "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
-          """{"a": "2018-12-02T21:04:00.123567+01:00"}""")
+    Seq(true, false).foreach { inferTimestamp =>
+      Seq("legacy", "corrected").foreach { legacyParserPolicy =>
+        withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) {
+          checkTimestampType("yyyy", """{"a": "2018"}""", inferTimestamp)
+          checkTimestampType("yyyy=MM", """{"a": "2018=12"}""", inferTimestamp)
+          checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""", inferTimestamp)
+          checkTimestampType(
+            "yyyy-MM-dd'T'HH:mm:ss.SSS",
+            """{"a": "2018-12-02T21:04:00.123"}""",
+            inferTimestamp)
+          checkTimestampType(
+            "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
+            """{"a": "2018-12-02T21:04:00.123567+01:00"}""",
+            inferTimestamp)
+        }
       }
     }
   }
@@ -71,16 +78,19 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
   }
 
   test("skip decimal type inferring") {
-    Seq("legacy", "corrected").foreach { legacyParserPolicy =>
-      withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) {
-        checkType(
-          options = Map(
-            "prefersDecimal" -> "false",
-            "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
-          ),
-          json = """{"a": "20181202.210400123"}""",
-          dt = TimestampType
-        )
+    Seq(true, false).foreach { inferTimestamp =>
+      Seq("legacy", "corrected").foreach { legacyParserPolicy =>
+        withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) {
+          checkType(
+            options = Map(
+              "prefersDecimal" -> "false",
+              "timestampFormat" -> "yyyyMMdd.HHmmssSSS",
+              "inferTimestamp" -> inferTimestamp.toString
+            ),
+            json = """{"a": "20181202.210400123"}""",
+            dt = if (inferTimestamp) TimestampType else StringType
+          )
+        }
       }
     }
   }

diff --git a/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt b/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt
@@ -7,106 +7,106 @@ OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-106
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 JSON schema inferring:                    Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       68879          68993         116          1.5         688.8       1.0X
-UTF-8 is set                                     115270         115602         455          0.9        1152.7       0.6X
+No encoding                                       69219          69342         116          1.4         692.2       1.0X
+UTF-8 is set                                     143950         143986          55          0.7        1439.5       0.5X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 count a short column:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       47452          47538         113          2.1         474.5       1.0X
-UTF-8 is set                                      77330          77354          30          1.3         773.3       0.6X
+No encoding                                       57828          57913         136          1.7         578.3       1.0X
+UTF-8 is set                                      83649          83711          60          1.2         836.5       0.7X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 count a wide column:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       60470          60900         534          0.2        6047.0       1.0X
-UTF-8 is set                                     104733         104931         189          0.1       10473.3       0.6X
+No encoding                                       64560          65193        1023          0.2        6456.0       1.0X
+UTF-8 is set                                     102925         103174         216          0.1       10292.5       0.6X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 select wide row:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                      130302         131072         976          0.0      260604.6       1.0X
-UTF-8 is set                                     150860         151284         377          0.0      301720.1       0.9X
+No encoding                                      131002         132316        1160          0.0      262003.1       1.0X
+UTF-8 is set                                     152128         152371         332          0.0      304256.5       0.9X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Select a subset of 10 columns:            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Select 10 columns                                 18619          18684          99          0.5        1861.9       1.0X
-Select 1 column                                   24227          24270          38          0.4        2422.7       0.8X
+Select 10 columns                                 19376          19514         160          0.5        1937.6       1.0X
+Select 1 column                                   24089          24156          58          0.4        2408.9       0.8X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 creation of JSON parser per line:         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Short column without encoding                      7947           7971          21          1.3         794.7       1.0X
-Short column with UTF-8                           12700          12753          58          0.8        1270.0       0.6X
-Wide column without encoding                      92632          92955         463          0.1        9263.2       0.1X
-Wide column with UTF-8                           147013         147170         188          0.1       14701.3       0.1X
+Short column without encoding                      8131           8219         103          1.2         813.1       1.0X
+Short column with UTF-8                           13464          13508          44          0.7        1346.4       0.6X
+Wide column without encoding                     108012         108598         914          0.1       10801.2       0.1X
+Wide column with UTF-8                           150988         151369         412          0.1       15098.8       0.1X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 JSON functions:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                           713            734          19         14.0          71.3       1.0X
-from_json                                         22019          22429         456          0.5        2201.9       0.0X
-json_tuple                                        27987          28047          74          0.4        2798.7       0.0X
-get_json_object                                   21468          21870         350          0.5        2146.8       0.0X
+Text read                                           753            765          18         13.3          75.3       1.0X
+from_json                                         23182          23446         230          0.4        2318.2       0.0X
+json_tuple                                        31129          31304         181          0.3        3112.9       0.0X
+get_json_object                                   22821          23073         225          0.4        2282.1       0.0X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Dataset of json strings:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                          2887           2910          24         17.3          57.7       1.0X
-schema inferring                                  31793          31843          43          1.6         635.9       0.1X
-parsing                                           36791          37104         294          1.4         735.8       0.1X
+Text read                                          3078           3101          26         16.2          61.6       1.0X
+schema inferring                                  30225          30434         333          1.7         604.5       0.1X
+parsing                                           32237          32308          63          1.6         644.7       0.1X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Json files in the per-line mode:          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                         10570          10611          45          4.7         211.4       1.0X
-Schema inferring                                  48729          48763          41          1.0         974.6       0.2X
-Parsing without charset                           35490          35648         141          1.4         709.8       0.3X
-Parsing with UTF-8                                63853          63994         163          0.8        1277.1       0.2X
+Text read                                         10835          10900          86          4.6         216.7       1.0X
+Schema inferring                                  37720          37805         110          1.3         754.4       0.3X
+Parsing without charset                           35464          35538         100          1.4         709.3       0.3X
+Parsing with UTF-8                                67311          67738         381          0.7        1346.2       0.2X
 
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Write dates and timestamps:               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps                     2187           2190           5          4.6         218.7       1.0X
-to_json(timestamp)                                16262          16503         323          0.6        1626.2       0.1X
-write timestamps to files                         11679          11692          12          0.9        1167.9       0.2X
-Create a dataset of dates                          2297           2310          12          4.4         229.7       1.0X
-to_json(date)                                     10904          10956          46          0.9        1090.4       0.2X
-write dates to files                               6610           6645          35          1.5         661.0       0.3X
+Create a dataset of timestamps                     2208           2222          14          4.5         220.8       1.0X
+to_json(timestamp)                                14299          14570         285          0.7        1429.9       0.2X
+write timestamps to files                         12955          12969          13          0.8        1295.5       0.2X
+Create a dataset of dates                          2297           2323          30          4.4         229.7       1.0X
+to_json(date)                                      8509           8561          74          1.2         850.9       0.3X
+write dates to files                               6786           6827          45          1.5         678.6       0.3X
 
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Read dates and timestamps:                Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files                     2524           2530           9          4.0         252.4       1.0X
-read timestamps from files                        41002          41052          59          0.2        4100.2       0.1X
-infer timestamps from files                       84621          84939         526          0.1        8462.1       0.0X
-read date text from files                          2292           2302           9          4.4         229.2       1.1X
-read date from files                              16954          16976          21          0.6        1695.4       0.1X
-timestamp strings                                  3067           3077          13          3.3         306.7       0.8X
-parse timestamps from Dataset[String]             48690          48971         243          0.2        4869.0       0.1X
-infer timestamps from Dataset[String]             97463          97786         338          0.1        9746.3       0.0X
-date strings                                       3952           3956           3          2.5         395.2       0.6X
-parse dates from Dataset[String]                  24210          24241          30          0.4        2421.0       0.1X
-from_json(timestamp)                              71710          72242         629          0.1        7171.0       0.0X
-from_json(date)                                   42465          42481          13          0.2        4246.5       0.1X
+read timestamp text from files                     2598           2613          18          3.8         259.8       1.0X
+read timestamps from files                        42007          42028          19          0.2        4200.7       0.1X
+infer timestamps from files                       18102          18120          28          0.6        1810.2       0.1X
+read date text from files                          2355           2360           5          4.2         235.5       1.1X
+read date from files                              17420          17458          33          0.6        1742.0       0.1X
+timestamp strings                                  3099           3101           3          3.2         309.9       0.8X
+parse timestamps from Dataset[String]             48188          48215          25          0.2        4818.8       0.1X
+infer timestamps from Dataset[String]             22929          22988         102          0.4        2292.9       0.1X
+date strings                                       4090           4103          11          2.4         409.0       0.6X
+parse dates from Dataset[String]                  24952          25068         139          0.4        2495.2       0.1X
+from_json(timestamp)                              66038          66352         413          0.2        6603.8       0.0X
+from_json(date)                                   43755          43782          27          0.2        4375.5       0.1X