From 6e703015111a934c66c36de978065c5ee746d314 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 4 May 2020 08:02:45 -0700 Subject: [PATCH 01/19] [SPARK-31636][SQL][DOCS] Remove HTML syntax in SQL reference --- docs/_data/menu-sql.yaml | 20 +- docs/sql-ref-ansi-compliance.md | 8 +- docs/sql-ref-datatypes.md | 4 +- docs/sql-ref-functions-udf-aggregate.md | 101 +++---- docs/sql-ref-functions-udf-hive.md | 12 +- docs/sql-ref-functions-udf-scalar.md | 28 +- docs/sql-ref-identifier.md | 43 ++- docs/sql-ref-literals.md | 256 ++++++++--------- docs/sql-ref-null-semantics.md | 44 +-- docs/sql-ref-syntax-aux-analyze-table.md | 64 ++--- docs/sql-ref-syntax-aux-cache-cache-table.md | 98 +++---- docs/sql-ref-syntax-aux-cache-clear-cache.md | 16 +- docs/sql-ref-syntax-aux-cache-refresh.md | 24 +- .../sql-ref-syntax-aux-cache-uncache-table.md | 31 +-- docs/sql-ref-syntax-aux-conf-mgmt-reset.md | 10 +- docs/sql-ref-syntax-aux-conf-mgmt-set.md | 31 +-- docs/sql-ref-syntax-aux-describe-database.md | 21 +- docs/sql-ref-syntax-aux-describe-function.md | 30 +- docs/sql-ref-syntax-aux-describe-query.md | 44 ++- docs/sql-ref-syntax-aux-describe-table.md | 62 ++--- docs/sql-ref-syntax-aux-refresh-table.md | 31 +-- ...l-ref-syntax-aux-resource-mgmt-add-file.md | 21 +- ...ql-ref-syntax-aux-resource-mgmt-add-jar.md | 20 +- ...-ref-syntax-aux-resource-mgmt-list-file.md | 14 +- ...l-ref-syntax-aux-resource-mgmt-list-jar.md | 14 +- docs/sql-ref-syntax-aux-show-columns.md | 2 +- docs/sql-ref-syntax-aux-show-create-table.md | 27 +- docs/sql-ref-syntax-aux-show-databases.md | 32 +-- docs/sql-ref-syntax-aux-show-functions.md | 59 ++-- docs/sql-ref-syntax-aux-show-partitions.md | 47 ++-- docs/sql-ref-syntax-aux-show-table.md | 56 ++-- docs/sql-ref-syntax-aux-show-tables.md | 41 ++- docs/sql-ref-syntax-aux-show-tblproperties.md | 51 ++-- docs/sql-ref-syntax-aux-show-views.md | 45 ++- docs/sql-ref-syntax-ddl-alter-database.md | 17 +- docs/sql-ref-syntax-ddl-alter-table.md | 249 +++++++---------- docs/sql-ref-syntax-ddl-alter-view.md | 124 ++++----- docs/sql-ref-syntax-ddl-create-database.md | 39 +-- docs/sql-ref-syntax-ddl-create-function.md | 87 +++--- ...-ref-syntax-ddl-create-table-datasource.md | 100 +++---- ...-ref-syntax-ddl-create-table-hiveformat.md | 99 +++---- docs/sql-ref-syntax-ddl-create-table-like.md | 73 +++-- docs/sql-ref-syntax-ddl-create-table.md | 10 +- docs/sql-ref-syntax-ddl-create-view.md | 82 +++--- docs/sql-ref-syntax-ddl-drop-database.md | 42 ++- docs/sql-ref-syntax-ddl-drop-function.md | 49 ++-- docs/sql-ref-syntax-ddl-drop-table.md | 37 ++- docs/sql-ref-syntax-ddl-drop-view.md | 41 ++- docs/sql-ref-syntax-ddl-repair-table.md | 25 +- docs/sql-ref-syntax-ddl-truncate-table.md | 43 ++- docs/sql-ref-syntax-dml-insert-into.md | 90 +++--- ...tax-dml-insert-overwrite-directory-hive.md | 75 ++--- ...f-syntax-dml-insert-overwrite-directory.md | 74 +++-- ...l-ref-syntax-dml-insert-overwrite-table.md | 87 +++--- docs/sql-ref-syntax-dml-insert.md | 8 +- docs/sql-ref-syntax-dml-load.md | 67 ++--- docs/sql-ref-syntax-dml.md | 4 +- docs/sql-ref-syntax-qry-explain.md | 58 ++-- docs/sql-ref-syntax-qry-sampling.md | 18 +- docs/sql-ref-syntax-qry-select-clusterby.md | 33 +-- docs/sql-ref-syntax-qry-select-cte.md | 35 +-- ...sql-ref-syntax-qry-select-distribute-by.md | 33 +-- docs/sql-ref-syntax-qry-select-groupby.md | 261 +++++++++--------- docs/sql-ref-syntax-qry-select-having.md | 54 ++-- docs/sql-ref-syntax-qry-select-hints.md | 56 ++-- .../sql-ref-syntax-qry-select-inline-table.md | 32 +-- docs/sql-ref-syntax-qry-select-join.md | 185 ++++++------- docs/sql-ref-syntax-qry-select-like.md | 51 ++-- docs/sql-ref-syntax-qry-select-limit.md | 39 ++- docs/sql-ref-syntax-qry-select-orderby.md | 82 +++--- docs/sql-ref-syntax-qry-select-setops.md | 26 +- docs/sql-ref-syntax-qry-select-sortby.md | 84 +++--- docs/sql-ref-syntax-qry-select-tvf.md | 35 +-- docs/sql-ref-syntax-qry-select-usedb.md | 23 +- docs/sql-ref-syntax-qry-select-where.md | 37 ++- docs/sql-ref-syntax-qry-select.md | 209 +++++++------- docs/sql-ref-syntax-qry-window.md | 89 +++--- docs/sql-ref-syntax-qry.md | 35 +-- 78 files changed, 1935 insertions(+), 2469 deletions(-) diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml index dfe4cfab2a2ab..57fc493dad2f2 100644 --- a/docs/_data/menu-sql.yaml +++ b/docs/_data/menu-sql.yaml @@ -156,22 +156,22 @@ url: sql-ref-syntax-qry-select-distribute-by.html - text: LIMIT Clause url: sql-ref-syntax-qry-select-limit.html + - text: Common Table Expression + url: sql-ref-syntax-qry-select-cte.html + - text: Inline Table + url: sql-ref-syntax-qry-select-inline-table.html - text: JOIN url: sql-ref-syntax-qry-select-join.html - text: Join Hints url: sql-ref-syntax-qry-select-hints.html + - text: LIKE Predicate + url: sql-ref-syntax-qry-select-like.html - text: Set Operators url: sql-ref-syntax-qry-select-setops.html - text: TABLESAMPLE url: sql-ref-syntax-qry-sampling.html - text: Table-valued Function url: sql-ref-syntax-qry-select-tvf.html - - text: Inline Table - url: sql-ref-syntax-qry-select-inline-table.html - - text: Common Table Expression - url: sql-ref-syntax-qry-select-cte.html - - text: LIKE Predicate - url: sql-ref-syntax-qry-select-like.html - text: Window Function url: sql-ref-syntax-qry-window.html - text: EXPLAIN @@ -213,20 +213,20 @@ subitems: - text: SHOW COLUMNS url: sql-ref-syntax-aux-show-columns.html + - text: SHOW CREATE TABLE + url: sql-ref-syntax-aux-show-create-table.html - text: SHOW DATABASES url: sql-ref-syntax-aux-show-databases.html - text: SHOW FUNCTIONS url: sql-ref-syntax-aux-show-functions.html + - text: SHOW PARTITIONS + url: sql-ref-syntax-aux-show-partitions.html - text: SHOW TABLE url: sql-ref-syntax-aux-show-table.html - text: SHOW TABLES url: sql-ref-syntax-aux-show-tables.html - text: SHOW TBLPROPERTIES url: sql-ref-syntax-aux-show-tblproperties.html - - text: SHOW PARTITIONS - url: sql-ref-syntax-aux-show-partitions.html - - text: SHOW CREATE TABLE - url: sql-ref-syntax-aux-show-create-table.html - text: SHOW VIEWS url: sql-ref-syntax-aux-show-views.html - text: CONFIGURATION MANAGEMENT diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index 6cf16535eecf8..b4b307f4ac2a0 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -66,7 +66,7 @@ This means that in case an operation causes overflows, the result is the same wi On the other hand, Spark SQL returns null for decimal overflows. When `spark.sql.ansi.enabled` is set to `true` and an overflow occurs in numeric and interval arithmetic operations, it throws an arithmetic exception at runtime. -{% highlight sql %} +```sql -- `spark.sql.ansi.enabled=true` SELECT 2147483647 + 1; java.lang.ArithmeticException: integer overflow @@ -78,7 +78,7 @@ SELECT 2147483647 + 1; +----------------+ | -2147483648| +----------------+ -{% endhighlight %} +``` ### Type Conversion @@ -89,7 +89,7 @@ On the other hand, `INSERT INTO` syntax throws an analysis exception when the AN Currently, the ANSI mode affects explicit casting and assignment casting only. In future releases, the behaviour of type coercion might change along with the other two type conversion rules. -{% highlight sql %} +```sql -- Examples of explicit casting -- `spark.sql.ansi.enabled=true` @@ -130,7 +130,7 @@ SELECT * FROM t; +---+ | 1| +---+ -{% endhighlight %} +``` ### SQL Functions diff --git a/docs/sql-ref-datatypes.md b/docs/sql-ref-datatypes.md index 0d49f6f882228..64c0d0c118a0e 100644 --- a/docs/sql-ref-datatypes.md +++ b/docs/sql-ref-datatypes.md @@ -751,7 +751,7 @@ Specifically: #### Examples -{% highlight sql %} +```sql SELECT double('infinity') AS col; +--------+ | col| @@ -824,4 +824,4 @@ SELECT COUNT(*), c2 FROM test GROUP BY c2; | 2|-Infinity| | 3| Infinity| +---------+---------+ -{% endhighlight %} \ No newline at end of file +``` diff --git a/docs/sql-ref-functions-udf-aggregate.md b/docs/sql-ref-functions-udf-aggregate.md index 3fde94d6bc4bf..da3182149410b 100644 --- a/docs/sql-ref-functions-udf-aggregate.md +++ b/docs/sql-ref-functions-udf-aggregate.md @@ -27,46 +27,35 @@ User-Defined Aggregate Functions (UDAFs) are user-programmable routines that act A base class for user-defined aggregations, which can be used in Dataset operations to take all of the elements of a group and reduce them to a single value. - * IN - The input type for the aggregation. - * BUF - The type of the intermediate value of the reduction. - * OUT - The type of the final output result. + ***IN*** - The input type for the aggregation. + + ***BUF*** - The type of the intermediate value of the reduction. + + ***OUT*** - The type of the final output result. + +* **bufferEncoder: Encoder[BUF]** -
-
bufferEncoder: Encoder[BUF]
-
Specifies the Encoder for the intermediate value type. -
-
-
-
finish(reduction: BUF): OUT
-
+ +* **finish(reduction: BUF): OUT** + Transform the output of the reduction. -
-
-
-
merge(b1: BUF, b2: BUF): BUF
-
+ +* **merge(b1: BUF, b2: BUF): BUF** + Merge two intermediate values. -
-
-
-
outputEncoder: Encoder[OUT]
-
+ +* **outputEncoder: Encoder[OUT]** + Specifies the Encoder for the final output value type. -
-
-
-
reduce(b: BUF, a: IN): BUF
-
- Aggregate input value a into current intermediate value. For performance, the function may modify b and return it instead of constructing new object for b. -
-
-
-
zero: BUF
-
+ +* **reduce(b: BUF, a: IN): BUF** + + Aggregate input value `a` into current intermediate value. For performance, the function may modify `b` and return it instead of constructing new object for `b`. + +* **zero: BUF** + The initial value of the intermediate result for this aggregation. -
-
### Examples @@ -95,16 +84,16 @@ For example, a user-defined average for untyped DataFrames can look like: {% include_example untyped_custom_aggregation java/org/apache/spark/examples/sql/JavaUserDefinedUntypedAggregation.java%}
-{% highlight sql %} +```sql -- Compile and place UDAF MyAverage in a JAR file called `MyAverage.jar` in /tmp. CREATE FUNCTION myAverage AS 'MyAverage' USING JAR '/tmp/MyAverage.jar'; SHOW USER FUNCTIONS; --- +------------------+ --- | function| --- +------------------+ --- | default.myAverage| --- +------------------+ ++------------------+ +| function| ++------------------+ +| default.myAverage| ++------------------+ CREATE TEMPORARY VIEW employees USING org.apache.spark.sql.json @@ -113,26 +102,26 @@ OPTIONS ( ); SELECT * FROM employees; --- +-------+------+ --- | name|salary| --- +-------+------+ --- |Michael| 3000| --- | Andy| 4500| --- | Justin| 3500| --- | Berta| 4000| --- +-------+------+ ++-------+------+ +| name|salary| ++-------+------+ +|Michael| 3000| +| Andy| 4500| +| Justin| 3500| +| Berta| 4000| ++-------+------+ SELECT myAverage(salary) as average_salary FROM employees; --- +--------------+ --- |average_salary| --- +--------------+ --- | 3750.0| --- +--------------+ -{% endhighlight %} ++--------------+ +|average_salary| ++--------------+ +| 3750.0| ++--------------+ +```
### Related Statements - * [Scalar User Defined Functions (UDFs)](sql-ref-functions-udf-scalar.html) - * [Integration with Hive UDFs/UDAFs/UDTFs](sql-ref-functions-udf-hive.html) +* [Scalar User Defined Functions (UDFs)](sql-ref-functions-udf-scalar.html) +* [Integration with Hive UDFs/UDAFs/UDTFs](sql-ref-functions-udf-hive.html) diff --git a/docs/sql-ref-functions-udf-hive.md b/docs/sql-ref-functions-udf-hive.md index 97d72c44ef016..d3d2a221c94d8 100644 --- a/docs/sql-ref-functions-udf-hive.md +++ b/docs/sql-ref-functions-udf-hive.md @@ -28,7 +28,7 @@ Spark SQL supports integration of Hive UDFs, UDAFs and UDTFs. Similar to Spark U Hive has two UDF interfaces: [UDF](https://github.com/apache/hive/blob/master/udf/src/java/org/apache/hadoop/hive/ql/exec/UDF.java) and [GenericUDF](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java). An example below uses [GenericUDFAbs](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFAbs.java) derived from `GenericUDF`. -{% highlight sql %} +```sql -- Register `GenericUDFAbs` and use it in Spark SQL. -- Note that, if you use your own programmed one, you need to add a JAR containig it -- into a classpath, @@ -52,12 +52,12 @@ SELECT testUDF(value) FROM t; | 2.0| | 3.0| +--------------+ -{% endhighlight %} +``` An example below uses [GenericUDTFExplode](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFExplode.java) derived from [GenericUDTF](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java). -{% highlight sql %} +```sql -- Register `GenericUDTFExplode` and use it in Spark SQL CREATE TEMPORARY FUNCTION hiveUDTF AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDTFExplode'; @@ -79,12 +79,12 @@ SELECT hiveUDTF(value) FROM t; | 3| | 4| +---+ -{% endhighlight %} +``` Hive has two UDAF interfaces: [UDAF](https://github.com/apache/hive/blob/master/udf/src/java/org/apache/hadoop/hive/ql/exec/UDAF.java) and [GenericUDAFResolver](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFResolver.java). An example below uses [GenericUDAFSum](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java) derived from `GenericUDAFResolver`. -{% highlight sql %} +```sql -- Register `GenericUDAFSum` and use it in Spark SQL CREATE TEMPORARY FUNCTION hiveUDAF AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum'; @@ -105,4 +105,4 @@ SELECT key, hiveUDAF(value) FROM t GROUP BY key; | b| 3| | a| 3| +---+---------------+ -{% endhighlight %} \ No newline at end of file +``` \ No newline at end of file diff --git a/docs/sql-ref-functions-udf-scalar.md b/docs/sql-ref-functions-udf-scalar.md index 2cb25f275cb59..97f5a89d3fb19 100644 --- a/docs/sql-ref-functions-udf-scalar.md +++ b/docs/sql-ref-functions-udf-scalar.md @@ -26,24 +26,18 @@ User-Defined Functions (UDFs) are user-programmable routines that act on one row ### UserDefinedFunction To define the properties of a user-defined function, the user can use some of the methods defined in this class. -
-
asNonNullable(): UserDefinedFunction
-
+ +* **asNonNullable(): UserDefinedFunction** + Updates UserDefinedFunction to non-nullable. -
-
-
-
asNondeterministic(): UserDefinedFunction
-
+ +* **asNondeterministic(): UserDefinedFunction** + Updates UserDefinedFunction to nondeterministic. -
-
-
-
withName(name: String): UserDefinedFunction
-
+ +* **withName(name: String): UserDefinedFunction** + Updates UserDefinedFunction with a given name. -
-
### Examples @@ -57,5 +51,5 @@ To define the properties of a user-defined function, the user can use some of th ### Related Statements - * [User Defined Aggregate Functions (UDAFs)](sql-ref-functions-udf-aggregate.html) - * [Integration with Hive UDFs/UDAFs/UDTFs](sql-ref-functions-udf-hive.html) +* [User Defined Aggregate Functions (UDAFs)](sql-ref-functions-udf-aggregate.html) +* [Integration with Hive UDFs/UDAFs/UDTFs](sql-ref-functions-udf-hive.html) diff --git a/docs/sql-ref-identifier.md b/docs/sql-ref-identifier.md index 89cde21e6fdb6..5b48ece19fb07 100644 --- a/docs/sql-ref-identifier.md +++ b/docs/sql-ref-identifier.md @@ -27,54 +27,47 @@ An identifier is a string used to identify a database object such as a table, vi #### Regular Identifier -{% highlight sql %} +```sql { letter | digit | '_' } [ , ... ] -{% endhighlight %} +``` Note: If `spark.sql.ansi.enabled` is set to true, ANSI SQL reserved keywords cannot be used as identifiers. For more details, please refer to [ANSI Compliance](sql-ref-ansi-compliance.html). #### Delimited Identifier -{% highlight sql %} +```sql `c [ ... ]` -{% endhighlight %} +``` ### Parameters -
-
letter
-
+* **letter** + Any letter from A-Z or a-z. -
-
-
-
digit
-
+ +* **digit** + Any numeral from 0 to 9. -
-
-
-
c
-
+ +* **c** + Any character from the character set. Use ` to escape special characters (e.g., `). -
-
### Examples -{% highlight sql %} +```sql -- This CREATE TABLE fails with ParseException because of the illegal identifier name a.b CREATE TABLE test (a.b int); -org.apache.spark.sql.catalyst.parser.ParseException: -no viable alternative at input 'CREATE TABLE test (a.'(line 1, pos 20) + org.apache.spark.sql.catalyst.parser.ParseException: + no viable alternative at input 'CREATE TABLE test (a.'(line 1, pos 20) -- This CREATE TABLE works CREATE TABLE test (`a.b` int); -- This CREATE TABLE fails with ParseException because special character ` is not escaped CREATE TABLE test1 (`a`b` int); -org.apache.spark.sql.catalyst.parser.ParseException: -no viable alternative at input 'CREATE TABLE test (`a`b`'(line 1, pos 23) + org.apache.spark.sql.catalyst.parser.ParseException: + no viable alternative at input 'CREATE TABLE test (`a`b`'(line 1, pos 23) -- This CREATE TABLE works CREATE TABLE test (`a``b` int); -{% endhighlight %} +``` diff --git a/docs/sql-ref-literals.md b/docs/sql-ref-literals.md index 0088f79cb7007..d203f3503ef87 100644 --- a/docs/sql-ref-literals.md +++ b/docs/sql-ref-literals.md @@ -35,22 +35,19 @@ A string literal is used to specify a character string value. #### Syntax -{% highlight sql %} +```sql 'c [ ... ]' | "c [ ... ]" -{% endhighlight %} +``` -#### Parameters +#### Parameters + +* **c** -
-
c
-
- One character from the character set. Use \ to escape special characters (e.g., ' or \). -
-
+ One character from the character set. Use `\` to escape special characters (e.g., `'` or `\`). -#### Examples +#### Examples -{% highlight sql %} +```sql SELECT 'Hello, World!' AS col; +-------------+ | col| @@ -71,7 +68,7 @@ SELECT 'it\'s $10.' AS col; +---------+ |It's $10.| +---------+ -{% endhighlight %} +``` ### Binary Literal @@ -79,29 +76,26 @@ A binary literal is used to specify a byte sequence value. #### Syntax -{% highlight sql %} +```sql X { 'c [ ... ]' | "c [ ... ]" } -{% endhighlight %} +``` + +#### Parameters -#### Parameters +* **c** -
-
c
-
One character from the character set. -
-
-#### Examples +#### Examples -{% highlight sql %} +```sql SELECT X'123456' AS col; +----------+ | col| +----------+ |[12 34 56]| +----------+ -{% endhighlight %} +``` ### Null Literal @@ -109,20 +103,20 @@ A null literal is used to specify a null value. #### Syntax -{% highlight sql %} +```sql NULL -{% endhighlight %} +``` #### Examples -{% highlight sql %} +```sql SELECT NULL AS col; +----+ | col| +----+ |NULL| +----+ -{% endhighlight %} +``` ### Boolean Literal @@ -130,20 +124,20 @@ A boolean literal is used to specify a boolean value. #### Syntax -{% highlight sql %} +```sql TRUE | FALSE -{% endhighlight %} +``` #### Examples -{% highlight sql %} +```sql SELECT TRUE AS col; +----+ | col| +----+ |true| +----+ -{% endhighlight %} +``` ### Numeric Literal @@ -153,46 +147,35 @@ A numeric literal is used to specify a fixed or floating-point number. #### Syntax -{% highlight sql %} +```sql [ + | - ] digit [ ... ] [ L | S | Y ] -{% endhighlight %} +``` #### Parameters -
-
digit
-
+* **digit** + Any numeral from 0 to 9. -
-
-
-
L
-
- Case insensitive, indicates BIGINT, which is a 8-byte signed integer number. -
-
-
-
S
-
- Case insensitive, indicates SMALLINT, which is a 2-byte signed integer number. -
-
-
-
Y
-
- Case insensitive, indicates TINYINT, which is a 1-byte signed integer number. -
-
-
-
default (no postfix)
-
+ +* **L** + + Case insensitive, indicates `BIGINT`, which is a 8-byte signed integer number. + +* **S** + + Case insensitive, indicates `SMALLINT`, which is a 2-byte signed integer number. + +* **Y** + + Case insensitive, indicates `TINYINT`, which is a 1-byte signed integer number. + +* **default (no postfix)** + Indicates a 4-byte signed integer number. -
-
#### Examples -{% highlight sql %} +```sql SELECT -2147483648 AS col; +-----------+ | col| @@ -220,56 +203,49 @@ SELECT 482S AS col; +---+ |482| +---+ -{% endhighlight %} +``` #### Fractional Literals #### Syntax decimal literals: -{% highlight sql %} +```sql decimal_digits { [ BD ] | [ exponent BD ] } | digit [ ... ] [ exponent ] BD -{% endhighlight %} +``` double literals: -{% highlight sql %} +```sql decimal_digits { D | exponent [ D ] } | digit [ ... ] { exponent [ D ] | [ exponent ] D } -{% endhighlight %} +``` While decimal_digits is defined as -{% highlight sql %} +```sql [ + | - ] { digit [ ... ] . [ digit [ ... ] ] | . digit [ ... ] } -{% endhighlight %} +``` and exponent is defined as -{% highlight sql %} +```sql E [ + | - ] digit [ ... ] -{% endhighlight %} +``` #### Parameters -
-
digit
-
+* **digit** + Any numeral from 0 to 9. -
-
-
-
D
-
- Case insensitive, indicates DOUBLE, which is a 8-byte double-precision floating point number. -
-
-
-
BD
-
- Case insensitive, indicates DECIMAL, with the total number of digits as precision and the number of digits to right of decimal point as scale. -
-
+ +* **D** + + Case insensitive, indicates `DOUBLE`, which is a 8-byte double-precision floating point number. + +* **BD** + + Case insensitive, indicates `DECIMAL`, with the total number of digits as precision and the number of digits to right of decimal point as scale. #### Examples -{% highlight sql %} +```sql SELECT 12.578 AS col; +------+ | col| @@ -353,7 +329,7 @@ SELECT -3.E-3D AS col; +------+ |-0.003| +------+ -{% endhighlight %} +``` ### Datetime Literal @@ -363,17 +339,17 @@ A Datetime literal is used to specify a datetime value. #### Syntax -{% highlight sql %} +```sql DATE { 'yyyy' | 'yyyy-[m]m' | 'yyyy-[m]m-[d]d' | 'yyyy-[m]m-[d]d[T]' } -{% endhighlight %} -Note: defaults to 01 if month or day is not specified. +``` +Note: defaults to `01` if month or day is not specified. #### Examples -{% highlight sql %} +```sql SELECT DATE '1997' AS col; +----------+ | col| @@ -394,13 +370,13 @@ SELECT DATE '2011-11-11' AS col; +----------+ |2011-11-11| +----------+ -{% endhighlight %} +``` #### Timestamp Literal #### Syntax -{% highlight sql %} +```sql TIMESTAMP { 'yyyy' | 'yyyy-[m]m' | 'yyyy-[m]m-[d]d' | @@ -409,27 +385,23 @@ TIMESTAMP { 'yyyy' | 'yyyy-[m]m-[d]d[T][h]h:[m]m[:]' | 'yyyy-[m]m-[d]d[T][h]h:[m]m:[s]s[.]' | 'yyyy-[m]m-[d]d[T][h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]'} -{% endhighlight %} -Note: defaults to 00 if hour, minute or second is not specified.

+``` +Note: defaults to `00` if hour, minute or second is not specified. `zone_id` should have one of the forms: - -Note: defaults to the session local timezone (set via spark.sql.session.timeZone) if zone_id is not specified. +* Z - Zulu time zone UTC+0 +* `+|-[h]h:[m]m` +* An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-, and a suffix in the formats: + * `+|-h[h]` + * `+|-hh[:]mm` + * `+|-hh:mm:ss` + * `+|-hhmmss` +* Region-based zone IDs in the form `area/city`, such as `Europe/Paris` + +Note: defaults to the session local timezone (set via `spark.sql.session.timeZone`) if `zone_id` is not specified. #### Examples -{% highlight sql %} +```sql SELECT TIMESTAMP '1997-01-31 09:26:56.123' AS col; +-----------------------+ | col| @@ -450,50 +422,40 @@ SELECT TIMESTAMP '1997-01' AS col; +-------------------+ |1997-01-01 00:00:00| +-------------------+ -{% endhighlight %} +``` ### Interval Literal An interval literal is used to specify a fixed period of time. -#### Syntax -{% highlight sql %} -{ INTERVAL interval_value interval_unit [ interval_value interval_unit ... ] | - INTERVAL 'interval_value interval_unit [ interval_value interval_unit ... ]' | - INTERVAL interval_string_value interval_unit TO interval_unit } -{% endhighlight %} +```sql +INTERVAL interval_value interval_unit [ interval_value interval_unit ... ] | +INTERVAL 'interval_value interval_unit [ interval_value interval_unit ... ]' | +INTERVAL interval_string_value interval_unit TO interval_unit +``` #### Parameters -
-
interval_value
-
- Syntax: - - [ + | - ] number_value | '[ + | - ] number_value' -
-
-
-
-
interval_string_value
-
- year-month/day-time interval string. -
-
-
-
interval_unit
-
- Syntax:
- - YEAR[S] | MONTH[S] | WEEK[S] | DAY[S] | HOUR[S] | MINUTE[S] | SECOND[S] |
- MILLISECOND[S] | MICROSECOND[S] -
-
-
+* **interval_value** + + **Syntax:** + + [ + | - ] number_value | '[ + | - ] number_value' + +* **interval_string_value** + + year-month/day-time interval string. + +* **interval_unit** + + **Syntax:** + + YEAR[S] | MONTH[S] | WEEK[S] | DAY[S] | HOUR[S] | MINUTE[S] | SECOND[S] | + MILLISECOND[S] | MICROSECOND[S] #### Examples -{% highlight sql %} +```sql SELECT INTERVAL 3 YEAR AS col; +-------+ | col| @@ -536,4 +498,4 @@ SELECT INTERVAL '20 15:40:32.99899999' DAY TO SECOND AS col; +---------------------------------------------+ |20 days 15 hours 40 minutes 32.998999 seconds| +---------------------------------------------+ -{% endhighlight %} +``` diff --git a/docs/sql-ref-null-semantics.md b/docs/sql-ref-null-semantics.md index d1f4f17596bd4..b96a02c43bee7 100644 --- a/docs/sql-ref-null-semantics.md +++ b/docs/sql-ref-null-semantics.md @@ -116,7 +116,7 @@ one or both operands are `NULL`: ### Examples -{% highlight sql %} +```sql -- Normal comparison operators return `NULL` when one of the operand is `NULL`. SELECT 5 > null AS expression_output; +-----------------+ @@ -148,7 +148,7 @@ SELECT NULL <=> NULL; +-----------------+ | true| +-----------------+ -{% endhighlight %} +``` ### Logical Operators @@ -209,7 +209,7 @@ The following tables illustrate the behavior of logical operators when one or bo ### Examples -{% highlight sql %} +```sql -- Normal comparison operators return `NULL` when one of the operands is `NULL`. SELECT (true OR null) AS expression_output; +-----------------+ @@ -233,7 +233,7 @@ SELECT NOT(null) AS expression_output; +-----------------+ | null| +-----------------+ -{% endhighlight %} +``` ### Expressions @@ -252,7 +252,7 @@ expression are `NULL` and most of the expressions fall in this category. ##### Examples -{% highlight sql %} +```sql SELECT concat('John', null) AS expression_output; +-----------------+ |expression_output| @@ -273,7 +273,7 @@ SELECT to_date(null) AS expression_output; +-----------------+ | null| +-----------------+ -{% endhighlight %} +``` #### Expressions That Can Process Null Value Operands @@ -296,7 +296,7 @@ returns the first non `NULL` value in its list of operands. However, `coalesce` ##### Examples -{% highlight sql %} +```sql SELECT isnull(null) AS expression_output; +-----------------+ |expression_output| @@ -326,7 +326,7 @@ SELECT isnan(null) AS expression_output; +-----------------+ | false| +-----------------+ -{% endhighlight %} +``` #### Builtin Aggregate Expressions @@ -346,7 +346,7 @@ the rules of how `NULL` values are handled by aggregate functions. #### Examples -{% highlight sql %} +```sql -- `count(*)` does not skip `NULL` values. SELECT count(*) FROM person; +--------+ @@ -387,7 +387,7 @@ SELECT max(age) FROM person where 1 = 0; +--------+ | null| +--------+ -{% endhighlight %} +``` ### Condition Expressions in WHERE, HAVING and JOIN Clauses @@ -398,7 +398,7 @@ For all the three operators, a condition expression is a boolean expression and #### Examples -{% highlight sql %} +```sql -- Persons whose age is unknown (`NULL`) are filtered out from the result set. SELECT * FROM person WHERE age > 0; +--------+---+ @@ -466,7 +466,7 @@ SELECT * FROM person p1, person p2 | Marry|null| Marry|null| | Joe| 30| Joe| 30| +--------+----+--------+----+ -{% endhighlight %} +``` ### Aggregate Operator (GROUP BY, DISTINCT) @@ -477,7 +477,7 @@ standard and with other enterprise database management systems. #### Examples -{% highlight sql %} +```sql -- `NULL` values are put in one bucket in `GROUP BY` processing. SELECT age, count(*) FROM person GROUP BY age; +----+--------+ @@ -499,7 +499,7 @@ SELECT DISTINCT age FROM person; | 30| | 18| +----+ -{% endhighlight %} +``` ### Sort Operator (ORDER BY Clause) @@ -509,7 +509,7 @@ the `NULL` values are placed at first. #### Examples -{% highlight sql %} +```sql -- `NULL` values are shown at first and other values -- are sorted in ascending way. SELECT age, name FROM person ORDER BY age; @@ -554,7 +554,7 @@ SELECT age, name FROM person ORDER BY age DESC NULLS LAST; |null| Marry| |null| Albert| +----+--------+ -{% endhighlight %} +``` ### Set Operators (UNION, INTERSECT, EXCEPT) @@ -564,7 +564,7 @@ equal unlike the regular `EqualTo`(`=`) operator. #### Examples -{% highlight sql %} +```sql CREATE VIEW unknown_age SELECT * FROM person WHERE age IS NULL; -- Only common rows between two legs of `INTERSECT` are in the @@ -612,7 +612,7 @@ SELECT name, age FROM person | Mike| 18| | Dan| 50| +--------+----+ -{% endhighlight %} +``` ### EXISTS/NOT EXISTS Subquery @@ -629,7 +629,7 @@ semijoins / anti-semijoins without special provisions for null awareness. #### Examples -{% highlight sql %} +```sql -- Even if subquery produces rows with `NULL` values, the `EXISTS` expression -- evaluates to `TRUE` as the subquery produces 1 row. SELECT * FROM person WHERE EXISTS (SELECT null); @@ -666,7 +666,7 @@ SELECT * FROM person WHERE NOT EXISTS (SELECT 1 WHERE 1 = 0); | Marry|null| | Joe| 30| +--------+----+ -{% endhighlight %} +``` ### IN/NOT IN Subquery @@ -692,7 +692,7 @@ and because NOT UNKNOWN is again UNKNOWN. #### Examples -{% highlight sql %} +```sql -- The subquery has only `NULL` value in its result set. Therefore, -- the result of `IN` predicate is UNKNOWN. SELECT * FROM person WHERE age IN (SELECT null); @@ -721,4 +721,4 @@ SELECT * FROM person |name|age| +----+---+ +----+---+ -{% endhighlight %} +``` diff --git a/docs/sql-ref-syntax-aux-analyze-table.md b/docs/sql-ref-syntax-aux-analyze-table.md index f6a6c5f4bc555..a8e11303432ba 100644 --- a/docs/sql-ref-syntax-aux-analyze-table.md +++ b/docs/sql-ref-syntax-aux-analyze-table.md @@ -25,53 +25,39 @@ The `ANALYZE TABLE` statement collects statistics about the table to be used by ### Syntax -{% highlight sql %} +```sql ANALYZE TABLE table_identifier [ partition_spec ] COMPUTE STATISTICS [ NOSCAN | FOR COLUMNS col [ , ... ] | FOR ALL COLUMNS ] -{% endhighlight %} +``` ### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
- -
-
partition_spec
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **partition_spec** + An optional parameter that specifies a comma separated list of key and value pairs - for partitions. When specified, partition statistics is returned.

- Syntax: - - PARTITION ( partition_col_name [ = partition_col_val ] [ , ... ] ) - -
-
- -
-
[ NOSCAN | FOR COLUMNS col [ , ... ] | FOR ALL COLUMNS ]
-
-
    -
  • If no analyze option is specified, ANALYZE TABLE collects the table's number of rows and size in bytes.
  • -
  • NOSCAN -
    Collect only the table's size in bytes ( which does not require scanning the entire table ).
  • -
  • FOR COLUMNS col [ , ... ] | FOR ALL COLUMNS -
    Collect column statistics for each column specified, or alternatively for every column, as well as table statistics. -
  • -
-
-
+ for partitions. When specified, partition statistics is returned. + + **Syntax:** `PARTITION ( partition_col_name [ = partition_col_val ] [ , ... ] )` + +* **[ NOSCAN `|` FOR COLUMNS col [ , ... ] `|` FOR ALL COLUMNS ]** + + * If no analyze option is specified, `ANALYZE TABLE` collects the table's number of rows and size in bytes. + * **NOSCAN** + + Collects only the table's size in bytes ( which does not require scanning the entire table ). + * **FOR COLUMNS col [ , ... ] `|` FOR ALL COLUMNS** + + Collects column statistics for each column specified, or alternatively for every column, as well as table statistics. ### Examples -{% highlight sql %} +```sql CREATE TABLE students (name STRING, student_id INT) PARTITIONED BY (student_id); INSERT INTO students PARTITION (student_id = 111111) VALUES ('Mark'); INSERT INTO students PARTITION (student_id = 222222) VALUES ('John'); @@ -135,4 +121,4 @@ DESC EXTENDED students name; | max_col_len| 4| | histogram| NULL| +--------------+----------+ -{% endhighlight %} +``` diff --git a/docs/sql-ref-syntax-aux-cache-cache-table.md b/docs/sql-ref-syntax-aux-cache-cache-table.md index 11f682cc10891..193e209d792b3 100644 --- a/docs/sql-ref-syntax-aux-cache-cache-table.md +++ b/docs/sql-ref-syntax-aux-cache-cache-table.md @@ -26,71 +26,57 @@ This reduces scanning of the original files in future queries. ### Syntax -{% highlight sql %} +```sql CACHE [ LAZY ] TABLE table_identifier [ OPTIONS ( 'storageLevel' [ = ] value ) ] [ [ AS ] query ] -{% endhighlight %} +``` ### Parameters -
-
LAZY
-
Only cache the table when it is first used, instead of immediately.
-
- -
-
table_identifier
-
- Specifies the table or view name to be cached. The table or view name may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
- -
-
OPTIONS ( 'storageLevel' [ = ] value )
-
- OPTIONS clause with storageLevel key and value pair. A Warning is issued when a key other than storageLevel is used. The valid options for storageLevel are: -
    -
  • NONE
  • -
  • DISK_ONLY
  • -
  • DISK_ONLY_2
  • -
  • MEMORY_ONLY
  • -
  • MEMORY_ONLY_2
  • -
  • MEMORY_ONLY_SER
  • -
  • MEMORY_ONLY_SER_2
  • -
  • MEMORY_AND_DISK
  • -
  • MEMORY_AND_DISK_2
  • -
  • MEMORY_AND_DISK_SER
  • -
  • MEMORY_AND_DISK_SER_2
  • -
  • OFF_HEAP
  • -
- An Exception is thrown when an invalid value is set for storageLevel. If storageLevel is not explicitly set using OPTIONS clause, the default storageLevel is set to MEMORY_AND_DISK. -
-
- -
-
query
-
A query that produces the rows to be cached. It can be in one of following formats: -
    -
  • a SELECT statement
  • -
  • a TABLE statement
  • -
  • a FROM statement
  • -
-
-
+* **LAZY** + + Only cache the table when it is first used, instead of immediately. + +* **table_identifier** + + Specifies the table or view name to be cached. The table or view name may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **OPTIONS ( 'storageLevel' [ = ] value )** + + `OPTIONS` clause with `storageLevel` key and value pair. A Warning is issued when a key other than `storageLevel` is used. The valid options for `storageLevel` are: + * `NONE` + * `DISK_ONLY` + * `DISK_ONLY_2` + * `MEMORY_ONLY` + * `MEMORY_ONLY_2` + * `MEMORY_ONLY_SER` + * `MEMORY_ONLY_SER_2` + * `MEMORY_AND_DISK` + * `MEMORY_AND_DISK_2` + * `MEMORY_AND_DISK_SER` + * `MEMORY_AND_DISK_SER_2` + * `OFF_HEAP` + + An Exception is thrown when an invalid value is set for `storageLevel`. If `storageLevel` is not explicitly set using `OPTIONS` clause, the default `storageLevel` is set to `MEMORY_AND_DISK`. + +* **query** + + A query that produces the rows to be cached. It can be in one of following formats: + * a `SELECT` statement + * a `TABLE` statement + * a `FROM` statement ### Examples -{% highlight sql %} +```sql CACHE TABLE testCache OPTIONS ('storageLevel' 'DISK_ONLY') SELECT * FROM testData; -{% endhighlight %} +``` ### Related Statements - * [CLEAR CACHE](sql-ref-syntax-aux-cache-clear-cache.html) - * [UNCACHE TABLE](sql-ref-syntax-aux-cache-uncache-table.html) - * [REFRESH TABLE](sql-ref-syntax-aux-refresh-table.html) - * [REFRESH](sql-ref-syntax-aux-cache-refresh.html) +* [CLEAR CACHE](sql-ref-syntax-aux-cache-clear-cache.html) +* [UNCACHE TABLE](sql-ref-syntax-aux-cache-uncache-table.html) +* [REFRESH TABLE](sql-ref-syntax-aux-refresh-table.html) +* [REFRESH](sql-ref-syntax-aux-cache-refresh.html) diff --git a/docs/sql-ref-syntax-aux-cache-clear-cache.md b/docs/sql-ref-syntax-aux-cache-clear-cache.md index 47889691148b7..ee33e6a98296d 100644 --- a/docs/sql-ref-syntax-aux-cache-clear-cache.md +++ b/docs/sql-ref-syntax-aux-cache-clear-cache.md @@ -25,19 +25,19 @@ license: | ### Syntax -{% highlight sql %} +```sql CLEAR CACHE -{% endhighlight %} +``` ### Examples -{% highlight sql %} +```sql CLEAR CACHE; -{% endhighlight %} +``` ### Related Statements - * [CACHE TABLE](sql-ref-syntax-aux-cache-cache-table.html) - * [UNCACHE TABLE](sql-ref-syntax-aux-cache-uncache-table.html) - * [REFRESH TABLE](sql-ref-syntax-aux-refresh-table.html) - * [REFRESH](sql-ref-syntax-aux-cache-refresh.html) +* [CACHE TABLE](sql-ref-syntax-aux-cache-cache-table.html) +* [UNCACHE TABLE](sql-ref-syntax-aux-cache-uncache-table.html) +* [REFRESH TABLE](sql-ref-syntax-aux-refresh-table.html) +* [REFRESH](sql-ref-syntax-aux-cache-refresh.html) diff --git a/docs/sql-ref-syntax-aux-cache-refresh.md b/docs/sql-ref-syntax-aux-cache-refresh.md index 25f7ede1d324e..82bc12da5d1ac 100644 --- a/docs/sql-ref-syntax-aux-cache-refresh.md +++ b/docs/sql-ref-syntax-aux-cache-refresh.md @@ -27,32 +27,30 @@ invalidate everything that is cached. ### Syntax -{% highlight sql %} +```sql REFRESH resource_path -{% endhighlight %} +``` ### Parameters -
-
resource_path
-
The path of the resource that is to be refreshed.
-
+* **resource_path** + + The path of the resource that is to be refreshed. ### Examples -{% highlight sql %} +```sql -- The Path is resolved using the datasource's File Index. - CREATE TABLE test(ID INT) using parquet; INSERT INTO test SELECT 1000; CACHE TABLE test; INSERT INTO test SELECT 100; REFRESH "hdfs://path/to/table"; -{% endhighlight %} +``` ### Related Statements - * [CACHE TABLE](sql-ref-syntax-aux-cache-cache-table.html) - * [CLEAR CACHE](sql-ref-syntax-aux-cache-clear-cache.html) - * [UNCACHE TABLE](sql-ref-syntax-aux-cache-uncache-table.html) - * [REFRESH TABLE](sql-ref-syntax-aux-refresh-table.html) +* [CACHE TABLE](sql-ref-syntax-aux-cache-cache-table.html) +* [CLEAR CACHE](sql-ref-syntax-aux-cache-clear-cache.html) +* [UNCACHE TABLE](sql-ref-syntax-aux-cache-uncache-table.html) +* [REFRESH TABLE](sql-ref-syntax-aux-refresh-table.html) diff --git a/docs/sql-ref-syntax-aux-cache-uncache-table.md b/docs/sql-ref-syntax-aux-cache-uncache-table.md index 95fd91c3c4807..c5a8fbbe08281 100644 --- a/docs/sql-ref-syntax-aux-cache-uncache-table.md +++ b/docs/sql-ref-syntax-aux-cache-uncache-table.md @@ -26,32 +26,27 @@ underlying entries should already have been brought to cache by previous `CACHE ### Syntax -{% highlight sql %} +```sql UNCACHE TABLE [ IF EXISTS ] table_identifier -{% endhighlight %} +``` ### Parameters -
-
table_identifier
-
- Specifies the table or view name to be uncached. The table or view name may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
+* **table_identifier** + + Specifies the table or view name to be uncached. The table or view name may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` ### Examples -{% highlight sql %} +```sql UNCACHE TABLE t1; -{% endhighlight %} +``` ### Related Statements - * [CACHE TABLE](sql-ref-syntax-aux-cache-cache-table.html) - * [CLEAR CACHE](sql-ref-syntax-aux-cache-clear-cache.html) - * [REFRESH TABLE](sql-ref-syntax-aux-refresh-table.html) - * [REFRESH](sql-ref-syntax-aux-cache-refresh.html) +* [CACHE TABLE](sql-ref-syntax-aux-cache-cache-table.html) +* [CLEAR CACHE](sql-ref-syntax-aux-cache-clear-cache.html) +* [REFRESH TABLE](sql-ref-syntax-aux-refresh-table.html) +* [REFRESH](sql-ref-syntax-aux-cache-refresh.html) diff --git a/docs/sql-ref-syntax-aux-conf-mgmt-reset.md b/docs/sql-ref-syntax-aux-conf-mgmt-reset.md index e7e6dda4e25ee..4caf57a232f89 100644 --- a/docs/sql-ref-syntax-aux-conf-mgmt-reset.md +++ b/docs/sql-ref-syntax-aux-conf-mgmt-reset.md @@ -25,17 +25,17 @@ Reset any runtime configurations specific to the current session which were set ### Syntax -{% highlight sql %} +```sql RESET -{% endhighlight %} +``` ### Examples -{% highlight sql %} +```sql -- Reset any runtime configurations specific to the current session which were set via the SET command to their default values. RESET; -{% endhighlight %} +``` ### Related Statements - * [SET](sql-ref-syntax-aux-conf-mgmt-set.html) +* [SET](sql-ref-syntax-aux-conf-mgmt-set.html) diff --git a/docs/sql-ref-syntax-aux-conf-mgmt-set.md b/docs/sql-ref-syntax-aux-conf-mgmt-set.md index 330a1a6a399ff..f97b7f2a8efed 100644 --- a/docs/sql-ref-syntax-aux-conf-mgmt-set.md +++ b/docs/sql-ref-syntax-aux-conf-mgmt-set.md @@ -25,32 +25,29 @@ The SET command sets a property, returns the value of an existing property or re ### Syntax -{% highlight sql %} +```sql SET SET [ -v ] SET property_key[ = property_value ] -{% endhighlight %} +``` ### Parameters -
-
-v
-
Outputs the key, value and meaning of existing SQLConf properties.
-
+* **-v** -
-
property_key
-
Returns the value of specified property key.
-
+ Outputs the key, value and meaning of existing SQLConf properties. -
-
property_key=property_value
-
Sets the value for a given property key. If an old value exists for a given property key, then it gets overridden by the new value.
-
+* **property_key** + + Returns the value of specified property key. + +* **property_key=property_value** + + Sets the value for a given property key. If an old value exists for a given property key, then it gets overridden by the new value. ### Examples -{% highlight sql %} +```sql -- Set a property. SET spark.sql.variable.substitute=false; @@ -67,8 +64,8 @@ SET spark.sql.variable.substitute; +-----------------------------+-----+ |spark.sql.variable.substitute|false| +-----------------------------+-----+ -{% endhighlight %} +``` ### Related Statements - * [RESET](sql-ref-syntax-aux-conf-mgmt-reset.html) +* [RESET](sql-ref-syntax-aux-conf-mgmt-reset.html) diff --git a/docs/sql-ref-syntax-aux-describe-database.md b/docs/sql-ref-syntax-aux-describe-database.md index 39a40ddac800f..143fa78b205ca 100644 --- a/docs/sql-ref-syntax-aux-describe-database.md +++ b/docs/sql-ref-syntax-aux-describe-database.md @@ -28,23 +28,20 @@ interchangeable. ### Syntax -{% highlight sql %} +```sql { DESC | DESCRIBE } DATABASE [ EXTENDED ] db_name -{% endhighlight %} +``` ### Parameters -
-
db_name
-
+* **db_name** + Specifies a name of an existing database or an existing schema in the system. If the name does not exist, an exception is thrown. -
-
### Examples -{% highlight sql %} +```sql -- Create employees DATABASE CREATE DATABASE employees COMMENT 'For software companies'; @@ -89,10 +86,10 @@ DESC DATABASE deployment; | Description| Deployment environment| | Location|file:/Users/Temp/deployment.db| +-------------------------+------------------------------+ -{% endhighlight %} +``` ### Related Statements - * [DESCRIBE FUNCTION](sql-ref-syntax-aux-describe-function.html) - * [DESCRIBE TABLE](sql-ref-syntax-aux-describe-table.html) - * [DESCRIBE QUERY](sql-ref-syntax-aux-describe-query.html) +* [DESCRIBE FUNCTION](sql-ref-syntax-aux-describe-function.html) +* [DESCRIBE TABLE](sql-ref-syntax-aux-describe-table.html) +* [DESCRIBE QUERY](sql-ref-syntax-aux-describe-query.html) diff --git a/docs/sql-ref-syntax-aux-describe-function.md b/docs/sql-ref-syntax-aux-describe-function.md index 76c9efad2fa7d..a871fb5bfd406 100644 --- a/docs/sql-ref-syntax-aux-describe-function.md +++ b/docs/sql-ref-syntax-aux-describe-function.md @@ -28,29 +28,24 @@ metadata information is returned along with the extended usage information. ### Syntax -{% highlight sql %} +```sql { DESC | DESCRIBE } FUNCTION [ EXTENDED ] function_name -{% endhighlight %} +``` ### Parameters -
-
function_name
-
+* **function_name** + Specifies a name of an existing function in the system. The function name may be optionally qualified with a database name. If `function_name` is qualified with a database then the function is resolved from the user specified database, otherwise - it is resolved from the current database.

- Syntax: - - [ database_name. ] function_name - -
-
+ it is resolved from the current database. + + **Syntax:** `[ database_name. ] function_name` ### Examples -{% highlight sql %} +```sql -- Describe a builtin scalar function. -- Returns function name, implementing class and usage DESC FUNCTION abs; @@ -107,11 +102,10 @@ DESC FUNCTION EXTENDED explode | 10 | | 20 | +---------------------------------------------------------------+ - -{% endhighlight %} +``` ### Related Statements - * [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) - * [DESCRIBE TABLE](sql-ref-syntax-aux-describe-table.html) - * [DESCRIBE QUERY](sql-ref-syntax-aux-describe-query.html) +* [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) +* [DESCRIBE TABLE](sql-ref-syntax-aux-describe-table.html) +* [DESCRIBE QUERY](sql-ref-syntax-aux-describe-query.html) diff --git a/docs/sql-ref-syntax-aux-describe-query.md b/docs/sql-ref-syntax-aux-describe-query.md index 07ac39b4431f4..65e101d3dbf13 100644 --- a/docs/sql-ref-syntax-aux-describe-query.md +++ b/docs/sql-ref-syntax-aux-describe-query.md @@ -27,38 +27,36 @@ describe the query output. ### Syntax -{% highlight sql %} +```sql { DESC | DESCRIBE } [ QUERY ] input_statement -{% endhighlight %} +``` ### Parameters -
-
QUERY
-
This clause is optional and may be omitted.
-
input_statement
-
+* **QUERY** + This clause is optional and may be omitted. + +* **input_statement** + Specifies a result set producing statement and may be one of the following: -
    -
  • a SELECT statement
  • -
  • a CTE(Common table expression) statement
  • -
  • an INLINE TABLE statement
  • -
  • a TABLE statement
  • -
  • a FROM statement
  • -
- Please refer to select-statement + + * a `SELECT` statement + * a `CTE(Common table expression)` statement + * an `INLINE TABLE` statement + * a `TABLE` statement + * a `FROM` statement` + + Please refer to [select-statement](sql-ref-syntax-qry-select.html) for a detailed syntax of the query parameter. -
-
### Examples -{% highlight sql %} +```sql -- Create table `person` CREATE TABLE person (name STRING , age INT COMMENT 'Age column', address STRING); -- Returns column metadata information for a simple select query -DESCRIBE QUERY select age, sum(age) FROM person GROUP BY age; +DESCRIBE QUERY SELECT age, sum(age) FROM person GROUP BY age; +--------+---------+----------+ |col_name|data_type| comment| +--------+---------+----------+ @@ -103,10 +101,10 @@ DESCRIBE FROM person SELECT age; +--------+---------+----------+ | age| int| Agecolumn| +--------+---------+----------+ -{% endhighlight %} +``` ### Related Statements - * [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) - * [DESCRIBE TABLE](sql-ref-syntax-aux-describe-table.html) - * [DESCRIBE FUNCTION](sql-ref-syntax-aux-describe-function.html) +* [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) +* [DESCRIBE TABLE](sql-ref-syntax-aux-describe-table.html) +* [DESCRIBE FUNCTION](sql-ref-syntax-aux-describe-function.html) diff --git a/docs/sql-ref-syntax-aux-describe-table.md b/docs/sql-ref-syntax-aux-describe-table.md index 63bf056d785cc..4b6e1e8c3461e 100644 --- a/docs/sql-ref-syntax-aux-describe-table.md +++ b/docs/sql-ref-syntax-aux-describe-table.md @@ -28,53 +28,43 @@ to return the metadata pertaining to a partition or column respectively. ### Syntax -{% highlight sql %} +```sql { DESC | DESCRIBE } [ TABLE ] [ format ] table_identifier [ partition_spec ] [ col_name ] -{% endhighlight %} +``` ### Parameters -
-
format
-
+* **format** + Specifies the optional format of describe output. If `EXTENDED` is specified then additional metadata information (such as parent database, owner, and access time) is returned. -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
partition_spec
-
+ +* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **partition_spec** + An optional parameter that specifies a comma separated list of key and value pairs - for partitions. When specified, additional partition metadata is returned.

- Syntax: - - PARTITION ( partition_col_name = partition_col_val [ , ... ] ) - -
-
col_name
-
+ for partitions. When specified, additional partition metadata is returned. + + **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` + +* **col_name** + An optional parameter that specifies the column name that needs to be described. The supplied column name may be optionally qualified. Parameters `partition_spec` and `col_name` are mutually exclusive and can not be specified together. Currently - nested columns are not allowed to be specified.

+ nested columns are not allowed to be specified. - Syntax: - - [ database_name. ] [ table_name. ] column_name - -
-
+ **Syntax:** `[ database_name. ] [ table_name. ] column_name` ### Examples -{% highlight sql %} +```sql -- Creates a table `customer`. Assumes current database is `salesdb`. CREATE TABLE customer( cust_id INT, @@ -183,10 +173,10 @@ DESCRIBE customer salesdb.customer.name; |data_type| string| | comment|Short name| +---------+----------+ -{% endhighlight %} +``` ### Related Statements - * [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) - * [DESCRIBE QUERY](sql-ref-syntax-aux-describe-query.html) - * [DESCRIBE FUNCTION](sql-ref-syntax-aux-describe-function.html) +* [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) +* [DESCRIBE QUERY](sql-ref-syntax-aux-describe-query.html) +* [DESCRIBE FUNCTION](sql-ref-syntax-aux-describe-function.html) diff --git a/docs/sql-ref-syntax-aux-refresh-table.md b/docs/sql-ref-syntax-aux-refresh-table.md index 165ca68309f4a..8d4a804f88671 100644 --- a/docs/sql-ref-syntax-aux-refresh-table.md +++ b/docs/sql-ref-syntax-aux-refresh-table.md @@ -27,26 +27,21 @@ lazy manner when the cached table or the query associated with it is executed ag ### Syntax -{% highlight sql %} +```sql REFRESH [TABLE] table_identifier -{% endhighlight %} +``` ### Parameters -
-
table_identifier
-
- Specifies a table name, which is either a qualified or unqualified name that designates a table/view. If no database identifier is provided, it refers to a temporary view or a table/view in the current database.

- Syntax: - - [ database_name. ] table_name - -
-
+* **table_identifier** + + Specifies a table name, which is either a qualified or unqualified name that designates a table/view. If no database identifier is provided, it refers to a temporary view or a table/view in the current database. + + **Syntax:** `[ database_name. ] table_name` ### Examples -{% highlight sql %} +```sql -- The cached entries of the table will be refreshed -- The table is resolved from the current database as the table name is unqualified. REFRESH TABLE tbl1; @@ -54,11 +49,11 @@ REFRESH TABLE tbl1; -- The cached entries of the view will be refreshed or invalidated -- The view is resolved from tempDB database, as the view name is qualified. REFRESH TABLE tempDB.view1; -{% endhighlight %} +``` ### Related Statements - * [CACHE TABLE](sql-ref-syntax-aux-cache-cache-table.html) - * [CLEAR CACHE](sql-ref-syntax-aux-cache-clear-cache.html) - * [UNCACHE TABLE](sql-ref-syntax-aux-cache-uncache-table.html) - * [REFRESH](sql-ref-syntax-aux-cache-refresh.html) +* [CACHE TABLE](sql-ref-syntax-aux-cache-cache-table.html) +* [CLEAR CACHE](sql-ref-syntax-aux-cache-clear-cache.html) +* [UNCACHE TABLE](sql-ref-syntax-aux-cache-uncache-table.html) +* [REFRESH](sql-ref-syntax-aux-cache-refresh.html) diff --git a/docs/sql-ref-syntax-aux-resource-mgmt-add-file.md b/docs/sql-ref-syntax-aux-resource-mgmt-add-file.md index 0028884308890..9203293d0c981 100644 --- a/docs/sql-ref-syntax-aux-resource-mgmt-add-file.md +++ b/docs/sql-ref-syntax-aux-resource-mgmt-add-file.md @@ -25,30 +25,29 @@ license: | ### Syntax -{% highlight sql %} +```sql ADD FILE resource_name -{% endhighlight %} +``` ### Parameters -
-
resource_name
-
The name of the file or directory to be added.
-
+* **resource_name** + + The name of the file or directory to be added. ### Examples -{% highlight sql %} +```sql ADD FILE /tmp/test; ADD FILE "/path/to/file/abc.txt"; ADD FILE '/another/test.txt'; ADD FILE "/path with space/abc.txt"; ADD FILE "/path/to/some/directory"; -{% endhighlight %} +``` ### Related Statements - * [LIST FILE](sql-ref-syntax-aux-resource-mgmt-list-file.html) - * [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html) - * [ADD JAR](sql-ref-syntax-aux-resource-mgmt-add-jar.html) +* [LIST FILE](sql-ref-syntax-aux-resource-mgmt-list-file.html) +* [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html) +* [ADD JAR](sql-ref-syntax-aux-resource-mgmt-add-jar.html) diff --git a/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md b/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md index c4020347c1be0..264c50a87ea55 100644 --- a/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md +++ b/docs/sql-ref-syntax-aux-resource-mgmt-add-jar.md @@ -25,28 +25,26 @@ license: | ### Syntax -{% highlight sql %} +```sql ADD JAR file_name -{% endhighlight %} +``` ### Parameters -
-
file_name
-
The name of the JAR file to be added. It could be either on a local file system or a distributed file system.
-
+* **file_name** + The name of the JAR file to be added. It could be either on a local file system or a distributed file system. ### Examples -{% highlight sql %} +```sql ADD JAR /tmp/test.jar; ADD JAR "/path/to/some.jar"; ADD JAR '/some/other.jar'; ADD JAR "/path with space/abc.jar"; -{% endhighlight %} +``` ### Related Statements - * [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html) - * [ADD FILE](sql-ref-syntax-aux-resource-mgmt-add-file.html) - * [LIST FILE](sql-ref-syntax-aux-resource-mgmt-list-file.html) +* [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html) +* [ADD FILE](sql-ref-syntax-aux-resource-mgmt-add-file.html) +* [LIST FILE](sql-ref-syntax-aux-resource-mgmt-list-file.html) diff --git a/docs/sql-ref-syntax-aux-resource-mgmt-list-file.md b/docs/sql-ref-syntax-aux-resource-mgmt-list-file.md index eec98e1fbffb5..9b9a7df7f612f 100644 --- a/docs/sql-ref-syntax-aux-resource-mgmt-list-file.md +++ b/docs/sql-ref-syntax-aux-resource-mgmt-list-file.md @@ -25,13 +25,13 @@ license: | ### Syntax -{% highlight sql %} +```sql LIST FILE -{% endhighlight %} +``` ### Examples -{% highlight sql %} +```sql ADD FILE /tmp/test; ADD FILE /tmp/test_2; LIST FILE; @@ -42,11 +42,11 @@ file:/private/tmp/test_2 LIST FILE /tmp/test /some/random/file /another/random/file --output file:/private/tmp/test -{% endhighlight %} +``` ### Related Statements - * [ADD FILE](sql-ref-syntax-aux-resource-mgmt-add-file.html) - * [ADD JAR](sql-ref-syntax-aux-resource-mgmt-add-jar.html) - * [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html) +* [ADD FILE](sql-ref-syntax-aux-resource-mgmt-add-file.html) +* [ADD JAR](sql-ref-syntax-aux-resource-mgmt-add-jar.html) +* [LIST JAR](sql-ref-syntax-aux-resource-mgmt-list-jar.html) diff --git a/docs/sql-ref-syntax-aux-resource-mgmt-list-jar.md b/docs/sql-ref-syntax-aux-resource-mgmt-list-jar.md index dca4252c90ef2..04aa52c2ad8af 100644 --- a/docs/sql-ref-syntax-aux-resource-mgmt-list-jar.md +++ b/docs/sql-ref-syntax-aux-resource-mgmt-list-jar.md @@ -25,13 +25,13 @@ license: | ### Syntax -{% highlight sql %} +```sql LIST JAR -{% endhighlight %} +``` ### Examples -{% highlight sql %} +```sql ADD JAR /tmp/test.jar; ADD JAR /tmp/test_2.jar; LIST JAR; @@ -42,11 +42,11 @@ spark://192.168.1.112:62859/jars/test_2.jar LIST JAR /tmp/test.jar /some/random.jar /another/random.jar; -- output spark://192.168.1.112:62859/jars/test.jar -{% endhighlight %} +``` ### Related Statements - * [ADD JAR](sql-ref-syntax-aux-resource-mgmt-add-jar.html) - * [ADD FILE](sql-ref-syntax-aux-resource-mgmt-add-file.html) - * [LIST FILE](sql-ref-syntax-aux-resource-mgmt-list-file.html) +* [ADD JAR](sql-ref-syntax-aux-resource-mgmt-add-jar.html) +* [ADD FILE](sql-ref-syntax-aux-resource-mgmt-add-file.html) +* [LIST FILE](sql-ref-syntax-aux-resource-mgmt-list-file.html) diff --git a/docs/sql-ref-syntax-aux-show-columns.md b/docs/sql-ref-syntax-aux-show-columns.md index 7229bba23d2bf..b76db252f1a0f 100644 --- a/docs/sql-ref-syntax-aux-show-columns.md +++ b/docs/sql-ref-syntax-aux-show-columns.md @@ -21,7 +21,7 @@ license: | ### Description -Return the list of columns in a table. If the table does not exist, an exception is thrown. +Returns the list of columns in a table. If the table does not exist, an exception is thrown. ### Syntax diff --git a/docs/sql-ref-syntax-aux-show-create-table.md b/docs/sql-ref-syntax-aux-show-create-table.md index 47a5290f1d022..ae8c10e2d0178 100644 --- a/docs/sql-ref-syntax-aux-show-create-table.md +++ b/docs/sql-ref-syntax-aux-show-create-table.md @@ -25,26 +25,21 @@ license: | ### Syntax -{% highlight sql %} +```sql SHOW CREATE TABLE table_identifier -{% endhighlight %} +``` ### Parameters -
-
table_identifier
-
- Specifies a table or view name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
+* **table_identifier** + + Specifies a table or view name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` ### Examples -{% highlight sql %} +```sql CREATE TABLE test (c INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE TBLPROPERTIES ('prop1' = 'value1', 'prop2' = 'value2'); @@ -60,9 +55,9 @@ SHOW CREATE TABLE test; 'prop1' = 'value1', 'prop2' = 'value2') +----------------------------------------------------+ -{% endhighlight %} +``` ### Related Statements - * [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) - * [CREATE VIEW](sql-ref-syntax-ddl-create-view.html) +* [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) +* [CREATE VIEW](sql-ref-syntax-ddl-create-view.html) diff --git a/docs/sql-ref-syntax-aux-show-databases.md b/docs/sql-ref-syntax-aux-show-databases.md index c84898aa81459..44c0fbbef3929 100644 --- a/docs/sql-ref-syntax-aux-show-databases.md +++ b/docs/sql-ref-syntax-aux-show-databases.md @@ -21,35 +21,31 @@ license: | ### Description -Lists the databases that match an optionally supplied string pattern. If no +Lists the databases that match an optionally supplied regular expression pattern. If no pattern is supplied then the command lists all the databases in the system. Please note that the usage of `SCHEMAS` and `DATABASES` are interchangeable and mean the same thing. ### Syntax -{% highlight sql %} +```sql SHOW { DATABASES | SCHEMAS } [ LIKE regex_pattern ] -{% endhighlight %} +``` ### Parameters -
-
regex_pattern
-
+* **regex_pattern** + Specifies a regular expression pattern that is used to filter the results of the statement. -
    -
  • Only * and | are allowed as wildcard pattern.
  • -
  • Excluding * and |, the remaining pattern follows the regular expression semantics.
  • -
  • The leading and trailing blanks are trimmed in the input pattern before processing. The pattern match is case-insensitive.
  • -
-
-
+ * Except for `*` and `|` character, the pattern works like a regular expression. + * `*` alone matches 0 or more characters and `|` is used to separate multiple different regular expressions, + any of which can match. + * The leading and trailing blanks are trimmed in the input pattern before processing. The pattern match is case-insensitive. ### Examples -{% highlight sql %} +```sql -- Create database. Assumes a database named `default` already exists in -- the system. CREATE DATABASE payroll_db; @@ -83,10 +79,10 @@ SHOW SCHEMAS; | payments_db| | payroll_db| +------------+ -{% endhighlight %} +``` ### Related Statements - * [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) - * [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) - * [ALTER DATABASE](sql-ref-syntax-ddl-alter-database.html) +* [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) +* [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) +* [ALTER DATABASE](sql-ref-syntax-ddl-alter-database.html) diff --git a/docs/sql-ref-syntax-aux-show-functions.md b/docs/sql-ref-syntax-aux-show-functions.md index 8a6de402c7f20..2cfca0f34bf77 100644 --- a/docs/sql-ref-syntax-aux-show-functions.md +++ b/docs/sql-ref-syntax-aux-show-functions.md @@ -9,7 +9,6 @@ license: | The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software @@ -29,48 +28,42 @@ clause is optional and supported only for compatibility with other systems. ### Syntax -{% highlight sql %} -SHOW [ function_kind ] FUNCTIONS ( [ LIKE ] function_name | regex_pattern ) -{% endhighlight %} +```sql +SHOW [ function_kind ] FUNCTIONS [ [ LIKE ] { function_name | regex_pattern } ] +``` ### Parameters -
-
function_kind
-
+* **function_kind** + Specifies the name space of the function to be searched upon. The valid name spaces are : -
    -
  • USER - Looks up the function(s) among the user defined functions.
  • -
  • SYSTEM - Looks up the function(s) among the system defined functions.
  • -
  • ALL - Looks up the function(s) among both user and system defined functions.
  • -
-
-
function_name
-
+ + * **USER** - Looks up the function(s) among the user defined functions. + * **SYSTEM** - Looks up the function(s) among the system defined functions. + * **ALL** - Looks up the function(s) among both user and system defined functions. + +* **function_name** + Specifies a name of an existing function in the system. The function name may be optionally qualified with a database name. If `function_name` is qualified with a database then the function is resolved from the user specified database, otherwise - it is resolved from the current database.

- Syntax: - - [database_name.]function_name - -
-
regex_pattern
-
+ it is resolved from the current database. + + **Syntax:** `[database_name.]function_name` + +* **regex_pattern** + Specifies a regular expression pattern that is used to filter the results of the statement. -
    -
  • Only * and | are allowed as wildcard pattern.
  • -
  • Excluding * and |, the remaining pattern follows the regular expression semantics.
  • -
  • The leading and trailing blanks are trimmed in the input pattern before processing. The pattern match is case-insensitive.
  • -
-
-
+ + * Except for `*` and `|` character, the pattern works like a regular expression. + * `*` alone matches 0 or more characters and `|` is used to separate multiple different regular expressions, + any of which can match. + * The leading and trailing blanks are trimmed in the input pattern before processing. The pattern match is case-insensitive. ### Examples -{% highlight sql %} +```sql -- List a system function `trim` by searching both user defined and system -- defined functions. SHOW FUNCTIONS trim; @@ -138,8 +131,8 @@ SHOW FUNCTIONS LIKE 't[a-z][a-z][a-z]'; | tanh| | trim| +--------+ -{% endhighlight %} +``` ### Related Statements - * [DESCRIBE FUNCTION](sql-ref-syntax-aux-describe-function.html) +* [DESCRIBE FUNCTION](sql-ref-syntax-aux-describe-function.html) diff --git a/docs/sql-ref-syntax-aux-show-partitions.md b/docs/sql-ref-syntax-aux-show-partitions.md index 592833b23eb09..f937f8f524342 100644 --- a/docs/sql-ref-syntax-aux-show-partitions.md +++ b/docs/sql-ref-syntax-aux-show-partitions.md @@ -27,37 +27,28 @@ partition spec. ### Syntax -{% highlight sql %} +```sql SHOW PARTITIONS table_identifier [ partition_spec ] -{% endhighlight %} +``` ### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
-
-
partition_spec
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **partition_spec** + An optional parameter that specifies a comma separated list of key and value pairs - for partitions. When specified, the partitions that match the partition spec are returned.

- Syntax: - - PARTITION ( partition_col_name [ = partition_col_val ] [ , ... ] ) - -
-
+ for partitions. When specified, the partitions that match the partition spec are returned. + + **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` ### Examples -{% highlight sql %} +```sql -- create a partitioned table and insert a few rows. USE salesdb; CREATE TABLE customer(id INT, name STRING) PARTITIONED BY (state STRING, city STRING); @@ -109,11 +100,11 @@ SHOW PARTITIONS customer PARTITION (city = 'San Jose'); +----------------------+ |state=CA/city=San Jose| +----------------------+ -{% endhighlight %} +``` ### Related Statements - * [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) - * [INSERT STATEMENT](sql-ref-syntax-dml-insert.html) - * [DESCRIBE TABLE](sql-ref-syntax-aux-describe-table.html) - * [SHOW TABLE](sql-ref-syntax-aux-show-table.html) +* [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) +* [INSERT STATEMENT](sql-ref-syntax-dml-insert.html) +* [DESCRIBE TABLE](sql-ref-syntax-aux-describe-table.html) +* [SHOW TABLE](sql-ref-syntax-aux-show-table.html) diff --git a/docs/sql-ref-syntax-aux-show-table.md b/docs/sql-ref-syntax-aux-show-table.md index 3f588045790b2..6be6e7eff79ca 100644 --- a/docs/sql-ref-syntax-aux-show-table.md +++ b/docs/sql-ref-syntax-aux-show-table.md @@ -32,42 +32,36 @@ cannot be used with a partition specification. ### Syntax -{% highlight sql %} -SHOW TABLE EXTENDED [ IN | FROM database_name ] LIKE regex_pattern +```sql +SHOW TABLE EXTENDED [ { IN | FROM } database_name ] LIKE regex_pattern [ partition_spec ] -{% endhighlight %} +``` ### Parameters -
-
IN|FROM database_name
-
+* **{ IN`|`FROM } database_name** + Specifies database name. If not provided, will use the current database. -
-
regex_pattern
-
+ +* **regex_pattern** + Specifies the regular expression pattern that is used to filter out unwanted tables. -
    -
  • Except for * and | character, the pattern works like a regular expression.
  • -
  • * alone matches 0 or more characters and | is used to separate multiple different regular expressions, - any of which can match.
  • -
  • The leading and trailing blanks are trimmed in the input pattern before processing. The pattern match is case-insensitive.
  • -
-
-
partition_spec
-
+ + * Except for `*` and `|` character, the pattern works like a regular expression. + * `*` alone matches 0 or more characters and `|` is used to separate multiple different regular expressions, + any of which can match. + * The leading and trailing blanks are trimmed in the input pattern before processing. The pattern match is case-insensitive. + +* **partition_spec** + An optional parameter that specifies a comma separated list of key and value pairs - for partitions. Note that a table regex cannot be used with a partition specification.

- Syntax: - - PARTITION ( partition_col_name [ = partition_col_val ] [ , ... ] ) - -
-
+ for partitions. Note that a table regex cannot be used with a partition specification. + + **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` ### Examples -{% highlight sql %} +```sql -- Assumes `employee` table created with partitioned by column `grade` CREATE TABLE employee(name STRING, grade INT) PARTITIONED BY (grade); INSERT INTO employee PARTITION (grade = 1) VALUES ('sam'); @@ -152,7 +146,7 @@ SHOW TABLE EXTENDED LIKE `employe*`; +--------+---------+----------+---------------------------------------------------------------+ -- show partition file system details -SHOW TABLE EXTENDED IN `default` LIKE `employee` PARTITION (`grade=1`); +SHOW TABLE EXTENDED IN default LIKE `employee` PARTITION (`grade=1`); +--------+---------+-----------+--------------------------------------------------------------+ |database|tableName|isTemporary| information | +--------+---------+-----------+--------------------------------------------------------------+ @@ -175,12 +169,12 @@ SHOW TABLE EXTENDED IN `default` LIKE `employee` PARTITION (`grade=1`); +--------+---------+-----------+--------------------------------------------------------------+ -- show partition file system details with regex fails as shown below -SHOW TABLE EXTENDED IN `default` LIKE `empl*` PARTITION (`grade=1`); +SHOW TABLE EXTENDED IN default LIKE `empl*` PARTITION (`grade=1`); Error: Error running query: org.apache.spark.sql.catalyst.analysis.NoSuchTableException: Table or view 'emplo*' not found in database 'default'; (state=,code=0) -{% endhighlight %} +``` ### Related Statements - * [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) - * [DESCRIBE TABLE](sql-ref-syntax-aux-describe-table.html) +* [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) +* [DESCRIBE TABLE](sql-ref-syntax-aux-describe-table.html) diff --git a/docs/sql-ref-syntax-aux-show-tables.md b/docs/sql-ref-syntax-aux-show-tables.md index 62eb3ddb18b5c..fef9722a444f8 100644 --- a/docs/sql-ref-syntax-aux-show-tables.md +++ b/docs/sql-ref-syntax-aux-show-tables.md @@ -28,33 +28,28 @@ current database. ### Syntax -{% highlight sql %} +```sql SHOW TABLES [ { FROM | IN } database_name ] [ LIKE regex_pattern ] -{% endhighlight %} +``` ### Parameters -
-
{ FROM | IN } database_name
-
+* **{ FROM `|` IN } database_name** + Specifies the database name from which tables are listed. -
-
regex_pattern
-
+ +* **regex_pattern** + Specifies the regular expression pattern that is used to filter out unwanted tables. -
    -
  • Except for * and | character, the pattern works like a regular expression.
  • -
  • * alone matches 0 or more characters and | is used to separate multiple different regular expressions, - any of which can match.
  • -
  • The leading and trailing blanks are trimmed in the input pattern before processing. The pattern match is case-insensitive.
  • -
- -
-
+ + * Except for `*` and `|` character, the pattern works like a regular expression. + * `*` alone matches 0 or more characters and `|` is used to separate multiple different regular expressions, + any of which can match. + * The leading and trailing blanks are trimmed in the input pattern before processing. The pattern match is case-insensitive. ### Examples -{% highlight sql %} +```sql -- List all tables in default database SHOW TABLES; +--------+---------+-----------+ @@ -101,11 +96,11 @@ SHOW TABLES LIKE 'sam*|suj'; | default| sam1| false| | default| suj| false| +--------+---------+-----------+ -{% endhighlight %} +``` ### Related Statements - * [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) - * [DROP TABLE](sql-ref-syntax-ddl-drop-table.html) - * [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) - * [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html) +* [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) +* [DROP TABLE](sql-ref-syntax-ddl-drop-table.html) +* [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) +* [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html) diff --git a/docs/sql-ref-syntax-aux-show-tblproperties.md b/docs/sql-ref-syntax-aux-show-tblproperties.md index 662aaad069dd9..5b7ddcbcd9534 100644 --- a/docs/sql-ref-syntax-aux-show-tblproperties.md +++ b/docs/sql-ref-syntax-aux-show-tblproperties.md @@ -26,37 +26,30 @@ a property key. If no key is specified then all the properties are returned. ### Syntax -{% highlight sql %} +```sql SHOW TBLPROPERTIES table_identifier [ ( unquoted_property_key | property_key_as_string_literal ) ] -{% endhighlight %} +``` ### Parameters -
-
table_identifier
-
+* **table_identifier** + Specifies the table name of an existing table. The table may be optionally qualified - with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
unquoted_property_key
-
+ with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **unquoted_property_key** + Specifies the property key in unquoted form. The key may consists of multiple - parts separated by dot.

- Syntax: - - [ key_part1 ] [ .key_part2 ] [ ... ] - -
-
property_key_as_string_literal
-
+ parts separated by dot. + + **Syntax:** `[ key_part1 ] [ .key_part2 ] [ ... ]` + +* **property_key_as_string_literal** + Specifies a property key value as a string literal. -
-
**Note** - Property value returned by this statement excludes some properties @@ -68,7 +61,7 @@ SHOW TBLPROPERTIES table_identifier ### Examples -{% highlight sql %} +```sql -- create a table `customer` in database `salesdb` USE salesdb; CREATE TABLE customer(cust_code INT, name VARCHAR(100), cust_addr STRING) @@ -110,11 +103,11 @@ SHOW TBLPROPERTIES customer ('created.date'); +----------+ |01-01-2001| +----------+ -{% endhighlight %} +``` ### Related Statements - * [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) - * [ALTER TABLE SET TBLPROPERTIES](sql-ref-syntax-ddl-alter-table.html) - * [SHOW TABLES](sql-ref-syntax-aux-show-tables.html) - * [SHOW TABLE EXTENDED](sql-ref-syntax-aux-show-table.html) +* [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) +* [ALTER TABLE SET TBLPROPERTIES](sql-ref-syntax-ddl-alter-table.html) +* [SHOW TABLES](sql-ref-syntax-aux-show-tables.html) +* [SHOW TABLE EXTENDED](sql-ref-syntax-aux-show-table.html) diff --git a/docs/sql-ref-syntax-aux-show-views.md b/docs/sql-ref-syntax-aux-show-views.md index 29ad6caf140f8..5003c092cabce 100644 --- a/docs/sql-ref-syntax-aux-show-views.md +++ b/docs/sql-ref-syntax-aux-show-views.md @@ -29,30 +29,26 @@ list global temporary views. Note that the command also lists local temporary vi regardless of a given database. ### Syntax -{% highlight sql %} +```sql SHOW VIEWS [ { FROM | IN } database_name ] [ LIKE regex_pattern ] -{% endhighlight %} +``` ### Parameters -
-
{ FROM | IN } database_name
-
+* **{ FROM `|` IN } database_name** + Specifies the database name from which views are listed. -
-
regex_pattern
-
+ +* **regex_pattern** + Specifies the regular expression pattern that is used to filter out unwanted views. -
    -
  • Except for * and | character, the pattern works like a regular expression.
  • -
  • * alone matches 0 or more characters and | is used to separate multiple different regular expressions, - any of which can match.
  • -
  • The leading and trailing blanks are trimmed in the input pattern before processing. The pattern match is case-insensitive.
  • -
-
-
+ + * Except for `*` and `|` character, the pattern works like a regular expression. + * `*` alone matches 0 or more characters and `|` is used to separate multiple different regular expressions, + any of which can match. + * The leading and trailing blanks are trimmed in the input pattern before processing. The pattern match is case-insensitive. ### Examples -{% highlight sql %} +```sql -- Create views in different databases, also create global/local temp views. CREATE VIEW sam AS SELECT id, salary FROM employee WHERE name = 'sam'; CREATE VIEW sam1 AS SELECT id, salary FROM employee WHERE name = 'sam1'; @@ -61,8 +57,8 @@ USE userdb; CREATE VIEW user1 AS SELECT id, salary FROM default.employee WHERE name = 'user1'; CREATE VIEW user2 AS SELECT id, salary FROM default.employee WHERE name = 'user2'; USE default; -CREATE GLOBAL TEMP VIEW temp1 AS SELECT 1 as col1; -CREATE TEMP VIEW temp2 AS SELECT 1 as col1; +CREATE GLOBAL TEMP VIEW temp1 AS SELECT 1 AS col1; +CREATE TEMP VIEW temp2 AS SELECT 1 AS col1; -- List all views in default database SHOW VIEWS; @@ -112,11 +108,10 @@ SHOW VIEWS LIKE 'sam|suj|temp*'; | default | suj | false | | | temp2 | true | +-------------+------------+--------------+ - -{% endhighlight %} +``` ### Related statements -- [CREATE VIEW](sql-ref-syntax-ddl-create-view.html) -- [DROP VIEW](sql-ref-syntax-ddl-drop-view.html) -- [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) -- [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html) +* [CREATE VIEW](sql-ref-syntax-ddl-create-view.html) +* [DROP VIEW](sql-ref-syntax-ddl-drop-view.html) +* [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) +* [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html) diff --git a/docs/sql-ref-syntax-ddl-alter-database.md b/docs/sql-ref-syntax-ddl-alter-database.md index 2d5860c2ea920..fbc454e25fb0c 100644 --- a/docs/sql-ref-syntax-ddl-alter-database.md +++ b/docs/sql-ref-syntax-ddl-alter-database.md @@ -29,21 +29,20 @@ for a database and may be used for auditing purposes. ### Syntax -{% highlight sql %} +```sql ALTER { DATABASE | SCHEMA } database_name SET DBPROPERTIES ( property_name = property_value [ , ... ] ) -{% endhighlight %} +``` ### Parameters -
-
database_name
-
Specifies the name of the database to be altered.
-
+* **database_name** + + Specifies the name of the database to be altered. ### Examples -{% highlight sql %} +```sql -- Creates a database named `inventory`. CREATE DATABASE inventory; @@ -60,8 +59,8 @@ DESCRIBE DATABASE EXTENDED inventory; | Location| file:/temp/spark-warehouse/inventory.db| | Properties|((Edit-date,01/01/2001), (Edited-by,John))| +-------------------------+------------------------------------------+ -{% endhighlight %} +``` ### Related Statements - * [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) +* [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) diff --git a/docs/sql-ref-syntax-ddl-alter-table.md b/docs/sql-ref-syntax-ddl-alter-table.md index f81585fef3aae..dc3f52344c43a 100644 --- a/docs/sql-ref-syntax-ddl-alter-table.md +++ b/docs/sql-ref-syntax-ddl-alter-table.md @@ -29,35 +29,25 @@ license: | #### Syntax -{% highlight sql %} +```sql ALTER TABLE table_identifier RENAME TO table_identifier ALTER TABLE table_identifier partition_spec RENAME TO partition_spec -{% endhighlight %} +``` #### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
- -
-
partition_spec
-
- Partition to be renamed.

- Syntax: - - PARTITION ( partition_col_name = partition_col_val [ , ... ] ) - -
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **partition_spec** + + Partition to be renamed. + + **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` ### ADD COLUMNS @@ -65,27 +55,23 @@ ALTER TABLE table_identifier partition_spec RENAME TO partition_spec #### Syntax -{% highlight sql %} +```sql ALTER TABLE table_identifier ADD COLUMNS ( col_spec [ , ... ] ) -{% endhighlight %} +``` #### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
- -
-
COLUMNS ( col_spec )
-
Specifies the columns to be added.
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **COLUMNS ( col_spec )** + + Specifies the columns to be added. + + **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` ### ALTER OR CHANGE COLUMN @@ -93,38 +79,29 @@ ALTER TABLE table_identifier ADD COLUMNS ( col_spec [ , ... ] ) #### Syntax -{% highlight sql %} +```sql ALTER TABLE table_identifier { ALTER | CHANGE } [ COLUMN ] col_spec alterColumnAction -{% endhighlight %} +``` #### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
- -
-
COLUMN col_spec
-
Specifies the column to be altered or be changed.
-
- -
-
alterColumnAction
-
- Change the comment string.

- Syntax: - - COMMENT STRING - -
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **COLUMNS ( col_spec )** + + Specifies the columns to be added. + + **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` + +* **alterColumnAction** + + Change the comment string. + + **Syntax:** `COMMENT STRING` ### ADD AND DROP PARTITION @@ -134,34 +111,24 @@ ALTER TABLE table_identifier { ALTER | CHANGE } [ COLUMN ] col_spec alterColumnA ##### Syntax -{% highlight sql %} +```sql ALTER TABLE table_identifier ADD [IF NOT EXISTS] ( partition_spec [ partition_spec ... ] ) -{% endhighlight %} +``` ##### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
- -
-
partition_spec
-
- Partition to be added.

- Syntax: - - PARTITION ( partition_col_name = partition_col_val [ , ... ] ) - -
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **partition_spec** + + Partition to be added.. + + **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` #### DROP PARTITION @@ -169,33 +136,23 @@ ALTER TABLE table_identifier ADD [IF NOT EXISTS] ##### Syntax -{% highlight sql %} +```sql ALTER TABLE table_identifier DROP [ IF EXISTS ] partition_spec [PURGE] -{% endhighlight %} +``` ##### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
- -
-
partition_spec
-
- Partition to be dropped.

- Syntax: - - PARTITION ( partition_col_name = partition_col_val [ , ... ] ) - -
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **partition_spec** + + Partition to be dropped. + + **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` ### SET AND UNSET @@ -208,30 +165,28 @@ this overrides the old value with the new one. ##### Syntax -{% highlight sql %} +```sql -- Set Table Properties ALTER TABLE table_identifier SET TBLPROPERTIES ( key1 = val1, key2 = val2, ... ) -- Unset Table Properties ALTER TABLE table_identifier UNSET TBLPROPERTIES [ IF EXISTS ] ( key1, key2, ... ) -{% endhighlight %} +``` #### SET SERDE -`ALTER TABLE SET` command is used for setting the SERDE or SERDE properties in Hive tables. If a particular property was already set, -this overrides the old value with the new one. +`ALTER TABLE SET` command is used for setting the SERDE or SERDE properties in Hive tables. If a particular property was already set, this overrides the old value with the new one. ##### Syntax -{% highlight sql %} +```sql -- Set SERDE Properties ALTER TABLE table_identifier [ partition_spec ] SET SERDEPROPERTIES ( key1 = val1, key2 = val2, ... ) ALTER TABLE table_identifier [ partition_spec ] SET SERDE serde_class_name [ WITH SERDEPROPERTIES ( key1 = val1, key2 = val2, ... ) ] - -{% endhighlight %} +``` #### SET LOCATION And SET FILE FORMAT @@ -240,46 +195,34 @@ existing tables. ##### Syntax -{% highlight sql %} +```sql -- Changing File Format ALTER TABLE table_identifier [ partition_spec ] SET FILEFORMAT file_format -- Changing File Location ALTER TABLE table_identifier [ partition_spec ] SET LOCATION 'new_location' -{% endhighlight %} +``` #### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
- -
-
partition_spec
-
- Specifies the partition on which the property has to be set.

- Syntax: - - PARTITION ( partition_col_name = partition_col_val [ , ... ] ) - -
-
- -
-
SERDEPROPERTIES ( key1 = val1, key2 = val2, ... )
-
Specifies the SERDE properties to be set.
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + **Syntax:** `[ database_name. ] table_name` + +* **partition_spec** + + Specifies the partition on which the property has to be set. + + **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` + +* **SERDEPROPERTIES ( key1 = val1, key2 = val2, ... )** + + Specifies the SERDE properties to be set. ### Examples -{% highlight sql %} +```sql -- RENAME table DESC student; +-----------------------+---------+-------+ @@ -481,9 +424,9 @@ ALTER TABLE dbx.tab1 SET TBLPROPERTIES ('winner' = 'loser') -- DROP TABLE PROPERTIES ALTER TABLE dbx.tab1 UNSET TBLPROPERTIES ('winner') -{% endhighlight %} +``` ### Related Statements - * [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) - * [DROP TABLE](sql-ref-syntax-ddl-drop-table.html) +* [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) +* [DROP TABLE](sql-ref-syntax-ddl-drop-table.html) diff --git a/docs/sql-ref-syntax-ddl-alter-view.md b/docs/sql-ref-syntax-ddl-alter-view.md index c2887692949ea..a34e77decf593 100644 --- a/docs/sql-ref-syntax-ddl-alter-view.md +++ b/docs/sql-ref-syntax-ddl-alter-view.md @@ -29,21 +29,16 @@ Renames the existing view. If the new view name already exists in the source dat does not support moving the views across databases. #### Syntax -{% highlight sql %} +```sql ALTER VIEW view_identifier RENAME TO view_identifier -{% endhighlight %} +``` #### Parameters -
-
view_identifier
-
- Specifies a view name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] view_name - -
-
+* **view_identifier** + + Specifies a view name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] view_name` #### SET View Properties Set one or more properties of an existing view. The properties are the key value pairs. If the properties' keys exist, @@ -51,89 +46,70 @@ the values are replaced with the new values. If the properties' keys do not exis the properties. #### Syntax -{% highlight sql %} +```sql ALTER VIEW view_identifier SET TBLPROPERTIES ( property_key = property_val [ , ... ] ) -{% endhighlight %} +``` #### Parameters -
-
view_identifier
-
- Specifies a view name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] view_name - -
-
property_key
-
- Specifies the property key. The key may consists of multiple parts separated by dot.

- Syntax: - - [ key_part1 ] [ .key_part2 ] [ ... ] - -
-
+* **view_identifier** + + Specifies a view name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] view_name` + +* **property_key** + + Specifies the property key. The key may consists of multiple parts separated by dot. + + **Syntax:** `[ key_part1 ] [ .key_part2 ] [ ... ]` #### UNSET View Properties Drop one or more properties of an existing view. If the specified keys do not exist, an exception is thrown. Use `IF EXISTS` to avoid the exception. #### Syntax -{% highlight sql %} +```sql ALTER VIEW view_identifier UNSET TBLPROPERTIES [ IF EXISTS ] ( property_key [ , ... ] ) -{% endhighlight %} +``` #### Parameters -
-
view_identifier
-
- Specifies a view name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] view_name - -
-
property_key
-
- Specifies the property key. The key may consists of multiple parts separated by dot.

- Syntax: - - [ key_part1 ] [ .key_part2 ] [ ... ] - -
-
+* **view_identifier** + + Specifies a view name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] view_name` + +* **property_key** + + Specifies the property key. The key may consists of multiple parts separated by dot. + + **Syntax:** `[ key_part1 ] [ .key_part2 ] [ ... ]` #### ALTER View AS SELECT `ALTER VIEW view_identifier AS SELECT` statement changes the definition of a view. The `SELECT` statement must be valid, and the `view_identifier` must exist. #### Syntax -{% highlight sql %} +```sql ALTER VIEW view_identifier AS select_statement -{% endhighlight %} +``` Note that `ALTER VIEW` statement does not support `SET SERDE` or `SET SERDEPROPERTIES` properties. #### Parameters -
-
view_identifier
-
- Specifies a view name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] view_name - -
-
select_statement
-
- Specifies the definition of the view. Check select_statement for details. -
-
+* **view_identifier** + + Specifies a view name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] view_name` + +* **select_statement** + + Specifies the definition of the view. Check [select_statement](sql-ref-syntax-qry-select.html) for details. ### Examples -{% highlight sql %} +```sql -- Rename only changes the view name. -- The source and target databases of the view have to be the same. -- Use qualified or unqualified name for the source and target view. @@ -218,11 +194,11 @@ DESC TABLE EXTENDED tempdb1.v2; | View Text| select * from tempdb1.v1| | | View Original Text| select * from tempdb1.v1| | +----------------------------+---------------------------+-------+ -{% endhighlight %} +``` ### Related Statements - * [describe-table](sql-ref-syntax-aux-describe-table.html) - * [create-view](sql-ref-syntax-ddl-create-view.html) - * [drop-view](sql-ref-syntax-ddl-drop-view.html) - * [show-views](sql-ref-syntax-aux-show-views.html) +* [describe-table](sql-ref-syntax-aux-describe-table.html) +* [create-view](sql-ref-syntax-ddl-create-view.html) +* [drop-view](sql-ref-syntax-ddl-drop-view.html) +* [show-views](sql-ref-syntax-aux-show-views.html) diff --git a/docs/sql-ref-syntax-ddl-create-database.md b/docs/sql-ref-syntax-ddl-create-database.md index 0ef0dfbdaed2b..8c0951253bf37 100644 --- a/docs/sql-ref-syntax-ddl-create-database.md +++ b/docs/sql-ref-syntax-ddl-create-database.md @@ -25,35 +25,38 @@ Creates a database with the specified name. If database with the same name alrea ### Syntax -{% highlight sql %} +```sql CREATE { DATABASE | SCHEMA } [ IF NOT EXISTS ] database_name [ COMMENT database_comment ] [ LOCATION database_directory ] [ WITH DBPROPERTIES ( property_name = property_value [ , ... ] ) ] -{% endhighlight %} +``` ### Parameters -
-
database_name
-
Specifies the name of the database to be created.
+* **database_name** -
IF NOT EXISTS
-
Creates a database with the given name if it doesn't exists. If a database with the same name already exists, nothing will happen.
+ Specifies the name of the database to be created. -
database_directory
-
Path of the file system in which the specified database is to be created. If the specified path does not exist in the underlying file system, this command creates a directory with the path. If the location is not specified, the database will be created in the default warehouse directory, whose path is configured by the static configuration spark.sql.warehouse.dir.
+* **IF NOT EXISTS** -
database_comment
-
Specifies the description for the database.
+ Creates a database with the given name if it doesn't exists. If a database with the same name already exists, nothing will happen. -
WITH DBPROPERTIES ( property_name=property_value [ , ... ] )
-
Specifies the properties for the database in key-value pairs.
-
+* **database_directory** + + Path of the file system in which the specified database is to be created. If the specified path does not exist in the underlying file system, this command creates a directory with the path. If the location is not specified, the database will be created in the default warehouse directory, whose path is configured by the static configuration spark.sql.warehouse.dir. + +* **database_comment** + + Specifies the description for the database. + +* **WITH DBPROPERTIES ( property_name=property_value [ , ... ] )** + + Specifies the properties for the database in key-value pairs. ### Examples -{% highlight sql %} +```sql -- Create database `customer_db`. This throws exception if database with name customer_db -- already exists. CREATE DATABASE customer_db; @@ -76,9 +79,9 @@ DESCRIBE DATABASE EXTENDED customer_db; | Location| hdfs://hacluster/user| | Properties| ((ID,001), (Name,John))| +-------------------------+--------------------------+ -{% endhighlight %} +``` ### Related Statements - * [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) - * [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html) +* [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) +* [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html) diff --git a/docs/sql-ref-syntax-ddl-create-function.md b/docs/sql-ref-syntax-ddl-create-function.md index e3f21f70f7c18..e66df5352b1b5 100644 --- a/docs/sql-ref-syntax-ddl-create-function.md +++ b/docs/sql-ref-syntax-ddl-create-function.md @@ -33,68 +33,57 @@ aggregate functions using Scala, Python and Java APIs. Please refer to ### Syntax -{% highlight sql %} +```sql CREATE [ OR REPLACE ] [ TEMPORARY ] FUNCTION [ IF NOT EXISTS ] function_name AS class_name [ resource_locations ] -{% endhighlight %} +``` ### Parameters -
-
OR REPLACE
-
+* **OR REPLACE** + If specified, the resources for the function are reloaded. This is mainly useful to pick up any changes made to the implementation of the function. This - parameter is mutually exclusive to IF NOT EXISTS and can not + parameter is mutually exclusive to `IF NOT EXISTS` and can not be specified together. -
-
TEMPORARY
-
- Indicates the scope of function being created. When TEMPORARY is specified, the + +* **TEMPORARY** + + Indicates the scope of function being created. When `TEMPORARY` is specified, the created function is valid and visible in the current session. No persistent entry is made in the catalog for these kind of functions. -
-
IF NOT EXISTS
-
- If specified, creates the function only when it does not exist. The creation - of function succeeds (no error is thrown) if the specified function already - exists in the system. This parameter is mutually exclusive to OR REPLACE - and can not be specified together. -
-
function_name
-
+ +* **IF NOT EXISTS** + Specifies a name of function to be created. The function name may be - optionally qualified with a database name.

- Syntax: - - [ database_name. ] function_name - -
-
class_name
-
+ optionally qualified with a database name. + +* **function_name** + + Specifies a name of function to be created. The function name may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] function_name` + +* **class_name** + Specifies the name of the class that provides the implementation for function to be created. The implementing class should extend one of the base classes as follows: -
    -
  • Should extend UDF or UDAF in org.apache.hadoop.hive.ql.exec package.
  • -
  • Should extend AbstractGenericUDAFResolver, GenericUDF, or - GenericUDTF in org.apache.hadoop.hive.ql.udf.generic package.
  • -
  • Should extend UserDefinedAggregateFunction in org.apache.spark.sql.expressions package.
  • -
-
-
resource_locations
-
+ + * Should extend `UDF` or `UDAF` in `org.apache.hadoop.hive.ql.exec` package. + * Should extend `AbstractGenericUDAFResolver`, `GenericUDF`, or + `GenericUDTF` in `org.apache.hadoop.hive.ql.udf.generic` package. + * Should extend `UserDefinedAggregateFunction` in `org.apache.spark.sql.expressions` package. + +* **resource_locations** + Specifies the list of resources that contain the implementation of the function - along with its dependencies.

- Syntax: - - USING { { (JAR | FILE ) resource_uri } , ... } - -
-
+ along with its dependencies. + + **Syntax:** `USING { { (JAR | FILE ) resource_uri } , ... }` ### Examples -{% highlight sql %} +```sql -- 1. Create a simple UDF `SimpleUdf` that increments the supplied integral value by 10. -- import org.apache.hadoop.hive.ql.exec.UDF; -- public class SimpleUdf extends UDF { @@ -166,10 +155,10 @@ SELECT simple_udf(c1) AS function_return_value FROM t1; | 21| | 22| +---------------------+ -{% endhighlight %} +``` ### Related Statements - * [SHOW FUNCTIONS](sql-ref-syntax-aux-show-functions.html) - * [DESCRIBE FUNCTION](sql-ref-syntax-aux-describe-function.html) - * [DROP FUNCTION](sql-ref-syntax-ddl-drop-function.html) +* [SHOW FUNCTIONS](sql-ref-syntax-aux-show-functions.html) +* [DESCRIBE FUNCTION](sql-ref-syntax-aux-describe-function.html) +* [DROP FUNCTION](sql-ref-syntax-ddl-drop-function.html) diff --git a/docs/sql-ref-syntax-ddl-create-table-datasource.md b/docs/sql-ref-syntax-ddl-create-table-datasource.md index 54827fd63568d..b592116c2a9e4 100644 --- a/docs/sql-ref-syntax-ddl-create-table-datasource.md +++ b/docs/sql-ref-syntax-ddl-create-table-datasource.md @@ -25,7 +25,7 @@ The `CREATE TABLE` statement defines a new table using a Data Source. ### Syntax -{% highlight sql %} +```sql CREATE TABLE [ IF NOT EXISTS ] table_identifier [ ( col_name1 col_type1 [ COMMENT col_comment1 ], ... ) ] [ USING data_source ] @@ -38,62 +38,52 @@ CREATE TABLE [ IF NOT EXISTS ] table_identifier [ COMMENT table_comment ] [ TBLPROPERTIES ( key1=val1, key2=val2, ... ) ] [ AS select_statement ] -{% endhighlight %} +``` Note that, the clauses between the USING clause and the AS SELECT clause can come in as any order. For example, you can write COMMENT table_comment after TBLPROPERTIES. ### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
-
-
USING data_source
-
Data Source is the input format used to create the table. Data source can be CSV, TXT, ORC, JDBC, PARQUET, etc.
-
- -
-
PARTITIONED BY
-
Partitions are created on the table, based on the columns specified.
-
- -
-
CLUSTERED BY
-
- Partitions created on the table will be bucketed into fixed buckets based on the column specified for bucketing.

- NOTE:Bucketing is an optimization technique that uses buckets (and bucketing columns) to determine data partitioning and avoid data shuffle.
-
SORTED BY
-
Determines the order in which the data is stored in buckets. Default is Ascending order.
- -
- -
-
LOCATION
-
Path to the directory where table data is stored, which could be a path on distributed storage like HDFS, etc.
-
- -
-
COMMENT
-
A string literal to describe the table.
-
- -
-
TBLPROPERTIES
-
A list of key-value pairs that is used to tag the table definition.
-
- -
-
AS select_statement
-
The table is populated using the data from the select statement.
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **USING data_source** + + Data Source is the input format used to create the table. Data source can be CSV, TXT, ORC, JDBC, PARQUET, etc. + +* **PARTITIONED BY** + + Partitions are created on the table, based on the columns specified. + +* **CLUSTERED BY** + + Partitions created on the table will be bucketed into fixed buckets based on the column specified for bucketing. + + **NOTE:** Bucketing is an optimization technique that uses buckets (and bucketing columns) to determine data partitioning and avoid data shuffle. + +* **SORTED BY** + + Determines the order in which the data is stored in buckets. Default is Ascending order. + +* **LOCATION** + + Path to the directory where table data is stored, which could be a path on distributed storage like HDFS, etc. + +* **COMMENT** + + A string literal to describe the table. + +* **TBLPROPERTIES** + + A list of key-value pairs that is used to tag the table definition. + +* **AS select_statement** + + The table is populated using the data from the select statement. ### Data Source Interaction @@ -110,7 +100,7 @@ input query, to make sure the table gets created contains exactly the same data ### Examples -{% highlight sql %} +```sql --Use data source CREATE TABLE student (id INT, name STRING, age INT) USING CSV; @@ -137,9 +127,9 @@ CREATE TABLE student (id INT, name STRING, age INT) USING CSV PARTITIONED BY (age) CLUSTERED BY (Id) INTO 4 buckets; -{% endhighlight %} +``` ### Related Statements - * [CREATE TABLE USING HIVE FORMAT](sql-ref-syntax-ddl-create-table-hiveformat.html) - * [CREATE TABLE LIKE](sql-ref-syntax-ddl-create-table-like.html) +* [CREATE TABLE USING HIVE FORMAT](sql-ref-syntax-ddl-create-table-hiveformat.html) +* [CREATE TABLE LIKE](sql-ref-syntax-ddl-create-table-like.html) diff --git a/docs/sql-ref-syntax-ddl-create-table-hiveformat.md b/docs/sql-ref-syntax-ddl-create-table-hiveformat.md index 06f353ad2f103..576d9190f2716 100644 --- a/docs/sql-ref-syntax-ddl-create-table-hiveformat.md +++ b/docs/sql-ref-syntax-ddl-create-table-hiveformat.md @@ -25,7 +25,7 @@ The `CREATE TABLE` statement defines a new table using Hive format. ### Syntax -{% highlight sql %} +```sql CREATE [ EXTERNAL ] TABLE [ IF NOT EXISTS ] table_identifier [ ( col_name1[:] col_type1 [ COMMENT col_comment1 ], ... ) ] [ COMMENT table_comment ] @@ -36,67 +36,54 @@ CREATE [ EXTERNAL ] TABLE [ IF NOT EXISTS ] table_identifier [ LOCATION path ] [ TBLPROPERTIES ( key1=val1, key2=val2, ... ) ] [ AS select_statement ] -{% endhighlight %} +``` Note that, the clauses between the columns definition clause and the AS SELECT clause can come in as any order. For example, you can write COMMENT table_comment after TBLPROPERTIES. ### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
- -
-
EXTERNAL
-
Table is defined using the path provided as LOCATION, does not use default location for this table.
-
- -
-
PARTITIONED BY
-
Partitions are created on the table, based on the columns specified.
-
- -
-
ROW FORMAT
-
SERDE is used to specify a custom SerDe or the DELIMITED clause in order to use the native SerDe.
-
- -
-
STORED AS
-
File format for table storage, could be TEXTFILE, ORC, PARQUET,etc.
-
- -
-
LOCATION
-
Path to the directory where table data is stored, Path to the directory where table data is stored, which could be a path on distributed storage like HDFS, etc.
-
- -
-
COMMENT
-
A string literal to describe the table.
-
- -
-
TBLPROPERTIES
-
A list of key-value pairs that is used to tag the table definition.
-
- -
-
AS select_statement
-
The table is populated using the data from the select statement.
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **EXTERNAL** + + Table is defined using the path provided as LOCATION, does not use default location for this table. + +* **PARTITIONED BY** + + Partitions are created on the table, based on the columns specified. + +* **ROW FORMAT** + + SERDE is used to specify a custom SerDe or the DELIMITED clause in order to use the native SerDe. + +* **STORED AS** + + File format for table storage, could be TEXTFILE, ORC, PARQUET,etc. + +* **LOCATION** + + Path to the directory where table data is stored, Path to the directory where table data is stored, which could be a path on distributed storage like HDFS, etc. + +* **COMMENT** + + A string literal to describe the table. + +* **TBLPROPERTIES** + + A list of key-value pairs that is used to tag the table definition. + +* **AS select_statement** + + The table is populated using the data from the select statement. ### Examples -{% highlight sql %} +```sql --Use hive format CREATE TABLE student (id INT, name STRING, age INT) STORED AS ORC; @@ -130,9 +117,9 @@ CREATE TABLE student (id INT, name STRING) CREATE TABLE student (id INT,name STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE; -{% endhighlight %} +``` ### Related Statements - * [CREATE TABLE USING DATASOURCE](sql-ref-syntax-ddl-create-table-datasource.html) - * [CREATE TABLE LIKE](sql-ref-syntax-ddl-create-table-like.html) +* [CREATE TABLE USING DATASOURCE](sql-ref-syntax-ddl-create-table-datasource.html) +* [CREATE TABLE LIKE](sql-ref-syntax-ddl-create-table-like.html) diff --git a/docs/sql-ref-syntax-ddl-create-table-like.md b/docs/sql-ref-syntax-ddl-create-table-like.md index fe1dc4b1ef258..a374c554bd179 100644 --- a/docs/sql-ref-syntax-ddl-create-table-like.md +++ b/docs/sql-ref-syntax-ddl-create-table-like.md @@ -25,57 +25,46 @@ The `CREATE TABLE` statement defines a new table using the definition/metadata o ### Syntax -{% highlight sql %} +```sql CREATE TABLE [IF NOT EXISTS] table_identifier LIKE source_table_identifier USING data_source [ ROW FORMAT row_format ] [ STORED AS file_format ] [ TBLPROPERTIES ( key1=val1, key2=val2, ... ) ] [ LOCATION path ] -{% endhighlight %} +``` ### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: [ TBLPROPERTIES ( key1=val1, key2=val2, ... ) ] - - [ database_name. ] table_name - -
-
- -
-
USING data_source
-
Data Source is the input format used to create the table. Data source can be CSV, TXT, ORC, JDBC, PARQUET, etc.
-
- -
-
ROW FORMAT
-
SERDE is used to specify a custom SerDe or the DELIMITED clause in order to use the native SerDe.
-
- -
-
STORED AS
-
File format for table storage, could be TEXTFILE, ORC, PARQUET,etc.
-
- -
-
TBLPROPERTIES
-
Table properties that have to be set are specified, such as `created.by.user`, `owner`, etc. -
-
- -
-
LOCATION
-
Path to the directory where table data is stored,Path to the directory where table data is stored, which could be a path on distributed storage like HDFS, etc. Location to create an external table.
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **USING data_source** + + Data Source is the input format used to create the table. Data source can be CSV, TXT, ORC, JDBC, PARQUET, etc. + +* **ROW FORMAT** + + SERDE is used to specify a custom SerDe or the DELIMITED clause in order to use the native SerDe. + +* **STORED AS** + + File format for table storage, could be TEXTFILE, ORC, PARQUET,etc. + +* **TBLPROPERTIES** + + Table properties that have to be set are specified, such as `created.by.user`, `owner`, etc. + +* **LOCATION** + + Path to the directory where table data is stored,Path to the directory where table data is stored, which could be a path on distributed storage like HDFS, etc. Location to create an external table. ### Examples -{% highlight sql %} +```sql -- Create table using an existing table CREATE TABLE Student_Dupli like Student; @@ -90,10 +79,10 @@ CREATE TABLE Student_Dupli like Student ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE TBLPROPERTIES ('owner'='xxxx'); -{% endhighlight %} +``` ### Related Statements - * [CREATE TABLE USING DATASOURCE](sql-ref-syntax-ddl-create-table-datasource.html) - * [CREATE TABLE USING HIVE FORMAT](sql-ref-syntax-ddl-create-table-hiveformat.html) +* [CREATE TABLE USING DATASOURCE](sql-ref-syntax-ddl-create-table-datasource.html) +* [CREATE TABLE USING HIVE FORMAT](sql-ref-syntax-ddl-create-table-hiveformat.html) diff --git a/docs/sql-ref-syntax-ddl-create-table.md b/docs/sql-ref-syntax-ddl-create-table.md index b0388adbc9a38..85dc2020e6585 100644 --- a/docs/sql-ref-syntax-ddl-create-table.md +++ b/docs/sql-ref-syntax-ddl-create-table.md @@ -25,11 +25,11 @@ license: | The CREATE statements: - * [CREATE TABLE USING DATA_SOURCE](sql-ref-syntax-ddl-create-table-datasource.html) - * [CREATE TABLE USING HIVE FORMAT](sql-ref-syntax-ddl-create-table-hiveformat.html) - * [CREATE TABLE LIKE](sql-ref-syntax-ddl-create-table-like.html) +* [CREATE TABLE USING DATA_SOURCE](sql-ref-syntax-ddl-create-table-datasource.html) +* [CREATE TABLE USING HIVE FORMAT](sql-ref-syntax-ddl-create-table-hiveformat.html) +* [CREATE TABLE LIKE](sql-ref-syntax-ddl-create-table-like.html) ### Related Statements - * [ALTER TABLE](sql-ref-syntax-ddl-alter-table.html) - * [DROP TABLE](sql-ref-syntax-ddl-drop-table.html) +* [ALTER TABLE](sql-ref-syntax-ddl-alter-table.html) +* [DROP TABLE](sql-ref-syntax-ddl-drop-table.html) diff --git a/docs/sql-ref-syntax-ddl-create-view.md b/docs/sql-ref-syntax-ddl-create-view.md index ba8c1df1223a3..032bcbcf19ad3 100644 --- a/docs/sql-ref-syntax-ddl-create-view.md +++ b/docs/sql-ref-syntax-ddl-create-view.md @@ -27,55 +27,47 @@ a virtual table that has no physical data therefore other operations like ### Syntax -{% highlight sql %} +```sql CREATE [ OR REPLACE ] [ [ GLOBAL ] TEMPORARY ] VIEW [ IF NOT EXISTS ] view_identifier create_view_clauses AS query -{% endhighlight %} +``` ### Parameters -
-
OR REPLACE
-
If a view of same name already exists, it will be replaced.
-
-
-
[ GLOBAL ] TEMPORARY
-
TEMPORARY views are session-scoped and will be dropped when session ends - because it skips persisting the definition in the underlying metastore, if any. - GLOBAL TEMPORARY views are tied to a system preserved temporary database global_temp.
-
-
-
IF NOT EXISTS
-
Creates a view if it does not exists.
-
-
-
view_identifier
-
- Specifies a view name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] view_name - -
-
-
-
create_view_clauses
-
These clauses are optional and order insensitive. It can be of following formats. -
    -
  • [ ( column_name [ COMMENT column_comment ], ... ) ] to specify column-level comments.
  • -
  • [ COMMENT view_comment ] to specify view-level comments.
  • -
  • [ TBLPROPERTIES ( property_name = property_value [ , ... ] ) ] to add metadata key-value pairs.
  • -
-
-
-
-
query
-
A SELECT statement that constructs the view from base tables or other views.
-
+* **OR REPLACE** + + If a view of same name already exists, it will be replaced. + +* **[ GLOBAL ] TEMPORARY** + + TEMPORARY views are session-scoped and will be dropped when session ends + because it skips persisting the definition in the underlying metastore, if any. + GLOBAL TEMPORARY views are tied to a system preserved temporary database `global_temp`. + +* **IF NOT EXISTS** + + Creates a view if it does not exists. + +* **view_identifier** + + Specifies a view name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] view_name` + +* **create_view_clauses** + + These clauses are optional and order insensitive. It can be of following formats. + + * `[ ( column_name [ COMMENT column_comment ], ... ) ]` to specify column-level comments. + * `[ COMMENT view_comment ]` to specify view-level comments. + * `[ TBLPROPERTIES ( property_name = property_value [ , ... ] ) ]` to add metadata key-value pairs. + +* **query** + A [SELECT](sql-ref-syntax-qry-select.html) statement that constructs the view from base tables or other views. ### Examples -{% highlight sql %} +```sql -- Create or replace view for `experienced_employee` with comments. CREATE OR REPLACE VIEW experienced_employee (ID COMMENT 'Unique identification number', Name) @@ -88,10 +80,10 @@ CREATE GLOBAL TEMPORARY VIEW IF NOT EXISTS subscribed_movies AS SELECT mo.member_id, mb.full_name, mo.movie_title FROM movies AS mo INNER JOIN members AS mb ON mo.member_id = mb.id; -{% endhighlight %} +``` ### Related Statements - * [ALTER VIEW](sql-ref-syntax-ddl-alter-view.html) - * [DROP VIEW](sql-ref-syntax-ddl-drop-view.html) - * [SHOW VIEWS](sql-ref-syntax-aux-show-views.html) +* [ALTER VIEW](sql-ref-syntax-ddl-alter-view.html) +* [DROP VIEW](sql-ref-syntax-ddl-drop-view.html) +* [SHOW VIEWS](sql-ref-syntax-aux-show-views.html) diff --git a/docs/sql-ref-syntax-ddl-drop-database.md b/docs/sql-ref-syntax-ddl-drop-database.md index 7467e7a4ad6e7..4a3bc0c68b6d4 100644 --- a/docs/sql-ref-syntax-ddl-drop-database.md +++ b/docs/sql-ref-syntax-ddl-drop-database.md @@ -26,35 +26,31 @@ exception will be thrown if the database does not exist in the system. ### Syntax -{% highlight sql %} +```sql DROP { DATABASE | SCHEMA } [ IF EXISTS ] dbname [ RESTRICT | CASCADE ] -{% endhighlight %} +``` ### Parameters -
-
DATABASE | SCHEMA
-
DATABASE and SCHEMA mean the same thing, either of them can be used.
-
+* **DATABASE `|` SCHEMA** -
-
IF EXISTS
-
If specified, no exception is thrown when the database does not exist.
-
+ `DATABASE` and `SCHEMA` mean the same thing, either of them can be used. -
-
RESTRICT
-
If specified, will restrict dropping a non-empty database and is enabled by default.
-
+* **IF EXISTS** -
-
CASCADE
-
If specified, will drop all the associated tables and functions.
-
+ If specified, no exception is thrown when the database does not exist. + +* **RESTRICT** + + If specified, will restrict dropping a non-empty database and is enabled by default. + +* **CASCADE** + + If specified, will drop all the associated tables and functions. ### Examples -{% highlight sql %} +```sql -- Create `inventory_db` Database CREATE DATABASE inventory_db COMMENT 'This database is used to maintain Inventory'; @@ -63,10 +59,10 @@ DROP DATABASE inventory_db CASCADE; -- Drop the database using IF EXISTS DROP DATABASE IF EXISTS inventory_db CASCADE; -{% endhighlight %} +``` ### Related Statements - * [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) - * [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) - * [SHOW DATABASES](sql-ref-syntax-aux-show-databases.html) +* [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) +* [DESCRIBE DATABASE](sql-ref-syntax-aux-describe-database.html) +* [SHOW DATABASES](sql-ref-syntax-aux-show-databases.html) diff --git a/docs/sql-ref-syntax-ddl-drop-function.md b/docs/sql-ref-syntax-ddl-drop-function.md index 66a405c24e413..b1b2ff9b1bb21 100644 --- a/docs/sql-ref-syntax-ddl-drop-function.md +++ b/docs/sql-ref-syntax-ddl-drop-function.md @@ -26,39 +26,32 @@ be thrown if the function does not exist. ### Syntax -{% highlight sql %} +```sql DROP [ TEMPORARY ] FUNCTION [ IF EXISTS ] function_name -{% endhighlight %} +``` ### Parameters -
-
function_name
-
+* **function_name** + Specifies the name of an existing function. The function name may be - optionally qualified with a database name.

- Syntax: - - [ database_name. ] function_name - -
-
- -
-
TEMPORARY
-
Should be used to delete the TEMPORARY function.
-
- -
-
IF EXISTS
-
If specified, no exception is thrown when the function does not exist.
-
+ optionally qualified with a database name. + + **Syntax:** `[ database_name. ] function_name` + +* **TEMPORARY** + + Should be used to delete the `TEMPORARY` function. + +* **IF EXISTS** + + If specified, no exception is thrown when the function does not exist. ### Examples -{% highlight sql %} +```sql -- Create a permanent function `test_avg` -CREATE FUNCTION test_avg as 'org.apache.hadoop.hive.ql.udf.generic.GenericUDAFAverage'; +CREATE FUNCTION test_avg AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDAFAverage'; -- List user functions SHOW USER FUNCTIONS; @@ -100,10 +93,10 @@ SHOW USER FUNCTIONS; -- Drop Temporary function DROP TEMPORARY FUNCTION IF EXISTS test_avg; -{% endhighlight %} +``` ### Related Statements - * [CREATE FUNCTION](sql-ref-syntax-ddl-create-function.html) - * [DESCRIBE FUNCTION](sql-ref-syntax-aux-describe-function.html) - * [SHOW FUNCTION](sql-ref-syntax-aux-show-functions.html) +* [CREATE FUNCTION](sql-ref-syntax-ddl-create-function.html) +* [DESCRIBE FUNCTION](sql-ref-syntax-aux-describe-function.html) +* [SHOW FUNCTION](sql-ref-syntax-aux-show-functions.html) diff --git a/docs/sql-ref-syntax-ddl-drop-table.md b/docs/sql-ref-syntax-ddl-drop-table.md index c943b922ae812..f2ff89993f2c3 100644 --- a/docs/sql-ref-syntax-ddl-drop-table.md +++ b/docs/sql-ref-syntax-ddl-drop-table.md @@ -28,30 +28,25 @@ In case of an external table, only the associated metadata information is remove ### Syntax -{% highlight sql %} +```sql DROP TABLE [ IF EXISTS ] table_identifier -{% endhighlight %} +``` ### Parameter -
-
IF EXISTS
-
- If specified, no exception is thrown when the table does not exists. -
-
table_identifier
-
- Specifies the table name to be dropped. The table name may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
+* **IF EXISTS** + + If specified, no exception is thrown when the table does not exists. + +* **table_identifier** + + Specifies the table name to be dropped. The table name may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` ### Examples -{% highlight sql %} +```sql -- Assumes a table named `employeetable` exists. DROP TABLE employeetable; @@ -67,10 +62,10 @@ DROP TABLE employeetable; -- Assumes a table named `employeetable` does not exists,Try with IF EXISTS -- this time it will not throw exception DROP TABLE IF EXISTS employeetable; -{% endhighlight %} +``` ### Related Statements - * [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) - * [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) - * [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html) +* [CREATE TABLE](sql-ref-syntax-ddl-create-table.html) +* [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) +* [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html) diff --git a/docs/sql-ref-syntax-ddl-drop-view.md b/docs/sql-ref-syntax-ddl-drop-view.md index ad018b5e6fd5c..0f4a7ca6c9463 100644 --- a/docs/sql-ref-syntax-ddl-drop-view.md +++ b/docs/sql-ref-syntax-ddl-drop-view.md @@ -25,30 +25,25 @@ license: | ### Syntax -{% highlight sql %} +```sql DROP VIEW [ IF EXISTS ] view_identifier -{% endhighlight %} +``` ### Parameter -
-
IF EXISTS
-
- If specified, no exception is thrown when the view does not exists. -
-
view_identifier
-
- Specifies the view name to be dropped. The view name may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] view_name - -
-
+* **IF EXISTS** + + If specified, no exception is thrown when the view does not exists. + +* **view_identifier** + + Specifies the view name to be dropped. The view name may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] view_name` ### Examples -{% highlight sql %} +```sql -- Assumes a view named `employeeView` exists. DROP VIEW employeeView; @@ -64,12 +59,12 @@ DROP VIEW employeeView; -- Assumes a view named `employeeView` does not exists,Try with IF EXISTS -- this time it will not throw exception DROP VIEW IF EXISTS employeeView; -{% endhighlight %} +``` ### Related Statements - * [CREATE VIEW](sql-ref-syntax-ddl-create-view.html) - * [ALTER VIEW](sql-ref-syntax-ddl-alter-view.html) - * [SHOW VIEWS](sql-ref-syntax-aux-show-views.html) - * [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) - * [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html) +* [CREATE VIEW](sql-ref-syntax-ddl-create-view.html) +* [ALTER VIEW](sql-ref-syntax-ddl-alter-view.html) +* [SHOW VIEWS](sql-ref-syntax-aux-show-views.html) +* [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) +* [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html) diff --git a/docs/sql-ref-syntax-ddl-repair-table.md b/docs/sql-ref-syntax-ddl-repair-table.md index c48b731512ad3..c2ef0a7b7fbe9 100644 --- a/docs/sql-ref-syntax-ddl-repair-table.md +++ b/docs/sql-ref-syntax-ddl-repair-table.md @@ -25,26 +25,21 @@ license: | ### Syntax -{% highlight sql %} +```sql MSCK REPAIR TABLE table_identifier -{% endhighlight %} +``` ### Parameters -
-
table_identifier
-
- Specifies the name of the table to be repaired. The table name may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
+* **table_identifier** + + Specifies the name of the table to be repaired. The table name may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` ### Examples -{% highlight sql %} +```sql -- create a partitioned table from existing data /tmp/namesAndAges.parquet CREATE TABLE t1 (name STRING, age INT) USING parquet PARTITIONED BY (age) LOCATION "/tmp/namesAndAges.parquet"; @@ -66,8 +61,8 @@ SELECT * FROM t1; +-------+---+ | Andy| 30| +-------+---+ -{% endhighlight %} +``` ### Related Statements - * [ALTER TABLE](sql-ref-syntax-ddl-alter-table.html) +* [ALTER TABLE](sql-ref-syntax-ddl-alter-table.html) diff --git a/docs/sql-ref-syntax-ddl-truncate-table.md b/docs/sql-ref-syntax-ddl-truncate-table.md index 820f439f97a4b..6139814a3259a 100644 --- a/docs/sql-ref-syntax-ddl-truncate-table.md +++ b/docs/sql-ref-syntax-ddl-truncate-table.md @@ -27,37 +27,28 @@ in `partition_spec`. If no `partition_spec` is specified it will remove all part ### Syntax -{% highlight sql %} +```sql TRUNCATE TABLE table_identifier [ partition_spec ] -{% endhighlight %} +``` ### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
-
-
partition_spec
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **partition_spec** + An optional parameter that specifies a comma separated list of key and value pairs - for partitions.

- Syntax: - - PARTITION ( partition_col_name = partition_col_val [ , ... ] ) - -
-
+ for partitions. + + **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` ### Examples -{% highlight sql %} +```sql -- Create table Student with partition CREATE TABLE Student (name STRING, rollno INT) PARTITIONED BY (age INT); @@ -89,9 +80,9 @@ SELECT * FROM Student; |name|rollno|age| +----+------+---+ +----+------+---+ -{% endhighlight %} +``` ### Related Statements - * [DROP TABLE](sql-ref-syntax-ddl-drop-table.html) - * [ALTER TABLE](sql-ref-syntax-ddl-alter-table.html) +* [DROP TABLE](sql-ref-syntax-ddl-drop-table.html) +* [ALTER TABLE](sql-ref-syntax-ddl-alter-table.html) diff --git a/docs/sql-ref-syntax-dml-insert-into.md b/docs/sql-ref-syntax-dml-insert-into.md index 924831f7feedd..ed5da2b2d28df 100644 --- a/docs/sql-ref-syntax-dml-insert-into.md +++ b/docs/sql-ref-syntax-dml-insert-into.md @@ -25,57 +25,43 @@ The `INSERT INTO` statement inserts new rows into a table. The inserted rows can ### Syntax -{% highlight sql %} +```sql INSERT INTO [ TABLE ] table_identifier [ partition_spec ] { VALUES ( { value | NULL } [ , ... ] ) [ , ( ... ) ] | query } -{% endhighlight %} +``` ### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
- -
-
partition_spec
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **partition_spec** + An optional parameter that specifies a comma separated list of key and value pairs - for partitions.

- Syntax: - - PARTITION ( partition_col_name = partition_col_val [ , ... ] ) - -
-
- -
-
VALUES ( { value | NULL } [ , ... ] ) [ , ( ... ) ]
-
Specifies the values to be inserted. Either an explicitly specified value or a NULL can be inserted. A comma must be used to separate each value in the clause. More than one set of values can be specified to insert multiple rows.
-
- -
-
query
-
A query that produces the rows to be inserted. It can be in one of following formats: -
    -
  • a SELECT statement
  • -
  • a TABLE statement
  • -
  • a FROM statement
  • -
-
-
+ for partitions. + + **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` + +* **VALUES ( { value `|` NULL } [ , ... ] ) [ , ( ... ) ]** + + Specifies the values to be inserted. Either an explicitly specified value or a NULL can be inserted. + A comma must be used to separate each value in the clause. More than one set of values can be specified to insert multiple rows. + +* **query** + + A query that produces the rows to be inserted. It can be in one of following formats: + * a `SELECT` statement + * a `TABLE` statement + * a `FROM` statement ### Examples #### Single Row Insert Using a VALUES Clause -{% highlight sql %} +```sql CREATE TABLE students (name VARCHAR(64), address VARCHAR(64), student_id INT) USING PARQUET PARTITIONED BY (student_id); @@ -88,11 +74,11 @@ SELECT * FROM students; +---------+---------------------+----------+ |Amy Smith|123 Park Ave,San Jose| 111111| +---------+---------------------+----------+ -{% endhighlight %} +``` #### Multi-Row Insert Using a VALUES Clause -{% highlight sql %} +```sql INSERT INTO students VALUES ('Bob Brown', '456 Taylor St, Cupertino', 222222), ('Cathy Johnson', '789 Race Ave, Palo Alto', 333333); @@ -107,11 +93,11 @@ SELECT * FROM students; +-------------+------------------------+----------+ |Cathy Johnson| 789 Race Ave, Palo Alto| 333333| +--------------+-----------------------+----------+ -{% endhighlight %} +``` #### Insert Using a SELECT Statement -{% highlight sql %} +```sql -- Assuming the persons table has already been created and populated. SELECT * FROM persons; +-------------+-------------------------+---------+ @@ -137,11 +123,11 @@ SELECT * FROM students; +-------------+-------------------------+----------+ |Dora Williams|134 Forest Ave, Melo Park| 444444| +-------------+-------------------------+----------+ -{% endhighlight %} +``` #### Insert Using a TABLE Statement -{% highlight sql %} +```sql -- Assuming the visiting_students table has already been created and populated. SELECT * FROM visiting_students; +-------------+---------------------+----------+ @@ -170,11 +156,11 @@ SELECT * FROM students; +-------------+-------------------------+----------+ |Gordon Martin| 779 Lake Ave, Oxford| 888888| +-------------+-------------------------+----------+ -{% endhighlight %} +``` #### Insert Using a FROM Statement -{% highlight sql %} +```sql -- Assuming the applicants table has already been created and populated. SELECT * FROM applicants; +-----------+--------------------------+----------+---------+ @@ -210,10 +196,10 @@ SELECT * FROM students; +-------------+-------------------------+----------+ | Jason Wang| 908 Bird St, Saratoga| 121212| +-------------+-------------------------+----------+ -{% endhighlight %} +``` ### Related Statements - * [INSERT OVERWRITE statement](sql-ref-syntax-dml-insert-overwrite-table.html) - * [INSERT OVERWRITE DIRECTORY statement](sql-ref-syntax-dml-insert-overwrite-directory.html) - * [INSERT OVERWRITE DIRECTORY with Hive format statement](sql-ref-syntax-dml-insert-overwrite-directory-hive.html) +* [INSERT OVERWRITE statement](sql-ref-syntax-dml-insert-overwrite-table.html) +* [INSERT OVERWRITE DIRECTORY statement](sql-ref-syntax-dml-insert-overwrite-directory.html) +* [INSERT OVERWRITE DIRECTORY with Hive format statement](sql-ref-syntax-dml-insert-overwrite-directory-hive.html) diff --git a/docs/sql-ref-syntax-dml-insert-overwrite-directory-hive.md b/docs/sql-ref-syntax-dml-insert-overwrite-directory-hive.md index 3cd2107668fbe..8ed6a3cd1be09 100644 --- a/docs/sql-ref-syntax-dml-insert-overwrite-directory-hive.md +++ b/docs/sql-ref-syntax-dml-insert-overwrite-directory-hive.md @@ -26,56 +26,41 @@ Hive support must be enabled to use this command. The inserted rows can be speci ### Syntax -{% highlight sql %} +```sql INSERT OVERWRITE [ LOCAL ] DIRECTORY directory_path [ ROW FORMAT row_format ] [ STORED AS file_format ] { VALUES ( { value | NULL } [ , ... ] ) [ , ( ... ) ] | query } -{% endhighlight %} +``` ### Parameters -
-
directory_path
-
- Specifies the destination directory. The LOCAL keyword is used to specify that the directory is on the local file system. -
-
- -
-
row_format
-
- Specifies the row format for this insert. Valid options are SERDE clause and DELIMITED clause. SERDE clause can be used to specify a custom SerDe for this insert. Alternatively, DELIMITED clause can be used to specify the native SerDe and state the delimiter, escape character, null character, and so on. -
-
- -
-
file_format
-
- Specifies the file format for this insert. Valid options are TEXTFILE, SEQUENCEFILE, RCFILE, ORC, PARQUET, and AVRO. You can also specify your own input and output format using INPUTFORMAT and OUTPUTFORMAT. ROW FORMAT SERDE can only be used with TEXTFILE, SEQUENCEFILE, or RCFILE, while ROW FORMAT DELIMITED can only be used with TEXTFILE. -
-
- -
-
VALUES ( { value | NULL } [ , ... ] ) [ , ( ... ) ]
-
- Specifies the values to be inserted. Either an explicitly specified value or a NULL can be inserted. A comma must be used to separate each value in the clause. More than one set of values can be specified to insert multiple rows. -
-
- -
-
query
-
A query that produces the rows to be inserted. It can be in one of following formats: -
    -
  • a SELECT statement
  • -
  • a TABLE statement
  • -
  • a FROM statement
  • -
-
-
+* **directory_path** + + Specifies the destination directory. The `LOCAL` keyword is used to specify that the directory is on the local file system. + +* **row_format** + + Specifies the row format for this insert. Valid options are `SERDE` clause and `DELIMITED` clause. `SERDE` clause can be used to specify a custom `SerDe` for this insert. Alternatively, `DELIMITED` clause can be used to specify the native `SerDe` and state the delimiter, escape character, null character, and so on. + +* **file_format** + + Specifies the file format for this insert. Valid options are `TEXTFILE`, `SEQUENCEFILE`, `RCFILE`, `ORC`, `PARQUET`, and `AVRO`. You can also specify your own input and output format using `INPUTFORMAT` and `OUTPUTFORMAT`. `ROW FORMAT SERDE` can only be used with `TEXTFILE`, `SEQUENCEFILE`, or `RCFILE`, while `ROW FORMAT DELIMITED` can only be used with `TEXTFILE`. + +* **VALUES ( { value `|` NULL } [ , ... ] ) [ , ( ... ) ]** + + Specifies the values to be inserted. Either an explicitly specified value or a NULL can be inserted. + A comma must be used to separate each value in the clause. More than one set of values can be specified to insert multiple rows. + +* **query** + + A query that produces the rows to be inserted. It can be in one of following formats: + * a `SELECT` statement + * a `TABLE` statement + * a `FROM` statement ### Examples -{% highlight sql %} +```sql INSERT OVERWRITE LOCAL DIRECTORY '/tmp/destination' STORED AS orc SELECT * FROM test_table; @@ -83,10 +68,10 @@ INSERT OVERWRITE LOCAL DIRECTORY '/tmp/destination' INSERT OVERWRITE LOCAL DIRECTORY '/tmp/destination' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' SELECT * FROM test_table; -{% endhighlight %} +``` ### Related Statements - * [INSERT INTO statement](sql-ref-syntax-dml-insert-into.html) - * [INSERT OVERWRITE statement](sql-ref-syntax-dml-insert-overwrite-table.html) - * [INSERT OVERWRITE DIRECTORY statement](sql-ref-syntax-dml-insert-overwrite-directory.html) +* [INSERT INTO statement](sql-ref-syntax-dml-insert-into.html) +* [INSERT OVERWRITE statement](sql-ref-syntax-dml-insert-overwrite-table.html) +* [INSERT OVERWRITE DIRECTORY statement](sql-ref-syntax-dml-insert-overwrite-directory.html) diff --git a/docs/sql-ref-syntax-dml-insert-overwrite-directory.md b/docs/sql-ref-syntax-dml-insert-overwrite-directory.md index 6ce7f50588e32..fd7437d37c909 100644 --- a/docs/sql-ref-syntax-dml-insert-overwrite-directory.md +++ b/docs/sql-ref-syntax-dml-insert-overwrite-directory.md @@ -25,54 +25,42 @@ The `INSERT OVERWRITE DIRECTORY` statement overwrites the existing data in the d ### Syntax -{% highlight sql %} +```sql INSERT OVERWRITE [ LOCAL ] DIRECTORY [ directory_path ] USING file_format [ OPTIONS ( key = val [ , ... ] ) ] { VALUES ( { value | NULL } [ , ... ] ) [ , ( ... ) ] | query } -{% endhighlight %} +``` ### Parameters -
-
directory_path
-
- Specifies the destination directory. It can also be specified in OPTIONS using path. The LOCAL keyword is used to specify that the directory is on the local file system. -
-
- -
-
file_format
-
- Specifies the file format to use for the insert. Valid options are TEXT, CSV, JSON, JDBC, PARQUET, ORC, HIVE, LIBSVM, or a fully qualified class name of a custom implementation of org.apache.spark.sql.execution.datasources.FileFormat. -
-
- -
-
OPTIONS ( key = val [ , ... ] )
-
Specifies one or more options for the writing of the file format.
-
- -
-
VALUES ( { value | NULL } [ , ... ] ) [ , ( ... ) ]
-
- Specifies the values to be inserted. Either an explicitly specified value or a NULL can be inserted. A comma must be used to separate each value in the clause. More than one set of values can be specified to insert multiple rows. -
-
- -
-
query
-
A query that produces the rows to be inserted. It can be in one of following formats: -
    -
  • a SELECT statement
  • -
  • a TABLE statement
  • -
  • a FROM statement
  • -
-
-
+* **directory_path** + + Specifies the destination directory. It can also be specified in `OPTIONS` using `path`. + The `LOCAL` keyword is used to specify that the directory is on the local file system. + +* **file_format** + + Specifies the file format to use for the insert. Valid options are `TEXT`, `CSV`, `JSON`, `JDBC`, `PARQUET`, `ORC`, `HIVE`, `LIBSVM`, or a fully qualified class name of a custom implementation of `org.apache.spark.sql.execution.datasources.FileFormat`. + +* **OPTIONS ( key = val [ , ... ] )** + + Specifies one or more options for the writing of the file format. + +* **VALUES ( { value `|` NULL } [ , ... ] ) [ , ( ... ) ]** + + Specifies the values to be inserted. Either an explicitly specified value or a NULL can be inserted. + A comma must be used to separate each value in the clause. More than one set of values can be specified to insert multiple rows. + +* **query** + + A query that produces the rows to be inserted. It can be in one of following formats: + * a `SELECT` statement + * a `TABLE` statement + * a `FROM` statement ### Examples -{% highlight sql %} +```sql INSERT OVERWRITE DIRECTORY '/tmp/destination' USING parquet OPTIONS (col1 1, col2 2, col3 'test') @@ -82,10 +70,10 @@ INSERT OVERWRITE DIRECTORY USING parquet OPTIONS ('path' '/tmp/destination', col1 1, col2 2, col3 'test') SELECT * FROM test_table; -{% endhighlight %} +``` ### Related Statements - * [INSERT INTO statement](sql-ref-syntax-dml-insert-into.html) - * [INSERT OVERWRITE statement](sql-ref-syntax-dml-insert-overwrite-table.html) - * [INSERT OVERWRITE DIRECTORY with Hive format statement](sql-ref-syntax-dml-insert-overwrite-directory-hive.html) +* [INSERT INTO statement](sql-ref-syntax-dml-insert-into.html) +* [INSERT OVERWRITE statement](sql-ref-syntax-dml-insert-overwrite-table.html) +* [INSERT OVERWRITE DIRECTORY with Hive format statement](sql-ref-syntax-dml-insert-overwrite-directory-hive.html) diff --git a/docs/sql-ref-syntax-dml-insert-overwrite-table.md b/docs/sql-ref-syntax-dml-insert-overwrite-table.md index 5c760f00ed0c4..ecfd060dfd5ee 100644 --- a/docs/sql-ref-syntax-dml-insert-overwrite-table.md +++ b/docs/sql-ref-syntax-dml-insert-overwrite-table.md @@ -25,57 +25,43 @@ The `INSERT OVERWRITE` statement overwrites the existing data in the table using ### Syntax -{% highlight sql %} +```sql INSERT OVERWRITE [ TABLE ] table_identifier [ partition_spec [ IF NOT EXISTS ] ] { VALUES ( { value | NULL } [ , ... ] ) [ , ( ... ) ] | query } -{% endhighlight %} +``` ### Parameters -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
- -
-
partition_spec
-
+* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **partition_spec** + An optional parameter that specifies a comma separated list of key and value pairs - for partitions.

- Syntax: - - PARTITION ( partition_col_name [ = partition_col_val ] [ , ... ] ) - -
-
- -
-
VALUES ( { value | NULL } [ , ... ] ) [ , ( ... ) ]
-
Specifies the values to be inserted. Either an explicitly specified value or a NULL can be inserted. A comma must be used to separate each value in the clause. More than one set of values can be specified to insert multiple rows.
-
- -
-
query
-
A query that produces the rows to be inserted. It can be in one of following formats: -
    -
  • a SELECT statement
  • -
  • a TABLE statement
  • -
  • a FROM statement
  • -
-
-
+ for partitions. + + **Syntax:** `PARTITION ( partition_col_name [ = partition_col_val ] [ , ... ] )` + +* **VALUES ( { value `|` NULL } [ , ... ] ) [ , ( ... ) ]** + + Specifies the values to be inserted. Either an explicitly specified value or a NULL can be inserted. + A comma must be used to separate each value in the clause. More than one set of values can be specified to insert multiple rows. + +* **query** + + A query that produces the rows to be inserted. It can be in one of following formats: + * a `SELECT` statement + * a `TABLE` statement + * a `FROM` statement ### Examples #### Insert Using a VALUES Clause -{% highlight sql %} +```sql -- Assuming the students table has already been created and populated. SELECT * FROM students; +-------------+-------------------------+----------+ @@ -102,12 +88,11 @@ SELECT * FROM students; |Ashua Hill|456 Erica Ct, Cupertino| 111111| |Brian Reed|723 Kern Ave, Palo Alto| 222222| +----------+-----------------------+----------+ - -{% endhighlight %} +``` #### Insert Using a SELECT Statement -{% highlight sql %} +```sql -- Assuming the persons table has already been created and populated. SELECT * FROM persons; +-------------+-------------------------+---------+ @@ -129,11 +114,11 @@ SELECT * FROM students; +-------------+-------------------------+----------+ |Dora Williams|134 Forest Ave, Melo Park| 222222| +-------------+-------------------------+----------+ -{% endhighlight %} +``` #### Insert Using a TABLE Statement -{% highlight sql %} +```sql -- Assuming the visiting_students table has already been created and populated. SELECT * FROM visiting_students; +-------------+---------------------+----------+ @@ -154,11 +139,11 @@ SELECT * FROM students; +-------------+---------------------+----------+ |Gordon Martin| 779 Lake Ave, Oxford| 888888| +-------------+---------------------+----------+ -{% endhighlight %} +``` #### Insert Using a FROM Statement -{% highlight sql %} +```sql -- Assuming the applicants table has already been created and populated. SELECT * FROM applicants; +-----------+--------------------------+----------+---------+ @@ -182,10 +167,10 @@ SELECT * FROM students; +-----------+-------------------------+----------+ | Jason Wang| 908 Bird St, Saratoga| 121212| +-----------+-------------------------+----------+ -{% endhighlight %} +``` ### Related Statements - * [INSERT INTO statement](sql-ref-syntax-dml-insert-into.html) - * [INSERT OVERWRITE DIRECTORY statement](sql-ref-syntax-dml-insert-overwrite-directory.html) - * [INSERT OVERWRITE DIRECTORY with Hive format statement](sql-ref-syntax-dml-insert-overwrite-directory-hive.html) +* [INSERT INTO statement](sql-ref-syntax-dml-insert-into.html) +* [INSERT OVERWRITE DIRECTORY statement](sql-ref-syntax-dml-insert-overwrite-directory.html) +* [INSERT OVERWRITE DIRECTORY with Hive format statement](sql-ref-syntax-dml-insert-overwrite-directory-hive.html) diff --git a/docs/sql-ref-syntax-dml-insert.md b/docs/sql-ref-syntax-dml-insert.md index 2345add2460c8..62f6dee876450 100644 --- a/docs/sql-ref-syntax-dml-insert.md +++ b/docs/sql-ref-syntax-dml-insert.md @@ -21,7 +21,7 @@ license: | The INSERT statements: - * [INSERT INTO statement](sql-ref-syntax-dml-insert-into.html) - * [INSERT OVERWRITE statement](sql-ref-syntax-dml-insert-overwrite-table.html) - * [INSERT OVERWRITE DIRECTORY statement](sql-ref-syntax-dml-insert-overwrite-directory.html) - * [INSERT OVERWRITE DIRECTORY with Hive format statement](sql-ref-syntax-dml-insert-overwrite-directory-hive.html) +* [INSERT INTO statement](sql-ref-syntax-dml-insert-into.html) +* [INSERT OVERWRITE statement](sql-ref-syntax-dml-insert-overwrite-table.html) +* [INSERT OVERWRITE DIRECTORY statement](sql-ref-syntax-dml-insert-overwrite-directory.html) +* [INSERT OVERWRITE DIRECTORY with Hive format statement](sql-ref-syntax-dml-insert-overwrite-directory-hive.html) diff --git a/docs/sql-ref-syntax-dml-load.md b/docs/sql-ref-syntax-dml-load.md index 01ece31bd17fa..9381b4267fb24 100644 --- a/docs/sql-ref-syntax-dml-load.md +++ b/docs/sql-ref-syntax-dml-load.md @@ -25,53 +25,40 @@ license: | ### Syntax -{% highlight sql %} +```sql LOAD DATA [ LOCAL ] INPATH path [ OVERWRITE ] INTO TABLE table_identifier [ partition_spec ] -{% endhighlight %} +``` ### Parameters -
-
path
-
Path of the file system. It can be either an absolute or a relative path.
-
- -
-
table_identifier
-
- Specifies a table name, which may be optionally qualified with a database name.

- Syntax: - - [ database_name. ] table_name - -
-
- -
-
partition_spec
-
+* **path** + + Path of the file system. It can be either an absolute or a relative path. + +* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **partition_spec** + An optional parameter that specifies a comma separated list of key and value pairs - for partitions.

- Syntax: - - PARTITION ( partition_col_name = partition_col_val [ , ... ] ) - -
-
- -
-
LOCAL
-
If specified, it causes the INPATH to be resolved against the local file system, instead of the default file system, which is typically a distributed storage.
-
- -
-
OVERWRITE
-
By default, new data is appended to the table. If OVERWRITE is used, the table is instead overwritten with new data.
-
+ for partitions. + + **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` + +* **LOCAL** + + If specified, it causes the `INPATH` to be resolved against the local file system, instead of the default file system, which is typically a distributed storage. + +* **OVERWRITE** + + By default, new data is appended to the table. If `OVERWRITE` is used, the table is instead overwritten with new data. ### Examples -{% highlight sql %} +```sql -- Example without partition specification. -- Assuming the students table has already been created and populated. SELECT * FROM students; @@ -123,4 +110,4 @@ SELECT * FROM test_load_partition; +---+---+---+ | 1| 2| 3| +---+---+---+ -{% endhighlight %} +``` diff --git a/docs/sql-ref-syntax-dml.md b/docs/sql-ref-syntax-dml.md index 9f75990555f64..fc408e1d38d26 100644 --- a/docs/sql-ref-syntax-dml.md +++ b/docs/sql-ref-syntax-dml.md @@ -21,5 +21,5 @@ license: | Data Manipulation Statements are used to add, change, or delete data. Spark SQL supports the following Data Manipulation Statements: - * [INSERT](sql-ref-syntax-dml-insert.html) - * [LOAD](sql-ref-syntax-dml-load.html) +* [INSERT](sql-ref-syntax-dml-insert.html) +* [LOAD](sql-ref-syntax-dml-load.html) diff --git a/docs/sql-ref-syntax-qry-explain.md b/docs/sql-ref-syntax-qry-explain.md index 298a2edaea1f2..7b84264a28cca 100644 --- a/docs/sql-ref-syntax-qry-explain.md +++ b/docs/sql-ref-syntax-qry-explain.md @@ -26,46 +26,38 @@ By default, this clause provides information about a physical plan only. ### Syntax -{% highlight sql %} +```sql EXPLAIN [ EXTENDED | CODEGEN | COST | FORMATTED ] statement -{% endhighlight %} +``` ### Parameters -
-
EXTENDED
-
Generates parsed logical plan, analyzed logical plan, optimized logical plan and physical plan. - Parsed Logical plan is a unresolved plan that extracted from the query. - Analyzed logical plans transforms which translates unresolvedAttribute and unresolvedRelation into fully typed objects. - The optimized logical plan transforms through a set of optimization rules, resulting in the physical plan. -
-
- -
-
CODEGEN
-
Generates code for the statement, if any and a physical plan.
-
- -
-
COST
-
If plan node statistics are available, generates a logical plan and the statistics.
-
- -
-
FORMATTED
-
Generates two sections: a physical plan outline and node details.
-
- -
-
statement
-
+* **EXTENDED** + + Generates parsed logical plan, analyzed logical plan, optimized logical plan and physical plan. + Parsed Logical plan is a unresolved plan that extracted from the query. + Analyzed logical plans transforms which translates unresolvedAttribute and unresolvedRelation into fully typed objects. + The optimized logical plan transforms through a set of optimization rules, resulting in the physical plan. + +* **CODEGEN** + + Generates code for the statement, if any and a physical plan. + +* **COST** + + If plan node statistics are available, generates a logical plan and the statistics. + +* **FORMATTED** + + Generates two sections: a physical plan outline and node details. + +* **statement** + Specifies a SQL statement to be explained. -
-
### Examples -{% highlight sql %} +```sql -- Default Output EXPLAIN select k, sum(v) from values (1, 2), (1, 3) t(k, v) group by k; +----------------------------------------------------+ @@ -132,4 +124,4 @@ EXPLAIN FORMATTED select k, sum(v) from values (1, 2), (1, 3) t(k, v) group by k Input: [k#19, sum#24L] | +----------------------------------------------------+ -{% endhighlight %} +``` diff --git a/docs/sql-ref-syntax-qry-sampling.md b/docs/sql-ref-syntax-qry-sampling.md index 82f6588e6c504..28e21e802fe25 100644 --- a/docs/sql-ref-syntax-qry-sampling.md +++ b/docs/sql-ref-syntax-qry-sampling.md @@ -30,15 +30,15 @@ Note: `TABLESAMPLE` returns the approximate number of rows or fraction requested ### Syntax -{% highlight sql %} -TABLESAMPLE ((integer_expression | decimal_expression) PERCENT) - | TABLESAMPLE (integer_expression ROWS) - | TABLESAMPLE (BUCKET integer_expression OUT OF integer_expression) -{% endhighlight %} +```sql +TABLESAMPLE ({ integer_expression | decimal_expression } PERCENT) + | TABLESAMPLE ( integer_expression ROWS ) + | TABLESAMPLE ( BUCKET integer_expression OUT OF integer_expression ) +``` ### Examples -{% highlight sql %} +```sql SELECT * FROM test; +--+----+ |id|name| @@ -87,8 +87,8 @@ SELECT * FROM test TABLESAMPLE (BUCKET 4 OUT OF 10); | 9|Eric| | 6|Mark| +--+----+ -{% endhighlight %} +``` -### Related Statement +### Related Statements - * [SELECT](sql-ref-syntax-qry-select.html) \ No newline at end of file +* [SELECT](sql-ref-syntax-qry-select.html) \ No newline at end of file diff --git a/docs/sql-ref-syntax-qry-select-clusterby.md b/docs/sql-ref-syntax-qry-select-clusterby.md index ac1e1ccb00ac9..e3bd2ed926ecc 100644 --- a/docs/sql-ref-syntax-qry-select-clusterby.md +++ b/docs/sql-ref-syntax-qry-select-clusterby.md @@ -21,7 +21,7 @@ license: | ### Description -The CLUSTER BY clause is used to first repartition the data based +The `CLUSTER BY` clause is used to first repartition the data based on the input expressions and then sort the data within each partition. This is semantically equivalent to performing a [DISTRIBUTE BY](sql-ref-syntax-qry-select-distribute-by.html) followed by a @@ -30,22 +30,19 @@ resultant rows are sorted within each partition and does not guarantee a total o ### Syntax -{% highlight sql %} +```sql CLUSTER BY { expression [ , ... ] } -{% endhighlight %} +``` ### Parameters -
-
expression
-
+* **expression** + Specifies combination of one or more values, operators and SQL functions that results in a value. -
-
### Examples -{% highlight sql %} +```sql CREATE TABLE person (name STRING, age INT); INSERT INTO person VALUES ('Zen Hui', 25), @@ -90,15 +87,15 @@ SELECT age, name FROM person CLUSTER BY age; | 16|Shone S| | 16| Jack N| +---+-------+ -{% endhighlight %} +``` ### Related Statements - * [SELECT Main](sql-ref-syntax-qry-select.html) - * [WHERE Clause](sql-ref-syntax-qry-select-where.html) - * [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) - * [HAVING Clause](sql-ref-syntax-qry-select-having.html) - * [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) - * [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) - * [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) - * [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) +* [SELECT Main](sql-ref-syntax-qry-select.html) +* [WHERE Clause](sql-ref-syntax-qry-select-where.html) +* [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) +* [HAVING Clause](sql-ref-syntax-qry-select-having.html) +* [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) +* [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) +* [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) +* [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) diff --git a/docs/sql-ref-syntax-qry-select-cte.md b/docs/sql-ref-syntax-qry-select-cte.md index 2408c884c64b5..351de64a2d026 100644 --- a/docs/sql-ref-syntax-qry-select-cte.md +++ b/docs/sql-ref-syntax-qry-select-cte.md @@ -25,33 +25,28 @@ A common table expression (CTE) defines a temporary result set that a user can r ### Syntax -{% highlight sql %} +```sql WITH common_table_expression [ , ... ] -{% endhighlight %} +``` While `common_table_expression` is defined as -{% highlight sql %} -expression_name [ ( column_name [ , ... ] ) ] [ AS ] ( [ common_table_expression ] query ) -{% endhighlight %} +```sql +expression_name [ ( column_name [ , ... ] ) ] [ AS ] ( query ) +``` ### Parameters -
-
expression_name
-
+* **expression_name** + Specifies a name for the common table expression. -
-
-
-
query
-
- A SELECT statement. -
-
+ +* **query** + + A [SELECT statement](sql-ref-syntax-qry-select.html). ### Examples -{% highlight sql %} +```sql -- CTE with multiple column aliases WITH t(x, y) AS (SELECT 1, 2) SELECT * FROM t WHERE x = 1 AND y = 2; @@ -62,7 +57,7 @@ SELECT * FROM t WHERE x = 1 AND y = 2; +---+---+ -- CTE in CTE definition -WITH t as ( +WITH t AS ( WITH t2 AS (SELECT 1) SELECT * FROM t2 ) @@ -122,8 +117,8 @@ SELECT * FROM t2; +---+ | 2| +---+ -{% endhighlight %} +``` ### Related Statements - * [SELECT](sql-ref-syntax-qry-select.html) +* [SELECT](sql-ref-syntax-qry-select.html) diff --git a/docs/sql-ref-syntax-qry-select-distribute-by.md b/docs/sql-ref-syntax-qry-select-distribute-by.md index 9e2db27ae7161..1fdfb91dad286 100644 --- a/docs/sql-ref-syntax-qry-select-distribute-by.md +++ b/docs/sql-ref-syntax-qry-select-distribute-by.md @@ -21,28 +21,25 @@ license: | ### Description -The DISTRIBUTE BY clause is used to repartition the data based +The `DISTRIBUTE BY` clause is used to repartition the data based on the input expressions. Unlike the [CLUSTER BY](sql-ref-syntax-qry-select-clusterby.html) clause, this does not sort the data within each partition. ### Syntax -{% highlight sql %} +```sql DISTRIBUTE BY { expression [ , ... ] } -{% endhighlight %} +``` ### Parameters -
-
expression
-
+* **expression** + Specifies combination of one or more values, operators and SQL functions that results in a value. -
-
### Examples -{% highlight sql %} +```sql CREATE TABLE person (name STRING, age INT); INSERT INTO person VALUES ('Zen Hui', 25), @@ -85,15 +82,15 @@ SELECT age, name FROM person DISTRIBUTE BY age; | 16|Shone S| | 16| Jack N| +---+-------+ -{% endhighlight %} +``` ### Related Statements - * [SELECT Main](sql-ref-syntax-qry-select.html) - * [WHERE Clause](sql-ref-syntax-qry-select-where.html) - * [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) - * [HAVING Clause](sql-ref-syntax-qry-select-having.html) - * [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) - * [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) - * [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) - * [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) +* [SELECT Main](sql-ref-syntax-qry-select.html) +* [WHERE Clause](sql-ref-syntax-qry-select-where.html) +* [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) +* [HAVING Clause](sql-ref-syntax-qry-select-having.html) +* [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) +* [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) +* [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) +* [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) diff --git a/docs/sql-ref-syntax-qry-select-groupby.md b/docs/sql-ref-syntax-qry-select-groupby.md index 22fe782f9eaa7..bd9377ef78df6 100644 --- a/docs/sql-ref-syntax-qry-select-groupby.md +++ b/docs/sql-ref-syntax-qry-select-groupby.md @@ -21,84 +21,79 @@ license: | ### Description -The GROUP BY clause is used to group the rows based on a set of specified grouping expressions and compute aggregations on +The `GROUP BY` clause is used to group the rows based on a set of specified grouping expressions and compute aggregations on the group of rows based on one or more specified aggregate functions. Spark also supports advanced aggregations to do multiple aggregations for the same input record set via `GROUPING SETS`, `CUBE`, `ROLLUP` clauses. When a FILTER clause is attached to an aggregate function, only the matching rows are passed to that function. ### Syntax -{% highlight sql %} +```sql GROUP BY group_expression [ , group_expression [ , ... ] ] [ { WITH ROLLUP | WITH CUBE | GROUPING SETS (grouping_set [ , ...]) } ] GROUP BY GROUPING SETS (grouping_set [ , ...]) -{% endhighlight %} +``` While aggregate functions are defined as -{% highlight sql %} +```sql aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_expression ) ] -{% endhighlight %} +``` ### Parameters -
-
GROUPING SETS
-
+* **GROUPING SETS** + Groups the rows for each subset of the expressions specified in the grouping sets. For example, - GROUP BY GROUPING SETS (warehouse, product) is semantically equivalent - to union of results of GROUP BY warehouse and GROUP BY product. This clause - is a shorthand for a UNION ALL where each leg of the UNION ALL - operator performs aggregation of subset of the columns specified in the GROUPING SETS clause. -
-
grouping_set
-
- A grouping set is specified by zero or more comma-separated expressions in parentheses.

- Syntax: - - ([expression [, ...]]) - -
-
grouping_expression
-
+ `GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent + to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause + is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` + operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. + +* **grouping_set** + + A grouping set is specified by zero or more comma-separated expressions in parentheses. + + **Syntax:** `( [ expression [ , ... ] ] )` + +* **grouping_expression** + Specifies the critieria based on which the rows are grouped together. The grouping of rows is performed based on result values of the grouping expressions. A grouping expression may be a column alias, a column position or an expression. -
-
ROLLUP
-
+ +* **ROLLUP** + Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations - based on multiple grouping sets. ROLLUP is a shorthand for GROUPING SETS. For example, - GROUP BY warehouse, product WITH ROLLUP is equivalent to GROUP BY GROUPING SETS - ((warehouse, product), (warehouse), ()). - The N elements of a ROLLUP specification results in N+1 GROUPING SETS. -
-
CUBE
-
- CUBE clause is used to perform aggregations based on combination of grouping columns specified in the - GROUP BY clause. CUBE is a shorthand for GROUPING SETS. For example, - GROUP BY warehouse, product WITH CUBE is equivalent to GROUP BY GROUPING SETS - ((warehouse, product), (warehouse), (product), ()). - The N elements of a CUBE specification results in 2^N GROUPING SETS. -
-
aggregate_name
-
+ based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, + `GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS + ((warehouse, product), (warehouse), ())`. + The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`. + +* **CUBE** + + `CUBE` clause is used to perform aggregations based on combination of grouping columns specified in the + `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example, + `GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY GROUPING SETS + ((warehouse, product), (warehouse), (product), ())`. + The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`. + +* **aggregate_name** + Specifies an aggregate function name (MIN, MAX, COUNT, SUM, AVG, etc.). -
-
DISTINCT
-
+ +* **DISTINCT** + Removes duplicates in input rows before they are passed to aggregate functions. -
-
FILTER
-
- Filters the input rows for which the boolean_expression in the WHERE clause evaluates + +* **FILTER** + + Filters the input rows for which the `boolean_expression` in the `WHERE` clause evaluates to true are passed to the aggregate function; other rows are discarded. -
-
### Examples -{% highlight sql %} +```sql CREATE TABLE dealer (id INT, city STRING, car_model STRING, quantity INT); INSERT INTO dealer VALUES (100, 'Fremont', 'Honda Civic', 10), @@ -174,106 +169,106 @@ SELECT id, sum(quantity) FILTER ( SELECT city, car_model, sum(quantity) AS sum FROM dealer GROUP BY GROUPING SETS ((city, car_model), (city), (car_model), ()) ORDER BY city; -+--------+------------+---+ -| city| car_model|sum| -+--------+------------+---+ -| null| null| 78| -| null| HondaAccord| 33| -| null| HondaCRV| 10| -| null| HondaCivic| 35| -| Dublin| null| 33| -| Dublin| HondaAccord| 10| -| Dublin| HondaCRV| 3| -| Dublin| HondaCivic| 20| -| Fremont| null| 32| -| Fremont| HondaAccord| 15| -| Fremont| HondaCRV| 7| -| Fremont| HondaCivic| 10| -| SanJose| null| 13| -| SanJose| HondaAccord| 8| -| SanJose| HondaCivic| 5| -+--------+------------+---+ ++---------+------------+---+ +| city| car_model|sum| ++---------+------------+---+ +| null| null| 78| +| null| HondaAccord| 33| +| null| HondaCRV| 10| +| null| HondaCivic| 35| +| Dublin| null| 33| +| Dublin| HondaAccord| 10| +| Dublin| HondaCRV| 3| +| Dublin| HondaCivic| 20| +| Fremont| null| 32| +| Fremont| HondaAccord| 15| +| Fremont| HondaCRV| 7| +| Fremont| HondaCivic| 10| +| San Jose| null| 13| +| San Jose| HondaAccord| 8| +| San Jose| HondaCivic| 5| ++---------+------------+---+ -- Alternate syntax for `GROUPING SETS` in which both `GROUP BY` and `GROUPING SETS` -- specifications are present. SELECT city, car_model, sum(quantity) AS sum FROM dealer GROUP BY city, car_model GROUPING SETS ((city, car_model), (city), (car_model), ()) ORDER BY city, car_model; -+--------+------------+---+ -| city| car_model|sum| -+--------+------------+---+ -| null| null| 78| -| null| HondaAccord| 33| -| null| HondaCRV| 10| -| null| HondaCivic| 35| -| Dublin| null| 33| -| Dublin| HondaAccord| 10| -| Dublin| HondaCRV| 3| -| Dublin| HondaCivic| 20| -| Fremont| null| 32| -| Fremont| HondaAccord| 15| -| Fremont| HondaCRV| 7| -| Fremont| HondaCivic| 10| -| SanJose| null| 13| -| SanJose| HondaAccord| 8| -| SanJose| HondaCivic| 5| -+--------+------------+---+ ++---------+------------+---+ +| city| car_model|sum| ++---------+------------+---+ +| null| null| 78| +| null| HondaAccord| 33| +| null| HondaCRV| 10| +| null| HondaCivic| 35| +| Dublin| null| 33| +| Dublin| HondaAccord| 10| +| Dublin| HondaCRV| 3| +| Dublin| HondaCivic| 20| +| Fremont| null| 32| +| Fremont| HondaAccord| 15| +| Fremont| HondaCRV| 7| +| Fremont| HondaCivic| 10| +| San Jose| null| 13| +| San Jose| HondaAccord| 8| +| San Jose| HondaCivic| 5| ++---------+------------+---+ -- Group by processing with `ROLLUP` clause. -- Equivalent GROUP BY GROUPING SETS ((city, car_model), (city), ()) SELECT city, car_model, sum(quantity) AS sum FROM dealer GROUP BY city, car_model WITH ROLLUP ORDER BY city, car_model; -+--------+------------+---+ -| city| car_model|sum| -+--------+------------+---+ -| null| null| 78| -| Dublin| null| 33| -| Dublin| HondaAccord| 10| -| Dublin| HondaCRV| 3| -| Dublin| HondaCivic| 20| -| Fremont| null| 32| -| Fremont| HondaAccord| 15| -| Fremont| HondaCRV| 7| -| Fremont| HondaCivic| 10| -| SanJose| null| 13| -| SanJose| HondaAccord| 8| -| SanJose| HondaCivic| 5| -+--------+------------+---+ ++---------+------------+---+ +| city| car_model|sum| ++---------+------------+---+ +| null| null| 78| +| Dublin| null| 33| +| Dublin| HondaAccord| 10| +| Dublin| HondaCRV| 3| +| Dublin| HondaCivic| 20| +| Fremont| null| 32| +| Fremont| HondaAccord| 15| +| Fremont| HondaCRV| 7| +| Fremont| HondaCivic| 10| +| San Jose| null| 13| +| San Jose| HondaAccord| 8| +| San Jose| HondaCivic| 5| ++---------+------------+---+ -- Group by processing with `CUBE` clause. -- Equivalent GROUP BY GROUPING SETS ((city, car_model), (city), (car_model), ()) SELECT city, car_model, sum(quantity) AS sum FROM dealer GROUP BY city, car_model WITH CUBE ORDER BY city, car_model; -+--------+------------+---+ -| city| car_model|sum| -+--------+------------+---+ -| null| null| 78| -| null| HondaAccord| 33| -| null| HondaCRV| 10| -| null| HondaCivic| 35| -| Dublin| null| 33| -| Dublin| HondaAccord| 10| -| Dublin| HondaCRV| 3| -| Dublin| HondaCivic| 20| -| Fremont| null| 32| -| Fremont| HondaAccord| 15| -| Fremont| HondaCRV| 7| -| Fremont| HondaCivic| 10| -| SanJose| null| 13| -| SanJose| HondaAccord| 8| -| SanJose| HondaCivic| 5| -+--------+------------+---+ -{% endhighlight %} ++---------+------------+---+ +| city| car_model|sum| ++---------+------------+---+ +| null| null| 78| +| null| HondaAccord| 33| +| null| HondaCRV| 10| +| null| HondaCivic| 35| +| Dublin| null| 33| +| Dublin| HondaAccord| 10| +| Dublin| HondaCRV| 3| +| Dublin| HondaCivic| 20| +| Fremont| null| 32| +| Fremont| HondaAccord| 15| +| Fremont| HondaCRV| 7| +| Fremont| HondaCivic| 10| +| San Jose| null| 13| +| San Jose| HondaAccord| 8| +| San Jose| HondaCivic| 5| ++---------+------------+---+ +``` ### Related Statements - * [SELECT Main](sql-ref-syntax-qry-select.html) - * [WHERE Clause](sql-ref-syntax-qry-select-where.html) - * [HAVING Clause](sql-ref-syntax-qry-select-having.html) - * [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) - * [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) - * [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) - * [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) - * [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) +* [SELECT Main](sql-ref-syntax-qry-select.html) +* [WHERE Clause](sql-ref-syntax-qry-select-where.html) +* [HAVING Clause](sql-ref-syntax-qry-select-having.html) +* [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) +* [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) +* [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) +* [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) +* [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) diff --git a/docs/sql-ref-syntax-qry-select-having.md b/docs/sql-ref-syntax-qry-select-having.md index c8c4f2c38104c..935782c551e1f 100644 --- a/docs/sql-ref-syntax-qry-select-having.md +++ b/docs/sql-ref-syntax-qry-select-having.md @@ -21,39 +21,35 @@ license: | ### Description -The HAVING clause is used to filter the results produced by -GROUP BY based on the specified condition. It is often used +The `HAVING` clause is used to filter the results produced by +`GROUP BY` based on the specified condition. It is often used in conjunction with a [GROUP BY](sql-ref-syntax-qry-select-groupby.html) clause. ### Syntax -{% highlight sql %} +```sql HAVING boolean_expression -{% endhighlight %} +``` ### Parameters -
-
boolean_expression
-
- Specifies any expression that evaluates to a result type boolean. Two or +* **boolean_expression** + + Specifies any expression that evaluates to a result type `boolean`. Two or more expressions may be combined together using the logical - operators ( AND, OR ).

- - Note
- The expressions specified in the HAVING clause can only refer to: -
    -
  1. Constants
  2. -
  3. Expressions that appear in GROUP BY
  4. -
  5. Aggregate functions
  6. -
-
-
+ operators ( `AND`, `OR` ). + + **Note** + + The expressions specified in the `HAVING` clause can only refer to: + 1. Constants + 2. Expressions that appear in GROUP BY + 3. Aggregate functions ### Examples -{% highlight sql %} +```sql CREATE TABLE dealer (id INT, city STRING, car_model STRING, quantity INT); INSERT INTO dealer VALUES (100, 'Fremont', 'Honda Civic', 10), @@ -117,15 +113,15 @@ SELECT sum(quantity) AS sum FROM dealer HAVING sum(quantity) > 10; +---+ | 78| +---+ -{% endhighlight %} +``` ### Related Statements - * [SELECT Main](sql-ref-syntax-qry-select.html) - * [WHERE Clause](sql-ref-syntax-qry-select-where.html) - * [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) - * [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) - * [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) - * [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) - * [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) - * [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) +* [SELECT Main](sql-ref-syntax-qry-select.html) +* [WHERE Clause](sql-ref-syntax-qry-select-where.html) +* [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) +* [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) +* [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) +* [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) +* [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) +* [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) diff --git a/docs/sql-ref-syntax-qry-select-hints.md b/docs/sql-ref-syntax-qry-select-hints.md index 16f4f95f90ea1..4bb48b08d5e3b 100644 --- a/docs/sql-ref-syntax-qry-select-hints.md +++ b/docs/sql-ref-syntax-qry-select-hints.md @@ -23,39 +23,33 @@ license: | Join Hints allow users to suggest the join strategy that Spark should use. Prior to Spark 3.0, only the `BROADCAST` Join Hint was supported. `MERGE`, `SHUFFLE_HASH` and `SHUFFLE_REPLICATE_NL` Joint Hints support was added in 3.0. When different join strategy hints are specified on both sides of a join, Spark prioritizes hints in the following order: `BROADCAST` over `MERGE` over `SHUFFLE_HASH` over `SHUFFLE_REPLICATE_NL`. When both sides are specified with the `BROADCAST` hint or the `SHUFFLE_HASH` hint, Spark will pick the build side based on the join type and the sizes of the relations. Since a given strategy may not support all join types, Spark is not guaranteed to use the join strategy suggested by the hint. +### Syntax + +```sql +/*+ join_hint [ , ... ] */ +``` + ### Join Hints Types -
-
BROADCAST
-
- Suggests that Spark use broadcast join. The join side with the hint will be broadcast regardless of autoBroadcastJoinThreshold. If both sides of the join have the broadcast hints, the one with the smaller size (based on stats) will be broadcast. The aliases for BROADCAST are BROADCASTJOIN and MAPJOIN. -
-
- -
-
MERGE
-
- Suggests that Spark use shuffle sort merge join. The aliases for MERGE are SHUFFLE_MERGE and MERGEJOIN. -
-
- -
-
SHUFFLE_HASH
-
- Suggests that Spark use shuffle hash join. If both sides have the shuffle hash hints, Spark chooses the smaller side (based on stats) as the build side. -
-
- -
-
SHUFFLE_REPLICATE_NL
-
+* **BROADCAST** + + Suggests that Spark use broadcast join. The join side with the hint will be broadcast regardless of `autoBroadcastJoinThreshold`. If both sides of the join have the broadcast hints, the one with the smaller size (based on stats) will be broadcast. The aliases for `BROADCAST` are `BROADCASTJOIN` and `MAPJOIN`. + +* **MERGE** + + Suggests that Spark use shuffle sort merge join. The aliases for `MERGE` are `SHUFFLE_MERGE` and `MERGEJOIN`. + +* **SHUFFLE_HASH** + + Suggests that Spark use shuffle hash join. If both sides have the shuffle hash hints, Spark chooses the smaller side (based on stats) as the build side. + +* **SHUFFLE_REPLICATE_NL** + Suggests that Spark use shuffle-and-replicate nested loop join. -
-
### Examples -{% highlight sql %} +```sql -- Join Hints for broadcast join SELECT /*+ BROADCAST(t1) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; SELECT /*+ BROADCASTJOIN (t1) */ * FROM t1 left JOIN t2 ON t1.key = t2.key; @@ -78,10 +72,10 @@ SELECT /*+ SHUFFLE_REPLICATE_NL(t1) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.ke -- Spark will issue Warning in the following example -- org.apache.spark.sql.catalyst.analysis.HintErrorLogger: Hint (strategy=merge) -- is overridden by another hint and will not take effect. -SELECT /*+ BROADCAST(t1) */ /*+ MERGE(t1, t2) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; -{% endhighlight %} +SELECT /*+ BROADCAST(t1), MERGE(t1, t2) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; +``` ### Related Statements - * [JOIN](sql-ref-syntax-qry-select-join.html) - * [SELECT](sql-ref-syntax-qry-select.html) +* [JOIN](sql-ref-syntax-qry-select-join.html) +* [SELECT](sql-ref-syntax-qry-select.html) diff --git a/docs/sql-ref-syntax-qry-select-inline-table.md b/docs/sql-ref-syntax-qry-select-inline-table.md index 9c33cbc679f06..38ecc3da5e14e 100644 --- a/docs/sql-ref-syntax-qry-select-inline-table.md +++ b/docs/sql-ref-syntax-qry-select-inline-table.md @@ -25,32 +25,24 @@ An inline table is a temporary table created using a VALUES clause. ### Syntax -{% highlight sql %} +```sql VALUES ( expression [ , ... ] ) [ table_alias ] -{% endhighlight %} +``` ### Parameters -
-
expression
-
+* **expression** + Specifies a combination of one or more values, operators and SQL functions that results in a value. -
-
-
-
table_alias
-
+ +* **table_alias** + Specifies a temporary name with an optional column name list.

- Syntax: - - [ AS ] table_name [ ( column_name [ , ... ] ) ] - -
-
+ **Syntax:** `[ AS ] table_name [ ( column_name [ , ... ] ) ]` ### Examples -{% highlight sql %} +```sql -- single row, without a table alias SELECT * FROM VALUES ("one", 1); +----+----+ @@ -77,8 +69,8 @@ SELECT * FROM VALUES ("one", array(0, 1)), ("two", array(2, 3)) AS data(a, b); |one|[0, 1]| |two|[2, 3]| +---+------+ -{% endhighlight %} +``` -### Related Statement +### Related Statements - * [SELECT](sql-ref-syntax-qry-select.html) +* [SELECT](sql-ref-syntax-qry-select.html) diff --git a/docs/sql-ref-syntax-qry-select-join.md b/docs/sql-ref-syntax-qry-select-join.md index 0b1bb1eb8fd61..28b21f5e3f0ff 100644 --- a/docs/sql-ref-syntax-qry-select-join.md +++ b/docs/sql-ref-syntax-qry-select-join.md @@ -25,118 +25,95 @@ A SQL join is used to combine rows from two relations based on join criteria. Th ### Syntax -{% highlight sql %} +```sql relation { [ join_type ] JOIN relation [ join_criteria ] | NATURAL join_type JOIN relation } -{% endhighlight %} +``` ### Parameters -
-
relation
-
+* **relation** + Specifies the relation to be joined. -
-
join_type
-
- Specifies the join type.

- Syntax:
- - [ INNER ] - | CROSS - | LEFT [ OUTER ] - | [ LEFT ] SEMI - | RIGHT [ OUTER ] - | FULL [ OUTER ] - | [ LEFT ] ANTI - -
-
join_criteria
-
- Specifies how the rows from one relation will be combined with the rows of another relation.

- Syntax: - - ON boolean_expression | USING ( column_name [ , column_name ... ] ) -

- boolean_expression
- Specifies an expression with a return type of boolean. -
-
+ +* **join_type** + + Specifies the join type. + + **Syntax:** + + `[ INNER ] | CROSS | LEFT [ OUTER ] | [ LEFT ] SEMI | RIGHT [ OUTER ] | FULL [ OUTER ] | [ LEFT ] ANTI` + +* **join_criteria** + + Specifies how the rows from one relation will be combined with the rows of another relation. + + **Syntax:** `ON boolean_expression | USING ( column_name [ , ... ] )` + + `boolean_expression` + + Specifies an expression with a return type of boolean. ### Join Types -#### Inner Join - -
-The inner join is the default join in Spark SQL. It selects rows that have matching values in both relations.

- Syntax:
- - relation [ INNER ] JOIN relation [ join_criteria ] - -
- -#### Left Join - -
-A left join returns all values from the left relation and the matched values from the right relation, or appends NULL if there is no match. It is also referred to as a left outer join.

- Syntax:
- - relation LEFT [ OUTER ] JOIN relation [ join_criteria ] - -
- -#### Right Join - -
-A right join returns all values from the right relation and the matched values from the left relation, or appends NULL if there is no match. It is also referred to as a right outer join.

- Syntax:
- - relation RIGHT [ OUTER ] JOIN relation [ join_criteria ] - -
- -#### Full Join - -
-A full join returns all values from both relations, appending NULL values on the side that does not have a match. It is also referred to as a full outer join.

- Syntax:
- - relation FULL [ OUTER ] JOIN relation [ join_criteria ] - -
- -#### Cross Join - -
-A cross join returns the Cartesian product of two relations.

- Syntax:
- - relation CROSS JOIN relation [ join_criteria ] - -
- -#### Semi Join - -
-A semi join returns values from the left side of the relation that has a match with the right. It is also referred to as a left semi join.

- Syntax:
- - relation [ LEFT ] SEMI JOIN relation [ join_criteria ] - -
- -#### Anti Join - -
-An anti join returns values from the left relation that has no match with the right. It is also referred to as a left anti join.

- Syntax:
- - relation [ LEFT ] ANTI JOIN relation [ join_criteria ] - -
+#### **Inner Join** + +The inner join is the default join in Spark SQL. It selects rows that have matching values in both relations. + +**Syntax:** + +`relation [ INNER ] JOIN relation [ join_criteria ]` + +#### **Left Join** + +A left join returns all values from the left relation and the matched values from the right relation, or appends NULL if there is no match. It is also referred to as a left outer join. + +**Syntax:** + +`relation LEFT [ OUTER ] JOIN relation [ join_criteria ]` + +#### **Right Join** + +A right join returns all values from the right relation and the matched values from the left relation, or appends NULL if there is no match. It is also referred to as a right outer join. + +**Syntax:** + +`relation RIGHT [ OUTER ] JOIN relation [ join_criteria ]` + +#### **Full Join** + +A full join returns all values from both relations, appending NULL values on the side that does not have a match. It is also referred to as a full outer join. + +**Syntax:** + +`relation FULL [ OUTER ] JOIN relation [ join_criteria ]` + +#### **Cross Join** + +A cross join returns the Cartesian product of two relations. + +**Syntax:** + +`relation CROSS JOIN relation [ join_criteria ]` + +#### **Semi Join** + +A semi join returns values from the left side of the relation that has a match with the right. It is also referred to as a left semi join. + +**Syntax:** + +`relation [ LEFT ] SEMI JOIN relation [ join_criteria ]` + +#### **Anti Join** + +An anti join returns values from the left relation that has no match with the right. It is also referred to as a left anti join. + +**Syntax:** + +`relation [ LEFT ] ANTI JOIN relation [ join_criteria ]` ### Examples -{% highlight sql %} +```sql -- Use employee and department tables to demonstrate different type of joins. SELECT * FROM employee; +---+-----+------+ @@ -253,9 +230,9 @@ SELECT * FROM employee ANTI JOIN department ON employee.deptno = department.dept |104| Evan| 4| |106| Amy| 6| +---+-----+------+ -{% endhighlight %} +``` ### Related Statements - * [SELECT](sql-ref-syntax-qry-select.html) - * [Join Hints](sql-ref-syntax-qry-select-hints.html) +* [SELECT](sql-ref-syntax-qry-select.html) +* [Join Hints](sql-ref-syntax-qry-select-hints.html) diff --git a/docs/sql-ref-syntax-qry-select-like.md b/docs/sql-ref-syntax-qry-select-like.md index 408673c532ddd..feb5eb7b3c80d 100644 --- a/docs/sql-ref-syntax-qry-select-like.md +++ b/docs/sql-ref-syntax-qry-select-like.md @@ -25,38 +25,30 @@ A LIKE predicate is used to search for a specific pattern. ### Syntax -{% highlight sql %} +```sql [ NOT ] { LIKE search_pattern [ ESCAPE esc_char ] | RLIKE regex_pattern } -{% endhighlight %} +``` ### Parameters -
-
search_pattern
-
- Specifies a string pattern to be searched by the LIKE clause. It can contain special pattern-matching characters: -
    -
  • %
  • matches zero or more characters. -
  • _
  • matches exactly one character. -
-
-
-
-
esc_char
-
- Specifies the escape character. The default escape character is \. -
-
-
-
regex_pattern
-
- Specifies a regular expression search pattern to be searched by the RLIKE clause. -
-
+* **search_pattern** + + Specifies a string pattern to be searched by the `LIKE` clause. It can contain special pattern-matching characters: + + * `%` matches zero or more characters. + * `_` matches exactly one character. + +* **esc_char** + + Specifies the escape character. The default escape character is `\`. + +* **regex_pattern** + + Specifies a regular expression search pattern to be searched by the `RLIKE` clause. ### Examples -{% highlight sql %} +```sql CREATE TABLE person (id INT, name STRING, age INT); INSERT INTO person VALUES (100, 'John', 30), @@ -90,12 +82,11 @@ SELECT * FROM person WHERE name NOT LIKE 'M_ry'; |400| Dan| 50| +---+------+---+ -SELECT * FROM person WHERE name RLIKE '[MD]'; +SELECT * FROM person WHERE name RLIKE 'M+'; +---+----+----+ | id|name| age| +---+----+----+ |300|Mike| 80| -|400| Dan| 50| |200|Mary|null| +---+----+----+ @@ -112,9 +103,9 @@ SELECT * FROM person WHERE name LIKE '%$_%' ESCAPE '$'; +---+------+---+ |500|Evan_W| 16| +---+------+---+ -{% endhighlight %} +``` ### Related Statements - * [SELECT](sql-ref-syntax-qry-select.html) - * [WHERE Clause](sql-ref-syntax-qry-select-where.html) +* [SELECT](sql-ref-syntax-qry-select.html) +* [WHERE Clause](sql-ref-syntax-qry-select-where.html) diff --git a/docs/sql-ref-syntax-qry-select-limit.md b/docs/sql-ref-syntax-qry-select-limit.md index eaeaed068102f..ec3532214084b 100644 --- a/docs/sql-ref-syntax-qry-select-limit.md +++ b/docs/sql-ref-syntax-qry-select-limit.md @@ -21,34 +21,31 @@ license: | ### Description -The LIMIT clause is used to constrain the number of rows returned by +The `LIMIT` clause is used to constrain the number of rows returned by the [SELECT](sql-ref-syntax-qry-select.html) statement. In general, this clause is used in conjunction with [ORDER BY](sql-ref-syntax-qry-select-orderby.html) to ensure that the results are deterministic. ### Syntax -{% highlight sql %} +```sql LIMIT { ALL | integer_expression } -{% endhighlight %} +``` ### Parameters -
-
ALL
-
+* **ALL** + If specified, the query returns all the rows. In other words, no limit is applied if this option is specified. -
-
integer_expression
-
+ +* **integer_expression** + Specifies a foldable expression that returns an integer. -
-
### Examples -{% highlight sql %} +```sql CREATE TABLE person (name STRING, age INT); INSERT INTO person VALUES ('Zen Hui', 25), @@ -95,15 +92,15 @@ SELECT name, age FROM person ORDER BY name LIMIT length('SPARK'); -- A non-foldable expression as an input to LIMIT is not allowed. SELECT name, age FROM person ORDER BY name LIMIT length(name); org.apache.spark.sql.AnalysisException: The limit expression must evaluate to a constant value ... -{% endhighlight %} +``` ### Related Statements - * [SELECT Main](sql-ref-syntax-qry-select.html) - * [WHERE Clause](sql-ref-syntax-qry-select-where.html) - * [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) - * [HAVING Clause](sql-ref-syntax-qry-select-having.html) - * [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) - * [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) - * [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) - * [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) +* [SELECT Main](sql-ref-syntax-qry-select.html) +* [WHERE Clause](sql-ref-syntax-qry-select-where.html) +* [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) +* [HAVING Clause](sql-ref-syntax-qry-select-having.html) +* [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) +* [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) +* [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) +* [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) diff --git a/docs/sql-ref-syntax-qry-select-orderby.md b/docs/sql-ref-syntax-qry-select-orderby.md index d927177398f7f..85bbe514cdc95 100644 --- a/docs/sql-ref-syntax-qry-select-orderby.md +++ b/docs/sql-ref-syntax-qry-select-orderby.md @@ -21,56 +21,48 @@ license: | ### Description -The ORDER BY clause is used to return the result rows in a sorted manner +The `ORDER BY` clause is used to return the result rows in a sorted manner in the user specified order. Unlike the [SORT BY](sql-ref-syntax-qry-select-sortby.html) clause, this clause guarantees a total order in the output. ### Syntax -{% highlight sql %} +```sql ORDER BY { expression [ sort_direction | nulls_sort_oder ] [ , ... ] } -{% endhighlight %} +``` ### Parameters -
-
ORDER BY
-
- Specifies a comma-separated list of expressions along with optional parameters sort_direction - and nulls_sort_order which are used to sort the rows. -
-
sort_direction
-
+* **ORDER BY** + + Specifies a comma-separated list of expressions along with optional parameters `sort_direction` + and `nulls_sort_order` which are used to sort the rows. + +* **sort_direction** + Optionally specifies whether to sort the rows in ascending or descending - order. The valid values for the sort direction are ASC for ascending - and DESC for descending. If sort direction is not explicitly specified, then by default - rows are sorted ascending.

- Syntax: - - [ ASC | DESC ] - -
-
nulls_sort_order
-
+ order. The valid values for the sort direction are `ASC` for ascending + and `DESC` for descending. If sort direction is not explicitly specified, then by default + rows are sorted ascending. + + **Syntax:** [ ASC `|` DESC ] + +* **nulls_sort_order** + Optionally specifies whether NULL values are returned before/after non-NULL values. If - null_sort_order is not specified, then NULLs sort first if sort order is - ASC and NULLS sort last if sort order is DESC.

-
    -
  1. If NULLS FIRST is specified, then NULL values are returned first - regardless of the sort order.
  2. -
  3. If NULLS LAST is specified, then NULL values are returned last regardless of - the sort order.
  4. -

- Syntax: - - [ NULLS { FIRST | LAST } ] - -
-
+ `null_sort_order` is not specified, then NULLs sort first if sort order is + `ASC` and NULLS sort last if sort order is `DESC`. + + 1. If `NULLS FIRST` is specified, then NULL values are returned first + regardless of the sort order. + 2. If `NULLS LAST` is specified, then NULL values are returned last regardless of + the sort order. + + **Syntax:** `[ NULLS { FIRST | LAST } ]` ### Examples -{% highlight sql %} +```sql CREATE TABLE person (id INT, name STRING, age INT); INSERT INTO person VALUES (100, 'John', 30), @@ -139,15 +131,15 @@ SELECT * FROM person ORDER BY name ASC, age DESC; |200| Mary|null| |300| Mike| 80| +---+-----+----+ -{% endhighlight %} +``` ### Related Statements - * [SELECT Main](sql-ref-syntax-qry-select.html) - * [WHERE Clause](sql-ref-syntax-qry-select-where.html) - * [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) - * [HAVING Clause](sql-ref-syntax-qry-select-having.html) - * [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) - * [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) - * [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) - * [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) +* [SELECT Main](sql-ref-syntax-qry-select.html) +* [WHERE Clause](sql-ref-syntax-qry-select-where.html) +* [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) +* [HAVING Clause](sql-ref-syntax-qry-select-having.html) +* [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) +* [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) +* [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) +* [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) diff --git a/docs/sql-ref-syntax-qry-select-setops.md b/docs/sql-ref-syntax-qry-select-setops.md index 98c20941d16bf..8cd12c37fa603 100644 --- a/docs/sql-ref-syntax-qry-select-setops.md +++ b/docs/sql-ref-syntax-qry-select-setops.md @@ -35,13 +35,13 @@ Note that input relations must have the same number of columns and compatible da #### Syntax -{% highlight sql %} +```sql [ ( ] relation [ ) ] EXCEPT | MINUS [ ALL | DISTINCT ] [ ( ] relation [ ) ] -{% endhighlight %} +``` #### Examples -{% highlight sql %} +```sql -- Use number1 and number2 tables to demonstrate set operators in this page. SELECT * FROM number1; +---+ @@ -98,7 +98,7 @@ SELECT c FROM number1 MINUS ALL (SELECT c FROM number2); | 3| | 4| +---+ -{% endhighlight %} +``` ### INTERSECT @@ -106,13 +106,13 @@ SELECT c FROM number1 MINUS ALL (SELECT c FROM number2); #### Syntax -{% highlight sql %} +```sql [ ( ] relation [ ) ] INTERSECT [ ALL | DISTINCT ] [ ( ] relation [ ) ] -{% endhighlight %} +``` #### Examples -{% highlight sql %} +```sql (SELECT c FROM number1) INTERSECT (SELECT c FROM number2); +---+ | c| @@ -137,7 +137,7 @@ SELECT c FROM number1 MINUS ALL (SELECT c FROM number2); | 2| | 2| +---+ -{% endhighlight %} +``` ### UNION @@ -145,13 +145,13 @@ SELECT c FROM number1 MINUS ALL (SELECT c FROM number2); #### Syntax -{% highlight sql %} +```sql [ ( ] relation [ ) ] UNION [ ALL | DISTINCT ] [ ( ] relation [ ) ] -{% endhighlight %} +``` ### Examples -{% highlight sql %} +```sql (SELECT c FROM number1) UNION (SELECT c FROM number2); +---+ | c| @@ -189,8 +189,8 @@ SELECT c FROM number1 UNION ALL (SELECT c FROM number2); | 2| | 2| +---+ -{% endhighlight %} +``` ### Related Statements - * [SELECT Statement](sql-ref-syntax-qry-select.html) +* [SELECT Statement](sql-ref-syntax-qry-select.html) diff --git a/docs/sql-ref-syntax-qry-select-sortby.md b/docs/sql-ref-syntax-qry-select-sortby.md index 1dfa10429709e..554bdb569d005 100644 --- a/docs/sql-ref-syntax-qry-select-sortby.md +++ b/docs/sql-ref-syntax-qry-select-sortby.md @@ -21,58 +21,50 @@ license: | ### Description -The SORT BY clause is used to return the result rows sorted +The `SORT BY` clause is used to return the result rows sorted within each partition in the user specified order. When there is more than one partition -SORT BY may return result that is partially ordered. This is different +`SORT BY` may return result that is partially ordered. This is different than [ORDER BY](sql-ref-syntax-qry-select-orderby.html) clause which guarantees a total order of the output. ### Syntax -{% highlight sql %} +```sql SORT BY { expression [ sort_direction | nulls_sort_order ] [ , ... ] } -{% endhighlight %} +``` ### Parameters -
-
SORT BY
-
- Specifies a comma-separated list of expressions along with optional parameters sort_direction - and nulls_sort_order which are used to sort the rows within each partition. -
-
sort_direction
-
+* **SORT BY** + + Specifies a comma-separated list of expressions along with optional parameters `sort_direction` + and `nulls_sort_order` which are used to sort the rows within each partition. + +* **sort_direction** + Optionally specifies whether to sort the rows in ascending or descending - order. The valid values for the sort direction are ASC for ascending - and DESC for descending. If sort direction is not explicitly specified, then by default - rows are sorted ascending.

- Syntax: - - [ ASC | DESC ] - -
-
nulls_sort_order
-
+ order. The valid values for the sort direction are `ASC` for ascending + and `DESC` for descending. If sort direction is not explicitly specified, then by default + rows are sorted ascending. + + **Syntax:** `[ ASC | DESC ]` + +* **nulls_sort_order** + Optionally specifies whether NULL values are returned before/after non-NULL values. If - null_sort_order is not specified, then NULLs sort first if sort order is - ASC and NULLS sort last if sort order is DESC.

-
    -
  1. If NULLS FIRST is specified, then NULL values are returned first - regardless of the sort order.
  2. -
  3. If NULLS LAST is specified, then NULL values are returned last regardless of - the sort order.
  4. -

- Syntax: - - [ NULLS { FIRST | LAST } ] - -
-
+ `null_sort_order` is not specified, then NULLs sort first if sort order is + `ASC` and NULLS sort last if sort order is `DESC`. + + 1. If `NULLS FIRST` is specified, then NULL values are returned first + regardless of the sort order. + 2. If `NULLS LAST` is specified, then NULL values are returned last regardless of + the sort order. + + **Syntax:** `[ NULLS { FIRST | LAST } ]` ### Examples -{% highlight sql %} +```sql CREATE TABLE person (zip_code INT, name STRING, age INT); INSERT INTO person VALUES (94588, 'Zen Hui', 50), @@ -172,15 +164,15 @@ SELECT /*+ REPARTITION(zip_code) */ name, age, zip_code FROM person | David K| 42| 94511| |Lalit B.|null| 94511| +--------+----+--------+ -{% endhighlight %} +``` ### Related Statements - * [SELECT Main](sql-ref-syntax-qry-select.html) - * [WHERE Clause](sql-ref-syntax-qry-select-where.html) - * [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) - * [HAVING Clause](sql-ref-syntax-qry-select-having.html) - * [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) - * [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) - * [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) - * [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) +* [SELECT Main](sql-ref-syntax-qry-select.html) +* [WHERE Clause](sql-ref-syntax-qry-select-where.html) +* [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) +* [HAVING Clause](sql-ref-syntax-qry-select-having.html) +* [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) +* [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) +* [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) +* [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) diff --git a/docs/sql-ref-syntax-qry-select-tvf.md b/docs/sql-ref-syntax-qry-select-tvf.md index ac8fecaae6413..89ad01aff2167 100644 --- a/docs/sql-ref-syntax-qry-select-tvf.md +++ b/docs/sql-ref-syntax-qry-select-tvf.md @@ -25,28 +25,21 @@ A table-valued function (TVF) is a function that returns a relation or a set of ### Syntax -{% highlight sql %} +```sql function_name ( expression [ , ... ] ) [ table_alias ] -{% endhighlight %} +``` ### Parameters -
-
expression
-
+* **expression** + Specifies a combination of one or more values, operators and SQL functions that results in a value. -
-
-
-
table_alias
-
- Specifies a temporary name with an optional column name list.

- Syntax: - - [ AS ] table_name [ ( column_name [ , ... ] ) ] - -
-
+ +* **table_alias** + + Specifies a temporary name with an optional column name list. + + **Syntax:** `[ AS ] table_name [ ( column_name [ , ... ] ) ]` ### Supported Table-valued Functions @@ -78,7 +71,7 @@ function_name ( expression [ , ... ] ) [ table_alias ] ### Examples -{% highlight sql %} +```sql -- range call with end SELECT * FROM range(6 + cos(3)); +---+ @@ -124,8 +117,8 @@ SELECT * FROM range(5, 8) AS test; | 6| | 7| +---+ -{% endhighlight %} +``` -### Related Statement +### Related Statements - * [SELECT](sql-ref-syntax-qry-select.html) +* [SELECT](sql-ref-syntax-qry-select.html) diff --git a/docs/sql-ref-syntax-qry-select-usedb.md b/docs/sql-ref-syntax-qry-select-usedb.md index bb95a8e4ddf30..4119a4927bc9f 100644 --- a/docs/sql-ref-syntax-qry-select-usedb.md +++ b/docs/sql-ref-syntax-qry-select-usedb.md @@ -28,32 +28,29 @@ The default database name is 'default'. ### Syntax -{% highlight sql %} +```sql USE database_name -{% endhighlight %} +``` ### Parameter -
-
database_name
-
- Name of the database will be used. If the database does not exist, an exception will be thrown. -
-
+* **database_name** + + Name of the database will be used. If the database does not exist, an exception will be thrown. ### Examples -{% highlight sql %} +```sql -- Use the 'userdb' which exists. USE userdb; -- Use the 'userdb1' which doesn't exist USE userdb1; Error: org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: Database 'userdb1' not found;(state=,code=0) -{% endhighlight %} +``` ### Related Statements - * [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) - * [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html) - * [CREATE TABLE ](sql-ref-syntax-ddl-create-table.html) +* [CREATE DATABASE](sql-ref-syntax-ddl-create-database.html) +* [DROP DATABASE](sql-ref-syntax-ddl-drop-database.html) +* [CREATE TABLE ](sql-ref-syntax-ddl-create-table.html) diff --git a/docs/sql-ref-syntax-qry-select-where.md b/docs/sql-ref-syntax-qry-select-where.md index 360313fcfff1c..ca3f5ec7866c6 100644 --- a/docs/sql-ref-syntax-qry-select-where.md +++ b/docs/sql-ref-syntax-qry-select-where.md @@ -21,29 +21,26 @@ license: | ### Description -The WHERE clause is used to limit the results of the FROM +The `WHERE` clause is used to limit the results of the `FROM` clause of a query or a subquery based on the specified condition. ### Syntax -{% highlight sql %} +```sql WHERE boolean_expression -{% endhighlight %} +``` ### Parameters -
-
boolean_expression
-
- Specifies any expression that evaluates to a result type boolean. Two or +* **boolean_expression** + + Specifies any expression that evaluates to a result type `boolean`. Two or more expressions may be combined together using the logical - operators ( AND, OR ). -
-
+ operators ( `AND`, `OR` ). ### Examples -{% highlight sql %} +```sql CREATE TABLE person (id INT, name STRING, age INT); INSERT INTO person VALUES (100, 'John', 30), @@ -116,15 +113,15 @@ SELECT * FROM person AS parent +---+----+----+ |200|Mary|null| +---+----+----+ -{% endhighlight %} +``` ### Related Statements - * [SELECT Main](sql-ref-syntax-qry-select.html) - * [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) - * [HAVING Clause](sql-ref-syntax-qry-select-having.html) - * [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) - * [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) - * [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) - * [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) - * [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) +* [SELECT Main](sql-ref-syntax-qry-select.html) +* [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) +* [HAVING Clause](sql-ref-syntax-qry-select-having.html) +* [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) +* [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) +* [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) +* [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) +* [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) diff --git a/docs/sql-ref-syntax-qry-select.md b/docs/sql-ref-syntax-qry-select.md index bc2cc0269124e..1aeecdb982c4c 100644 --- a/docs/sql-ref-syntax-qry-select.md +++ b/docs/sql-ref-syntax-qry-select.md @@ -28,7 +28,7 @@ of a query along with examples. ### Syntax -{% highlight sql %} +```sql [ WITH with_query [ , ... ] ] select_statement [ { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select_statement, ... ] [ ORDER BY { expression [ ASC | DESC ] [ NULLS { FIRST | LAST } ] [ , ...] } ] @@ -37,126 +37,125 @@ select_statement [ { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select_stat [ DISTRIBUTE BY { expression [, ...] } ] [ WINDOW { named_window [ , WINDOW named_window, ... ] } ] [ LIMIT { ALL | expression } ] -{% endhighlight %} +``` While `select_statement` is defined as -{% highlight sql %} +```sql SELECT [ hints , ... ] [ ALL | DISTINCT ] { named_expression [ , ... ] } FROM { from_item [ , ...] } [ WHERE boolean_expression ] [ GROUP BY expression [ , ...] ] [ HAVING boolean_expression ] -{% endhighlight %} +``` ### Parameters -
-
with_query
-
- Specifies the common table expressions (CTEs) before the main query block. +* **with_query** + + Specifies the [common table expressions (CTEs)](sql-ref-syntax-qry-select-cte.html) before the main query block. These table expressions are allowed to be referenced later in the FROM clause. This is useful to abstract out repeated subquery blocks in the FROM clause and improves readability of the query. -
-
hints
-
+ +* **hints** + Hints can be specified to help spark optimizer make better planning decisions. Currently spark supports hints that influence selection of join strategies and repartitioning of the data. -
-
ALL
-
+ +* **ALL** + Select all matching rows from the relation and is enabled by default. -
-
DISTINCT
-
+ +* **DISTINCT** + Select all matching rows from the relation after removing duplicates in results. -
-
named_expression
-
- An expression with an assigned name. In general, it denotes a column expression.

- Syntax: - - expression [AS] [alias] - -
-
from_item
-
- Specifies a source of input for the query. It can be one of the following: -
    -
  1. Table relation
  2. -
  3. Join relation
  4. -
  5. Table-value function
  6. -
  7. Inline table
  8. -
  9. Subquery
  10. -
-
-
WHERE
-
- Filters the result of the FROM clause based on the supplied predicates. -
-
GROUP BY
-
- Specifies the expressions that are used to group the rows. This is used in conjunction with aggregate functions - (MIN, MAX, COUNT, SUM, AVG, etc.) to group rows based on the grouping expressions and aggregate values in each group. - When a FILTER clause is attached to an aggregate function, only the matching rows are passed to that function. -
-
HAVING
-
- Specifies the predicates by which the rows produced by GROUP BY are filtered. The HAVING clause is used to - filter rows after the grouping is performed. If HAVING is specified without GROUP BY, it indicates a GROUP BY - without grouping expressions (global aggregate). -
-
ORDER BY
-
- Specifies an ordering of the rows of the complete result set of the query. The output rows are ordered - across the partitions. This parameter is mutually exclusive with SORT BY, - CLUSTER BY and DISTRIBUTE BY and can not be specified together. -
-
SORT BY
-
- Specifies an ordering by which the rows are ordered within each partition. This parameter is mutually - exclusive with ORDER BY and CLUSTER BY and can not be specified together. -
-
CLUSTER BY
-
- Specifies a set of expressions that is used to repartition and sort the rows. Using this clause has - the same effect of using DISTRIBUTE BY and SORT BY together. -
-
DISTRIBUTE BY
-
- Specifies a set of expressions by which the result rows are repartitioned. This parameter is mutually - exclusive with ORDER BY and CLUSTER BY and can not be specified together. -
-
LIMIT
-
- Specifies the maximum number of rows that can be returned by a statement or subquery. This clause - is mostly used in the conjunction with ORDER BY to produce a deterministic result. -
-
boolean_expression
-
- Specifies an expression with a return type of boolean. -
-
expression
-
- Specifies a combination of one or more values, operators, and SQL functions that evaluates to a value. -
-
named_window
-
- Specifies aliases for one or more source window specifications. The source window specifications can - be referenced in the widow definitions in the query. -
-
+ +* **named_expression** + + An expression with an assigned name. In general, it denotes a column expression. + + **Syntax:** `expression [AS] [alias]` + + * **from_item** + + Specifies a source of input for the query. It can be one of the following: + * Table relation + * [Join relation](sql-ref-syntax-qry-select-join.html) + * [Table-value function](sql-ref-syntax-qry-select-tvf.html) + * [Inline table](sql-ref-syntax-qry-select-inline-table.html) + * Subquery + + + * **WHERE** + + Filters the result of the FROM clause based on the supplied predicates. + + * **GROUP BY** + + Specifies the expressions that are used to group the rows. This is used in conjunction with aggregate functions + (MIN, MAX, COUNT, SUM, AVG, etc.) to group rows based on the grouping expressions and aggregate values in each group. + When a FILTER clause is attached to an aggregate function, only the matching rows are passed to that function. + + * **HAVING** + + Specifies the predicates by which the rows produced by GROUP BY are filtered. The HAVING clause is used to + filter rows after the grouping is performed. If HAVING is specified without GROUP BY, it indicates a GROUP BY + without grouping expressions (global aggregate). + + * **ORDER BY** + + Specifies an ordering of the rows of the complete result set of the query. The output rows are ordered + across the partitions. This parameter is mutually exclusive with `SORT BY`, + `CLUSTER BY` and `DISTRIBUTE BY` and can not be specified together. + + * **SORT BY** + + Specifies an ordering by which the rows are ordered within each partition. This parameter is mutually + exclusive with `ORDER BY` and `CLUSTER BY` and can not be specified together. + + * **CLUSTER BY** + + Specifies a set of expressions that is used to repartition and sort the rows. Using this clause has + the same effect of using `DISTRIBUTE BY` and `SORT BY` together. + + * **DISTRIBUTE BY** + + Specifies a set of expressions by which the result rows are repartitioned. This parameter is mutually + exclusive with `ORDER BY` and `CLUSTER BY` and can not be specified together. + + * **LIMIT** + + Specifies the maximum number of rows that can be returned by a statement or subquery. This clause + is mostly used in the conjunction with `ORDER BY` to produce a deterministic result. + + * **boolean_expression** + + Specifies an expression with a return type of boolean. + + * **expression** + + Specifies a combination of one or more values, operators, and SQL functions that evaluates to a value. + + * **named_window** + + Specifies aliases for one or more source window specifications. The source window specifications can + be referenced in the widow definitions in the query. ### Related Statements - * [WHERE Clause](sql-ref-syntax-qry-select-where.html) - * [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) - * [HAVING Clause](sql-ref-syntax-qry-select-having.html) - * [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) - * [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) - * [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) - * [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) - * [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) - * [TABLESAMPLE](sql-ref-syntax-qry-sampling.html) - * [JOIN](sql-ref-syntax-qry-select-join.html) - * [SET Operators](sql-ref-syntax-qry-select-setops.html) - * [Common Table Expression](sql-ref-syntax-qry-select-cte.html) +* [WHERE Clause](sql-ref-syntax-qry-select-where.html) +* [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) +* [HAVING Clause](sql-ref-syntax-qry-select-having.html) +* [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) +* [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) +* [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) +* [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) +* [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) +* [Common Table Expression](sql-ref-syntax-qry-select-cte.html) +* [Inline Table](sql-ref-syntax-qry-select-inline-table.html) +* [JOIN](sql-ref-syntax-qry-select-join.html) +* [Join Hints](sql-ref-syntax-qry-select-hints.html) +* [LIKE Predicate](sql-ref-syntax-qry-select-like.html) +* [Set Operators](sql-ref-syntax-qry-select-setops.html) +* [TABLESAMPLE](sql-ref-syntax-qry-sampling.html) +* [Table-valued Function](sql-ref-syntax-qry-select-tvf.html) +* [Window Function](sql-ref-syntax-qry-window.html) diff --git a/docs/sql-ref-syntax-qry-window.md b/docs/sql-ref-syntax-qry-window.md index e3762925760e2..9c03b65fec3eb 100644 --- a/docs/sql-ref-syntax-qry-window.md +++ b/docs/sql-ref-syntax-qry-window.md @@ -25,67 +25,52 @@ Window functions operate on a group of rows, referred to as a window, and calcul ### Syntax -{% highlight sql %} +```sql window_function OVER ( [ { PARTITION | DISTRIBUTE } BY partition_col_name = partition_col_val ( [ , ... ] ) ] { ORDER | SORT } BY expression [ ASC | DESC ] [ NULLS { FIRST | LAST } ] [ , ... ] [ window_frame ] ) -{% endhighlight %} +``` ### Parameters -
-
window_function
-
-
    -
  • Ranking Functions
  • -
    - Syntax: - - RANK | DENSE_RANK | PERCENT_RANK | NTILE | ROW_NUMBER - -
-
    -
  • Analytic Functions
  • -
    - Syntax: - - CUME_DIST | LAG | LEAD - -
-
    -
  • Aggregate Functions
  • -
    - Syntax: - - MAX | MIN | COUNT | SUM | AVG | ... - -
    - Please refer to the Built-in Functions document for a complete list of Spark aggregate functions. -
-
-
-
-
window_frame
-
+* **window_function** + + * Ranking Functions + + **Syntax:** `RANK | DENSE_RANK | PERCENT_RANK | NTILE | ROW_NUMBER` + + * Analytic Functions + + **Syntax:** `CUME_DIST | LAG | LEAD` + + * Aggregate Functions + + **Syntax:** `MAX | MIN | COUNT | SUM | AVG | ...` + + Please refer to the [Built-in Aggregation Functions](sql-ref-functions-builtin.html#aggregate-functions) document for a complete list of Spark aggregate functions. + +* **window_frame** + Specifies which row to start the window on and where to end it.
- Syntax:
- { RANGE | ROWS } { frame_start | BETWEEN frame_start AND frame_end }
- If frame_end is omitted it defaults to CURRENT ROW.

-
    - frame_start and frame_end have the following syntax
    - Syntax:
    - - UNBOUNDED PRECEDING | offset PRECEDING | CURRENT ROW | offset FOLLOWING | UNBOUNDED FOLLOWING -
    - offset:specifies the offset from the position of the current row. -
-
-
+ + **Syntax:** + + `{ RANGE | ROWS } { frame_start | BETWEEN frame_start AND frame_end }` + + If frame_end is omitted it defaults to CURRENT ROW. + + `frame_start` and `frame_end` have the following syntax + + **Syntax:** + + `UNBOUNDED PRECEDING | offset PRECEDING | CURRENT ROW | offset FOLLOWING | UNBOUNDED FOLLOWING` + + `offset:` specifies the offset from the position of the current row. ### Examples -{% highlight sql %} +```sql CREATE TABLE employees (name STRING, dept STRING, salary INT, age INT); INSERT INTO employees VALUES ("Lisa", "Sales", 10000, 35); @@ -199,8 +184,8 @@ SELECT name, salary, | Jane| Marketing| 29000|29000|35000| | Jeff| Marketing| 35000|29000| 0| +-----+-----------+------+-----+-----+ -{% endhighlight %} +``` ### Related Statements - * [SELECT](sql-ref-syntax-qry-select.html) +* [SELECT](sql-ref-syntax-qry-select.html) diff --git a/docs/sql-ref-syntax-qry.md b/docs/sql-ref-syntax-qry.md index 325c9b69f12f9..1171fead55e30 100644 --- a/docs/sql-ref-syntax-qry.md +++ b/docs/sql-ref-syntax-qry.md @@ -27,20 +27,21 @@ to SELECT are also included in this section. Spark also provides the ability to generate logical and physical plan for a given query using [EXPLAIN](sql-ref-syntax-qry-explain.html) statement. - * [WHERE Clause](sql-ref-syntax-qry-select-where.html) - * [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) - * [HAVING Clause](sql-ref-syntax-qry-select-having.html) - * [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) - * [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) - * [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) - * [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) - * [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) - * [JOIN](sql-ref-syntax-qry-select-join.html) - * [Join Hints](sql-ref-syntax-qry-select-hints.html) - * [Set Operators](sql-ref-syntax-qry-select-setops.html) - * [TABLESAMPLE](sql-ref-syntax-qry-sampling.html) - * [Table-valued Function](sql-ref-syntax-qry-select-tvf.html) - * [Inline Table](sql-ref-syntax-qry-select-inline-table.html) - * [Common Table Expression](sql-ref-syntax-qry-select-cte.html) - * [Window Function](sql-ref-syntax-qry-window.html) - * [EXPLAIN Statement](sql-ref-syntax-qry-explain.html) +* [WHERE Clause](sql-ref-syntax-qry-select-where.html) +* [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) +* [HAVING Clause](sql-ref-syntax-qry-select-having.html) +* [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) +* [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) +* [CLUSTER BY Clause](sql-ref-syntax-qry-select-clusterby.html) +* [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) +* [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) +* [Common Table Expression](sql-ref-syntax-qry-select-cte.html) +* [Inline Table](sql-ref-syntax-qry-select-inline-table.html) +* [JOIN](sql-ref-syntax-qry-select-join.html) +* [Join Hints](sql-ref-syntax-qry-select-hints.html) +* [LIKE Predicate](sql-ref-syntax-qry-select-like.html) +* [Set Operators](sql-ref-syntax-qry-select-setops.html) +* [TABLESAMPLE](sql-ref-syntax-qry-sampling.html) +* [Table-valued Function](sql-ref-syntax-qry-select-tvf.html) +* [Window Function](sql-ref-syntax-qry-window.html) +* [EXPLAIN Statement](sql-ref-syntax-qry-explain.html) From fad1bd029c6023db3287c90363b6fbd46b17d113 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 4 May 2020 09:13:37 -0700 Subject: [PATCH 02/19] fix --- docs/sql-ref-syntax-aux-analyze-table.md | 2 +- docs/sql-ref-syntax-aux-show-functions.md | 2 +- docs/sql-ref-syntax-aux-show.md | 4 ++-- docs/sql-ref-syntax-ddl-alter-table.md | 4 ---- docs/sql-ref-syntax-ddl-create-function.md | 6 ++++-- docs/sql-ref-syntax-ddl-create-table-datasource.md | 2 +- 6 files changed, 9 insertions(+), 11 deletions(-) diff --git a/docs/sql-ref-syntax-aux-analyze-table.md b/docs/sql-ref-syntax-aux-analyze-table.md index a8e11303432ba..8f43d7388d7db 100644 --- a/docs/sql-ref-syntax-aux-analyze-table.md +++ b/docs/sql-ref-syntax-aux-analyze-table.md @@ -47,7 +47,7 @@ ANALYZE TABLE table_identifier [ partition_spec ] * **[ NOSCAN `|` FOR COLUMNS col [ , ... ] `|` FOR ALL COLUMNS ]** - * If no analyze option is specified, `ANALYZE TABLE` collects the table's number of rows and size in bytes. + * If no analyze option is specified, `ANALYZE TABLE` collects the table's number of rows and size in bytes. * **NOSCAN** Collects only the table's size in bytes ( which does not require scanning the entire table ). diff --git a/docs/sql-ref-syntax-aux-show-functions.md b/docs/sql-ref-syntax-aux-show-functions.md index 2cfca0f34bf77..942d6a5409ca4 100644 --- a/docs/sql-ref-syntax-aux-show-functions.md +++ b/docs/sql-ref-syntax-aux-show-functions.md @@ -49,7 +49,7 @@ SHOW [ function_kind ] FUNCTIONS [ [ LIKE ] { function_name | regex_pattern } ] a database then the function is resolved from the user specified database, otherwise it is resolved from the current database. - **Syntax:** `[database_name.]function_name` + **Syntax:** `[ database_name. ] function_name` * **regex_pattern** diff --git a/docs/sql-ref-syntax-aux-show.md b/docs/sql-ref-syntax-aux-show.md index 424fe71370897..9f64ea2d50ae1 100644 --- a/docs/sql-ref-syntax-aux-show.md +++ b/docs/sql-ref-syntax-aux-show.md @@ -20,11 +20,11 @@ license: | --- * [SHOW COLUMNS](sql-ref-syntax-aux-show-columns.html) + * [SHOW CREATE TABLE](sql-ref-syntax-aux-show-create-table.html) * [SHOW DATABASES](sql-ref-syntax-aux-show-databases.html) * [SHOW FUNCTIONS](sql-ref-syntax-aux-show-functions.html) + * [SHOW PARTITIONS](sql-ref-syntax-aux-show-partitions.html) * [SHOW TABLE EXTENDED](sql-ref-syntax-aux-show-table.html) * [SHOW TABLES](sql-ref-syntax-aux-show-tables.html) * [SHOW TBLPROPERTIES](sql-ref-syntax-aux-show-tblproperties.html) - * [SHOW PARTITIONS](sql-ref-syntax-aux-show-partitions.html) - * [SHOW CREATE TABLE](sql-ref-syntax-aux-show-create-table.html) * [SHOW VIEWS](sql-ref-syntax-aux-show-views.html) diff --git a/docs/sql-ref-syntax-ddl-alter-table.md b/docs/sql-ref-syntax-ddl-alter-table.md index dc3f52344c43a..7a109d91d16bf 100644 --- a/docs/sql-ref-syntax-ddl-alter-table.md +++ b/docs/sql-ref-syntax-ddl-alter-table.md @@ -71,8 +71,6 @@ ALTER TABLE table_identifier ADD COLUMNS ( col_spec [ , ... ] ) Specifies the columns to be added. - **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` - ### ALTER OR CHANGE COLUMN `ALTER TABLE ALTER COLUMN` or `ALTER TABLE CHANGE COLUMN` statement changes column's comment. @@ -95,8 +93,6 @@ ALTER TABLE table_identifier { ALTER | CHANGE } [ COLUMN ] col_spec alterColumnA Specifies the columns to be added. - **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` - * **alterColumnAction** Change the comment string. diff --git a/docs/sql-ref-syntax-ddl-create-function.md b/docs/sql-ref-syntax-ddl-create-function.md index e66df5352b1b5..aa6c1fad7b56b 100644 --- a/docs/sql-ref-syntax-ddl-create-function.md +++ b/docs/sql-ref-syntax-ddl-create-function.md @@ -55,8 +55,10 @@ CREATE [ OR REPLACE ] [ TEMPORARY ] FUNCTION [ IF NOT EXISTS ] * **IF NOT EXISTS** - Specifies a name of function to be created. The function name may be - optionally qualified with a database name. + If specified, creates the function only when it does not exist. The creation + of function succeeds (no error is thrown) if the specified function already + exists in the system. This parameter is mutually exclusive to `OR REPLACE` + and can not be specified together. * **function_name** diff --git a/docs/sql-ref-syntax-ddl-create-table-datasource.md b/docs/sql-ref-syntax-ddl-create-table-datasource.md index b592116c2a9e4..bb7215ab06876 100644 --- a/docs/sql-ref-syntax-ddl-create-table-datasource.md +++ b/docs/sql-ref-syntax-ddl-create-table-datasource.md @@ -65,7 +65,7 @@ as any order. For example, you can write COMMENT table_comment after TBLPROPERTI **NOTE:** Bucketing is an optimization technique that uses buckets (and bucketing columns) to determine data partitioning and avoid data shuffle. -* **SORTED BY** + **SORTED BY** Determines the order in which the data is stored in buckets. Default is Ascending order. From d3033f126b0bee326b4d2f1f0d3bb426272572eb Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 4 May 2020 10:34:02 -0700 Subject: [PATCH 03/19] fix --- docs/sql-ref-syntax-ddl-create-table-datasource.md | 2 +- docs/sql-ref-syntax-ddl-create-table-hiveformat.md | 2 +- docs/sql-ref-syntax-ddl-create-table-like.md | 2 +- docs/sql-ref-syntax-qry-window.md | 10 +++++----- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/sql-ref-syntax-ddl-create-table-datasource.md b/docs/sql-ref-syntax-ddl-create-table-datasource.md index bb7215ab06876..b592116c2a9e4 100644 --- a/docs/sql-ref-syntax-ddl-create-table-datasource.md +++ b/docs/sql-ref-syntax-ddl-create-table-datasource.md @@ -65,7 +65,7 @@ as any order. For example, you can write COMMENT table_comment after TBLPROPERTI **NOTE:** Bucketing is an optimization technique that uses buckets (and bucketing columns) to determine data partitioning and avoid data shuffle. - **SORTED BY** +* **SORTED BY** Determines the order in which the data is stored in buckets. Default is Ascending order. diff --git a/docs/sql-ref-syntax-ddl-create-table-hiveformat.md b/docs/sql-ref-syntax-ddl-create-table-hiveformat.md index 576d9190f2716..7f7033fcaebaf 100644 --- a/docs/sql-ref-syntax-ddl-create-table-hiveformat.md +++ b/docs/sql-ref-syntax-ddl-create-table-hiveformat.md @@ -67,7 +67,7 @@ as any order. For example, you can write COMMENT table_comment after TBLPROPERTI * **LOCATION** - Path to the directory where table data is stored, Path to the directory where table data is stored, which could be a path on distributed storage like HDFS, etc. + Path to the directory where table data is stored, which could be a path on distributed storage like HDFS, etc. * **COMMENT** diff --git a/docs/sql-ref-syntax-ddl-create-table-like.md b/docs/sql-ref-syntax-ddl-create-table-like.md index a374c554bd179..23d8e4f9712a6 100644 --- a/docs/sql-ref-syntax-ddl-create-table-like.md +++ b/docs/sql-ref-syntax-ddl-create-table-like.md @@ -60,7 +60,7 @@ CREATE TABLE [IF NOT EXISTS] table_identifier LIKE source_table_identifier * **LOCATION** - Path to the directory where table data is stored,Path to the directory where table data is stored, which could be a path on distributed storage like HDFS, etc. Location to create an external table. + Path to the directory where table data is stored, which could be a path on distributed storage like HDFS, etc. Location to create an external table. ### Examples diff --git a/docs/sql-ref-syntax-qry-window.md b/docs/sql-ref-syntax-qry-window.md index 9c03b65fec3eb..823ffa2677624 100644 --- a/docs/sql-ref-syntax-qry-window.md +++ b/docs/sql-ref-syntax-qry-window.md @@ -58,15 +58,15 @@ window_function OVER `{ RANGE | ROWS } { frame_start | BETWEEN frame_start AND frame_end }` - If frame_end is omitted it defaults to CURRENT ROW. + * `frame_start` and `frame_end` have the following syntax: - `frame_start` and `frame_end` have the following syntax + **Syntax:** - **Syntax:** + `UNBOUNDED PRECEDING | offset PRECEDING | CURRENT ROW | offset FOLLOWING | UNBOUNDED FOLLOWING` - `UNBOUNDED PRECEDING | offset PRECEDING | CURRENT ROW | offset FOLLOWING | UNBOUNDED FOLLOWING` + `offset:` specifies the `offset` from the position of the current row. - `offset:` specifies the offset from the position of the current row. + **Note:** If `frame_end` is omitted it defaults to `CURRENT ROW`. ### Examples From ca9f9ebaa38b6309de79d2101711929b20032e02 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sat, 2 May 2020 17:54:36 +0900 Subject: [PATCH 04/19] [MINOR][SQL][TESTS] Disable UI in SQL benchmarks by default ### What changes were proposed in this pull request? Set `spark.ui.enabled` to `false` in `SqlBasedBenchmark.getSparkSession`. This disables UI in all SQL benchmarks by default. ### Why are the changes needed? UI overhead lowers numbers in the `Relative` column and impacts on `Stdev` in benchmark results. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Checked by running `DateTimeRebaseBenchmark`. Closes #28432 from MaxGekk/ui-off-in-benchmarks. Authored-by: Max Gekk Signed-off-by: Takeshi Yamamuro --- .../spark/sql/execution/benchmark/DataSourceReadBenchmark.scala | 2 -- .../spark/sql/execution/benchmark/FilterPushdownBenchmark.scala | 2 -- .../spark/sql/execution/benchmark/SqlBasedBenchmark.scala | 2 ++ 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala index d29c5e3f88010..0fc43c7052d06 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala @@ -23,7 +23,6 @@ import scala.util.Random import org.apache.spark.SparkConf import org.apache.spark.benchmark.Benchmark -import org.apache.spark.internal.config.UI._ import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.parquet.{SpecificParquetRecordReaderBase, VectorizedParquetRecordReader} @@ -52,7 +51,6 @@ object DataSourceReadBenchmark extends SqlBasedBenchmark { .set("spark.master", "local[1]") .setIfMissing("spark.driver.memory", "3g") .setIfMissing("spark.executor.memory", "3g") - .setIfMissing(UI_ENABLED, false) val sparkSession = SparkSession.builder.config(conf).getOrCreate() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala index 444ffa4f99697..b3f65d40ad95b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala @@ -23,7 +23,6 @@ import scala.util.Random import org.apache.spark.SparkConf import org.apache.spark.benchmark.Benchmark -import org.apache.spark.internal.config.UI._ import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.functions.monotonically_increasing_id import org.apache.spark.sql.internal.SQLConf @@ -49,7 +48,6 @@ object FilterPushdownBenchmark extends SqlBasedBenchmark { .set("spark.master", "local[1]") .setIfMissing("spark.driver.memory", "3g") .setIfMissing("spark.executor.memory", "3g") - .setIfMissing(UI_ENABLED, false) .setIfMissing("orc.compression", "snappy") .setIfMissing("spark.sql.parquet.compression.codec", "snappy") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala index ee7a03e5e0542..28387dcef125b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.execution.benchmark import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.internal.config.UI.UI_ENABLED import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.SaveMode.Overwrite import org.apache.spark.sql.catalyst.plans.SQLHelper @@ -37,6 +38,7 @@ trait SqlBasedBenchmark extends BenchmarkBase with SQLHelper { .appName(this.getClass.getCanonicalName) .config(SQLConf.SHUFFLE_PARTITIONS.key, 1) .config(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key, 1) + .config(UI_ENABLED.key, false) .getOrCreate() } From 0c8146a062e14042a8f865282636d157c0e4e029 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 3 May 2020 12:40:20 +0900 Subject: [PATCH 05/19] [SPARK-31571][R] Overhaul stop/message/warning calls to be more canonical ### What changes were proposed in this pull request? Internal usages like `{stop,warning,message}({paste,paste0,sprintf}` and `{stop,warning,message}(some_literal_string_as_variable` have been removed and replaced as appropriate. ### Why are the changes needed? CRAN policy recommends against using such constructions to build error messages, in particular because it makes the process of creating portable error messages for the package more onerous. ### Does this PR introduce any user-facing change? There may be some small grammatical changes visible in error messaging. ### How was this patch tested? Not done Closes #28365 from MichaelChirico/r-stop-paste. Authored-by: Michael Chirico Signed-off-by: HyukjinKwon --- R/pkg/R/DataFrame.R | 40 ++++++++--------- R/pkg/R/RDD.R | 2 +- R/pkg/R/SQLContext.R | 17 ++++---- R/pkg/R/client.R | 7 ++- R/pkg/R/context.R | 8 ++-- R/pkg/R/deserialize.R | 2 +- R/pkg/R/group.R | 4 +- R/pkg/R/install.R | 62 +++++++++++---------------- R/pkg/R/mllib_classification.R | 4 +- R/pkg/R/mllib_stat.R | 3 +- R/pkg/R/pairRDD.R | 2 +- R/pkg/R/schema.R | 2 +- R/pkg/R/serialize.R | 4 +- R/pkg/R/sparkR.R | 9 ++-- R/pkg/R/utils.R | 53 +++++++++++------------ R/pkg/tests/fulltests/test_sparkSQL.R | 2 +- 16 files changed, 103 insertions(+), 118 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 09e831814b893..15b3ce2935427 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -431,7 +431,7 @@ setMethod("coltypes", if (is.null(type)) { specialtype <- specialtypeshandle(x) if (is.null(specialtype)) { - stop(paste("Unsupported data type: ", x)) + stop("Unsupported data type: ", x) } type <- PRIMITIVE_TYPES[[specialtype]] } @@ -829,8 +829,8 @@ setMethod("repartitionByRange", jcol <- lapply(cols, function(c) { c@jc }) sdf <- callJMethod(x@sdf, "repartitionByRange", numToInt(numPartitions), jcol) } else { - stop(paste("numPartitions and col must be numeric and Column; however, got", - class(numPartitions), "and", class(col))) + stop("numPartitions and col must be numeric and Column; however, got ", + class(numPartitions), " and ", class(col)) } } else if (!is.null(col)) { # only columns are specified @@ -839,7 +839,7 @@ setMethod("repartitionByRange", jcol <- lapply(cols, function(c) { c@jc }) sdf <- callJMethod(x@sdf, "repartitionByRange", jcol) } else { - stop(paste("col must be Column; however, got", class(col))) + stop("col must be Column; however, got ", class(col)) } } else if (!is.null(numPartitions)) { # only numPartitions is specified @@ -1068,10 +1068,10 @@ setMethod("sample", signature(x = "SparkDataFrame"), function(x, withReplacement = FALSE, fraction, seed) { if (!is.numeric(fraction)) { - stop(paste("fraction must be numeric; however, got", class(fraction))) + stop("fraction must be numeric; however, got ", class(fraction)) } if (!is.logical(withReplacement)) { - stop(paste("withReplacement must be logical; however, got", class(withReplacement))) + stop("withReplacement must be logical; however, got ", class(withReplacement)) } if (!missing(seed)) { @@ -1211,11 +1211,10 @@ setMethod("collect", checkSchemaInArrow(schema(x)) TRUE }, error = function(e) { - warning(paste0("The conversion from Spark DataFrame to R DataFrame was attempted ", - "with Arrow optimization because ", - "'spark.sql.execution.arrow.sparkr.enabled' is set to true; ", - "however, failed, attempting non-optimization. Reason: ", - e)) + warning("The conversion from Spark DataFrame to R DataFrame was attempted ", + "with Arrow optimization because ", + "'spark.sql.execution.arrow.sparkr.enabled' is set to true; ", + "however, failed, attempting non-optimization. Reason: ", e) FALSE }) } @@ -1508,8 +1507,8 @@ dapplyInternal <- function(x, func, schema) { if (inherits(schema, "structType")) { checkSchemaInArrow(schema) } else if (is.null(schema)) { - stop(paste0("Arrow optimization does not support 'dapplyCollect' yet. Please disable ", - "Arrow optimization or use 'collect' and 'dapply' APIs instead.")) + stop("Arrow optimization does not support 'dapplyCollect' yet. Please disable ", + "Arrow optimization or use 'collect' and 'dapply' APIs instead.") } else { stop("'schema' should be DDL-formatted string or structType.") } @@ -2012,8 +2011,8 @@ setMethod("[", signature(x = "SparkDataFrame"), x } else { if (class(i) != "Column") { - stop(paste0("Expressions other than filtering predicates are not supported ", - "in the first parameter of extract operator [ or subset() method.")) + stop("Expressions other than filtering predicates are not supported ", + "in the first parameter of extract operator [ or subset() method.") } filter(x, i) } @@ -2604,18 +2603,17 @@ setMethod("join", if (is.null(joinType)) { sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc) } else { - if (joinType %in% c("inner", "cross", + validJoinTypes <- c("inner", "cross", "outer", "full", "fullouter", "full_outer", "left", "leftouter", "left_outer", "right", "rightouter", "right_outer", - "semi", "left_semi", "leftsemi", "anti", "left_anti", "leftanti")) { + "semi", "leftsemi", "left_semi", "anti", "leftanti", "left_anti") + if (joinType %in% validJoinTypes) { joinType <- gsub("_", "", joinType, fixed = TRUE) sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, joinType) } else { - stop(paste("joinType must be one of the following types:", - "'inner', 'cross', 'outer', 'full', 'fullouter', 'full_outer',", - "'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',", - "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'.")) + stop("joinType must be one of the following types: ", + "'", paste(validJoinTypes, collapse = "', '"), "'") } } } diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 7ee725d90d550..7a1d157bb8a36 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -947,7 +947,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical", MAXINT <- .Machine$integer.max if (num < 0) - stop(paste("Negative number of elements requested")) + stop("Negative number of elements requested") if (initialCount > MAXINT - 1) { maxSelected <- MAXINT - 1 diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index 1ef2641742704..c0ac68332ec41 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -34,7 +34,7 @@ getInternalType <- function(x) { Date = "date", POSIXlt = "timestamp", POSIXct = "timestamp", - stop(paste("Unsupported type for SparkDataFrame:", class(x)))) + stop("Unsupported type for SparkDataFrame: ", class(x))) } #' return the SparkSession @@ -112,9 +112,9 @@ sparkR.conf <- function(key, defaultValue) { error = function(e) { estr <- as.character(e) if (any(grepl("java.util.NoSuchElementException", estr, fixed = TRUE))) { - stop(paste0("Config '", key, "' is not set")) + stop("Config '", key, "' is not set") } else { - stop(paste0("Unknown error: ", estr)) + stop("Unknown error: ", estr) } }) } else { @@ -208,7 +208,7 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) { names <- lapply(names, function(n) { nn <- gsub(".", "_", n, fixed = TRUE) if (nn != n) { - warning(paste("Use", nn, "instead of", n, "as column name")) + warning("Use ", nn, " instead of ", n, " as column name") } nn }) @@ -290,10 +290,9 @@ createDataFrame <- function(data, schema = NULL, samplingRatio = 1.0, TRUE }, error = function(e) { - warning(paste0("createDataFrame attempted Arrow optimization because ", - "'spark.sql.execution.arrow.sparkr.enabled' is set to true; however, ", - "failed, attempting non-optimization. Reason: ", - e)) + warning("createDataFrame attempted Arrow optimization because ", + "'spark.sql.execution.arrow.sparkr.enabled' is set to true; however, ", + "failed, attempting non-optimization. Reason: ", e) FALSE }) } @@ -326,7 +325,7 @@ createDataFrame <- function(data, schema = NULL, samplingRatio = 1.0, } else if (inherits(data, "RDD")) { rdd <- data } else { - stop(paste("unexpected type:", class(data))) + stop("unexpected type: ", class(data)) } schema <- getSchema(schema, firstRow, rdd) diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R index 872b21443eaad..797a5c7da1549 100644 --- a/R/pkg/R/client.R +++ b/R/pkg/R/client.R @@ -102,10 +102,9 @@ checkJavaVersion <- function() { javaVersionNum <- as.integer(versions[1]) } if (javaVersionNum < minJavaVersion || javaVersionNum >= maxJavaVersion) { - stop(paste0("Java version, greater than or equal to ", minJavaVersion, - " and less than ", maxJavaVersion, - ", is required for this package; found version: ", - javaVersionStr)) + stop("Java version, greater than or equal to ", minJavaVersion, + " and less than ", maxJavaVersion, ", is required for this ", + "package; found version: ", javaVersionStr) } return(javaVersionNum) } diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index d96a287f818a2..e3c9d9f8793d6 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -144,13 +144,13 @@ parallelize <- function(sc, coll, numSlices = 1) { if ((!is.list(coll) && !is.vector(coll)) || is.data.frame(coll)) { # nolint end if (is.data.frame(coll)) { - message(paste("context.R: A data frame is parallelized by columns.")) + message("context.R: A data frame is parallelized by columns.") } else { if (is.matrix(coll)) { - message(paste("context.R: A matrix is parallelized by elements.")) + message("context.R: A matrix is parallelized by elements.") } else { - message(paste("context.R: parallelize() currently only supports lists and vectors.", - "Calling as.list() to coerce coll into a list.")) + message("context.R: parallelize() currently only supports lists and vectors. ", + "Calling as.list() to coerce coll into a list.") } } coll <- as.list(coll) diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index ca4a6e342d772..3e7c456bd548d 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -57,7 +57,7 @@ readTypedObject <- function(con, type) { "s" = readStruct(con), "n" = NULL, "j" = getJobj(readString(con)), - stop(paste("Unsupported type for deserialization", type))) + stop("Unsupported type for deserialization ", type)) } readStringData <- function(con, len) { diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R index 2b7995e1e37f6..99d62240a3b2a 100644 --- a/R/pkg/R/group.R +++ b/R/pkg/R/group.R @@ -234,8 +234,8 @@ gapplyInternal <- function(x, func, schema) { if (inherits(schema, "structType")) { checkSchemaInArrow(schema) } else if (is.null(schema)) { - stop(paste0("Arrow optimization does not support 'gapplyCollect' yet. Please disable ", - "Arrow optimization or use 'collect' and 'gapply' APIs instead.")) + stop("Arrow optimization does not support 'gapplyCollect' yet. Please disable ", + "Arrow optimization or use 'collect' and 'gapply' APIs instead.") } else { stop("'schema' should be DDL-formatted string or structType.") } diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 8c5355a8324f9..ea2c0b4c0f42f 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -89,8 +89,8 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, } if (overwrite) { - message(paste0("Overwrite = TRUE: download and overwrite the tar file", - "and Spark package directory if they exist.")) + message("Overwrite = TRUE: download and overwrite the tar file", + "and Spark package directory if they exist.") } releaseUrl <- Sys.getenv("SPARKR_RELEASE_DOWNLOAD_URL") @@ -103,12 +103,11 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) { if (releaseUrl != "") { - message(paste(packageName, "found, setting SPARK_HOME to", packageLocalDir)) + message(packageName, " found, setting SPARK_HOME to ", packageLocalDir) } else { - fmt <- "%s for Hadoop %s found, setting SPARK_HOME to %s" - msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), - packageLocalDir) - message(msg) + message(version, " for Hadoop ", + if (hadoopVersion == "without") "Free build" else hadoopVersion, + " found, setting SPARK_HOME to ", packageLocalDir) } Sys.setenv(SPARK_HOME = packageLocalDir) return(invisible(packageLocalDir)) @@ -127,26 +126,23 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, success <- downloadUrl(releaseUrl, packageLocalPath) if (!success) { unlink(packageLocalPath) - stop(paste0("Fetch failed from ", releaseUrl)) + stop("Fetch failed from ", releaseUrl) } } else { robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) } } - message(sprintf("Installing to %s", localDir)) + message("Installing to ", localDir) # There are two ways untar can fail - untar could stop() on errors like incomplete block on file # or, tar command can return failure code success <- tryCatch(untar(tarfile = packageLocalPath, exdir = localDir) == 0, error = function(e) { - message(e) - message() + message(e, "\n") FALSE }, warning = function(w) { - # Treat warning as error, add an empty line with message() - message(w) - message() + message(w, "\n") FALSE }) if (!tarExists || overwrite || !success) { @@ -160,7 +156,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, if (!success) stop("Extract archive failed.") message("DONE.") Sys.setenv(SPARK_HOME = packageLocalDir) - message(paste("SPARK_HOME set to", packageLocalDir)) + message("SPARK_HOME set to ", packageLocalDir) invisible(packageLocalDir) } @@ -173,7 +169,7 @@ robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, pa if (success) { return() } else { - message(paste0("Unable to download from mirrorUrl: ", mirrorUrl)) + message("Unable to download from mirrorUrl: ", mirrorUrl) } } else { message("MirrorUrl not provided.") @@ -201,11 +197,9 @@ robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, pa # remove any partially downloaded file unlink(packageLocalPath) message("Unable to download from default mirror site: ", mirrorUrl) - msg <- sprintf(paste("Unable to download Spark %s for Hadoop %s.", - "Please check network connection, Hadoop version,", - "or provide other mirror sites."), - version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion)) - stop(msg) + stop("Unable to download Spark ", version, + " for Hadoop ", if (hadoopVersion == "without") "Free build" else hadoopVersion, + ". Please check network connection, Hadoop version, or provide other mirror sites.") } } @@ -222,7 +216,7 @@ getPreferredMirror <- function(version, packageName) { endPos <- matchInfo + attr(matchInfo, "match.length") - 2 mirrorPreferred <- base::substr(linePreferred, startPos, endPos) mirrorPreferred <- paste0(mirrorPreferred, "spark") - message(sprintf("Preferred mirror site found: %s", mirrorPreferred)) + message("Preferred mirror site found: ", mirrorPreferred) } else { mirrorPreferred <- NULL } @@ -231,24 +225,20 @@ getPreferredMirror <- function(version, packageName) { directDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { packageRemotePath <- paste0(file.path(mirrorUrl, version, packageName), ".tgz") - fmt <- "Downloading %s for Hadoop %s from:\n- %s" - msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), - packageRemotePath) - message(msg) + message("Downloading ", version, " for Hadoop ", + if (hadoopVersion == "without") "Free build" else hadoopVersion, + " from:\n- ", packageRemotePath) downloadUrl(packageRemotePath, packageLocalPath) } downloadUrl <- function(remotePath, localPath) { isFail <- tryCatch(download.file(remotePath, localPath), error = function(e) { - message(e) - message() + message(e, "\n") TRUE }, warning = function(w) { - # Treat warning as error, add an empty line with message() - message(w) - message() + message(w, "\n") TRUE }) !isFail @@ -279,9 +269,9 @@ sparkCachePath <- function() { winAppPath <- Sys.getenv("USERPROFILE", unset = NA) } if (is.na(winAppPath)) { - stop(paste("%LOCALAPPDATA% and %USERPROFILE% not found.", - "Please define the environment variable", - "or restart and enter an installation path in localDir.")) + stop("%LOCALAPPDATA% and %USERPROFILE% not found. ", + "Please define the environment variable ", + "or restart and enter an installation path in localDir.") } else { path <- file.path(winAppPath, "Apache", "Spark", "Cache") } @@ -293,7 +283,7 @@ sparkCachePath <- function() { Sys.getenv("XDG_CACHE_HOME", file.path(Sys.getenv("HOME"), ".cache")), "spark") } } else { - stop(sprintf("Unknown OS: %s", .Platform$OS.type)) + stop("Unknown OS: ", .Platform$OS.type) } normalizePath(path, mustWork = FALSE) } @@ -322,7 +312,7 @@ installInstruction <- function(mode) { "If you need further help, ", "contact the administrators of the cluster.") } else { - stop(paste0("No instruction found for ", mode, " mode.")) + stop("No instruction found for mode ", mode) } } diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index 5cc97ea723afc..ec83b6bd406a7 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -337,8 +337,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients) || col != ncol(upperBoundsOnCoefficients))) { - stop(paste0("dimension of upperBoundsOnCoefficients ", - "is not the same as lowerBoundsOnCoefficients")) + stop("dimension of upperBoundsOnCoefficients ", + "is not the same as lowerBoundsOnCoefficients") } if (is.null(lowerBoundsOnCoefficients)) { diff --git a/R/pkg/R/mllib_stat.R b/R/pkg/R/mllib_stat.R index f8c3329359961..6db4d5d4831dd 100644 --- a/R/pkg/R/mllib_stat.R +++ b/R/pkg/R/mllib_stat.R @@ -69,8 +69,7 @@ setMethod("spark.kstest", signature(data = "SparkDataFrame"), function(data, testCol = "test", nullHypothesis = c("norm"), distParams = c(0, 1)) { tryCatch(match.arg(nullHypothesis), error = function(e) { - msg <- paste("Distribution", nullHypothesis, "is not supported.") - stop(msg) + stop("Distribution ", nullHypothesis, " is not supported.") }) if (nullHypothesis == "norm") { distParams <- as.numeric(distParams) diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R index 9c2e57d3067db..b29381bb900fb 100644 --- a/R/pkg/R/pairRDD.R +++ b/R/pkg/R/pairRDD.R @@ -906,7 +906,7 @@ setMethod("sampleByKey", for (elem in fractions) { if (elem < 0.0) { - stop(paste("Negative fraction value ", fractions[which(fractions == elem)])) + stop("Negative fraction value ", fractions[which(fractions == elem)]) } } diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R index 89d5c2cd1a5e2..7044ede0cc58b 100644 --- a/R/pkg/R/schema.R +++ b/R/pkg/R/schema.R @@ -199,7 +199,7 @@ checkType <- function(type) { }) } - stop(paste("Unsupported type for SparkDataframe:", type)) + stop("Unsupported type for SparkDataframe: ", type) } #' @param type The data type of the field diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R index cb3c1c59d12ed..7760d9be16f0b 100644 --- a/R/pkg/R/serialize.R +++ b/R/pkg/R/serialize.R @@ -84,7 +84,7 @@ writeObject <- function(con, object, writeType = TRUE) { Date = writeDate(con, object), POSIXlt = writeTime(con, object), POSIXct = writeTime(con, object), - stop(paste("Unsupported type for serialization", type))) + stop("Unsupported type for serialization ", type)) } writeVoid <- function(con) { @@ -158,7 +158,7 @@ writeType <- function(con, class) { Date = "D", POSIXlt = "t", POSIXct = "t", - stop(paste("Unsupported type for serialization", class))) + stop("Unsupported type for serialization ", class)) writeBin(charToRaw(type), con) } diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 9ba36ad46740a..e4a11a5f78a71 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -154,8 +154,8 @@ sparkR.sparkContext <- function( connectionTimeout <- as.numeric(Sys.getenv("SPARKR_BACKEND_CONNECTION_TIMEOUT", "6000")) if (existingPort != "") { if (length(packages) != 0) { - warning(paste("sparkPackages has no effect when using spark-submit or sparkR shell", - " please use the --packages commandline instead", sep = ",")) + warning("sparkPackages has no effect when using spark-submit or sparkR shell, ", + "please use the --packages commandline instead") } backendPort <- existingPort authSecret <- Sys.getenv("SPARKR_BACKEND_AUTH_SECRET") @@ -439,8 +439,9 @@ sparkR.session <- function( rPackageVersion <- paste0(packageVersion("SparkR")) if (jvmVersionStrip != rPackageVersion) { - warning(paste("Version mismatch between Spark JVM and SparkR package. JVM version was", - jvmVersion, ", while R package version was", rPackageVersion)) + warning("Version mismatch between Spark JVM and SparkR package. ", + "JVM version was ", jvmVersion, + ", while R package version was ", rPackageVersion) } sparkSession diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index c60e4db1496d0..65db9c21d9dbb 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -46,9 +46,9 @@ convertJListToRList <- function(jList, flatten, logicalUpperBound = NULL, res <- list(unserialize(keyBytes), unserialize(valBytes)) } else { - stop(paste("utils.R: convertJListToRList only supports", - "RDD[Array[Byte]] and", - "JavaPairRDD[Array[Byte], Array[Byte]] for now")) + stop("utils.R: convertJListToRList only supports ", + "RDD[Array[Byte]] and ", + "JavaPairRDD[Array[Byte], Array[Byte]] for now") } } else { if (inherits(obj, "raw")) { @@ -354,8 +354,8 @@ varargsToStrEnv <- function(...) { } else { value <- pairs[[name]] if (!(is.logical(value) || is.numeric(value) || is.character(value) || is.null(value))) { - stop(paste0("Unsupported type for ", name, " : ", class(value), - ". Supported types are logical, numeric, character and NULL."), call. = FALSE) + stop("Unsupported type for ", name, " : ", toString(class(value)), ". ", + "Supported types are logical, numeric, character and NULL.", call. = FALSE) } if (is.logical(value)) { env[[name]] <- tolower(as.character(value)) @@ -369,8 +369,7 @@ varargsToStrEnv <- function(...) { } if (length(ignoredNames) != 0) { - warning(paste0("Unnamed arguments ignored: ", paste(ignoredNames, collapse = ", "), "."), - call. = FALSE) + warning("Unnamed arguments ignored: ", toString(ignoredNames), ".", call. = FALSE) } env } @@ -449,7 +448,7 @@ storageLevelToString <- function(levelObj) { # the user to type (for example) `5` instead of `5L` to avoid a confusing error message. numToInt <- function(num) { if (as.integer(num) != num) { - warning(paste("Coercing", as.list(sys.call())[[2]], "to integer.")) + warning("Coercing ", as.list(sys.call())[[2L]], " to integer.") } as.integer(num) } @@ -650,8 +649,8 @@ mergePartitions <- function(rdd, zip) { # For zip operation, check if corresponding partitions # of both RDDs have the same number of elements. if (zip && lengthOfKeys != lengthOfValues) { - stop(paste("Can only zip RDDs with same number of elements", - "in each pair of corresponding partitions.")) + stop("Can only zip RDDs with same number of elements ", + "in each pair of corresponding partitions.") } if (lengthOfKeys > 1) { @@ -804,7 +803,7 @@ handledCallJMethod <- function(obj, method, ...) { captureJVMException <- function(e, method) { rawmsg <- as.character(e) - if (any(grep("^Error in .*?: ", rawmsg))) { + if (any(grepl("^Error in .*?: ", rawmsg))) { # If the exception message starts with "Error in ...", this is possibly # "Error in invokeJava(...)". Here, it replaces the characters to # `paste("Error in", method, ":")` in order to identify which function @@ -818,58 +817,58 @@ captureJVMException <- function(e, method) { } # StreamingQueryException could wrap an IllegalArgumentException, so look for that first - if (any(grep("org.apache.spark.sql.streaming.StreamingQueryException: ", - stacktrace, fixed = TRUE))) { + if (any(grepl("org.apache.spark.sql.streaming.StreamingQueryException: ", + stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.streaming.StreamingQueryException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(paste0(rmsg, "streaming query error - ", first), call. = FALSE) - } else if (any(grep("java.lang.IllegalArgumentException: ", stacktrace, fixed = TRUE))) { + stop(rmsg, "streaming query error - ", first, call. = FALSE) + } else if (any(grepl("java.lang.IllegalArgumentException: ", stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "java.lang.IllegalArgumentException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(paste0(rmsg, "illegal argument - ", first), call. = FALSE) - } else if (any(grep("org.apache.spark.sql.AnalysisException: ", stacktrace, fixed = TRUE))) { + stop(rmsg, "illegal argument - ", first, call. = FALSE) + } else if (any(grepl("org.apache.spark.sql.AnalysisException: ", stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.AnalysisException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(paste0(rmsg, "analysis error - ", first), call. = FALSE) + stop(rmsg, "analysis error - ", first, call. = FALSE) } else - if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ", - stacktrace, fixed = TRUE))) { + if (any(grepl("org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ", + stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(paste0(rmsg, "no such database - ", first), call. = FALSE) + stop(rmsg, "no such database - ", first, call. = FALSE) } else - if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ", - stacktrace, fixed = TRUE))) { + if (any(grepl("org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ", + stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(paste0(rmsg, "no such table - ", first), call. = FALSE) - } else if (any(grep("org.apache.spark.sql.catalyst.parser.ParseException: ", - stacktrace, fixed = TRUE))) { + stop(rmsg, "no such table - ", first, call. = FALSE) + } else if (any(grepl("org.apache.spark.sql.catalyst.parser.ParseException: ", + stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.catalyst.parser.ParseException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(paste0(rmsg, "parse error - ", first), call. = FALSE) + stop(rmsg, "parse error - ", first, call. = FALSE) } else { stop(stacktrace, call. = FALSE) } diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index b7172b2ae0774..611d9057c0f13 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -2558,7 +2558,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { error_msg <- paste("joinType must be one of the following types:", "'inner', 'cross', 'outer', 'full', 'fullouter', 'full_outer',", "'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',", - "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'.") + "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti', 'left_anti'") expect_error(join(df2, df, df2$name == df$name, "invalid"), error_msg) merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE) From d8a2fa0e5b45d99ff0cbd4c869242fedb8541b55 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Mon, 4 May 2020 09:39:50 +0900 Subject: [PATCH 06/19] [SPARK-31527][SQL][TESTS][FOLLOWUP] Fix the number of rows in `DateTimeBenchmark` ### What changes were proposed in this pull request? - Changed to the number of rows in benchmark cases from 3 to the actual number `N`. - Regenerated benchmark results in the environment: | Item | Description | | ---- | ----| | Region | us-west-2 (Oregon) | | Instance | r3.xlarge | | AMI | ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190722.1 (ami-06f2f779464715dc5) | | Java | OpenJDK 64-Bit Server VM 1.8.0_242 and OpenJDK 64-Bit Server VM 11.0.6+10 | ### Why are the changes needed? The changes are needed to have: - Correct benchmark results - Base line for other perf improvements that can be checked in the same environment. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By running the benchmark and checking its output. Closes #28440 from MaxGekk/SPARK-31527-DateTimeBenchmark-followup. Authored-by: Max Gekk Signed-off-by: Takeshi Yamamuro --- .../DateTimeBenchmark-jdk11-results.txt | 474 +++++++++--------- .../benchmarks/DateTimeBenchmark-results.txt | 474 +++++++++--------- .../benchmark/DateTimeBenchmark.scala | 2 +- 3 files changed, 475 insertions(+), 475 deletions(-) diff --git a/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt index 1004bcf1aa286..61b4c762a752e 100644 --- a/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt @@ -2,456 +2,456 @@ datetime +/- interval ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz datetime +/- interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date + interval(m) 919 933 22 0.0 306237514.3 1.0X -date + interval(m, d) 910 916 9 0.0 303338619.0 1.0X -date + interval(m, d, ms) 3912 3923 16 0.0 1303942791.7 0.2X -date - interval(m) 883 887 6 0.0 294268789.3 1.0X -date - interval(m, d) 898 911 18 0.0 299453403.0 1.0X -date - interval(m, d, ms) 3937 3944 11 0.0 1312269472.0 0.2X -timestamp + interval(m) 2226 2236 14 0.0 741972014.3 0.4X -timestamp + interval(m, d) 2264 2274 13 0.0 754709121.0 0.4X -timestamp + interval(m, d, ms) 2202 2223 30 0.0 734001075.0 0.4X -timestamp - interval(m) 1992 2005 17 0.0 664152744.7 0.5X -timestamp - interval(m, d) 2069 2075 9 0.0 689631159.0 0.4X -timestamp - interval(m, d, ms) 2240 2244 6 0.0 746538728.0 0.4X +date + interval(m) 1485 1567 116 6.7 148.5 1.0X +date + interval(m, d) 1504 1510 9 6.6 150.4 1.0X +date + interval(m, d, ms) 7000 7013 18 1.4 700.0 0.2X +date - interval(m) 1466 1478 17 6.8 146.6 1.0X +date - interval(m, d) 1533 1534 1 6.5 153.3 1.0X +date - interval(m, d, ms) 7014 7019 7 1.4 701.4 0.2X +timestamp + interval(m) 3062 3064 3 3.3 306.2 0.5X +timestamp + interval(m, d) 3133 3136 5 3.2 313.3 0.5X +timestamp + interval(m, d, ms) 3401 3402 3 2.9 340.1 0.4X +timestamp - interval(m) 3025 3037 17 3.3 302.5 0.5X +timestamp - interval(m, d) 3083 3120 51 3.2 308.3 0.5X +timestamp - interval(m, d, ms) 3371 3379 11 3.0 337.1 0.4X ================================================================================================ Extract components ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz cast to timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp wholestage off 178 180 3 56.1 17.8 1.0X -cast to timestamp wholestage on 189 192 4 53.0 18.9 0.9X +cast to timestamp wholestage off 339 339 1 29.5 33.9 1.0X +cast to timestamp wholestage on 330 337 9 30.3 33.0 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz year of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -year of timestamp wholestage off 760 761 1 13.2 76.0 1.0X -year of timestamp wholestage on 731 741 10 13.7 73.1 1.0X +year of timestamp wholestage off 1226 1237 15 8.2 122.6 1.0X +year of timestamp wholestage on 1230 1242 9 8.1 123.0 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz quarter of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -quarter of timestamp wholestage off 1005 1013 10 9.9 100.5 1.0X -quarter of timestamp wholestage on 981 986 3 10.2 98.1 1.0X +quarter of timestamp wholestage off 1602 1606 7 6.2 160.2 1.0X +quarter of timestamp wholestage on 1511 1514 3 6.6 151.1 1.1X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz month of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -month of timestamp wholestage off 754 758 6 13.3 75.4 1.0X -month of timestamp wholestage on 719 729 11 13.9 71.9 1.0X +month of timestamp wholestage off 1227 1233 8 8.1 122.7 1.0X +month of timestamp wholestage on 1226 1242 28 8.2 122.6 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz weekofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekofyear of timestamp wholestage off 1085 1088 4 9.2 108.5 1.0X -weekofyear of timestamp wholestage on 1075 1091 13 9.3 107.5 1.0X +weekofyear of timestamp wholestage off 1965 1980 20 5.1 196.5 1.0X +weekofyear of timestamp wholestage on 1816 1833 17 5.5 181.6 1.1X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz day of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -day of timestamp wholestage off 751 770 27 13.3 75.1 1.0X -day of timestamp wholestage on 735 741 7 13.6 73.5 1.0X +day of timestamp wholestage off 1229 1231 3 8.1 122.9 1.0X +day of timestamp wholestage on 1222 1230 10 8.2 122.2 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz dayofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofyear of timestamp wholestage off 765 769 5 13.1 76.5 1.0X -dayofyear of timestamp wholestage on 762 770 7 13.1 76.2 1.0X +dayofyear of timestamp wholestage off 1294 1297 4 7.7 129.4 1.0X +dayofyear of timestamp wholestage on 1257 1264 6 8.0 125.7 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz dayofmonth of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofmonth of timestamp wholestage off 780 782 2 12.8 78.0 1.0X -dayofmonth of timestamp wholestage on 720 736 12 13.9 72.0 1.1X +dayofmonth of timestamp wholestage off 1247 1253 8 8.0 124.7 1.0X +dayofmonth of timestamp wholestage on 1225 1229 4 8.2 122.5 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz dayofweek of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofweek of timestamp wholestage off 887 899 17 11.3 88.7 1.0X -dayofweek of timestamp wholestage on 820 847 20 12.2 82.0 1.1X +dayofweek of timestamp wholestage off 1416 1416 0 7.1 141.6 1.0X +dayofweek of timestamp wholestage on 1376 1382 8 7.3 137.6 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz weekday of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekday of timestamp wholestage off 821 825 5 12.2 82.1 1.0X -weekday of timestamp wholestage on 802 814 9 12.5 80.2 1.0X +weekday of timestamp wholestage off 1350 1351 1 7.4 135.0 1.0X +weekday of timestamp wholestage on 1308 1318 13 7.6 130.8 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz hour of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -hour of timestamp wholestage off 611 622 14 16.4 61.1 1.0X -hour of timestamp wholestage on 571 577 8 17.5 57.1 1.1X +hour of timestamp wholestage off 1004 1007 3 10.0 100.4 1.0X +hour of timestamp wholestage on 928 938 7 10.8 92.8 1.1X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz minute of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -minute of timestamp wholestage off 607 615 12 16.5 60.7 1.0X -minute of timestamp wholestage on 573 580 6 17.5 57.3 1.1X +minute of timestamp wholestage off 1009 1020 15 9.9 100.9 1.0X +minute of timestamp wholestage on 933 935 2 10.7 93.3 1.1X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz second of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -second of timestamp wholestage off 615 616 2 16.3 61.5 1.0X -second of timestamp wholestage on 564 575 8 17.7 56.4 1.1X +second of timestamp wholestage off 995 995 0 10.0 99.5 1.0X +second of timestamp wholestage on 932 937 8 10.7 93.2 1.1X ================================================================================================ Current date and time ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz current_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_date wholestage off 166 169 4 60.4 16.6 1.0X -current_date wholestage on 150 153 3 66.7 15.0 1.1X +current_date wholestage off 292 316 34 34.2 29.2 1.0X +current_date wholestage on 270 276 6 37.0 27.0 1.1X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz current_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_timestamp wholestage off 179 181 2 55.8 17.9 1.0X -current_timestamp wholestage on 162 324 138 61.9 16.2 1.1X +current_timestamp wholestage off 313 328 20 31.9 31.3 1.0X +current_timestamp wholestage on 270 331 95 37.0 27.0 1.2X ================================================================================================ Date arithmetic ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz cast to date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date wholestage off 658 661 5 15.2 65.8 1.0X -cast to date wholestage on 644 654 10 15.5 64.4 1.0X +cast to date wholestage off 1078 1081 3 9.3 107.8 1.0X +cast to date wholestage on 1035 1040 7 9.7 103.5 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz last_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -last_day wholestage off 768 772 5 13.0 76.8 1.0X -last_day wholestage on 737 750 12 13.6 73.7 1.0X +last_day wholestage off 1265 1266 3 7.9 126.5 1.0X +last_day wholestage on 1236 1246 10 8.1 123.6 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz next_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -next_day wholestage off 691 704 17 14.5 69.1 1.0X -next_day wholestage on 664 676 10 15.1 66.4 1.0X +next_day wholestage off 1118 1118 1 8.9 111.8 1.0X +next_day wholestage on 1085 1090 8 9.2 108.5 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_add: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_add wholestage off 646 646 0 15.5 64.6 1.0X -date_add wholestage on 623 640 13 16.1 62.3 1.0X +date_add wholestage off 1052 1054 4 9.5 105.2 1.0X +date_add wholestage on 1046 1051 6 9.6 104.6 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_sub: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_sub wholestage off 638 645 9 15.7 63.8 1.0X -date_sub wholestage on 618 629 8 16.2 61.8 1.0X +date_sub wholestage off 1075 1075 0 9.3 107.5 1.0X +date_sub wholestage on 1043 1046 3 9.6 104.3 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz add_months: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -add_months wholestage off 892 896 5 11.2 89.2 1.0X -add_months wholestage on 926 938 7 10.8 92.6 1.0X +add_months wholestage off 1409 1409 0 7.1 140.9 1.0X +add_months wholestage on 1448 1453 4 6.9 144.8 1.0X ================================================================================================ Formatting dates ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz format date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -format date wholestage off 3395 3439 62 2.9 339.5 1.0X -format date wholestage on 3418 3438 14 2.9 341.8 1.0X +format date wholestage off 5373 5390 24 1.9 537.3 1.0X +format date wholestage on 5337 5346 12 1.9 533.7 1.0X ================================================================================================ Formatting timestamps ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz from_unixtime: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_unixtime wholestage off 4565 4592 38 2.2 456.5 1.0X -from_unixtime wholestage on 4608 4635 32 2.2 460.8 1.0X +from_unixtime wholestage off 7302 7308 9 1.4 730.2 1.0X +from_unixtime wholestage on 7298 7319 16 1.4 729.8 1.0X ================================================================================================ Convert timestamps ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz from_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_utc_timestamp wholestage off 801 807 9 12.5 80.1 1.0X -from_utc_timestamp wholestage on 819 830 7 12.2 81.9 1.0X +from_utc_timestamp wholestage off 1322 1355 48 7.6 132.2 1.0X +from_utc_timestamp wholestage on 1290 1294 5 7.8 129.0 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz to_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_utc_timestamp wholestage off 1108 1114 8 9.0 110.8 1.0X -to_utc_timestamp wholestage on 1067 1078 13 9.4 106.7 1.0X +to_utc_timestamp wholestage off 1692 1705 18 5.9 169.2 1.0X +to_utc_timestamp wholestage on 1653 1657 4 6.1 165.3 1.0X ================================================================================================ Intervals ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz cast interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast interval wholestage off 211 213 2 47.4 21.1 1.0X -cast interval wholestage on 185 188 3 54.1 18.5 1.1X +cast interval wholestage off 340 356 22 29.4 34.0 1.0X +cast interval wholestage on 293 296 2 34.1 29.3 1.2X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz datediff: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -datediff wholestage off 1120 1120 1 8.9 112.0 1.0X -datediff wholestage on 1174 1205 19 8.5 117.4 1.0X +datediff wholestage off 1843 1862 28 5.4 184.3 1.0X +datediff wholestage on 1766 1780 16 5.7 176.6 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz months_between: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -months_between wholestage off 3669 3688 26 2.7 366.9 1.0X -months_between wholestage on 3687 3819 181 2.7 368.7 1.0X +months_between wholestage off 5856 5858 2 1.7 585.6 1.0X +months_between wholestage on 5799 5815 14 1.7 579.9 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz window: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -window wholestage off 1147 1148 1 0.9 1146.6 1.0X -window wholestage on 16997 17207 226 0.1 16996.7 0.1X +window wholestage off 2017 2147 183 0.5 2017.4 1.0X +window wholestage on 47789 47910 91 0.0 47788.6 0.0X ================================================================================================ Truncation ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc YEAR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YEAR wholestage off 1824 1859 50 5.5 182.4 1.0X -date_trunc YEAR wholestage on 1844 1942 71 5.4 184.4 1.0X +date_trunc YEAR wholestage off 2689 2689 1 3.7 268.9 1.0X +date_trunc YEAR wholestage on 2655 2670 17 3.8 265.5 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc YYYY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YYYY wholestage off 1808 1815 11 5.5 180.8 1.0X -date_trunc YYYY wholestage on 1833 1864 49 5.5 183.3 1.0X +date_trunc YYYY wholestage off 2698 2700 3 3.7 269.8 1.0X +date_trunc YYYY wholestage on 2654 2660 6 3.8 265.4 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc YY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YY wholestage off 1867 1883 23 5.4 186.7 1.0X -date_trunc YY wholestage on 1843 1861 15 5.4 184.3 1.0X +date_trunc YY wholestage off 2692 2697 7 3.7 269.2 1.0X +date_trunc YY wholestage on 2653 2662 7 3.8 265.3 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc MON: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MON wholestage off 1845 1858 18 5.4 184.5 1.0X -date_trunc MON wholestage on 1830 1893 42 5.5 183.0 1.0X +date_trunc MON wholestage off 2752 2756 6 3.6 275.2 1.0X +date_trunc MON wholestage on 2666 2675 15 3.8 266.6 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc MONTH: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MONTH wholestage off 1822 1855 47 5.5 182.2 1.0X -date_trunc MONTH wholestage on 1832 1863 20 5.5 183.2 1.0X +date_trunc MONTH wholestage off 2743 2746 4 3.6 274.3 1.0X +date_trunc MONTH wholestage on 2667 2673 8 3.7 266.7 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc MM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MM wholestage off 1843 1848 7 5.4 184.3 1.0X -date_trunc MM wholestage on 1886 1905 14 5.3 188.6 1.0X +date_trunc MM wholestage off 2741 2741 1 3.6 274.1 1.0X +date_trunc MM wholestage on 2670 2678 7 3.7 267.0 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc DAY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DAY wholestage off 1542 1545 4 6.5 154.2 1.0X -date_trunc DAY wholestage on 1610 1616 5 6.2 161.0 1.0X +date_trunc DAY wholestage off 2338 2342 6 4.3 233.8 1.0X +date_trunc DAY wholestage on 2269 2277 7 4.4 226.9 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc DD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DD wholestage off 1521 1529 11 6.6 152.1 1.0X -date_trunc DD wholestage on 1595 1611 21 6.3 159.5 1.0X +date_trunc DD wholestage off 2324 2325 1 4.3 232.4 1.0X +date_trunc DD wholestage on 2270 2273 2 4.4 227.0 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc HOUR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc HOUR wholestage off 1496 1543 67 6.7 149.6 1.0X -date_trunc HOUR wholestage on 1567 1594 18 6.4 156.7 1.0X +date_trunc HOUR wholestage off 2325 2326 1 4.3 232.5 1.0X +date_trunc HOUR wholestage on 2284 2295 8 4.4 228.4 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc MINUTE: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MINUTE wholestage off 230 230 1 43.5 23.0 1.0X -date_trunc MINUTE wholestage on 288 295 7 34.7 28.8 0.8X +date_trunc MINUTE wholestage off 407 408 0 24.5 40.7 1.0X +date_trunc MINUTE wholestage on 382 386 3 26.1 38.2 1.1X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc SECOND: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc SECOND wholestage off 247 249 4 40.5 24.7 1.0X -date_trunc SECOND wholestage on 297 314 12 33.6 29.7 0.8X +date_trunc SECOND wholestage off 404 404 1 24.8 40.4 1.0X +date_trunc SECOND wholestage on 386 390 4 25.9 38.6 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc WEEK: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc WEEK wholestage off 1786 1788 3 5.6 178.6 1.0X -date_trunc WEEK wholestage on 1786 1832 46 5.6 178.6 1.0X +date_trunc WEEK wholestage off 2693 2694 2 3.7 269.3 1.0X +date_trunc WEEK wholestage on 2619 2629 10 3.8 261.9 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc QUARTER: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc QUARTER wholestage off 2319 2365 66 4.3 231.9 1.0X -date_trunc QUARTER wholestage on 2424 2551 182 4.1 242.4 1.0X +date_trunc QUARTER wholestage off 3454 3466 17 2.9 345.4 1.0X +date_trunc QUARTER wholestage on 3384 3404 24 3.0 338.4 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz trunc year: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc year wholestage off 180 189 12 55.5 18.0 1.0X -trunc year wholestage on 271 277 5 36.9 27.1 0.7X +trunc year wholestage off 339 340 1 29.5 33.9 1.0X +trunc year wholestage on 337 347 9 29.7 33.7 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz trunc yyyy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yyyy wholestage off 189 191 4 52.9 18.9 1.0X -trunc yyyy wholestage on 276 284 6 36.2 27.6 0.7X +trunc yyyy wholestage off 347 348 2 28.8 34.7 1.0X +trunc yyyy wholestage on 334 335 2 29.9 33.4 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz trunc yy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yy wholestage off 189 190 1 52.9 18.9 1.0X -trunc yy wholestage on 279 294 11 35.9 27.9 0.7X +trunc yy wholestage off 339 346 11 29.5 33.9 1.0X +trunc yy wholestage on 333 338 5 30.0 33.3 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz trunc mon: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mon wholestage off 185 186 1 54.1 18.5 1.0X -trunc mon wholestage on 272 285 13 36.8 27.2 0.7X +trunc mon wholestage off 339 347 11 29.5 33.9 1.0X +trunc mon wholestage on 331 336 4 30.2 33.1 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz trunc month: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc month wholestage off 190 190 1 52.6 19.0 1.0X -trunc month wholestage on 293 300 4 34.1 29.3 0.6X +trunc month wholestage off 341 344 3 29.3 34.1 1.0X +trunc month wholestage on 332 338 9 30.1 33.2 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz trunc mm: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mm wholestage off 178 182 6 56.3 17.8 1.0X -trunc mm wholestage on 306 312 5 32.7 30.6 0.6X +trunc mm wholestage off 337 338 1 29.6 33.7 1.0X +trunc mm wholestage on 332 336 5 30.1 33.2 1.0X ================================================================================================ Parsing ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz to timestamp str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to timestamp str wholestage off 111 117 8 9.0 110.9 1.0X -to timestamp str wholestage on 101 109 6 9.9 100.6 1.1X +to timestamp str wholestage off 184 187 4 5.4 183.9 1.0X +to timestamp str wholestage on 159 162 2 6.3 159.4 1.2X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz to_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_timestamp wholestage off 735 746 15 1.4 734.9 1.0X -to_timestamp wholestage on 708 725 11 1.4 708.2 1.0X +to_timestamp wholestage off 1683 1689 8 0.6 1683.3 1.0X +to_timestamp wholestage on 1722 1725 4 0.6 1721.6 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz to_unix_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_unix_timestamp wholestage off 718 727 12 1.4 717.9 1.0X -to_unix_timestamp wholestage on 739 755 12 1.4 739.1 1.0X +to_unix_timestamp wholestage off 1733 1736 4 0.6 1733.1 1.0X +to_unix_timestamp wholestage on 1687 1690 4 0.6 1686.6 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz to date str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to date str wholestage off 124 125 1 8.0 124.4 1.0X -to date str wholestage on 134 138 3 7.5 133.9 0.9X +to date str wholestage off 218 220 4 4.6 217.6 1.0X +to date str wholestage on 213 215 2 4.7 212.6 1.0X -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz to_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_date wholestage off 1510 1544 48 0.7 1510.4 1.0X -to_date wholestage on 1544 1557 15 0.6 1544.2 1.0X +to_date wholestage off 3697 3699 2 0.3 3697.2 1.0X +to_date wholestage on 3603 3624 15 0.3 3602.7 1.0X ================================================================================================ Conversion from/to external types ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz To/from Java's date-time: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -From java.sql.Date 269 278 9 18.6 53.7 1.0X -From java.time.LocalDate 227 234 7 22.0 45.4 1.2X -Collect java.sql.Date 1164 1272 141 4.3 232.8 0.2X -Collect java.time.LocalDate 1070 1130 59 4.7 214.1 0.3X -From java.sql.Timestamp 246 248 2 20.3 49.2 1.1X -From java.time.Instant 214 216 2 23.4 42.8 1.3X -Collect longs 814 831 15 6.1 162.7 0.3X -Collect java.sql.Timestamp 1016 1096 78 4.9 203.2 0.3X -Collect java.time.Instant 1012 1093 86 4.9 202.4 0.3X +From java.sql.Date 432 436 7 11.6 86.4 1.0X +From java.time.LocalDate 343 347 6 14.6 68.6 1.3X +Collect java.sql.Date 1888 2453 971 2.6 377.6 0.2X +Collect java.time.LocalDate 1779 1820 42 2.8 355.7 0.2X +From java.sql.Timestamp 375 384 9 13.3 75.0 1.2X +From java.time.Instant 317 326 8 15.8 63.5 1.4X +Collect longs 1338 1428 115 3.7 267.6 0.3X +Collect java.sql.Timestamp 1716 2014 281 2.9 343.1 0.3X +Collect java.time.Instant 1832 1970 122 2.7 366.5 0.2X diff --git a/sql/core/benchmarks/DateTimeBenchmark-results.txt b/sql/core/benchmarks/DateTimeBenchmark-results.txt index dba6c909be637..3ef2f922f95bf 100644 --- a/sql/core/benchmarks/DateTimeBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeBenchmark-results.txt @@ -2,456 +2,456 @@ datetime +/- interval ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz datetime +/- interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date + interval(m) 1003 1012 13 0.0 334353721.7 1.0X -date + interval(m, d) 1154 1174 29 0.0 384575202.7 0.9X -date + interval(m, d, ms) 4338 4366 40 0.0 1446002701.3 0.2X -date - interval(m) 850 858 8 0.0 283424914.7 1.2X -date - interval(m, d) 1017 1031 19 0.0 339034354.7 1.0X -date - interval(m, d, ms) 4699 4717 25 0.0 1566218686.3 0.2X -timestamp + interval(m) 2044 2046 3 0.0 681382301.0 0.5X -timestamp + interval(m, d) 2215 2249 48 0.0 738464286.7 0.5X -timestamp + interval(m, d, ms) 2053 2063 13 0.0 684393366.0 0.5X -timestamp - interval(m) 1668 1677 12 0.0 556138256.7 0.6X -timestamp - interval(m, d) 1865 1882 25 0.0 621574795.3 0.5X -timestamp - interval(m, d, ms) 2075 2077 3 0.0 691569937.3 0.5X +date + interval(m) 1613 1622 13 6.2 161.3 1.0X +date + interval(m, d) 1729 1752 32 5.8 172.9 0.9X +date + interval(m, d, ms) 6421 6424 5 1.6 642.1 0.3X +date - interval(m) 1441 1443 2 6.9 144.1 1.1X +date - interval(m, d) 1687 1689 2 5.9 168.7 1.0X +date - interval(m, d, ms) 6617 6625 11 1.5 661.7 0.2X +timestamp + interval(m) 2713 2733 28 3.7 271.3 0.6X +timestamp + interval(m, d) 3027 3032 8 3.3 302.7 0.5X +timestamp + interval(m, d, ms) 3501 3509 12 2.9 350.1 0.5X +timestamp - interval(m) 2892 2895 4 3.5 289.2 0.6X +timestamp - interval(m, d) 3190 3196 9 3.1 319.0 0.5X +timestamp - interval(m, d, ms) 3497 3500 5 2.9 349.7 0.5X ================================================================================================ Extract components ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz cast to timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp wholestage off 192 192 0 52.2 19.2 1.0X -cast to timestamp wholestage on 163 166 3 61.3 16.3 1.2X +cast to timestamp wholestage off 321 323 2 31.1 32.1 1.0X +cast to timestamp wholestage on 294 306 10 34.0 29.4 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz year of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -year of timestamp wholestage off 743 745 4 13.5 74.3 1.0X -year of timestamp wholestage on 708 715 5 14.1 70.8 1.0X +year of timestamp wholestage off 1235 1242 9 8.1 123.5 1.0X +year of timestamp wholestage on 1208 1216 8 8.3 120.8 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz quarter of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -quarter of timestamp wholestage off 848 857 12 11.8 84.8 1.0X -quarter of timestamp wholestage on 803 813 6 12.5 80.3 1.1X +quarter of timestamp wholestage off 1415 1424 12 7.1 141.5 1.0X +quarter of timestamp wholestage on 1338 1341 4 7.5 133.8 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz month of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -month of timestamp wholestage off 740 745 7 13.5 74.0 1.0X -month of timestamp wholestage on 703 710 5 14.2 70.3 1.1X +month of timestamp wholestage off 1224 1225 1 8.2 122.4 1.0X +month of timestamp wholestage on 1193 1202 8 8.4 119.3 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz weekofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekofyear of timestamp wholestage off 1162 1182 28 8.6 116.2 1.0X -weekofyear of timestamp wholestage on 1093 1102 9 9.2 109.3 1.1X +weekofyear of timestamp wholestage off 1864 1866 3 5.4 186.4 1.0X +weekofyear of timestamp wholestage on 1827 1840 7 5.5 182.7 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz day of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -day of timestamp wholestage off 742 748 9 13.5 74.2 1.0X -day of timestamp wholestage on 703 713 7 14.2 70.3 1.1X +day of timestamp wholestage off 1209 1211 2 8.3 120.9 1.0X +day of timestamp wholestage on 1191 1194 6 8.4 119.1 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz dayofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofyear of timestamp wholestage off 791 799 11 12.6 79.1 1.0X -dayofyear of timestamp wholestage on 732 744 9 13.7 73.2 1.1X +dayofyear of timestamp wholestage off 1270 1271 2 7.9 127.0 1.0X +dayofyear of timestamp wholestage on 1241 1250 12 8.1 124.1 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz dayofmonth of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofmonth of timestamp wholestage off 738 752 20 13.6 73.8 1.0X -dayofmonth of timestamp wholestage on 695 712 9 14.4 69.5 1.1X +dayofmonth of timestamp wholestage off 1236 1250 20 8.1 123.6 1.0X +dayofmonth of timestamp wholestage on 1193 1195 3 8.4 119.3 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz dayofweek of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofweek of timestamp wholestage off 854 856 3 11.7 85.4 1.0X -dayofweek of timestamp wholestage on 819 839 16 12.2 81.9 1.0X +dayofweek of timestamp wholestage off 1402 1405 4 7.1 140.2 1.0X +dayofweek of timestamp wholestage on 1352 1359 7 7.4 135.2 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz weekday of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekday of timestamp wholestage off 816 821 7 12.3 81.6 1.0X -weekday of timestamp wholestage on 788 800 8 12.7 78.8 1.0X +weekday of timestamp wholestage off 1346 1347 2 7.4 134.6 1.0X +weekday of timestamp wholestage on 1294 1299 7 7.7 129.4 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz hour of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -hour of timestamp wholestage off 595 595 1 16.8 59.5 1.0X -hour of timestamp wholestage on 533 541 10 18.8 53.3 1.1X +hour of timestamp wholestage off 1000 1008 11 10.0 100.0 1.0X +hour of timestamp wholestage on 936 941 6 10.7 93.6 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz minute of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -minute of timestamp wholestage off 585 588 4 17.1 58.5 1.0X -minute of timestamp wholestage on 532 545 11 18.8 53.2 1.1X +minute of timestamp wholestage off 969 976 10 10.3 96.9 1.0X +minute of timestamp wholestage on 933 936 4 10.7 93.3 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz second of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -second of timestamp wholestage off 579 589 13 17.3 57.9 1.0X -second of timestamp wholestage on 529 537 6 18.9 52.9 1.1X +second of timestamp wholestage off 1002 1005 3 10.0 100.2 1.0X +second of timestamp wholestage on 935 938 2 10.7 93.5 1.1X ================================================================================================ Current date and time ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz current_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_date wholestage off 171 174 4 58.4 17.1 1.0X -current_date wholestage on 152 155 3 65.6 15.2 1.1X +current_date wholestage off 308 316 11 32.5 30.8 1.0X +current_date wholestage on 265 275 12 37.8 26.5 1.2X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz current_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_timestamp wholestage off 178 181 4 56.2 17.8 1.0X -current_timestamp wholestage on 138 149 7 72.6 13.8 1.3X +current_timestamp wholestage off 307 312 7 32.6 30.7 1.0X +current_timestamp wholestage on 263 268 5 38.1 26.3 1.2X ================================================================================================ Date arithmetic ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz cast to date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date wholestage off 630 640 14 15.9 63.0 1.0X -cast to date wholestage on 591 594 5 16.9 59.1 1.1X +cast to date wholestage off 1061 1065 5 9.4 106.1 1.0X +cast to date wholestage on 985 991 11 10.2 98.5 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz last_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -last_day wholestage off 759 778 26 13.2 75.9 1.0X -last_day wholestage on 727 736 9 13.8 72.7 1.0X +last_day wholestage off 1261 1262 1 7.9 126.1 1.0X +last_day wholestage on 1223 1235 12 8.2 122.3 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz next_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -next_day wholestage off 649 659 15 15.4 64.9 1.0X -next_day wholestage on 628 629 1 15.9 62.8 1.0X +next_day wholestage off 1114 1119 7 9.0 111.4 1.0X +next_day wholestage on 1034 1038 3 9.7 103.4 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_add: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_add wholestage off 621 622 1 16.1 62.1 1.0X -date_add wholestage on 600 606 6 16.7 60.0 1.0X +date_add wholestage off 1059 1076 25 9.4 105.9 1.0X +date_add wholestage on 1012 1021 9 9.9 101.2 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_sub: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_sub wholestage off 611 626 21 16.4 61.1 1.0X -date_sub wholestage on 588 600 7 17.0 58.8 1.0X +date_sub wholestage off 1046 1046 0 9.6 104.6 1.0X +date_sub wholestage on 1019 1023 3 9.8 101.9 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz add_months: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -add_months wholestage off 843 845 2 11.9 84.3 1.0X -add_months wholestage on 818 831 11 12.2 81.8 1.0X +add_months wholestage off 1392 1393 1 7.2 139.2 1.0X +add_months wholestage on 1335 1346 14 7.5 133.5 1.0X ================================================================================================ Formatting dates ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz format date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -format date wholestage off 3557 3569 18 2.8 355.7 1.0X -format date wholestage on 3564 3588 17 2.8 356.4 1.0X +format date wholestage off 5959 5994 50 1.7 595.9 1.0X +format date wholestage on 5991 6008 28 1.7 599.1 1.0X ================================================================================================ Formatting timestamps ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz from_unixtime: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_unixtime wholestage off 4875 4887 17 2.1 487.5 1.0X -from_unixtime wholestage on 4845 4870 16 2.1 484.5 1.0X +from_unixtime wholestage off 8851 8872 29 1.1 885.1 1.0X +from_unixtime wholestage on 8855 8872 10 1.1 885.5 1.0X ================================================================================================ Convert timestamps ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz from_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_utc_timestamp wholestage off 665 671 8 15.0 66.5 1.0X -from_utc_timestamp wholestage on 654 672 14 15.3 65.4 1.0X +from_utc_timestamp wholestage off 1105 1107 2 9.0 110.5 1.0X +from_utc_timestamp wholestage on 1072 1084 11 9.3 107.2 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz to_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_utc_timestamp wholestage off 982 983 1 10.2 98.2 1.0X -to_utc_timestamp wholestage on 877 889 9 11.4 87.7 1.1X +to_utc_timestamp wholestage off 1531 1534 3 6.5 153.1 1.0X +to_utc_timestamp wholestage on 1451 1463 14 6.9 145.1 1.1X ================================================================================================ Intervals ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz cast interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast interval wholestage off 200 206 9 50.1 20.0 1.0X -cast interval wholestage on 157 163 5 63.6 15.7 1.3X +cast interval wholestage off 360 366 8 27.8 36.0 1.0X +cast interval wholestage on 286 292 7 35.0 28.6 1.3X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz datediff: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -datediff wholestage off 1065 1068 4 9.4 106.5 1.0X -datediff wholestage on 1028 1047 15 9.7 102.8 1.0X +datediff wholestage off 1809 1814 8 5.5 180.9 1.0X +datediff wholestage on 1742 1751 8 5.7 174.2 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz months_between: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -months_between wholestage off 3102 3111 13 3.2 310.2 1.0X -months_between wholestage on 2970 3028 46 3.4 297.0 1.0X +months_between wholestage off 5007 5007 1 2.0 500.7 1.0X +months_between wholestage on 4957 4980 35 2.0 495.7 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz window: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -window wholestage off 1142 1154 16 0.9 1142.2 1.0X -window wholestage on 14817 15049 257 0.1 14816.5 0.1X +window wholestage off 1945 2027 116 0.5 1945.3 1.0X +window wholestage on 45637 45648 8 0.0 45637.2 0.0X ================================================================================================ Truncation ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc YEAR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YEAR wholestage off 1516 1518 2 6.6 151.6 1.0X -date_trunc YEAR wholestage on 1458 1468 9 6.9 145.8 1.0X +date_trunc YEAR wholestage off 2463 2465 2 4.1 246.3 1.0X +date_trunc YEAR wholestage on 2406 2409 3 4.2 240.6 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc YYYY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YYYY wholestage off 1535 1535 1 6.5 153.5 1.0X -date_trunc YYYY wholestage on 1453 1461 7 6.9 145.3 1.1X +date_trunc YYYY wholestage off 2462 2463 1 4.1 246.2 1.0X +date_trunc YYYY wholestage on 2407 2411 6 4.2 240.7 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc YY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YY wholestage off 1561 1567 9 6.4 156.1 1.0X -date_trunc YY wholestage on 1452 1467 16 6.9 145.2 1.1X +date_trunc YY wholestage off 2462 2466 6 4.1 246.2 1.0X +date_trunc YY wholestage on 2401 2406 4 4.2 240.1 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc MON: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MON wholestage off 1522 1531 13 6.6 152.2 1.0X -date_trunc MON wholestage on 1458 1467 7 6.9 145.8 1.0X +date_trunc MON wholestage off 2437 2437 0 4.1 243.7 1.0X +date_trunc MON wholestage on 2416 2421 6 4.1 241.6 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc MONTH: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MONTH wholestage off 1518 1519 0 6.6 151.8 1.0X -date_trunc MONTH wholestage on 1452 1465 16 6.9 145.2 1.0X +date_trunc MONTH wholestage off 2430 2437 9 4.1 243.0 1.0X +date_trunc MONTH wholestage on 2417 2423 5 4.1 241.7 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc MM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MM wholestage off 1531 1532 1 6.5 153.1 1.0X -date_trunc MM wholestage on 1453 1463 8 6.9 145.3 1.1X +date_trunc MM wholestage off 2429 2431 3 4.1 242.9 1.0X +date_trunc MM wholestage on 2417 2421 4 4.1 241.7 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc DAY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DAY wholestage off 1287 1309 31 7.8 128.7 1.0X -date_trunc DAY wholestage on 1310 1337 16 7.6 131.0 1.0X +date_trunc DAY wholestage off 2074 2075 2 4.8 207.4 1.0X +date_trunc DAY wholestage on 2001 2010 16 5.0 200.1 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc DD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DD wholestage off 1322 1328 9 7.6 132.2 1.0X -date_trunc DD wholestage on 1282 1324 28 7.8 128.2 1.0X +date_trunc DD wholestage off 2067 2067 0 4.8 206.7 1.0X +date_trunc DD wholestage on 2000 2003 3 5.0 200.0 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc HOUR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc HOUR wholestage off 1379 1393 20 7.3 137.9 1.0X -date_trunc HOUR wholestage on 1288 1302 11 7.8 128.8 1.1X +date_trunc HOUR wholestage off 2074 2084 14 4.8 207.4 1.0X +date_trunc HOUR wholestage on 2057 2067 10 4.9 205.7 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc MINUTE: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MINUTE wholestage off 243 245 2 41.2 24.3 1.0X -date_trunc MINUTE wholestage on 213 219 8 47.0 21.3 1.1X +date_trunc MINUTE wholestage off 362 364 3 27.6 36.2 1.0X +date_trunc MINUTE wholestage on 319 333 14 31.3 31.9 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc SECOND: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc SECOND wholestage off 238 245 11 42.1 23.8 1.0X -date_trunc SECOND wholestage on 201 210 9 49.7 20.1 1.2X +date_trunc SECOND wholestage off 361 366 7 27.7 36.1 1.0X +date_trunc SECOND wholestage on 324 341 23 30.9 32.4 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc WEEK: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc WEEK wholestage off 1443 1477 49 6.9 144.3 1.0X -date_trunc WEEK wholestage on 1491 1516 17 6.7 149.1 1.0X +date_trunc WEEK wholestage off 2385 2393 11 4.2 238.5 1.0X +date_trunc WEEK wholestage on 2313 2322 6 4.3 231.3 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz date_trunc QUARTER: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc QUARTER wholestage off 2017 2039 32 5.0 201.7 1.0X -date_trunc QUARTER wholestage on 1966 2005 36 5.1 196.6 1.0X +date_trunc QUARTER wholestage off 3278 3280 2 3.1 327.8 1.0X +date_trunc QUARTER wholestage on 3228 3234 8 3.1 322.8 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz trunc year: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc year wholestage off 206 206 1 48.6 20.6 1.0X -trunc year wholestage on 175 178 2 57.2 17.5 1.2X +trunc year wholestage off 328 331 4 30.5 32.8 1.0X +trunc year wholestage on 286 295 9 35.0 28.6 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz trunc yyyy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yyyy wholestage off 188 189 2 53.2 18.8 1.0X -trunc yyyy wholestage on 176 180 4 56.9 17.6 1.1X +trunc yyyy wholestage off 317 319 3 31.5 31.7 1.0X +trunc yyyy wholestage on 283 287 6 35.3 28.3 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz trunc yy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yy wholestage off 191 191 0 52.4 19.1 1.0X -trunc yy wholestage on 175 180 4 57.0 17.5 1.1X +trunc yy wholestage off 321 321 0 31.1 32.1 1.0X +trunc yy wholestage on 284 293 11 35.2 28.4 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz trunc mon: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mon wholestage off 203 205 3 49.3 20.3 1.0X -trunc mon wholestage on 183 186 2 54.8 18.3 1.1X +trunc mon wholestage off 318 319 1 31.4 31.8 1.0X +trunc mon wholestage on 283 287 4 35.4 28.3 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz trunc month: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc month wholestage off 199 199 0 50.3 19.9 1.0X -trunc month wholestage on 177 179 2 56.4 17.7 1.1X +trunc month wholestage off 319 321 3 31.3 31.9 1.0X +trunc month wholestage on 286 293 7 35.0 28.6 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz trunc mm: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mm wholestage off 198 198 1 50.5 19.8 1.0X -trunc mm wholestage on 180 183 3 55.7 18.0 1.1X +trunc mm wholestage off 317 319 2 31.5 31.7 1.0X +trunc mm wholestage on 282 285 3 35.4 28.2 1.1X ================================================================================================ Parsing ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz to timestamp str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to timestamp str wholestage off 138 139 2 7.2 138.2 1.0X -to timestamp str wholestage on 129 138 7 7.8 128.9 1.1X +to timestamp str wholestage off 219 220 0 4.6 219.4 1.0X +to timestamp str wholestage on 214 218 6 4.7 214.1 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz to_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_timestamp wholestage off 885 889 5 1.1 885.3 1.0X -to_timestamp wholestage on 854 866 10 1.2 854.0 1.0X +to_timestamp wholestage off 1912 1913 2 0.5 1912.0 1.0X +to_timestamp wholestage on 1671 1675 7 0.6 1670.8 1.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz to_unix_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_unix_timestamp wholestage off 848 856 12 1.2 848.1 1.0X -to_unix_timestamp wholestage on 826 850 18 1.2 826.4 1.0X +to_unix_timestamp wholestage off 1761 1763 3 0.6 1761.1 1.0X +to_unix_timestamp wholestage on 1695 1697 2 0.6 1695.4 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz to date str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to date str wholestage off 167 171 5 6.0 167.2 1.0X -to date str wholestage on 165 173 4 6.1 165.0 1.0X +to date str wholestage off 267 272 7 3.7 266.9 1.0X +to date str wholestage on 266 267 2 3.8 265.8 1.0X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz to_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_date wholestage off 1612 1633 31 0.6 1611.7 1.0X -to_date wholestage on 1588 1605 19 0.6 1588.2 1.0X +to_date wholestage off 3705 3743 55 0.3 3704.6 1.0X +to_date wholestage on 3736 3746 11 0.3 3736.4 1.0X ================================================================================================ Conversion from/to external types ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 -Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz To/from Java's date-time: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -From java.sql.Date 245 247 1 20.4 49.0 1.0X -From java.time.LocalDate 228 233 4 21.9 45.6 1.1X -Collect java.sql.Date 1239 1361 209 4.0 247.9 0.2X -Collect java.time.LocalDate 1049 1107 54 4.8 209.8 0.2X -From java.sql.Timestamp 247 252 4 20.2 49.5 1.0X -From java.time.Instant 156 158 3 32.1 31.2 1.6X -Collect longs 854 910 59 5.9 170.8 0.3X -Collect java.sql.Timestamp 1133 1140 12 4.4 226.6 0.2X -Collect java.time.Instant 1108 1159 74 4.5 221.7 0.2X +From java.sql.Date 400 406 6 12.5 80.1 1.0X +From java.time.LocalDate 343 349 7 14.6 68.6 1.2X +Collect java.sql.Date 1904 2739 1170 2.6 380.9 0.2X +Collect java.time.LocalDate 1477 1495 19 3.4 295.3 0.3X +From java.sql.Timestamp 376 388 10 13.3 75.2 1.1X +From java.time.Instant 237 239 3 21.1 47.4 1.7X +Collect longs 1258 1356 111 4.0 251.7 0.3X +Collect java.sql.Timestamp 1878 1937 64 2.7 375.6 0.2X +Collect java.time.Instant 1667 1904 238 3.0 333.4 0.2X diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala index 0034819b58893..f56efa3bba600 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala @@ -61,7 +61,7 @@ object DateTimeBenchmark extends SqlBasedBenchmark { withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> LA.getId) { val N = 10000000 runBenchmark("datetime +/- interval") { - val benchmark = new Benchmark("datetime +/- interval", 3, output = output) + val benchmark = new Benchmark("datetime +/- interval", N, output = output) val ts = "cast(id as timestamp)" val dt = s"cast($ts as date)" benchmark.addCase("date + interval(m)") { _ => From da32137d37d032a3fd2507b229e3f494cc612246 Mon Sep 17 00:00:00 2001 From: Tianshi Zhu Date: Mon, 4 May 2020 14:50:38 +0900 Subject: [PATCH 07/19] [SPARK-31267][SQL] Flaky test: WholeStageCodegenSparkSubmitSuite.Generated code on driver should not embed platform-specific constant ### What changes were proposed in this pull request? Allow customized timeouts for `runSparkSubmit`, which will make flaky tests more likely to pass by using a larger timeout value. I was able to reproduce the test failure on my laptop, which took 1.5 - 2 minutes to finish the test. After increasing the timeout, the test now can pass locally. ### Why are the changes needed? This allows slow tests to use a larger timeout, so they are more likely to succeed. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? The test was able to pass on my local env after the change. Closes #28438 from tianshizz/SPARK-31267. Authored-by: Tianshi Zhu Signed-off-by: HyukjinKwon --- .../scala/org/apache/spark/deploy/SparkSubmitSuite.scala | 5 +++-- .../sql/execution/WholeStageCodegenSparkSubmitSuite.scala | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index 1f3243400a918..fd2d1f56ed9b6 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -31,6 +31,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FSDataInputStream, Path} import org.scalatest.{BeforeAndAfterEach, Matchers} import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits} +import org.scalatest.time.Span import org.scalatest.time.SpanSugar._ import org.apache.spark._ @@ -1419,7 +1420,7 @@ object SparkSubmitSuite extends SparkFunSuite with TimeLimits { implicit val defaultSignaler: Signaler = ThreadSignaler // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly. - def runSparkSubmit(args: Seq[String], root: String = ".."): Unit = { + def runSparkSubmit(args: Seq[String], root: String = "..", timeout: Span = 1.minute): Unit = { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) val sparkSubmitFile = if (Utils.isWindows) { new File(s"$root\\bin\\spark-submit.cmd") @@ -1432,7 +1433,7 @@ object SparkSubmitSuite extends SparkFunSuite with TimeLimits { Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome)) try { - val exitCode = failAfter(1.minute) { process.waitFor() } + val exitCode = failAfter(timeout) { process.waitFor() } if (exitCode != 0) { fail(s"Process returned with exit code $exitCode. See the log4j logs for more detail.") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala index f6814d8ff8a3d..c5a01de911962 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution import org.scalatest.{Assertions, BeforeAndAfterEach, Matchers} import org.scalatest.concurrent.TimeLimits +import org.scalatest.time.SpanSugar._ import org.apache.spark.{SparkFunSuite, TestUtils} import org.apache.spark.deploy.SparkSubmitSuite @@ -50,7 +51,7 @@ class WholeStageCodegenSparkSubmitSuite extends SparkFunSuite "--conf", "spark.executor.extraJavaOptions=-XX:+UseCompressedOops", "--conf", "spark.sql.adaptive.enabled=false", unusedJar.toString) - SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..") + SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..", 3.minutes) } } From 065871c30f8d505125f5197aa8ce6691ee8af92b Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Mon, 4 May 2020 14:59:33 +0900 Subject: [PATCH 08/19] [SPARK-31626][SQL] Port HIVE-10415: hive.start.cleanup.scratchdir configuration is not taking effect ### What changes were proposed in this pull request? This pr port [HIVE-10415](https://issues.apache.org/jira/browse/HIVE-10415): `hive.start.cleanup.scratchdir` configuration is not taking effect. ### Why are the changes needed? I encountered this issue: ![image](https://user-images.githubusercontent.com/5399861/80869375-aeafd080-8cd2-11ea-8573-93ec4b422be1.png) I'd like to make `hive.start.cleanup.scratchdir` effective to reduce this issue. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Unit test Closes #28436 from wangyum/SPARK-31626. Authored-by: Yuming Wang Signed-off-by: HyukjinKwon --- .../hive/thriftserver/HiveThriftServer2.scala | 3 ++ .../HiveThriftServer2Suites.scala | 33 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala index f15193b0dc3cc..f9f2ceeed8a75 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.hive.thriftserver import java.util.Locale import java.util.concurrent.atomic.AtomicBoolean +import org.apache.hadoop.hive.common.ServerUtils import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hive.service.cli.thrift.{ThriftBinaryCLIService, ThriftHttpCLIService} @@ -101,6 +102,8 @@ object HiveThriftServer2 extends Logging { SparkSQLEnv.sqlContext.sessionState.newHadoopConf()) try { + // Cleanup the scratch dir before starting + ServerUtils.cleanUpScratchDir(executionHive.conf) val server = new HiveThriftServer2(SparkSQLEnv.sqlContext) server.init(executionHive.conf) server.start() diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala index 639dc4d13f673..0cec63460814c 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala @@ -924,6 +924,39 @@ class SingleSessionSuite extends HiveThriftJdbcTest { } } +class HiveThriftCleanUpScratchDirSuite extends HiveThriftJdbcTest{ + var tempScratchDir: File = _ + + override protected def beforeAll(): Unit = { + tempScratchDir = Utils.createTempDir() + tempScratchDir.setWritable(true, false) + assert(tempScratchDir.list().isEmpty) + new File(tempScratchDir.getAbsolutePath + File.separator + "SPARK-31626").createNewFile() + assert(tempScratchDir.list().nonEmpty) + super.beforeAll() + } + + override def mode: ServerMode.Value = ServerMode.binary + + override protected def extraConf: Seq[String] = + s" --hiveconf ${ConfVars.HIVE_START_CLEANUP_SCRATCHDIR}=true " :: + s"--hiveconf ${ConfVars.SCRATCHDIR}=${tempScratchDir.getAbsolutePath}" :: Nil + + test("Cleanup the Hive scratchdir when starting the Hive Server") { + assert(!tempScratchDir.exists()) + withJdbcStatement() { statement => + val rs = statement.executeQuery("SELECT id FROM range(1)") + assert(rs.next()) + assert(rs.getLong(1) === 0L) + } + } + + override protected def afterAll(): Unit = { + Utils.deleteRecursively(tempScratchDir) + super.afterAll() + } +} + class HiveThriftHttpServerSuite extends HiveThriftJdbcTest { override def mode: ServerMode.Value = ServerMode.http From 931c0bcd95087716c633eb9fd065d580a1d47353 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Mon, 4 May 2020 15:30:10 +0900 Subject: [PATCH 09/19] [SPARK-31606][SQL] Reduce the perf regression of vectorized parquet reader caused by datetime rebase ### What changes were proposed in this pull request? Push the rebase logic to the lower level of the parquet vectorized reader, to make the final code more vectorization-friendly. ### Why are the changes needed? Parquet vectorized reader is carefully implemented, to make it more likely to be vectorized by the JVM. However, the newly added datetime rebase degrade the performance a lot, as it breaks vectorization, even if the datetime values don't need to rebase (this is very likely as dates before 1582 is rare). ### Does this PR introduce any user-facing change? no ### How was this patch tested? Run part of the `DateTimeRebaseBenchmark` locally. The results: before this patch ``` [info] Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative [info] ------------------------------------------------------------------------------------------------------------------------ [info] after 1582, vec on, rebase off 2677 2838 142 37.4 26.8 1.0X [info] after 1582, vec on, rebase on 3828 4331 805 26.1 38.3 0.7X [info] before 1582, vec on, rebase off 2903 2926 34 34.4 29.0 0.9X [info] before 1582, vec on, rebase on 4163 4197 38 24.0 41.6 0.6X [info] Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative [info] ------------------------------------------------------------------------------------------------------------------------ [info] after 1900, vec on, rebase off 3537 3627 104 28.3 35.4 1.0X [info] after 1900, vec on, rebase on 6891 7010 105 14.5 68.9 0.5X [info] before 1900, vec on, rebase off 3692 3770 72 27.1 36.9 1.0X [info] before 1900, vec on, rebase on 7588 7610 30 13.2 75.9 0.5X ``` After this patch ``` [info] Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative [info] ------------------------------------------------------------------------------------------------------------------------ [info] after 1582, vec on, rebase off 2758 2944 197 36.3 27.6 1.0X [info] after 1582, vec on, rebase on 2908 2966 51 34.4 29.1 0.9X [info] before 1582, vec on, rebase off 2840 2878 37 35.2 28.4 1.0X [info] before 1582, vec on, rebase on 3407 3433 24 29.4 34.1 0.8X [info] Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative [info] ------------------------------------------------------------------------------------------------------------------------ [info] after 1900, vec on, rebase off 3861 4003 139 25.9 38.6 1.0X [info] after 1900, vec on, rebase on 4194 4283 77 23.8 41.9 0.9X [info] before 1900, vec on, rebase off 3849 3937 79 26.0 38.5 1.0X [info] before 1900, vec on, rebase on 7512 7546 55 13.3 75.1 0.5X ``` Date type is 30% faster if the values don't need to rebase, 20% faster if need to rebase. Timestamp type is 60% faster if the values don't need to rebase, no difference if need to rebase. Closes #28406 from cloud-fan/perf. Lead-authored-by: Wenchen Fan Co-authored-by: Maxim Gekk Signed-off-by: HyukjinKwon --- .../sql/catalyst/util/RebaseDateTime.scala | 4 + .../DateTimeRebaseBenchmark-jdk11-results.txt | 104 +++++++++--------- .../DateTimeRebaseBenchmark-results.txt | 104 +++++++++--------- .../parquet/VectorizedColumnReader.java | 22 +--- .../parquet/VectorizedPlainValuesReader.java | 55 +++++++++ .../parquet/VectorizedRleValuesReader.java | 85 ++++++++++++++ .../parquet/VectorizedValuesReader.java | 2 + .../benchmark/DateTimeRebaseBenchmark.scala | 79 ++++++------- 8 files changed, 295 insertions(+), 160 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala index 6848b0fa39c7c..040a97a14d451 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala @@ -71,6 +71,8 @@ object RebaseDateTime { -719164, -682945, -646420, -609895, -536845, -500320, -463795, -390745, -354220, -317695, -244645, -208120, -171595, -141427) + final val lastSwitchJulianDay: Int = julianGregDiffSwitchDay.last + // The first days of Common Era (CE) which is mapped to the '0001-01-01' date in Julian calendar. private final val julianCommonEraStartDay = julianGregDiffSwitchDay(0) @@ -416,6 +418,8 @@ object RebaseDateTime { // in the interval: [julianGregDiffSwitchMicros(i), julianGregDiffSwitchMicros(i+1)) private val julianGregRebaseMap = loadRebaseRecords("julian-gregorian-rebase-micros.json") + final val lastSwitchJulianTs: Long = julianGregRebaseMap.values.map(_.switches.last).max + /** * An optimized version of [[rebaseJulianToGregorianMicros(ZoneId, Long)]]. This method leverages * the pre-calculated rebasing maps to save calculation. If the rebasing map doesn't contain diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt index 2a9322a4b462a..03e0d7b8bc575 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt @@ -2,93 +2,93 @@ Rebasing dates/timestamps in Parquet datasource ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 21171 21171 0 4.7 211.7 1.0X -before 1582, noop 11036 11036 0 9.1 110.4 1.9X -after 1582, rebase off 34321 34321 0 2.9 343.2 0.6X -after 1582, rebase on 33269 33269 0 3.0 332.7 0.6X -before 1582, rebase off 22016 22016 0 4.5 220.2 1.0X -before 1582, rebase on 23338 23338 0 4.3 233.4 0.9X +after 1582, noop 20073 20073 0 5.0 200.7 1.0X +before 1582, noop 10985 10985 0 9.1 109.9 1.8X +after 1582, rebase off 32245 32245 0 3.1 322.4 0.6X +after 1582, rebase on 31434 31434 0 3.2 314.3 0.6X +before 1582, rebase off 21590 21590 0 4.6 215.9 0.9X +before 1582, rebase on 22963 22963 0 4.4 229.6 0.9X -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 12791 13089 287 7.8 127.9 1.0X -after 1582, vec off, rebase on 13203 13271 81 7.6 132.0 1.0X -after 1582, vec on, rebase off 3709 3764 49 27.0 37.1 3.4X -after 1582, vec on, rebase on 5082 5114 29 19.7 50.8 2.5X -before 1582, vec off, rebase off 13059 13153 87 7.7 130.6 1.0X -before 1582, vec off, rebase on 14211 14236 27 7.0 142.1 0.9X -before 1582, vec on, rebase off 3687 3749 72 27.1 36.9 3.5X -before 1582, vec on, rebase on 5449 5497 56 18.4 54.5 2.3X +after 1582, vec off, rebase off 12815 12858 40 7.8 128.1 1.0X +after 1582, vec off, rebase on 13030 13167 148 7.7 130.3 1.0X +after 1582, vec on, rebase off 3705 3712 6 27.0 37.1 3.5X +after 1582, vec on, rebase on 3788 3791 3 26.4 37.9 3.4X +before 1582, vec off, rebase off 12873 12943 61 7.8 128.7 1.0X +before 1582, vec off, rebase on 14072 14165 80 7.1 140.7 0.9X +before 1582, vec on, rebase off 3694 3708 15 27.1 36.9 3.5X +before 1582, vec on, rebase on 4403 4484 81 22.7 44.0 2.9X -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 2831 2831 0 35.3 28.3 1.0X -before 1582, noop 2816 2816 0 35.5 28.2 1.0X -after 1582, rebase off 15543 15543 0 6.4 155.4 0.2X -after 1582, rebase on 18391 18391 0 5.4 183.9 0.2X -before 1582, rebase off 15747 15747 0 6.4 157.5 0.2X -before 1582, rebase on 18846 18846 0 5.3 188.5 0.2X +after 1900, noop 3032 3032 0 33.0 30.3 1.0X +before 1900, noop 3043 3043 0 32.9 30.4 1.0X +after 1900, rebase off 15634 15634 0 6.4 156.3 0.2X +after 1900, rebase on 18233 18233 0 5.5 182.3 0.2X +before 1900, rebase off 15820 15820 0 6.3 158.2 0.2X +before 1900, rebase on 19921 19921 0 5.0 199.2 0.2X -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 16126 16216 78 6.2 161.3 1.0X -after 1582, vec off, rebase on 18277 18453 165 5.5 182.8 0.9X -after 1582, vec on, rebase off 5030 5067 42 19.9 50.3 3.2X -after 1582, vec on, rebase on 8553 8583 43 11.7 85.5 1.9X -before 1582, vec off, rebase off 15828 15872 39 6.3 158.3 1.0X -before 1582, vec off, rebase on 18899 18959 103 5.3 189.0 0.9X -before 1582, vec on, rebase off 4961 5009 43 20.2 49.6 3.3X -before 1582, vec on, rebase on 9099 9140 40 11.0 91.0 1.8X +after 1900, vec off, rebase off 14987 15008 18 6.7 149.9 1.0X +after 1900, vec off, rebase on 17500 17628 210 5.7 175.0 0.9X +after 1900, vec on, rebase off 5030 5036 7 19.9 50.3 3.0X +after 1900, vec on, rebase on 5066 5109 44 19.7 50.7 3.0X +before 1900, vec off, rebase off 15094 15213 121 6.6 150.9 1.0X +before 1900, vec off, rebase on 18098 18175 101 5.5 181.0 0.8X +before 1900, vec on, rebase off 5008 5012 4 20.0 50.1 3.0X +before 1900, vec on, rebase on 8803 8848 55 11.4 88.0 1.7X ================================================================================================ Rebasing dates/timestamps in ORC datasource ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 21026 21026 0 4.8 210.3 1.0X -before 1582, noop 11040 11040 0 9.1 110.4 1.9X -after 1582 28171 28171 0 3.5 281.7 0.7X -before 1582 18955 18955 0 5.3 189.5 1.1X +after 1582, noop 19593 19593 0 5.1 195.9 1.0X +before 1582, noop 10581 10581 0 9.5 105.8 1.9X +after 1582 27843 27843 0 3.6 278.4 0.7X +before 1582 19435 19435 0 5.1 194.4 1.0X -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off 10876 10931 49 9.2 108.8 1.0X -after 1582, vec on 3900 3913 20 25.6 39.0 2.8X -before 1582, vec off 11165 11174 12 9.0 111.6 1.0X -before 1582, vec on 4208 4214 7 23.8 42.1 2.6X +after 1582, vec off 10395 10507 119 9.6 103.9 1.0X +after 1582, vec on 3921 3945 22 25.5 39.2 2.7X +before 1582, vec off 10762 10860 127 9.3 107.6 1.0X +before 1582, vec on 4194 4226 41 23.8 41.9 2.5X -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 2924 2924 0 34.2 29.2 1.0X -before 1582, noop 2820 2820 0 35.5 28.2 1.0X -after 1582 22228 22228 0 4.5 222.3 0.1X -before 1582 22590 22590 0 4.4 225.9 0.1X +after 1900, noop 3003 3003 0 33.3 30.0 1.0X +before 1900, noop 3016 3016 0 33.2 30.2 1.0X +after 1900 21804 21804 0 4.6 218.0 0.1X +before 1900 23920 23920 0 4.2 239.2 0.1X -OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off 13591 13658 59 7.4 135.9 1.0X -after 1582, vec on 7399 7488 126 13.5 74.0 1.8X -before 1582, vec off 14065 14096 30 7.1 140.7 1.0X -before 1582, vec on 7950 8127 249 12.6 79.5 1.7X +after 1900, vec off 14112 14128 17 7.1 141.1 1.0X +after 1900, vec on 7347 7459 134 13.6 73.5 1.9X +before 1900, vec off 15170 15192 27 6.6 151.7 0.9X +before 1900, vec on 8280 8312 52 12.1 82.8 1.7X diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt index 050950571511d..a32a1ad8af89e 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt @@ -2,93 +2,93 @@ Rebasing dates/timestamps in Parquet datasource ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 24114 24114 0 4.1 241.1 1.0X -before 1582, noop 10250 10250 0 9.8 102.5 2.4X -after 1582, rebase off 36672 36672 0 2.7 366.7 0.7X -after 1582, rebase on 37123 37123 0 2.7 371.2 0.6X -before 1582, rebase off 21925 21925 0 4.6 219.2 1.1X -before 1582, rebase on 22341 22341 0 4.5 223.4 1.1X +after 1582, noop 23088 23088 0 4.3 230.9 1.0X +before 1582, noop 10782 10782 0 9.3 107.8 2.1X +after 1582, rebase off 34821 34821 0 2.9 348.2 0.7X +after 1582, rebase on 35040 35040 0 2.9 350.4 0.7X +before 1582, rebase off 22151 22151 0 4.5 221.5 1.0X +before 1582, rebase on 24677 24677 0 4.1 246.8 0.9X -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 12456 12601 126 8.0 124.6 1.0X -after 1582, vec off, rebase on 13299 13336 32 7.5 133.0 0.9X -after 1582, vec on, rebase off 3623 3660 40 27.6 36.2 3.4X -after 1582, vec on, rebase on 5160 5177 15 19.4 51.6 2.4X -before 1582, vec off, rebase off 13177 13264 76 7.6 131.8 0.9X -before 1582, vec off, rebase on 14102 14149 46 7.1 141.0 0.9X -before 1582, vec on, rebase off 3649 3670 34 27.4 36.5 3.4X -before 1582, vec on, rebase on 5652 5667 15 17.7 56.5 2.2X +after 1582, vec off, rebase off 13559 13650 79 7.4 135.6 1.0X +after 1582, vec off, rebase on 12942 12973 28 7.7 129.4 1.0X +after 1582, vec on, rebase off 3657 3689 29 27.3 36.6 3.7X +after 1582, vec on, rebase on 3859 3902 53 25.9 38.6 3.5X +before 1582, vec off, rebase off 12588 12607 17 7.9 125.9 1.1X +before 1582, vec off, rebase on 13396 13420 25 7.5 134.0 1.0X +before 1582, vec on, rebase off 3631 3650 19 27.5 36.3 3.7X +before 1582, vec on, rebase on 4706 4755 77 21.3 47.1 2.9X -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 2871 2871 0 34.8 28.7 1.0X -before 1582, noop 2753 2753 0 36.3 27.5 1.0X -after 1582, rebase off 15927 15927 0 6.3 159.3 0.2X -after 1582, rebase on 19138 19138 0 5.2 191.4 0.1X -before 1582, rebase off 16137 16137 0 6.2 161.4 0.2X -before 1582, rebase on 19584 19584 0 5.1 195.8 0.1X +after 1900, noop 2681 2681 0 37.3 26.8 1.0X +before 1900, noop 3051 3051 0 32.8 30.5 0.9X +after 1900, rebase off 16901 16901 0 5.9 169.0 0.2X +after 1900, rebase on 19725 19725 0 5.1 197.3 0.1X +before 1900, rebase off 16900 16900 0 5.9 169.0 0.2X +before 1900, rebase on 20381 20381 0 4.9 203.8 0.1X -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 14995 15047 47 6.7 150.0 1.0X -after 1582, vec off, rebase on 18111 18146 37 5.5 181.1 0.8X -after 1582, vec on, rebase off 4837 4873 44 20.7 48.4 3.1X -after 1582, vec on, rebase on 9542 9669 111 10.5 95.4 1.6X -before 1582, vec off, rebase off 14993 15090 94 6.7 149.9 1.0X -before 1582, vec off, rebase on 18675 18712 64 5.4 186.7 0.8X -before 1582, vec on, rebase off 4908 4923 15 20.4 49.1 3.1X -before 1582, vec on, rebase on 10128 10148 19 9.9 101.3 1.5X +after 1900, vec off, rebase off 15236 15291 62 6.6 152.4 1.0X +after 1900, vec off, rebase on 17832 18047 187 5.6 178.3 0.9X +after 1900, vec on, rebase off 4875 4901 31 20.5 48.7 3.1X +after 1900, vec on, rebase on 5354 5386 37 18.7 53.5 2.8X +before 1900, vec off, rebase off 15229 15338 108 6.6 152.3 1.0X +before 1900, vec off, rebase on 18626 18668 44 5.4 186.3 0.8X +before 1900, vec on, rebase off 4968 4975 6 20.1 49.7 3.1X +before 1900, vec on, rebase on 9913 9932 16 10.1 99.1 1.5X ================================================================================================ Rebasing dates/timestamps in ORC datasource ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 23977 23977 0 4.2 239.8 1.0X -before 1582, noop 10094 10094 0 9.9 100.9 2.4X -after 1582 33115 33115 0 3.0 331.2 0.7X -before 1582 19430 19430 0 5.1 194.3 1.2X +after 1582, noop 22942 22942 0 4.4 229.4 1.0X +before 1582, noop 11035 11035 0 9.1 110.4 2.1X +after 1582 31341 31341 0 3.2 313.4 0.7X +before 1582 20376 20376 0 4.9 203.8 1.1X -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off 10217 10241 21 9.8 102.2 1.0X -after 1582, vec on 3671 3691 31 27.2 36.7 2.8X -before 1582, vec off 10800 10874 114 9.3 108.0 0.9X -before 1582, vec on 4118 4165 74 24.3 41.2 2.5X +after 1582, vec off 10361 10378 29 9.7 103.6 1.0X +after 1582, vec on 3820 3828 11 26.2 38.2 2.7X +before 1582, vec off 10709 10720 13 9.3 107.1 1.0X +before 1582, vec on 4136 4153 15 24.2 41.4 2.5X -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 2691 2691 0 37.2 26.9 1.0X -before 1582, noop 2743 2743 0 36.5 27.4 1.0X -after 1582 21409 21409 0 4.7 214.1 0.1X -before 1582 22554 22554 0 4.4 225.5 0.1X +after 1900, noop 2888 2888 0 34.6 28.9 1.0X +before 1900, noop 2823 2823 0 35.4 28.2 1.0X +after 1900 19790 19790 0 5.1 197.9 0.1X +before 1900 20774 20774 0 4.8 207.7 0.1X -OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off 14752 14855 103 6.8 147.5 1.0X -after 1582, vec on 8146 8185 34 12.3 81.5 1.8X -before 1582, vec off 15247 15294 46 6.6 152.5 1.0X -before 1582, vec on 8414 8466 52 11.9 84.1 1.8X +after 1900, vec off 14649 14687 38 6.8 146.5 1.0X +after 1900, vec on 7850 7937 130 12.7 78.5 1.9X +before 1900, vec off 15354 15417 108 6.5 153.5 1.0X +before 1900, vec on 8382 8408 22 11.9 83.8 1.7X diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java index cfb873ff37379..7ae60f22aa790 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java @@ -423,15 +423,8 @@ private void readIntBatch(int rowId, int num, WritableColumnVector column) throw num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); } else if (column.dataType() == DataTypes.DateType ) { if (rebaseDateTime) { - for (int i = 0; i < num; i++) { - if (defColumn.readInteger() == maxDefLevel) { - column.putInt( - rowId + i, - RebaseDateTime.rebaseJulianToGregorianDays(dataColumn.readInteger())); - } else { - column.putNull(rowId + i); - } - } + defColumn.readIntegersWithRebase( + num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); } else { defColumn.readIntegers( num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); @@ -449,15 +442,8 @@ private void readLongBatch(int rowId, int num, WritableColumnVector column) thro num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); } else if (originalType == OriginalType.TIMESTAMP_MICROS) { if (rebaseDateTime) { - for (int i = 0; i < num; i++) { - if (defColumn.readInteger() == maxDefLevel) { - column.putLong( - rowId + i, - RebaseDateTime.rebaseJulianToGregorianMicros(dataColumn.readLong())); - } else { - column.putNull(rowId + i); - } - } + defColumn.readLongsWithRebase( + num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); } else { defColumn.readLongs( num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java index c62dc3d86386e..2ed2e11b60c03 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java @@ -22,6 +22,7 @@ import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.io.ParquetDecodingException; +import org.apache.spark.sql.catalyst.util.RebaseDateTime; import org.apache.spark.sql.execution.vectorized.WritableColumnVector; import org.apache.parquet.column.values.ValuesReader; @@ -81,6 +82,33 @@ public final void readIntegers(int total, WritableColumnVector c, int rowId) { } } + // A fork of `readIntegers` to rebase the date values. For performance reasons, this method + // iterates the values twice: check if we need to rebase first, then go to the optimized branch + // if rebase is not needed. + @Override + public final void readIntegersWithRebase(int total, WritableColumnVector c, int rowId) { + int requiredBytes = total * 4; + ByteBuffer buffer = getBuffer(requiredBytes); + boolean rebase = false; + for (int i = 0; i < total; i += 1) { + rebase |= buffer.getInt(buffer.position() + i * 4) < RebaseDateTime.lastSwitchJulianDay(); + } + if (rebase) { + for (int i = 0; i < total; i += 1) { + c.putInt(rowId + i, RebaseDateTime.rebaseJulianToGregorianDays(buffer.getInt())); + } + } else { + if (buffer.hasArray()) { + int offset = buffer.arrayOffset() + buffer.position(); + c.putIntsLittleEndian(rowId, total, buffer.array(), offset); + } else { + for (int i = 0; i < total; i += 1) { + c.putInt(rowId + i, buffer.getInt()); + } + } + } + } + @Override public final void readLongs(int total, WritableColumnVector c, int rowId) { int requiredBytes = total * 8; @@ -96,6 +124,33 @@ public final void readLongs(int total, WritableColumnVector c, int rowId) { } } + // A fork of `readLongs` to rebase the timestamp values. For performance reasons, this method + // iterates the values twice: check if we need to rebase first, then go to the optimized branch + // if rebase is not needed. + @Override + public final void readLongsWithRebase(int total, WritableColumnVector c, int rowId) { + int requiredBytes = total * 8; + ByteBuffer buffer = getBuffer(requiredBytes); + boolean rebase = false; + for (int i = 0; i < total; i += 1) { + rebase |= buffer.getLong(buffer.position() + i * 8) < RebaseDateTime.lastSwitchJulianTs(); + } + if (rebase) { + for (int i = 0; i < total; i += 1) { + c.putLong(rowId + i, RebaseDateTime.rebaseJulianToGregorianMicros(buffer.getLong())); + } + } else { + if (buffer.hasArray()) { + int offset = buffer.arrayOffset() + buffer.position(); + c.putLongsLittleEndian(rowId, total, buffer.array(), offset); + } else { + for (int i = 0; i < total; i += 1) { + c.putLong(rowId + i, buffer.getLong()); + } + } + } + } + @Override public final void readFloats(int total, WritableColumnVector c, int rowId) { int requiredBytes = total * 4; diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java index fe3d31ae8e746..4d72a33fcf774 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java @@ -26,6 +26,7 @@ import org.apache.parquet.io.ParquetDecodingException; import org.apache.parquet.io.api.Binary; +import org.apache.spark.sql.catalyst.util.RebaseDateTime; import org.apache.spark.sql.execution.vectorized.WritableColumnVector; import java.io.IOException; @@ -203,6 +204,43 @@ public void readIntegers( } } + // A fork of `readIntegers`, which rebases the date int value (days) before filling + // the Spark column vector. + public void readIntegersWithRebase( + int total, + WritableColumnVector c, + int rowId, + int level, + VectorizedValuesReader data) throws IOException { + int left = total; + while (left > 0) { + if (this.currentCount == 0) this.readNextGroup(); + int n = Math.min(left, this.currentCount); + switch (mode) { + case RLE: + if (currentValue == level) { + data.readIntegersWithRebase(n, c, rowId); + } else { + c.putNulls(rowId, n); + } + break; + case PACKED: + for (int i = 0; i < n; ++i) { + if (currentBuffer[currentBufferIdx++] == level) { + c.putInt(rowId + i, + RebaseDateTime.rebaseJulianToGregorianDays(data.readInteger())); + } else { + c.putNull(rowId + i); + } + } + break; + } + rowId += n; + left -= n; + currentCount -= n; + } + } + // TODO: can this code duplication be removed without a perf penalty? public void readBooleans( int total, @@ -342,6 +380,43 @@ public void readLongs( } } + // A fork of `readLongs`, which rebases the timestamp long value (microseconds) before filling + // the Spark column vector. + public void readLongsWithRebase( + int total, + WritableColumnVector c, + int rowId, + int level, + VectorizedValuesReader data) throws IOException { + int left = total; + while (left > 0) { + if (this.currentCount == 0) this.readNextGroup(); + int n = Math.min(left, this.currentCount); + switch (mode) { + case RLE: + if (currentValue == level) { + data.readLongsWithRebase(n, c, rowId); + } else { + c.putNulls(rowId, n); + } + break; + case PACKED: + for (int i = 0; i < n; ++i) { + if (currentBuffer[currentBufferIdx++] == level) { + c.putLong(rowId + i, + RebaseDateTime.rebaseJulianToGregorianMicros(data.readLong())); + } else { + c.putNull(rowId + i); + } + } + break; + } + rowId += n; + left -= n; + currentCount -= n; + } + } + public void readFloats( int total, WritableColumnVector c, @@ -508,6 +583,11 @@ public void readIntegers(int total, WritableColumnVector c, int rowId) { } } + @Override + public void readIntegersWithRebase(int total, WritableColumnVector c, int rowId) { + throw new UnsupportedOperationException("only readInts is valid."); + } + @Override public byte readByte() { throw new UnsupportedOperationException("only readInts is valid."); @@ -523,6 +603,11 @@ public void readLongs(int total, WritableColumnVector c, int rowId) { throw new UnsupportedOperationException("only readInts is valid."); } + @Override + public void readLongsWithRebase(int total, WritableColumnVector c, int rowId) { + throw new UnsupportedOperationException("only readInts is valid."); + } + @Override public void readBinary(int total, WritableColumnVector c, int rowId) { throw new UnsupportedOperationException("only readInts is valid."); diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java index 57d92ae27ece8..809ac44cc8272 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java @@ -40,7 +40,9 @@ public interface VectorizedValuesReader { void readBooleans(int total, WritableColumnVector c, int rowId); void readBytes(int total, WritableColumnVector c, int rowId); void readIntegers(int total, WritableColumnVector c, int rowId); + void readIntegersWithRebase(int total, WritableColumnVector c, int rowId); void readLongs(int total, WritableColumnVector c, int rowId); + void readLongsWithRebase(int total, WritableColumnVector c, int rowId); void readFloats(int total, WritableColumnVector c, int rowId); void readDoubles(int total, WritableColumnVector c, int rowId); void readBinary(int total, WritableColumnVector c, int rowId); diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala index 077ac28c149ee..7968836a00d0f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala @@ -49,15 +49,15 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { .select($"seconds".cast("timestamp").as("ts")) } - private def genTsAfter1582(cardinality: Int): DataFrame = { - val start = LocalDateTime.of(1582, 10, 15, 0, 0, 0) + private def genTsAfter1900(cardinality: Int): DataFrame = { + val start = LocalDateTime.of(1900, 1, 31, 0, 0, 0) val end = LocalDateTime.of(3000, 1, 1, 0, 0, 0) genTs(cardinality, start, end) } - private def genTsBefore1582(cardinality: Int): DataFrame = { + private def genTsBefore1900(cardinality: Int): DataFrame = { val start = LocalDateTime.of(10, 1, 1, 0, 0, 0) - val end = LocalDateTime.of(1580, 1, 1, 0, 0, 0) + val end = LocalDateTime.of(1900, 1, 1, 0, 0, 0) genTs(cardinality, start, end) } @@ -71,34 +71,35 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { } private def genDateAfter1582(cardinality: Int): DataFrame = { - val start = LocalDate.of(1582, 10, 15) + val start = LocalDate.of(1582, 10, 31) val end = LocalDate.of(3000, 1, 1) genDate(cardinality, start, end) } private def genDateBefore1582(cardinality: Int): DataFrame = { val start = LocalDate.of(10, 1, 1) - val end = LocalDate.of(1580, 1, 1) + val end = LocalDate.of(1580, 10, 1) genDate(cardinality, start, end) } - private def genDF(cardinality: Int, dateTime: String, after1582: Boolean): DataFrame = { - (dateTime, after1582) match { + private def genDF(cardinality: Int, dateTime: String, modernDates: Boolean): DataFrame = { + (dateTime, modernDates) match { case ("date", true) => genDateAfter1582(cardinality) case ("date", false) => genDateBefore1582(cardinality) - case ("timestamp", true) => genTsAfter1582(cardinality) - case ("timestamp", false) => genTsBefore1582(cardinality) + case ("timestamp", true) => genTsAfter1900(cardinality) + case ("timestamp", false) => genTsBefore1900(cardinality) case _ => throw new IllegalArgumentException( - s"cardinality = $cardinality dateTime = $dateTime after1582 = $after1582") + s"cardinality = $cardinality dateTime = $dateTime modernDates = $modernDates") } } private def benchmarkInputs(benchmark: Benchmark, rowsNum: Int, dateTime: String): Unit = { - benchmark.addCase("after 1582, noop", 1) { _ => - genDF(rowsNum, dateTime, after1582 = true).noop() + val year = if (dateTime == "date") 1582 else 1900 + benchmark.addCase(s"after $year, noop", 1) { _ => + genDF(rowsNum, dateTime, modernDates = true).noop() } - benchmark.addCase("before 1582, noop", 1) { _ => - genDF(rowsNum, dateTime, after1582 = false).noop() + benchmark.addCase(s"before $year, noop", 1) { _ => + genDF(rowsNum, dateTime, modernDates = false).noop() } } @@ -107,23 +108,26 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { } private def caseName( - after1582: Boolean, + modernDates: Boolean, + dateTime: String, rebase: Option[Boolean] = None, vec: Option[Boolean] = None): String = { - val period = if (after1582) "after" else "before" + val period = if (modernDates) "after" else "before" + val year = if (dateTime == "date") 1582 else 1900 val vecFlag = vec.map(flagToStr).map(flag => s", vec $flag").getOrElse("") val rebaseFlag = rebase.map(flagToStr).map(flag => s", rebase $flag").getOrElse("") - s"$period 1582$vecFlag$rebaseFlag" + s"$period $year$vecFlag$rebaseFlag" } private def getPath( basePath: File, dateTime: String, - after1582: Boolean, + modernDates: Boolean, rebase: Option[Boolean] = None): String = { - val period = if (after1582) "after" else "before" + val period = if (modernDates) "after" else "before" + val year = if (dateTime == "date") 1582 else 1900 val rebaseFlag = rebase.map(flagToStr).map(flag => s"_$flag").getOrElse("") - basePath.getAbsolutePath + s"/${dateTime}_${period}_1582$rebaseFlag" + basePath.getAbsolutePath + s"/${dateTime}_${period}_$year$rebaseFlag" } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { @@ -139,16 +143,16 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { rowsNum, output = output) benchmarkInputs(benchmark, rowsNum, dateTime) - Seq(true, false).foreach { after1582 => + Seq(true, false).foreach { modernDates => Seq(false, true).foreach { rebase => - benchmark.addCase(caseName(after1582, Some(rebase)), 1) { _ => + benchmark.addCase(caseName(modernDates, dateTime, Some(rebase)), 1) { _ => withSQLConf( SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_WRITE.key -> rebase.toString) { - genDF(rowsNum, dateTime, after1582) + genDF(rowsNum, dateTime, modernDates) .write .mode("overwrite") .format("parquet") - .save(getPath(path, dateTime, after1582, Some(rebase))) + .save(getPath(path, dateTime, modernDates, Some(rebase))) } } } @@ -157,16 +161,15 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { val benchmark2 = new Benchmark( s"Load ${dateTime}s from parquet", rowsNum, output = output) - Seq(true, false).foreach { after1582 => + Seq(true, false).foreach { modernDates => Seq(false, true).foreach { vec => Seq(false, true).foreach { rebase => - benchmark2.addCase(caseName(after1582, Some(rebase), Some(vec)), 3) { _ => - withSQLConf( - SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vec.toString, - SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_READ.key -> rebase.toString) { + val name = caseName(modernDates, dateTime, Some(rebase), Some(vec)) + benchmark2.addCase(name, 3) { _ => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vec.toString) { spark.read .format("parquet") - .load(getPath(path, dateTime, after1582, Some(rebase))) + .load(getPath(path, dateTime, modernDates, Some(rebase))) .noop() } } @@ -183,13 +186,13 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { Seq("date", "timestamp").foreach { dateTime => val benchmark = new Benchmark(s"Save ${dateTime}s to ORC", rowsNum, output = output) benchmarkInputs(benchmark, rowsNum, dateTime) - Seq(true, false).foreach { after1582 => - benchmark.addCase(caseName(after1582), 1) { _ => - genDF(rowsNum, dateTime, after1582) + Seq(true, false).foreach { modernDates => + benchmark.addCase(caseName(modernDates, dateTime), 1) { _ => + genDF(rowsNum, dateTime, modernDates) .write .mode("overwrite") .format("orc") - .save(getPath(path, dateTime, after1582)) + .save(getPath(path, dateTime, modernDates)) } } benchmark.run() @@ -198,14 +201,14 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { s"Load ${dateTime}s from ORC", rowsNum, output = output) - Seq(true, false).foreach { after1582 => + Seq(true, false).foreach { modernDates => Seq(false, true).foreach { vec => - benchmark2.addCase(caseName(after1582, vec = Some(vec)), 3) { _ => + benchmark2.addCase(caseName(modernDates, dateTime, vec = Some(vec)), 3) { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vec.toString) { spark .read .format("orc") - .load(getPath(path, dateTime, after1582)) + .load(getPath(path, dateTime, modernDates)) .noop() } } From ffd69c69772172311ad6969eaeac956cc72d30c6 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 4 May 2020 16:53:50 +0900 Subject: [PATCH 10/19] [MINOR][DOCS] Fix typo in documents ### What changes were proposed in this pull request? Fixed typo in `docs` directory and in `project/MimaExcludes.scala` ### Why are the changes needed? Better readability of documents ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? No test needed Closes #28447 from kiszk/typo_20200504. Authored-by: Kazuaki Ishizaki Signed-off-by: HyukjinKwon --- docs/ml-classification-regression.md | 4 ++-- docs/spark-standalone.md | 2 +- docs/sql-migration-guide.md | 6 +++--- docs/sql-ref-functions-udf-hive.md | 14 +++++++------- docs/sql-ref-functions.md | 2 +- docs/sql-ref-syntax-aux-describe-query.md | 2 +- docs/web-ui.md | 4 ++-- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index ce0831d3fb1a2..247989d16bcd7 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -567,7 +567,7 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat Refer to the [R API docs](api/R/spark.fmClassifier.html) for more details. -Note: At the moment SparkR doesn't suport feature scaling. +Note: At the moment SparkR doesn't support feature scaling. {% include_example r/ml/fmClassifier.R %} @@ -1105,7 +1105,7 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression. Refer to the [R API documentation](api/R/spark.fmRegressor.html) for more details. -Note: At the moment SparkR doesn't suport feature scaling. +Note: At the moment SparkR doesn't support feature scaling. {% include_example r/ml/fmRegressor.R %} diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 2c2ed53b478c3..1e6f8c586d546 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -335,7 +335,7 @@ SPARK_WORKER_OPTS supports the following system properties: overlap with `spark.worker.cleanup.enabled`, as this enables cleanup of non-shuffle files in local directories of a dead executor, while `spark.worker.cleanup.enabled` enables cleanup of all files/subdirectories of a stopped and timeout application. - This only affects Standalone mode, support of other cluster manangers can be added in the future. + This only affects Standalone mode, support of other cluster managers can be added in the future. 2.4.0 diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 0d4075ab93001..9c113d658f090 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -42,7 +42,7 @@ license: | - In Spark 3.0, `CREATE TABLE` without a specific provider uses the value of `spark.sql.sources.default` as its provider. In Spark version 2.4 and below, it was Hive. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createHiveTableByDefault.enabled` to `true`. - - In Spark 3.0, when inserting a value into a table column with a different data type, the type coercion is performed as per ANSI SQL standard. Certain unreasonable type conversions such as converting `string` to `int` and `double` to `boolean` are disallowed. A runtime exception is thrown if the value is out-of-range for the data type of the column. In Spark version 2.4 and below, type conversions during table insertion are allowed as long as they are valid `Cast`. When inserting an out-of-range value to a integral field, the low-order bits of the value is inserted(the same as Java/Scala numeric type casting). For example, if 257 is inserted to a field of byte type, the result is 1. The behavior is controlled by the option `spark.sql.storeAssignmentPolicy`, with a default value as "ANSI". Setting the option as "Legacy" restores the previous behavior. + - In Spark 3.0, when inserting a value into a table column with a different data type, the type coercion is performed as per ANSI SQL standard. Certain unreasonable type conversions such as converting `string` to `int` and `double` to `boolean` are disallowed. A runtime exception is thrown if the value is out-of-range for the data type of the column. In Spark version 2.4 and below, type conversions during table insertion are allowed as long as they are valid `Cast`. When inserting an out-of-range value to an integral field, the low-order bits of the value is inserted(the same as Java/Scala numeric type casting). For example, if 257 is inserted to a field of byte type, the result is 1. The behavior is controlled by the option `spark.sql.storeAssignmentPolicy`, with a default value as "ANSI". Setting the option as "Legacy" restores the previous behavior. - The `ADD JAR` command previously returned a result set with the single value 0. It now returns an empty result set. @@ -50,7 +50,7 @@ license: | - Refreshing a cached table would trigger a table uncache operation and then a table cache (lazily) operation. In Spark version 2.4 and below, the cache name and storage level are not preserved before the uncache operation. Therefore, the cache name and storage level could be changed unexpectedly. In Spark 3.0, cache name and storage level are first preserved for cache recreation. It helps to maintain a consistent cache behavior upon table refreshing. - - In Spark 3.0, the properties listing below become reserved; commands fail if you specify reserved properties in places like `CREATE DATABASE ... WITH DBPROPERTIES` and `ALTER TABLE ... SET TBLPROPERTIES`. You need their specific clauses to specify them, for example, `CREATE DATABASE test COMMENT 'any comment' LOCATION 'some path'`. You can set `spark.sql.legacy.notReserveProperties` to `true` to ignore the `ParseException`, in this case, these properties will be silently removed, for example: `SET DBPROTERTIES('location'='/tmp')` will have no effect. In Spark version 2.4 and below, these properties are neither reserved nor have side effects, for example, `SET DBPROTERTIES('location'='/tmp')` do not change the location of the database but only create a headless property just like `'a'='b'`. + - In Spark 3.0, the properties listing below become reserved; commands fail if you specify reserved properties in places like `CREATE DATABASE ... WITH DBPROPERTIES` and `ALTER TABLE ... SET TBLPROPERTIES`. You need their specific clauses to specify them, for example, `CREATE DATABASE test COMMENT 'any comment' LOCATION 'some path'`. You can set `spark.sql.legacy.notReserveProperties` to `true` to ignore the `ParseException`, in this case, these properties will be silently removed, for example: `SET DBPROPERTIES('location'='/tmp')` will have no effect. In Spark version 2.4 and below, these properties are neither reserved nor have side effects, for example, `SET DBPROPERTIES('location'='/tmp')` do not change the location of the database but only create a headless property just like `'a'='b'`. | Property (case sensitive) | Database Reserved | Table Reserved | Remarks | | ------------------------- | ----------------- | -------------- | ------- | @@ -130,7 +130,7 @@ license: | - In Spark 3.0, negative scale of decimal is not allowed by default, for example, data type of literal like `1E10BD` is `DecimalType(11, 0)`. In Spark version 2.4 and below, it was `DecimalType(2, -9)`. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.allowNegativeScaleOfDecimal` to `true`. - - In Spark 3.0, the unary arithmetic operator plus(`+`) only accepts string, numeric and interval type values as inputs. Besides, `+` with a integral string representation is coerced to a double value, for example, `+'1'` returns `1.0`. In Spark version 2.4 and below, this operator is ignored. There is no type checking for it, thus, all type values with a `+` prefix are valid, for example, `+ array(1, 2)` is valid and results `[1, 2]`. Besides, there is no type coercion for it at all, for example, in Spark 2.4, the result of `+'1'` is string `1`. + - In Spark 3.0, the unary arithmetic operator plus(`+`) only accepts string, numeric and interval type values as inputs. Besides, `+` with an integral string representation is coerced to a double value, for example, `+'1'` returns `1.0`. In Spark version 2.4 and below, this operator is ignored. There is no type checking for it, thus, all type values with a `+` prefix are valid, for example, `+ array(1, 2)` is valid and results `[1, 2]`. Besides, there is no type coercion for it at all, for example, in Spark 2.4, the result of `+'1'` is string `1`. - In Spark 3.0, Dataset query fails if it contains ambiguous column reference that is caused by self join. A typical example: `val df1 = ...; val df2 = df1.filter(...);`, then `df1.join(df2, df1("a") > df2("a"))` returns an empty result which is quite confusing. This is because Spark cannot resolve Dataset column references that point to tables being self joined, and `df1("a")` is exactly the same as `df2("a")` in Spark. To restore the behavior before Spark 3.0, you can set `spark.sql.analyzer.failAmbiguousSelfJoin` to `false`. diff --git a/docs/sql-ref-functions-udf-hive.md b/docs/sql-ref-functions-udf-hive.md index d3d2a221c94d8..7a7129de23836 100644 --- a/docs/sql-ref-functions-udf-hive.md +++ b/docs/sql-ref-functions-udf-hive.md @@ -28,9 +28,9 @@ Spark SQL supports integration of Hive UDFs, UDAFs and UDTFs. Similar to Spark U Hive has two UDF interfaces: [UDF](https://github.com/apache/hive/blob/master/udf/src/java/org/apache/hadoop/hive/ql/exec/UDF.java) and [GenericUDF](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java). An example below uses [GenericUDFAbs](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFAbs.java) derived from `GenericUDF`. -```sql +{% highlight sql %} -- Register `GenericUDFAbs` and use it in Spark SQL. --- Note that, if you use your own programmed one, you need to add a JAR containig it +-- Note that, if you use your own programmed one, you need to add a JAR containing it -- into a classpath, -- e.g., ADD JAR yourHiveUDF.jar; CREATE TEMPORARY FUNCTION testUDF AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFAbs'; @@ -52,12 +52,12 @@ SELECT testUDF(value) FROM t; | 2.0| | 3.0| +--------------+ -``` +{% endhighlight %} An example below uses [GenericUDTFExplode](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFExplode.java) derived from [GenericUDTF](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java). -```sql +{% highlight sql %} -- Register `GenericUDTFExplode` and use it in Spark SQL CREATE TEMPORARY FUNCTION hiveUDTF AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDTFExplode'; @@ -79,12 +79,12 @@ SELECT hiveUDTF(value) FROM t; | 3| | 4| +---+ -``` +{% endhighlight %} Hive has two UDAF interfaces: [UDAF](https://github.com/apache/hive/blob/master/udf/src/java/org/apache/hadoop/hive/ql/exec/UDAF.java) and [GenericUDAFResolver](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFResolver.java). An example below uses [GenericUDAFSum](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java) derived from `GenericUDAFResolver`. -```sql +{% highlight sql %} -- Register `GenericUDAFSum` and use it in Spark SQL CREATE TEMPORARY FUNCTION hiveUDAF AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum'; @@ -105,4 +105,4 @@ SELECT key, hiveUDAF(value) FROM t GROUP BY key; | b| 3| | a| 3| +---+---------------+ -``` \ No newline at end of file +{% endhighlight %} diff --git a/docs/sql-ref-functions.md b/docs/sql-ref-functions.md index 7493b8bbb7cdf..67951a9695f5e 100644 --- a/docs/sql-ref-functions.md +++ b/docs/sql-ref-functions.md @@ -24,7 +24,7 @@ Built-in functions are commonly used routines that Spark SQL predefines and a co ### Built-in Functions -Spark SQL has some categories of frequently-used built-in functions for aggregtion, arrays/maps, date/timestamp, and JSON data. +Spark SQL has some categories of frequently-used built-in functions for aggregation, arrays/maps, date/timestamp, and JSON data. This subsection presents the usages and descriptions of these functions. #### Scalar Functions diff --git a/docs/sql-ref-syntax-aux-describe-query.md b/docs/sql-ref-syntax-aux-describe-query.md index 65e101d3dbf13..b2a74cbd06078 100644 --- a/docs/sql-ref-syntax-aux-describe-query.md +++ b/docs/sql-ref-syntax-aux-describe-query.md @@ -73,7 +73,7 @@ DESCRIBE QUERY WITH all_names_cte | name| string| null| +--------+---------+-------+ --- Returns column metadata information for a inline table. +-- Returns column metadata information for an inline table. DESC QUERY VALUES(100, 'John', 10000.20D) AS employee(id, name, salary); +--------+---------+-------+ |col_name|data_type|comment| diff --git a/docs/web-ui.md b/docs/web-ui.md index c53af804d8d59..3c35dbeec86a2 100644 --- a/docs/web-ui.md +++ b/docs/web-ui.md @@ -99,7 +99,7 @@ This page displays the details of a specific job identified by its job ID. The Stages tab displays a summary page that shows the current state of all stages of all jobs in the Spark application. -At the beginning of the page is the summary with the count of all stages by status (active, pending, completed, sikipped, and failed) +At the beginning of the page is the summary with the count of all stages by status (active, pending, completed, skipped, and failed)

Stages header @@ -136,7 +136,7 @@ Summary metrics for all task are represented in a table and in a timeline. * **[Tasks deserialization time](configuration.html#compression-and-serialization)** * **Duration of tasks**. * **GC time** is the total JVM garbage collection time. -* **Result serialization time** is the time spent serializing the task result on a executor before sending it back to the driver. +* **Result serialization time** is the time spent serializing the task result on an executor before sending it back to the driver. * **Getting result time** is the time that the driver spends fetching task results from workers. * **Scheduler delay** is the time the task waits to be scheduled for execution. * **Peak execution memory** is the maximum memory used by the internal data structures created during shuffles, aggregations and joins. From 78758b6759c34fa2e80e580209557342b4997557 Mon Sep 17 00:00:00 2001 From: Burak Yavuz Date: Mon, 4 May 2020 12:22:29 +0000 Subject: [PATCH 11/19] [SPARK-31624] Fix SHOW TBLPROPERTIES for V2 tables that leverage the session catalog ## What changes were proposed in this pull request? SHOW TBLPROPERTIES does not get the correct table properties for tables using the Session Catalog. This PR fixes that, by explicitly falling back to the V1 implementation if the table is in fact a V1 table. We also hide the reserved table properties for V2 tables, as users do not have control over setting these table properties. Henceforth, if they cannot be set or controlled by the user, then they shouldn't be displayed as such. ### Why are the changes needed? Shows the incorrect table properties, i.e. only what exists in the Hive MetaStore for V2 tables that may have table properties outside of the MetaStore. ### Does this PR introduce _any_ user-facing change? Fixes a bug ### How was this patch tested? Regression test Closes #28434 from brkyvz/ddlCommands. Authored-by: Burak Yavuz Signed-off-by: Wenchen Fan --- .../analysis/ResolveSessionCatalog.scala | 3 ++- .../v2/ShowTablePropertiesExec.scala | 8 ++++++-- .../DataSourceV2SQLSessionCatalogSuite.scala | 18 +++++++++++++++++- .../sql/connector/DataSourceV2SQLSuite.scala | 2 -- 4 files changed, 25 insertions(+), 6 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index 58a7251f4ebd5..bf90875e511f8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -559,7 +559,8 @@ class ResolveSessionCatalog( "SHOW VIEWS, only SessionCatalog supports this command.") } - case ShowTableProperties(r: ResolvedTable, propertyKey) if isSessionCatalog(r.catalog) => + case ShowTableProperties( + r @ ResolvedTable(_, _, _: V1Table), propertyKey) if isSessionCatalog(r.catalog) => ShowTablePropertiesCommand(r.identifier.asTableIdentifier, propertyKey) case ShowTableProperties(r: ResolvedView, propertyKey) => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala index 0bcd7ea541045..fef63cb8253ca 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala @@ -19,8 +19,8 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder -import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRowWithSchema} -import org.apache.spark.sql.connector.catalog.Table +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, GenericRowWithSchema} +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Table, TableCatalog} /** * Physical plan node for showing table properties. @@ -30,11 +30,15 @@ case class ShowTablePropertiesExec( catalogTable: Table, propertyKey: Option[String]) extends V2CommandExec { + override def producedAttributes: AttributeSet = AttributeSet(output) + override protected def run(): Seq[InternalRow] = { import scala.collection.JavaConverters._ val toRow = RowEncoder(schema).resolveAndBind().createSerializer() + // The reservered properties are accessible through DESCRIBE val properties = catalogTable.properties.asScala + .filter { case (k, v) => !CatalogV2Util.TABLE_RESERVED_PROPERTIES.contains(k) } propertyKey match { case Some(p) => val propValue = properties diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala index 249b27c28b072..cf00b3b5e4410 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.connector -import org.apache.spark.sql.{DataFrame, SaveMode} +import org.apache.spark.sql.{DataFrame, Row, SaveMode} import org.apache.spark.sql.connector.catalog.{Identifier, Table, TableCatalog} class DataSourceV2SQLSessionCatalogSuite @@ -63,4 +63,20 @@ class DataSourceV2SQLSessionCatalogSuite } } } + + test("SPARK-31624: SHOW TBLPROPERTIES working with V2 tables and the session catalog") { + val t1 = "tbl" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format TBLPROPERTIES " + + "(key='v', key2='v2')") + + checkAnswer(sql(s"SHOW TBLPROPERTIES $t1"), Seq(Row("key", "v"), Row("key2", "v2"))) + + checkAnswer(sql(s"SHOW TBLPROPERTIES $t1('key')"), Row("key", "v")) + + checkAnswer( + sql(s"SHOW TBLPROPERTIES $t1('keyX')"), + Row("keyX", s"Table default.$t1 does not have property: keyX")) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index 3244684c33965..e947e15a179e8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -2122,8 +2122,6 @@ class DataSourceV2SQLSuite .add("value", StringType, nullable = false) val expected = Seq( - Row(TableCatalog.PROP_OWNER, defaultUser), - Row("provider", provider), Row("status", status), Row("user", user)) From 45f939f1afc70d782e67d87acd8be9945c16e7a0 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 4 May 2020 08:14:12 -0700 Subject: [PATCH 12/19] [SPARK-31633][BUILD] Upgrade SLF4J from 1.7.16 to 1.7.30 ### What changes were proposed in this pull request? This PR aims to upgrade SLF4J from 1.7.16 to 1.7.30. ### Why are the changes needed? SLF4J 1.7.23+ is required to enable `slf4j-log4j12` with MDC feature to run under Java 9. Also, this will bring all latest bug fixes. - http://www.slf4j.org/news.html > When running under Java 9, log4j version 1.2.x is unable to correctly parse the "java.version" system property. Assuming an inccorect Java version, it proceeded to disable its MDC functionality. The slf4j-log4j12 module shipping in this release fixes the issue by tweaking MDC internals by reflection, allowing log4j to run under Java 9. See also SLF4J-393. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the Jenkins with the existing tests. Closes #28446 from dongjoon-hyun/SPARK-31633. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- dev/deps/spark-deps-hadoop-2.7-hive-1.2 | 8 ++++---- dev/deps/spark-deps-hadoop-2.7-hive-2.3 | 8 ++++---- dev/deps/spark-deps-hadoop-3.2-hive-2.3 | 8 ++++---- pom.xml | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-1.2 b/dev/deps/spark-deps-hadoop-2.7-hive-1.2 index beaa7d42fbd0f..0f8da141249ca 100644 --- a/dev/deps/spark-deps-hadoop-2.7-hive-1.2 +++ b/dev/deps/spark-deps-hadoop-2.7-hive-1.2 @@ -112,7 +112,7 @@ javax.servlet-api/3.1.0//javax.servlet-api-3.1.0.jar javolution/5.5.1//javolution-5.5.1.jar jaxb-api/2.2.2//jaxb-api-2.2.2.jar jaxb-runtime/2.3.2//jaxb-runtime-2.3.2.jar -jcl-over-slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcl-over-slf4j/1.7.30//jcl-over-slf4j-1.7.30.jar jdo-api/3.0.1//jdo-api-3.0.1.jar jersey-client/2.30//jersey-client-2.30.jar jersey-common/2.30//jersey-common-2.30.jar @@ -135,7 +135,7 @@ json4s-scalap_2.12/3.6.6//json4s-scalap_2.12-3.6.6.jar jsp-api/2.1//jsp-api-2.1.jar jsr305/3.0.0//jsr305-3.0.0.jar jta/1.1//jta-1.1.jar -jul-to-slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +jul-to-slf4j/1.7.30//jul-to-slf4j-1.7.30.jar kryo-shaded/4.0.2//kryo-shaded-4.0.2.jar kubernetes-client/4.7.1//kubernetes-client-4.7.1.jar kubernetes-model-common/4.7.1//kubernetes-model-common-4.7.1.jar @@ -184,8 +184,8 @@ scala-reflect/2.12.10//scala-reflect-2.12.10.jar scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar shims/0.7.45//shims-0.7.45.jar -slf4j-api/1.7.16//slf4j-api-1.7.16.jar -slf4j-log4j12/1.7.16//slf4j-log4j12-1.7.16.jar +slf4j-api/1.7.30//slf4j-api-1.7.30.jar +slf4j-log4j12/1.7.30//slf4j-log4j12-1.7.30.jar snakeyaml/1.24//snakeyaml-1.24.jar snappy-java/1.1.7.3//snappy-java-1.1.7.3.jar snappy/0.2//snappy-0.2.jar diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 index 81c68e0bec983..6abdd7409eb14 100644 --- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 @@ -126,7 +126,7 @@ javax.servlet-api/3.1.0//javax.servlet-api-3.1.0.jar javolution/5.5.1//javolution-5.5.1.jar jaxb-api/2.2.2//jaxb-api-2.2.2.jar jaxb-runtime/2.3.2//jaxb-runtime-2.3.2.jar -jcl-over-slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcl-over-slf4j/1.7.30//jcl-over-slf4j-1.7.30.jar jdo-api/3.0.1//jdo-api-3.0.1.jar jersey-client/2.30//jersey-client-2.30.jar jersey-common/2.30//jersey-common-2.30.jar @@ -150,7 +150,7 @@ json4s-scalap_2.12/3.6.6//json4s-scalap_2.12-3.6.6.jar jsp-api/2.1//jsp-api-2.1.jar jsr305/3.0.0//jsr305-3.0.0.jar jta/1.1//jta-1.1.jar -jul-to-slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +jul-to-slf4j/1.7.30//jul-to-slf4j-1.7.30.jar kryo-shaded/4.0.2//kryo-shaded-4.0.2.jar kubernetes-client/4.7.1//kubernetes-client-4.7.1.jar kubernetes-model-common/4.7.1//kubernetes-model-common-4.7.1.jar @@ -198,8 +198,8 @@ scala-reflect/2.12.10//scala-reflect-2.12.10.jar scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar shims/0.7.45//shims-0.7.45.jar -slf4j-api/1.7.16//slf4j-api-1.7.16.jar -slf4j-log4j12/1.7.16//slf4j-log4j12-1.7.16.jar +slf4j-api/1.7.30//slf4j-api-1.7.30.jar +slf4j-log4j12/1.7.30//slf4j-log4j12-1.7.30.jar snakeyaml/1.24//snakeyaml-1.24.jar snappy-java/1.1.7.3//snappy-java-1.1.7.3.jar spire-macros_2.12/0.17.0-M1//spire-macros_2.12-0.17.0-M1.jar diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 index 6966de94e46da..3553734b35fe6 100644 --- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 @@ -126,7 +126,7 @@ javolution/5.5.1//javolution-5.5.1.jar jaxb-api/2.2.11//jaxb-api-2.2.11.jar jaxb-runtime/2.3.2//jaxb-runtime-2.3.2.jar jcip-annotations/1.0-1//jcip-annotations-1.0-1.jar -jcl-over-slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jcl-over-slf4j/1.7.30//jcl-over-slf4j-1.7.30.jar jdo-api/3.0.1//jdo-api-3.0.1.jar jersey-client/2.30//jersey-client-2.30.jar jersey-common/2.30//jersey-common-2.30.jar @@ -148,7 +148,7 @@ json4s-scalap_2.12/3.6.6//json4s-scalap_2.12-3.6.6.jar jsp-api/2.1//jsp-api-2.1.jar jsr305/3.0.0//jsr305-3.0.0.jar jta/1.1//jta-1.1.jar -jul-to-slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +jul-to-slf4j/1.7.30//jul-to-slf4j-1.7.30.jar kerb-admin/1.0.1//kerb-admin-1.0.1.jar kerb-client/1.0.1//kerb-client-1.0.1.jar kerb-common/1.0.1//kerb-common-1.0.1.jar @@ -214,8 +214,8 @@ scala-reflect/2.12.10//scala-reflect-2.12.10.jar scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar shims/0.7.45//shims-0.7.45.jar -slf4j-api/1.7.16//slf4j-api-1.7.16.jar -slf4j-log4j12/1.7.16//slf4j-log4j12-1.7.16.jar +slf4j-api/1.7.30//slf4j-api-1.7.30.jar +slf4j-log4j12/1.7.30//slf4j-log4j12-1.7.30.jar snakeyaml/1.24//snakeyaml-1.24.jar snappy-java/1.1.7.3//snappy-java-1.1.7.3.jar spire-macros_2.12/0.17.0-M1//spire-macros_2.12-0.17.0-M1.jar diff --git a/pom.xml b/pom.xml index 8dbff24c5d8c8..9c55630786ce7 100644 --- a/pom.xml +++ b/pom.xml @@ -117,7 +117,7 @@ ${java.version} 3.6.3 spark - 1.7.16 + 1.7.30 1.2.17 2.7.4 2.5.0 From 6e340c8d77a659a507aa75051a748983568a7bf4 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 4 May 2020 11:23:07 -0700 Subject: [PATCH 13/19] Revert "[SPARK-31624] Fix SHOW TBLPROPERTIES for V2 tables that leverage the session catalog" This reverts commit 78758b6759c34fa2e80e580209557342b4997557. --- .../analysis/ResolveSessionCatalog.scala | 3 +-- .../v2/ShowTablePropertiesExec.scala | 8 ++------ .../DataSourceV2SQLSessionCatalogSuite.scala | 18 +----------------- .../sql/connector/DataSourceV2SQLSuite.scala | 2 ++ 4 files changed, 6 insertions(+), 25 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index bf90875e511f8..58a7251f4ebd5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -559,8 +559,7 @@ class ResolveSessionCatalog( "SHOW VIEWS, only SessionCatalog supports this command.") } - case ShowTableProperties( - r @ ResolvedTable(_, _, _: V1Table), propertyKey) if isSessionCatalog(r.catalog) => + case ShowTableProperties(r: ResolvedTable, propertyKey) if isSessionCatalog(r.catalog) => ShowTablePropertiesCommand(r.identifier.asTableIdentifier, propertyKey) case ShowTableProperties(r: ResolvedView, propertyKey) => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala index fef63cb8253ca..0bcd7ea541045 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala @@ -19,8 +19,8 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, GenericRowWithSchema} -import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Table, TableCatalog} +import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRowWithSchema} +import org.apache.spark.sql.connector.catalog.Table /** * Physical plan node for showing table properties. @@ -30,15 +30,11 @@ case class ShowTablePropertiesExec( catalogTable: Table, propertyKey: Option[String]) extends V2CommandExec { - override def producedAttributes: AttributeSet = AttributeSet(output) - override protected def run(): Seq[InternalRow] = { import scala.collection.JavaConverters._ val toRow = RowEncoder(schema).resolveAndBind().createSerializer() - // The reservered properties are accessible through DESCRIBE val properties = catalogTable.properties.asScala - .filter { case (k, v) => !CatalogV2Util.TABLE_RESERVED_PROPERTIES.contains(k) } propertyKey match { case Some(p) => val propValue = properties diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala index cf00b3b5e4410..249b27c28b072 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.connector -import org.apache.spark.sql.{DataFrame, Row, SaveMode} +import org.apache.spark.sql.{DataFrame, SaveMode} import org.apache.spark.sql.connector.catalog.{Identifier, Table, TableCatalog} class DataSourceV2SQLSessionCatalogSuite @@ -63,20 +63,4 @@ class DataSourceV2SQLSessionCatalogSuite } } } - - test("SPARK-31624: SHOW TBLPROPERTIES working with V2 tables and the session catalog") { - val t1 = "tbl" - withTable(t1) { - sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format TBLPROPERTIES " + - "(key='v', key2='v2')") - - checkAnswer(sql(s"SHOW TBLPROPERTIES $t1"), Seq(Row("key", "v"), Row("key2", "v2"))) - - checkAnswer(sql(s"SHOW TBLPROPERTIES $t1('key')"), Row("key", "v")) - - checkAnswer( - sql(s"SHOW TBLPROPERTIES $t1('keyX')"), - Row("keyX", s"Table default.$t1 does not have property: keyX")) - } - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index e947e15a179e8..3244684c33965 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -2122,6 +2122,8 @@ class DataSourceV2SQLSuite .add("value", StringType, nullable = false) val expected = Seq( + Row(TableCatalog.PROP_OWNER, defaultUser), + Row("provider", provider), Row("status", status), Row("user", user)) From 368cdc8f8f6a3d7f3ae6304b62d506ff237763c4 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 4 May 2020 11:23:27 -0700 Subject: [PATCH 14/19] Revert "[MINOR][DOCS] Fix typo in documents" This reverts commit ffd69c69772172311ad6969eaeac956cc72d30c6. --- docs/ml-classification-regression.md | 4 ++-- docs/spark-standalone.md | 2 +- docs/sql-migration-guide.md | 6 +++--- docs/sql-ref-functions-udf-hive.md | 14 +++++++------- docs/sql-ref-functions.md | 2 +- docs/sql-ref-syntax-aux-describe-query.md | 2 +- docs/web-ui.md | 4 ++-- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index 247989d16bcd7..ce0831d3fb1a2 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -567,7 +567,7 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.classificat Refer to the [R API docs](api/R/spark.fmClassifier.html) for more details. -Note: At the moment SparkR doesn't support feature scaling. +Note: At the moment SparkR doesn't suport feature scaling. {% include_example r/ml/fmClassifier.R %} @@ -1105,7 +1105,7 @@ Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.regression. Refer to the [R API documentation](api/R/spark.fmRegressor.html) for more details. -Note: At the moment SparkR doesn't support feature scaling. +Note: At the moment SparkR doesn't suport feature scaling. {% include_example r/ml/fmRegressor.R %} diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 1e6f8c586d546..2c2ed53b478c3 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -335,7 +335,7 @@ SPARK_WORKER_OPTS supports the following system properties: overlap with `spark.worker.cleanup.enabled`, as this enables cleanup of non-shuffle files in local directories of a dead executor, while `spark.worker.cleanup.enabled` enables cleanup of all files/subdirectories of a stopped and timeout application. - This only affects Standalone mode, support of other cluster managers can be added in the future. + This only affects Standalone mode, support of other cluster manangers can be added in the future. 2.4.0 diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 9c113d658f090..0d4075ab93001 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -42,7 +42,7 @@ license: | - In Spark 3.0, `CREATE TABLE` without a specific provider uses the value of `spark.sql.sources.default` as its provider. In Spark version 2.4 and below, it was Hive. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createHiveTableByDefault.enabled` to `true`. - - In Spark 3.0, when inserting a value into a table column with a different data type, the type coercion is performed as per ANSI SQL standard. Certain unreasonable type conversions such as converting `string` to `int` and `double` to `boolean` are disallowed. A runtime exception is thrown if the value is out-of-range for the data type of the column. In Spark version 2.4 and below, type conversions during table insertion are allowed as long as they are valid `Cast`. When inserting an out-of-range value to an integral field, the low-order bits of the value is inserted(the same as Java/Scala numeric type casting). For example, if 257 is inserted to a field of byte type, the result is 1. The behavior is controlled by the option `spark.sql.storeAssignmentPolicy`, with a default value as "ANSI". Setting the option as "Legacy" restores the previous behavior. + - In Spark 3.0, when inserting a value into a table column with a different data type, the type coercion is performed as per ANSI SQL standard. Certain unreasonable type conversions such as converting `string` to `int` and `double` to `boolean` are disallowed. A runtime exception is thrown if the value is out-of-range for the data type of the column. In Spark version 2.4 and below, type conversions during table insertion are allowed as long as they are valid `Cast`. When inserting an out-of-range value to a integral field, the low-order bits of the value is inserted(the same as Java/Scala numeric type casting). For example, if 257 is inserted to a field of byte type, the result is 1. The behavior is controlled by the option `spark.sql.storeAssignmentPolicy`, with a default value as "ANSI". Setting the option as "Legacy" restores the previous behavior. - The `ADD JAR` command previously returned a result set with the single value 0. It now returns an empty result set. @@ -50,7 +50,7 @@ license: | - Refreshing a cached table would trigger a table uncache operation and then a table cache (lazily) operation. In Spark version 2.4 and below, the cache name and storage level are not preserved before the uncache operation. Therefore, the cache name and storage level could be changed unexpectedly. In Spark 3.0, cache name and storage level are first preserved for cache recreation. It helps to maintain a consistent cache behavior upon table refreshing. - - In Spark 3.0, the properties listing below become reserved; commands fail if you specify reserved properties in places like `CREATE DATABASE ... WITH DBPROPERTIES` and `ALTER TABLE ... SET TBLPROPERTIES`. You need their specific clauses to specify them, for example, `CREATE DATABASE test COMMENT 'any comment' LOCATION 'some path'`. You can set `spark.sql.legacy.notReserveProperties` to `true` to ignore the `ParseException`, in this case, these properties will be silently removed, for example: `SET DBPROPERTIES('location'='/tmp')` will have no effect. In Spark version 2.4 and below, these properties are neither reserved nor have side effects, for example, `SET DBPROPERTIES('location'='/tmp')` do not change the location of the database but only create a headless property just like `'a'='b'`. + - In Spark 3.0, the properties listing below become reserved; commands fail if you specify reserved properties in places like `CREATE DATABASE ... WITH DBPROPERTIES` and `ALTER TABLE ... SET TBLPROPERTIES`. You need their specific clauses to specify them, for example, `CREATE DATABASE test COMMENT 'any comment' LOCATION 'some path'`. You can set `spark.sql.legacy.notReserveProperties` to `true` to ignore the `ParseException`, in this case, these properties will be silently removed, for example: `SET DBPROTERTIES('location'='/tmp')` will have no effect. In Spark version 2.4 and below, these properties are neither reserved nor have side effects, for example, `SET DBPROTERTIES('location'='/tmp')` do not change the location of the database but only create a headless property just like `'a'='b'`. | Property (case sensitive) | Database Reserved | Table Reserved | Remarks | | ------------------------- | ----------------- | -------------- | ------- | @@ -130,7 +130,7 @@ license: | - In Spark 3.0, negative scale of decimal is not allowed by default, for example, data type of literal like `1E10BD` is `DecimalType(11, 0)`. In Spark version 2.4 and below, it was `DecimalType(2, -9)`. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.allowNegativeScaleOfDecimal` to `true`. - - In Spark 3.0, the unary arithmetic operator plus(`+`) only accepts string, numeric and interval type values as inputs. Besides, `+` with an integral string representation is coerced to a double value, for example, `+'1'` returns `1.0`. In Spark version 2.4 and below, this operator is ignored. There is no type checking for it, thus, all type values with a `+` prefix are valid, for example, `+ array(1, 2)` is valid and results `[1, 2]`. Besides, there is no type coercion for it at all, for example, in Spark 2.4, the result of `+'1'` is string `1`. + - In Spark 3.0, the unary arithmetic operator plus(`+`) only accepts string, numeric and interval type values as inputs. Besides, `+` with a integral string representation is coerced to a double value, for example, `+'1'` returns `1.0`. In Spark version 2.4 and below, this operator is ignored. There is no type checking for it, thus, all type values with a `+` prefix are valid, for example, `+ array(1, 2)` is valid and results `[1, 2]`. Besides, there is no type coercion for it at all, for example, in Spark 2.4, the result of `+'1'` is string `1`. - In Spark 3.0, Dataset query fails if it contains ambiguous column reference that is caused by self join. A typical example: `val df1 = ...; val df2 = df1.filter(...);`, then `df1.join(df2, df1("a") > df2("a"))` returns an empty result which is quite confusing. This is because Spark cannot resolve Dataset column references that point to tables being self joined, and `df1("a")` is exactly the same as `df2("a")` in Spark. To restore the behavior before Spark 3.0, you can set `spark.sql.analyzer.failAmbiguousSelfJoin` to `false`. diff --git a/docs/sql-ref-functions-udf-hive.md b/docs/sql-ref-functions-udf-hive.md index 7a7129de23836..d3d2a221c94d8 100644 --- a/docs/sql-ref-functions-udf-hive.md +++ b/docs/sql-ref-functions-udf-hive.md @@ -28,9 +28,9 @@ Spark SQL supports integration of Hive UDFs, UDAFs and UDTFs. Similar to Spark U Hive has two UDF interfaces: [UDF](https://github.com/apache/hive/blob/master/udf/src/java/org/apache/hadoop/hive/ql/exec/UDF.java) and [GenericUDF](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java). An example below uses [GenericUDFAbs](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFAbs.java) derived from `GenericUDF`. -{% highlight sql %} +```sql -- Register `GenericUDFAbs` and use it in Spark SQL. --- Note that, if you use your own programmed one, you need to add a JAR containing it +-- Note that, if you use your own programmed one, you need to add a JAR containig it -- into a classpath, -- e.g., ADD JAR yourHiveUDF.jar; CREATE TEMPORARY FUNCTION testUDF AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFAbs'; @@ -52,12 +52,12 @@ SELECT testUDF(value) FROM t; | 2.0| | 3.0| +--------------+ -{% endhighlight %} +``` An example below uses [GenericUDTFExplode](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFExplode.java) derived from [GenericUDTF](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java). -{% highlight sql %} +```sql -- Register `GenericUDTFExplode` and use it in Spark SQL CREATE TEMPORARY FUNCTION hiveUDTF AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDTFExplode'; @@ -79,12 +79,12 @@ SELECT hiveUDTF(value) FROM t; | 3| | 4| +---+ -{% endhighlight %} +``` Hive has two UDAF interfaces: [UDAF](https://github.com/apache/hive/blob/master/udf/src/java/org/apache/hadoop/hive/ql/exec/UDAF.java) and [GenericUDAFResolver](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFResolver.java). An example below uses [GenericUDAFSum](https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java) derived from `GenericUDAFResolver`. -{% highlight sql %} +```sql -- Register `GenericUDAFSum` and use it in Spark SQL CREATE TEMPORARY FUNCTION hiveUDAF AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum'; @@ -105,4 +105,4 @@ SELECT key, hiveUDAF(value) FROM t GROUP BY key; | b| 3| | a| 3| +---+---------------+ -{% endhighlight %} +``` \ No newline at end of file diff --git a/docs/sql-ref-functions.md b/docs/sql-ref-functions.md index 67951a9695f5e..7493b8bbb7cdf 100644 --- a/docs/sql-ref-functions.md +++ b/docs/sql-ref-functions.md @@ -24,7 +24,7 @@ Built-in functions are commonly used routines that Spark SQL predefines and a co ### Built-in Functions -Spark SQL has some categories of frequently-used built-in functions for aggregation, arrays/maps, date/timestamp, and JSON data. +Spark SQL has some categories of frequently-used built-in functions for aggregtion, arrays/maps, date/timestamp, and JSON data. This subsection presents the usages and descriptions of these functions. #### Scalar Functions diff --git a/docs/sql-ref-syntax-aux-describe-query.md b/docs/sql-ref-syntax-aux-describe-query.md index b2a74cbd06078..65e101d3dbf13 100644 --- a/docs/sql-ref-syntax-aux-describe-query.md +++ b/docs/sql-ref-syntax-aux-describe-query.md @@ -73,7 +73,7 @@ DESCRIBE QUERY WITH all_names_cte | name| string| null| +--------+---------+-------+ --- Returns column metadata information for an inline table. +-- Returns column metadata information for a inline table. DESC QUERY VALUES(100, 'John', 10000.20D) AS employee(id, name, salary); +--------+---------+-------+ |col_name|data_type|comment| diff --git a/docs/web-ui.md b/docs/web-ui.md index 3c35dbeec86a2..c53af804d8d59 100644 --- a/docs/web-ui.md +++ b/docs/web-ui.md @@ -99,7 +99,7 @@ This page displays the details of a specific job identified by its job ID. The Stages tab displays a summary page that shows the current state of all stages of all jobs in the Spark application. -At the beginning of the page is the summary with the count of all stages by status (active, pending, completed, skipped, and failed) +At the beginning of the page is the summary with the count of all stages by status (active, pending, completed, sikipped, and failed)

Stages header @@ -136,7 +136,7 @@ Summary metrics for all task are represented in a table and in a timeline. * **[Tasks deserialization time](configuration.html#compression-and-serialization)** * **Duration of tasks**. * **GC time** is the total JVM garbage collection time. -* **Result serialization time** is the time spent serializing the task result on an executor before sending it back to the driver. +* **Result serialization time** is the time spent serializing the task result on a executor before sending it back to the driver. * **Getting result time** is the time that the driver spends fetching task results from workers. * **Scheduler delay** is the time the task waits to be scheduled for execution. * **Peak execution memory** is the maximum memory used by the internal data structures created during shuffles, aggregations and joins. From 0bd2ca5fc79cf88d6d9248c8fbc02f43a387b832 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 4 May 2020 11:23:32 -0700 Subject: [PATCH 15/19] Revert "[SPARK-31606][SQL] Reduce the perf regression of vectorized parquet reader caused by datetime rebase" This reverts commit 931c0bcd95087716c633eb9fd065d580a1d47353. --- .../sql/catalyst/util/RebaseDateTime.scala | 4 - .../DateTimeRebaseBenchmark-jdk11-results.txt | 104 +++++++++--------- .../DateTimeRebaseBenchmark-results.txt | 104 +++++++++--------- .../parquet/VectorizedColumnReader.java | 22 +++- .../parquet/VectorizedPlainValuesReader.java | 55 --------- .../parquet/VectorizedRleValuesReader.java | 85 -------------- .../parquet/VectorizedValuesReader.java | 2 - .../benchmark/DateTimeRebaseBenchmark.scala | 79 +++++++------ 8 files changed, 160 insertions(+), 295 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala index 040a97a14d451..6848b0fa39c7c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala @@ -71,8 +71,6 @@ object RebaseDateTime { -719164, -682945, -646420, -609895, -536845, -500320, -463795, -390745, -354220, -317695, -244645, -208120, -171595, -141427) - final val lastSwitchJulianDay: Int = julianGregDiffSwitchDay.last - // The first days of Common Era (CE) which is mapped to the '0001-01-01' date in Julian calendar. private final val julianCommonEraStartDay = julianGregDiffSwitchDay(0) @@ -418,8 +416,6 @@ object RebaseDateTime { // in the interval: [julianGregDiffSwitchMicros(i), julianGregDiffSwitchMicros(i+1)) private val julianGregRebaseMap = loadRebaseRecords("julian-gregorian-rebase-micros.json") - final val lastSwitchJulianTs: Long = julianGregRebaseMap.values.map(_.switches.last).max - /** * An optimized version of [[rebaseJulianToGregorianMicros(ZoneId, Long)]]. This method leverages * the pre-calculated rebasing maps to save calculation. If the rebasing map doesn't contain diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt index 03e0d7b8bc575..2a9322a4b462a 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-jdk11-results.txt @@ -2,93 +2,93 @@ Rebasing dates/timestamps in Parquet datasource ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 20073 20073 0 5.0 200.7 1.0X -before 1582, noop 10985 10985 0 9.1 109.9 1.8X -after 1582, rebase off 32245 32245 0 3.1 322.4 0.6X -after 1582, rebase on 31434 31434 0 3.2 314.3 0.6X -before 1582, rebase off 21590 21590 0 4.6 215.9 0.9X -before 1582, rebase on 22963 22963 0 4.4 229.6 0.9X +after 1582, noop 21171 21171 0 4.7 211.7 1.0X +before 1582, noop 11036 11036 0 9.1 110.4 1.9X +after 1582, rebase off 34321 34321 0 2.9 343.2 0.6X +after 1582, rebase on 33269 33269 0 3.0 332.7 0.6X +before 1582, rebase off 22016 22016 0 4.5 220.2 1.0X +before 1582, rebase on 23338 23338 0 4.3 233.4 0.9X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 12815 12858 40 7.8 128.1 1.0X -after 1582, vec off, rebase on 13030 13167 148 7.7 130.3 1.0X -after 1582, vec on, rebase off 3705 3712 6 27.0 37.1 3.5X -after 1582, vec on, rebase on 3788 3791 3 26.4 37.9 3.4X -before 1582, vec off, rebase off 12873 12943 61 7.8 128.7 1.0X -before 1582, vec off, rebase on 14072 14165 80 7.1 140.7 0.9X -before 1582, vec on, rebase off 3694 3708 15 27.1 36.9 3.5X -before 1582, vec on, rebase on 4403 4484 81 22.7 44.0 2.9X +after 1582, vec off, rebase off 12791 13089 287 7.8 127.9 1.0X +after 1582, vec off, rebase on 13203 13271 81 7.6 132.0 1.0X +after 1582, vec on, rebase off 3709 3764 49 27.0 37.1 3.4X +after 1582, vec on, rebase on 5082 5114 29 19.7 50.8 2.5X +before 1582, vec off, rebase off 13059 13153 87 7.7 130.6 1.0X +before 1582, vec off, rebase on 14211 14236 27 7.0 142.1 0.9X +before 1582, vec on, rebase off 3687 3749 72 27.1 36.9 3.5X +before 1582, vec on, rebase on 5449 5497 56 18.4 54.5 2.3X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 3032 3032 0 33.0 30.3 1.0X -before 1900, noop 3043 3043 0 32.9 30.4 1.0X -after 1900, rebase off 15634 15634 0 6.4 156.3 0.2X -after 1900, rebase on 18233 18233 0 5.5 182.3 0.2X -before 1900, rebase off 15820 15820 0 6.3 158.2 0.2X -before 1900, rebase on 19921 19921 0 5.0 199.2 0.2X +after 1582, noop 2831 2831 0 35.3 28.3 1.0X +before 1582, noop 2816 2816 0 35.5 28.2 1.0X +after 1582, rebase off 15543 15543 0 6.4 155.4 0.2X +after 1582, rebase on 18391 18391 0 5.4 183.9 0.2X +before 1582, rebase off 15747 15747 0 6.4 157.5 0.2X +before 1582, rebase on 18846 18846 0 5.3 188.5 0.2X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase off 14987 15008 18 6.7 149.9 1.0X -after 1900, vec off, rebase on 17500 17628 210 5.7 175.0 0.9X -after 1900, vec on, rebase off 5030 5036 7 19.9 50.3 3.0X -after 1900, vec on, rebase on 5066 5109 44 19.7 50.7 3.0X -before 1900, vec off, rebase off 15094 15213 121 6.6 150.9 1.0X -before 1900, vec off, rebase on 18098 18175 101 5.5 181.0 0.8X -before 1900, vec on, rebase off 5008 5012 4 20.0 50.1 3.0X -before 1900, vec on, rebase on 8803 8848 55 11.4 88.0 1.7X +after 1582, vec off, rebase off 16126 16216 78 6.2 161.3 1.0X +after 1582, vec off, rebase on 18277 18453 165 5.5 182.8 0.9X +after 1582, vec on, rebase off 5030 5067 42 19.9 50.3 3.2X +after 1582, vec on, rebase on 8553 8583 43 11.7 85.5 1.9X +before 1582, vec off, rebase off 15828 15872 39 6.3 158.3 1.0X +before 1582, vec off, rebase on 18899 18959 103 5.3 189.0 0.9X +before 1582, vec on, rebase off 4961 5009 43 20.2 49.6 3.3X +before 1582, vec on, rebase on 9099 9140 40 11.0 91.0 1.8X ================================================================================================ Rebasing dates/timestamps in ORC datasource ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 19593 19593 0 5.1 195.9 1.0X -before 1582, noop 10581 10581 0 9.5 105.8 1.9X -after 1582 27843 27843 0 3.6 278.4 0.7X -before 1582 19435 19435 0 5.1 194.4 1.0X +after 1582, noop 21026 21026 0 4.8 210.3 1.0X +before 1582, noop 11040 11040 0 9.1 110.4 1.9X +after 1582 28171 28171 0 3.5 281.7 0.7X +before 1582 18955 18955 0 5.3 189.5 1.1X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off 10395 10507 119 9.6 103.9 1.0X -after 1582, vec on 3921 3945 22 25.5 39.2 2.7X -before 1582, vec off 10762 10860 127 9.3 107.6 1.0X -before 1582, vec on 4194 4226 41 23.8 41.9 2.5X +after 1582, vec off 10876 10931 49 9.2 108.8 1.0X +after 1582, vec on 3900 3913 20 25.6 39.0 2.8X +before 1582, vec off 11165 11174 12 9.0 111.6 1.0X +before 1582, vec on 4208 4214 7 23.8 42.1 2.6X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 3003 3003 0 33.3 30.0 1.0X -before 1900, noop 3016 3016 0 33.2 30.2 1.0X -after 1900 21804 21804 0 4.6 218.0 0.1X -before 1900 23920 23920 0 4.2 239.2 0.1X +after 1582, noop 2924 2924 0 34.2 29.2 1.0X +before 1582, noop 2820 2820 0 35.5 28.2 1.0X +after 1582 22228 22228 0 4.5 222.3 0.1X +before 1582 22590 22590 0 4.4 225.9 0.1X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 11.0.6+10-post-Ubuntu-1ubuntu118.04.1 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off 14112 14128 17 7.1 141.1 1.0X -after 1900, vec on 7347 7459 134 13.6 73.5 1.9X -before 1900, vec off 15170 15192 27 6.6 151.7 0.9X -before 1900, vec on 8280 8312 52 12.1 82.8 1.7X +after 1582, vec off 13591 13658 59 7.4 135.9 1.0X +after 1582, vec on 7399 7488 126 13.5 74.0 1.8X +before 1582, vec off 14065 14096 30 7.1 140.7 1.0X +before 1582, vec on 7950 8127 249 12.6 79.5 1.7X diff --git a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt index a32a1ad8af89e..050950571511d 100644 --- a/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeRebaseBenchmark-results.txt @@ -2,93 +2,93 @@ Rebasing dates/timestamps in Parquet datasource ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 23088 23088 0 4.3 230.9 1.0X -before 1582, noop 10782 10782 0 9.3 107.8 2.1X -after 1582, rebase off 34821 34821 0 2.9 348.2 0.7X -after 1582, rebase on 35040 35040 0 2.9 350.4 0.7X -before 1582, rebase off 22151 22151 0 4.5 221.5 1.0X -before 1582, rebase on 24677 24677 0 4.1 246.8 0.9X +after 1582, noop 24114 24114 0 4.1 241.1 1.0X +before 1582, noop 10250 10250 0 9.8 102.5 2.4X +after 1582, rebase off 36672 36672 0 2.7 366.7 0.7X +after 1582, rebase on 37123 37123 0 2.7 371.2 0.6X +before 1582, rebase off 21925 21925 0 4.6 219.2 1.1X +before 1582, rebase on 22341 22341 0 4.5 223.4 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off, rebase off 13559 13650 79 7.4 135.6 1.0X -after 1582, vec off, rebase on 12942 12973 28 7.7 129.4 1.0X -after 1582, vec on, rebase off 3657 3689 29 27.3 36.6 3.7X -after 1582, vec on, rebase on 3859 3902 53 25.9 38.6 3.5X -before 1582, vec off, rebase off 12588 12607 17 7.9 125.9 1.1X -before 1582, vec off, rebase on 13396 13420 25 7.5 134.0 1.0X -before 1582, vec on, rebase off 3631 3650 19 27.5 36.3 3.7X -before 1582, vec on, rebase on 4706 4755 77 21.3 47.1 2.9X +after 1582, vec off, rebase off 12456 12601 126 8.0 124.6 1.0X +after 1582, vec off, rebase on 13299 13336 32 7.5 133.0 0.9X +after 1582, vec on, rebase off 3623 3660 40 27.6 36.2 3.4X +after 1582, vec on, rebase on 5160 5177 15 19.4 51.6 2.4X +before 1582, vec off, rebase off 13177 13264 76 7.6 131.8 0.9X +before 1582, vec off, rebase on 14102 14149 46 7.1 141.0 0.9X +before 1582, vec on, rebase off 3649 3670 34 27.4 36.5 3.4X +before 1582, vec on, rebase on 5652 5667 15 17.7 56.5 2.2X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2681 2681 0 37.3 26.8 1.0X -before 1900, noop 3051 3051 0 32.8 30.5 0.9X -after 1900, rebase off 16901 16901 0 5.9 169.0 0.2X -after 1900, rebase on 19725 19725 0 5.1 197.3 0.1X -before 1900, rebase off 16900 16900 0 5.9 169.0 0.2X -before 1900, rebase on 20381 20381 0 4.9 203.8 0.1X +after 1582, noop 2871 2871 0 34.8 28.7 1.0X +before 1582, noop 2753 2753 0 36.3 27.5 1.0X +after 1582, rebase off 15927 15927 0 6.3 159.3 0.2X +after 1582, rebase on 19138 19138 0 5.2 191.4 0.1X +before 1582, rebase off 16137 16137 0 6.2 161.4 0.2X +before 1582, rebase on 19584 19584 0 5.1 195.8 0.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from parquet: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off, rebase off 15236 15291 62 6.6 152.4 1.0X -after 1900, vec off, rebase on 17832 18047 187 5.6 178.3 0.9X -after 1900, vec on, rebase off 4875 4901 31 20.5 48.7 3.1X -after 1900, vec on, rebase on 5354 5386 37 18.7 53.5 2.8X -before 1900, vec off, rebase off 15229 15338 108 6.6 152.3 1.0X -before 1900, vec off, rebase on 18626 18668 44 5.4 186.3 0.8X -before 1900, vec on, rebase off 4968 4975 6 20.1 49.7 3.1X -before 1900, vec on, rebase on 9913 9932 16 10.1 99.1 1.5X +after 1582, vec off, rebase off 14995 15047 47 6.7 150.0 1.0X +after 1582, vec off, rebase on 18111 18146 37 5.5 181.1 0.8X +after 1582, vec on, rebase off 4837 4873 44 20.7 48.4 3.1X +after 1582, vec on, rebase on 9542 9669 111 10.5 95.4 1.6X +before 1582, vec off, rebase off 14993 15090 94 6.7 149.9 1.0X +before 1582, vec off, rebase on 18675 18712 64 5.4 186.7 0.8X +before 1582, vec on, rebase off 4908 4923 15 20.4 49.1 3.1X +before 1582, vec on, rebase on 10128 10148 19 9.9 101.3 1.5X ================================================================================================ Rebasing dates/timestamps in ORC datasource ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save dates to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, noop 22942 22942 0 4.4 229.4 1.0X -before 1582, noop 11035 11035 0 9.1 110.4 2.1X -after 1582 31341 31341 0 3.2 313.4 0.7X -before 1582 20376 20376 0 4.9 203.8 1.1X +after 1582, noop 23977 23977 0 4.2 239.8 1.0X +before 1582, noop 10094 10094 0 9.9 100.9 2.4X +after 1582 33115 33115 0 3.0 331.2 0.7X +before 1582 19430 19430 0 5.1 194.3 1.2X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load dates from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1582, vec off 10361 10378 29 9.7 103.6 1.0X -after 1582, vec on 3820 3828 11 26.2 38.2 2.7X -before 1582, vec off 10709 10720 13 9.3 107.1 1.0X -before 1582, vec on 4136 4153 15 24.2 41.4 2.5X +after 1582, vec off 10217 10241 21 9.8 102.2 1.0X +after 1582, vec on 3671 3691 31 27.2 36.7 2.8X +before 1582, vec off 10800 10874 114 9.3 108.0 0.9X +before 1582, vec on 4118 4165 74 24.3 41.2 2.5X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Save timestamps to ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, noop 2888 2888 0 34.6 28.9 1.0X -before 1900, noop 2823 2823 0 35.4 28.2 1.0X -after 1900 19790 19790 0 5.1 197.9 0.1X -before 1900 20774 20774 0 4.8 207.7 0.1X +after 1582, noop 2691 2691 0 37.2 26.9 1.0X +before 1582, noop 2743 2743 0 36.5 27.4 1.0X +after 1582 21409 21409 0 4.7 214.1 0.1X +before 1582 22554 22554 0 4.4 225.5 0.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws +OpenJDK 64-Bit Server VM 1.8.0_242-8u242-b08-0ubuntu3~18.04-b08 on Linux 4.15.0-1063-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Load timestamps from ORC: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -after 1900, vec off 14649 14687 38 6.8 146.5 1.0X -after 1900, vec on 7850 7937 130 12.7 78.5 1.9X -before 1900, vec off 15354 15417 108 6.5 153.5 1.0X -before 1900, vec on 8382 8408 22 11.9 83.8 1.7X +after 1582, vec off 14752 14855 103 6.8 147.5 1.0X +after 1582, vec on 8146 8185 34 12.3 81.5 1.8X +before 1582, vec off 15247 15294 46 6.6 152.5 1.0X +before 1582, vec on 8414 8466 52 11.9 84.1 1.8X diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java index 7ae60f22aa790..cfb873ff37379 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java @@ -423,8 +423,15 @@ private void readIntBatch(int rowId, int num, WritableColumnVector column) throw num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); } else if (column.dataType() == DataTypes.DateType ) { if (rebaseDateTime) { - defColumn.readIntegersWithRebase( - num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); + for (int i = 0; i < num; i++) { + if (defColumn.readInteger() == maxDefLevel) { + column.putInt( + rowId + i, + RebaseDateTime.rebaseJulianToGregorianDays(dataColumn.readInteger())); + } else { + column.putNull(rowId + i); + } + } } else { defColumn.readIntegers( num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); @@ -442,8 +449,15 @@ private void readLongBatch(int rowId, int num, WritableColumnVector column) thro num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); } else if (originalType == OriginalType.TIMESTAMP_MICROS) { if (rebaseDateTime) { - defColumn.readLongsWithRebase( - num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); + for (int i = 0; i < num; i++) { + if (defColumn.readInteger() == maxDefLevel) { + column.putLong( + rowId + i, + RebaseDateTime.rebaseJulianToGregorianMicros(dataColumn.readLong())); + } else { + column.putNull(rowId + i); + } + } } else { defColumn.readLongs( num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn); diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java index 2ed2e11b60c03..c62dc3d86386e 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedPlainValuesReader.java @@ -22,7 +22,6 @@ import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.io.ParquetDecodingException; -import org.apache.spark.sql.catalyst.util.RebaseDateTime; import org.apache.spark.sql.execution.vectorized.WritableColumnVector; import org.apache.parquet.column.values.ValuesReader; @@ -82,33 +81,6 @@ public final void readIntegers(int total, WritableColumnVector c, int rowId) { } } - // A fork of `readIntegers` to rebase the date values. For performance reasons, this method - // iterates the values twice: check if we need to rebase first, then go to the optimized branch - // if rebase is not needed. - @Override - public final void readIntegersWithRebase(int total, WritableColumnVector c, int rowId) { - int requiredBytes = total * 4; - ByteBuffer buffer = getBuffer(requiredBytes); - boolean rebase = false; - for (int i = 0; i < total; i += 1) { - rebase |= buffer.getInt(buffer.position() + i * 4) < RebaseDateTime.lastSwitchJulianDay(); - } - if (rebase) { - for (int i = 0; i < total; i += 1) { - c.putInt(rowId + i, RebaseDateTime.rebaseJulianToGregorianDays(buffer.getInt())); - } - } else { - if (buffer.hasArray()) { - int offset = buffer.arrayOffset() + buffer.position(); - c.putIntsLittleEndian(rowId, total, buffer.array(), offset); - } else { - for (int i = 0; i < total; i += 1) { - c.putInt(rowId + i, buffer.getInt()); - } - } - } - } - @Override public final void readLongs(int total, WritableColumnVector c, int rowId) { int requiredBytes = total * 8; @@ -124,33 +96,6 @@ public final void readLongs(int total, WritableColumnVector c, int rowId) { } } - // A fork of `readLongs` to rebase the timestamp values. For performance reasons, this method - // iterates the values twice: check if we need to rebase first, then go to the optimized branch - // if rebase is not needed. - @Override - public final void readLongsWithRebase(int total, WritableColumnVector c, int rowId) { - int requiredBytes = total * 8; - ByteBuffer buffer = getBuffer(requiredBytes); - boolean rebase = false; - for (int i = 0; i < total; i += 1) { - rebase |= buffer.getLong(buffer.position() + i * 8) < RebaseDateTime.lastSwitchJulianTs(); - } - if (rebase) { - for (int i = 0; i < total; i += 1) { - c.putLong(rowId + i, RebaseDateTime.rebaseJulianToGregorianMicros(buffer.getLong())); - } - } else { - if (buffer.hasArray()) { - int offset = buffer.arrayOffset() + buffer.position(); - c.putLongsLittleEndian(rowId, total, buffer.array(), offset); - } else { - for (int i = 0; i < total; i += 1) { - c.putLong(rowId + i, buffer.getLong()); - } - } - } - } - @Override public final void readFloats(int total, WritableColumnVector c, int rowId) { int requiredBytes = total * 4; diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java index 4d72a33fcf774..fe3d31ae8e746 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedRleValuesReader.java @@ -26,7 +26,6 @@ import org.apache.parquet.io.ParquetDecodingException; import org.apache.parquet.io.api.Binary; -import org.apache.spark.sql.catalyst.util.RebaseDateTime; import org.apache.spark.sql.execution.vectorized.WritableColumnVector; import java.io.IOException; @@ -204,43 +203,6 @@ public void readIntegers( } } - // A fork of `readIntegers`, which rebases the date int value (days) before filling - // the Spark column vector. - public void readIntegersWithRebase( - int total, - WritableColumnVector c, - int rowId, - int level, - VectorizedValuesReader data) throws IOException { - int left = total; - while (left > 0) { - if (this.currentCount == 0) this.readNextGroup(); - int n = Math.min(left, this.currentCount); - switch (mode) { - case RLE: - if (currentValue == level) { - data.readIntegersWithRebase(n, c, rowId); - } else { - c.putNulls(rowId, n); - } - break; - case PACKED: - for (int i = 0; i < n; ++i) { - if (currentBuffer[currentBufferIdx++] == level) { - c.putInt(rowId + i, - RebaseDateTime.rebaseJulianToGregorianDays(data.readInteger())); - } else { - c.putNull(rowId + i); - } - } - break; - } - rowId += n; - left -= n; - currentCount -= n; - } - } - // TODO: can this code duplication be removed without a perf penalty? public void readBooleans( int total, @@ -380,43 +342,6 @@ public void readLongs( } } - // A fork of `readLongs`, which rebases the timestamp long value (microseconds) before filling - // the Spark column vector. - public void readLongsWithRebase( - int total, - WritableColumnVector c, - int rowId, - int level, - VectorizedValuesReader data) throws IOException { - int left = total; - while (left > 0) { - if (this.currentCount == 0) this.readNextGroup(); - int n = Math.min(left, this.currentCount); - switch (mode) { - case RLE: - if (currentValue == level) { - data.readLongsWithRebase(n, c, rowId); - } else { - c.putNulls(rowId, n); - } - break; - case PACKED: - for (int i = 0; i < n; ++i) { - if (currentBuffer[currentBufferIdx++] == level) { - c.putLong(rowId + i, - RebaseDateTime.rebaseJulianToGregorianMicros(data.readLong())); - } else { - c.putNull(rowId + i); - } - } - break; - } - rowId += n; - left -= n; - currentCount -= n; - } - } - public void readFloats( int total, WritableColumnVector c, @@ -583,11 +508,6 @@ public void readIntegers(int total, WritableColumnVector c, int rowId) { } } - @Override - public void readIntegersWithRebase(int total, WritableColumnVector c, int rowId) { - throw new UnsupportedOperationException("only readInts is valid."); - } - @Override public byte readByte() { throw new UnsupportedOperationException("only readInts is valid."); @@ -603,11 +523,6 @@ public void readLongs(int total, WritableColumnVector c, int rowId) { throw new UnsupportedOperationException("only readInts is valid."); } - @Override - public void readLongsWithRebase(int total, WritableColumnVector c, int rowId) { - throw new UnsupportedOperationException("only readInts is valid."); - } - @Override public void readBinary(int total, WritableColumnVector c, int rowId) { throw new UnsupportedOperationException("only readInts is valid."); diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java index 809ac44cc8272..57d92ae27ece8 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedValuesReader.java @@ -40,9 +40,7 @@ public interface VectorizedValuesReader { void readBooleans(int total, WritableColumnVector c, int rowId); void readBytes(int total, WritableColumnVector c, int rowId); void readIntegers(int total, WritableColumnVector c, int rowId); - void readIntegersWithRebase(int total, WritableColumnVector c, int rowId); void readLongs(int total, WritableColumnVector c, int rowId); - void readLongsWithRebase(int total, WritableColumnVector c, int rowId); void readFloats(int total, WritableColumnVector c, int rowId); void readDoubles(int total, WritableColumnVector c, int rowId); void readBinary(int total, WritableColumnVector c, int rowId); diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala index 7968836a00d0f..077ac28c149ee 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeRebaseBenchmark.scala @@ -49,15 +49,15 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { .select($"seconds".cast("timestamp").as("ts")) } - private def genTsAfter1900(cardinality: Int): DataFrame = { - val start = LocalDateTime.of(1900, 1, 31, 0, 0, 0) + private def genTsAfter1582(cardinality: Int): DataFrame = { + val start = LocalDateTime.of(1582, 10, 15, 0, 0, 0) val end = LocalDateTime.of(3000, 1, 1, 0, 0, 0) genTs(cardinality, start, end) } - private def genTsBefore1900(cardinality: Int): DataFrame = { + private def genTsBefore1582(cardinality: Int): DataFrame = { val start = LocalDateTime.of(10, 1, 1, 0, 0, 0) - val end = LocalDateTime.of(1900, 1, 1, 0, 0, 0) + val end = LocalDateTime.of(1580, 1, 1, 0, 0, 0) genTs(cardinality, start, end) } @@ -71,35 +71,34 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { } private def genDateAfter1582(cardinality: Int): DataFrame = { - val start = LocalDate.of(1582, 10, 31) + val start = LocalDate.of(1582, 10, 15) val end = LocalDate.of(3000, 1, 1) genDate(cardinality, start, end) } private def genDateBefore1582(cardinality: Int): DataFrame = { val start = LocalDate.of(10, 1, 1) - val end = LocalDate.of(1580, 10, 1) + val end = LocalDate.of(1580, 1, 1) genDate(cardinality, start, end) } - private def genDF(cardinality: Int, dateTime: String, modernDates: Boolean): DataFrame = { - (dateTime, modernDates) match { + private def genDF(cardinality: Int, dateTime: String, after1582: Boolean): DataFrame = { + (dateTime, after1582) match { case ("date", true) => genDateAfter1582(cardinality) case ("date", false) => genDateBefore1582(cardinality) - case ("timestamp", true) => genTsAfter1900(cardinality) - case ("timestamp", false) => genTsBefore1900(cardinality) + case ("timestamp", true) => genTsAfter1582(cardinality) + case ("timestamp", false) => genTsBefore1582(cardinality) case _ => throw new IllegalArgumentException( - s"cardinality = $cardinality dateTime = $dateTime modernDates = $modernDates") + s"cardinality = $cardinality dateTime = $dateTime after1582 = $after1582") } } private def benchmarkInputs(benchmark: Benchmark, rowsNum: Int, dateTime: String): Unit = { - val year = if (dateTime == "date") 1582 else 1900 - benchmark.addCase(s"after $year, noop", 1) { _ => - genDF(rowsNum, dateTime, modernDates = true).noop() + benchmark.addCase("after 1582, noop", 1) { _ => + genDF(rowsNum, dateTime, after1582 = true).noop() } - benchmark.addCase(s"before $year, noop", 1) { _ => - genDF(rowsNum, dateTime, modernDates = false).noop() + benchmark.addCase("before 1582, noop", 1) { _ => + genDF(rowsNum, dateTime, after1582 = false).noop() } } @@ -108,26 +107,23 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { } private def caseName( - modernDates: Boolean, - dateTime: String, + after1582: Boolean, rebase: Option[Boolean] = None, vec: Option[Boolean] = None): String = { - val period = if (modernDates) "after" else "before" - val year = if (dateTime == "date") 1582 else 1900 + val period = if (after1582) "after" else "before" val vecFlag = vec.map(flagToStr).map(flag => s", vec $flag").getOrElse("") val rebaseFlag = rebase.map(flagToStr).map(flag => s", rebase $flag").getOrElse("") - s"$period $year$vecFlag$rebaseFlag" + s"$period 1582$vecFlag$rebaseFlag" } private def getPath( basePath: File, dateTime: String, - modernDates: Boolean, + after1582: Boolean, rebase: Option[Boolean] = None): String = { - val period = if (modernDates) "after" else "before" - val year = if (dateTime == "date") 1582 else 1900 + val period = if (after1582) "after" else "before" val rebaseFlag = rebase.map(flagToStr).map(flag => s"_$flag").getOrElse("") - basePath.getAbsolutePath + s"/${dateTime}_${period}_$year$rebaseFlag" + basePath.getAbsolutePath + s"/${dateTime}_${period}_1582$rebaseFlag" } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { @@ -143,16 +139,16 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { rowsNum, output = output) benchmarkInputs(benchmark, rowsNum, dateTime) - Seq(true, false).foreach { modernDates => + Seq(true, false).foreach { after1582 => Seq(false, true).foreach { rebase => - benchmark.addCase(caseName(modernDates, dateTime, Some(rebase)), 1) { _ => + benchmark.addCase(caseName(after1582, Some(rebase)), 1) { _ => withSQLConf( SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_WRITE.key -> rebase.toString) { - genDF(rowsNum, dateTime, modernDates) + genDF(rowsNum, dateTime, after1582) .write .mode("overwrite") .format("parquet") - .save(getPath(path, dateTime, modernDates, Some(rebase))) + .save(getPath(path, dateTime, after1582, Some(rebase))) } } } @@ -161,15 +157,16 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { val benchmark2 = new Benchmark( s"Load ${dateTime}s from parquet", rowsNum, output = output) - Seq(true, false).foreach { modernDates => + Seq(true, false).foreach { after1582 => Seq(false, true).foreach { vec => Seq(false, true).foreach { rebase => - val name = caseName(modernDates, dateTime, Some(rebase), Some(vec)) - benchmark2.addCase(name, 3) { _ => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vec.toString) { + benchmark2.addCase(caseName(after1582, Some(rebase), Some(vec)), 3) { _ => + withSQLConf( + SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vec.toString, + SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_READ.key -> rebase.toString) { spark.read .format("parquet") - .load(getPath(path, dateTime, modernDates, Some(rebase))) + .load(getPath(path, dateTime, after1582, Some(rebase))) .noop() } } @@ -186,13 +183,13 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { Seq("date", "timestamp").foreach { dateTime => val benchmark = new Benchmark(s"Save ${dateTime}s to ORC", rowsNum, output = output) benchmarkInputs(benchmark, rowsNum, dateTime) - Seq(true, false).foreach { modernDates => - benchmark.addCase(caseName(modernDates, dateTime), 1) { _ => - genDF(rowsNum, dateTime, modernDates) + Seq(true, false).foreach { after1582 => + benchmark.addCase(caseName(after1582), 1) { _ => + genDF(rowsNum, dateTime, after1582) .write .mode("overwrite") .format("orc") - .save(getPath(path, dateTime, modernDates)) + .save(getPath(path, dateTime, after1582)) } } benchmark.run() @@ -201,14 +198,14 @@ object DateTimeRebaseBenchmark extends SqlBasedBenchmark { s"Load ${dateTime}s from ORC", rowsNum, output = output) - Seq(true, false).foreach { modernDates => + Seq(true, false).foreach { after1582 => Seq(false, true).foreach { vec => - benchmark2.addCase(caseName(modernDates, dateTime, vec = Some(vec)), 3) { _ => + benchmark2.addCase(caseName(after1582, vec = Some(vec)), 3) { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vec.toString) { spark .read .format("orc") - .load(getPath(path, dateTime, modernDates)) + .load(getPath(path, dateTime, after1582)) .noop() } } From d3d175b8c1f66ae0b20d6a203cab3988c5457002 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 4 May 2020 11:23:35 -0700 Subject: [PATCH 16/19] Revert "[SPARK-31626][SQL] Port HIVE-10415: hive.start.cleanup.scratchdir configuration is not taking effect" This reverts commit 065871c30f8d505125f5197aa8ce6691ee8af92b. --- .../hive/thriftserver/HiveThriftServer2.scala | 3 -- .../HiveThriftServer2Suites.scala | 33 ------------------- 2 files changed, 36 deletions(-) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala index f9f2ceeed8a75..f15193b0dc3cc 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql.hive.thriftserver import java.util.Locale import java.util.concurrent.atomic.AtomicBoolean -import org.apache.hadoop.hive.common.ServerUtils import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hive.service.cli.thrift.{ThriftBinaryCLIService, ThriftHttpCLIService} @@ -102,8 +101,6 @@ object HiveThriftServer2 extends Logging { SparkSQLEnv.sqlContext.sessionState.newHadoopConf()) try { - // Cleanup the scratch dir before starting - ServerUtils.cleanUpScratchDir(executionHive.conf) val server = new HiveThriftServer2(SparkSQLEnv.sqlContext) server.init(executionHive.conf) server.start() diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala index 0cec63460814c..639dc4d13f673 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala @@ -924,39 +924,6 @@ class SingleSessionSuite extends HiveThriftJdbcTest { } } -class HiveThriftCleanUpScratchDirSuite extends HiveThriftJdbcTest{ - var tempScratchDir: File = _ - - override protected def beforeAll(): Unit = { - tempScratchDir = Utils.createTempDir() - tempScratchDir.setWritable(true, false) - assert(tempScratchDir.list().isEmpty) - new File(tempScratchDir.getAbsolutePath + File.separator + "SPARK-31626").createNewFile() - assert(tempScratchDir.list().nonEmpty) - super.beforeAll() - } - - override def mode: ServerMode.Value = ServerMode.binary - - override protected def extraConf: Seq[String] = - s" --hiveconf ${ConfVars.HIVE_START_CLEANUP_SCRATCHDIR}=true " :: - s"--hiveconf ${ConfVars.SCRATCHDIR}=${tempScratchDir.getAbsolutePath}" :: Nil - - test("Cleanup the Hive scratchdir when starting the Hive Server") { - assert(!tempScratchDir.exists()) - withJdbcStatement() { statement => - val rs = statement.executeQuery("SELECT id FROM range(1)") - assert(rs.next()) - assert(rs.getLong(1) === 0L) - } - } - - override protected def afterAll(): Unit = { - Utils.deleteRecursively(tempScratchDir) - super.afterAll() - } -} - class HiveThriftHttpServerSuite extends HiveThriftJdbcTest { override def mode: ServerMode.Value = ServerMode.http From 24eb041a098a86d5f8a6137c25ba32a5eb141eba Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 4 May 2020 11:23:37 -0700 Subject: [PATCH 17/19] Revert "[SPARK-31267][SQL] Flaky test: WholeStageCodegenSparkSubmitSuite.Generated code on driver should not embed platform-specific constant" This reverts commit da32137d37d032a3fd2507b229e3f494cc612246. --- .../scala/org/apache/spark/deploy/SparkSubmitSuite.scala | 5 ++--- .../sql/execution/WholeStageCodegenSparkSubmitSuite.scala | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index fd2d1f56ed9b6..1f3243400a918 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -31,7 +31,6 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, FSDataInputStream, Path} import org.scalatest.{BeforeAndAfterEach, Matchers} import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits} -import org.scalatest.time.Span import org.scalatest.time.SpanSugar._ import org.apache.spark._ @@ -1420,7 +1419,7 @@ object SparkSubmitSuite extends SparkFunSuite with TimeLimits { implicit val defaultSignaler: Signaler = ThreadSignaler // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly. - def runSparkSubmit(args: Seq[String], root: String = "..", timeout: Span = 1.minute): Unit = { + def runSparkSubmit(args: Seq[String], root: String = ".."): Unit = { val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) val sparkSubmitFile = if (Utils.isWindows) { new File(s"$root\\bin\\spark-submit.cmd") @@ -1433,7 +1432,7 @@ object SparkSubmitSuite extends SparkFunSuite with TimeLimits { Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome)) try { - val exitCode = failAfter(timeout) { process.waitFor() } + val exitCode = failAfter(1.minute) { process.waitFor() } if (exitCode != 0) { fail(s"Process returned with exit code $exitCode. See the log4j logs for more detail.") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala index c5a01de911962..f6814d8ff8a3d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution import org.scalatest.{Assertions, BeforeAndAfterEach, Matchers} import org.scalatest.concurrent.TimeLimits -import org.scalatest.time.SpanSugar._ import org.apache.spark.{SparkFunSuite, TestUtils} import org.apache.spark.deploy.SparkSubmitSuite @@ -51,7 +50,7 @@ class WholeStageCodegenSparkSubmitSuite extends SparkFunSuite "--conf", "spark.executor.extraJavaOptions=-XX:+UseCompressedOops", "--conf", "spark.sql.adaptive.enabled=false", unusedJar.toString) - SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..", 3.minutes) + SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..") } } From 8f9e9b2dc167c275543f1584695118a4b231f87e Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 4 May 2020 11:23:40 -0700 Subject: [PATCH 18/19] Revert "[SPARK-31527][SQL][TESTS][FOLLOWUP] Fix the number of rows in `DateTimeBenchmark`" This reverts commit d8a2fa0e5b45d99ff0cbd4c869242fedb8541b55. --- .../DateTimeBenchmark-jdk11-results.txt | 474 +++++++++--------- .../benchmarks/DateTimeBenchmark-results.txt | 474 +++++++++--------- .../benchmark/DateTimeBenchmark.scala | 2 +- 3 files changed, 475 insertions(+), 475 deletions(-) diff --git a/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt index 61b4c762a752e..1004bcf1aa286 100644 --- a/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt @@ -2,456 +2,456 @@ datetime +/- interval ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz datetime +/- interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date + interval(m) 1485 1567 116 6.7 148.5 1.0X -date + interval(m, d) 1504 1510 9 6.6 150.4 1.0X -date + interval(m, d, ms) 7000 7013 18 1.4 700.0 0.2X -date - interval(m) 1466 1478 17 6.8 146.6 1.0X -date - interval(m, d) 1533 1534 1 6.5 153.3 1.0X -date - interval(m, d, ms) 7014 7019 7 1.4 701.4 0.2X -timestamp + interval(m) 3062 3064 3 3.3 306.2 0.5X -timestamp + interval(m, d) 3133 3136 5 3.2 313.3 0.5X -timestamp + interval(m, d, ms) 3401 3402 3 2.9 340.1 0.4X -timestamp - interval(m) 3025 3037 17 3.3 302.5 0.5X -timestamp - interval(m, d) 3083 3120 51 3.2 308.3 0.5X -timestamp - interval(m, d, ms) 3371 3379 11 3.0 337.1 0.4X +date + interval(m) 919 933 22 0.0 306237514.3 1.0X +date + interval(m, d) 910 916 9 0.0 303338619.0 1.0X +date + interval(m, d, ms) 3912 3923 16 0.0 1303942791.7 0.2X +date - interval(m) 883 887 6 0.0 294268789.3 1.0X +date - interval(m, d) 898 911 18 0.0 299453403.0 1.0X +date - interval(m, d, ms) 3937 3944 11 0.0 1312269472.0 0.2X +timestamp + interval(m) 2226 2236 14 0.0 741972014.3 0.4X +timestamp + interval(m, d) 2264 2274 13 0.0 754709121.0 0.4X +timestamp + interval(m, d, ms) 2202 2223 30 0.0 734001075.0 0.4X +timestamp - interval(m) 1992 2005 17 0.0 664152744.7 0.5X +timestamp - interval(m, d) 2069 2075 9 0.0 689631159.0 0.4X +timestamp - interval(m, d, ms) 2240 2244 6 0.0 746538728.0 0.4X ================================================================================================ Extract components ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz cast to timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp wholestage off 339 339 1 29.5 33.9 1.0X -cast to timestamp wholestage on 330 337 9 30.3 33.0 1.0X +cast to timestamp wholestage off 178 180 3 56.1 17.8 1.0X +cast to timestamp wholestage on 189 192 4 53.0 18.9 0.9X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz year of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -year of timestamp wholestage off 1226 1237 15 8.2 122.6 1.0X -year of timestamp wholestage on 1230 1242 9 8.1 123.0 1.0X +year of timestamp wholestage off 760 761 1 13.2 76.0 1.0X +year of timestamp wholestage on 731 741 10 13.7 73.1 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz quarter of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -quarter of timestamp wholestage off 1602 1606 7 6.2 160.2 1.0X -quarter of timestamp wholestage on 1511 1514 3 6.6 151.1 1.1X +quarter of timestamp wholestage off 1005 1013 10 9.9 100.5 1.0X +quarter of timestamp wholestage on 981 986 3 10.2 98.1 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz month of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -month of timestamp wholestage off 1227 1233 8 8.1 122.7 1.0X -month of timestamp wholestage on 1226 1242 28 8.2 122.6 1.0X +month of timestamp wholestage off 754 758 6 13.3 75.4 1.0X +month of timestamp wholestage on 719 729 11 13.9 71.9 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz weekofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekofyear of timestamp wholestage off 1965 1980 20 5.1 196.5 1.0X -weekofyear of timestamp wholestage on 1816 1833 17 5.5 181.6 1.1X +weekofyear of timestamp wholestage off 1085 1088 4 9.2 108.5 1.0X +weekofyear of timestamp wholestage on 1075 1091 13 9.3 107.5 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz day of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -day of timestamp wholestage off 1229 1231 3 8.1 122.9 1.0X -day of timestamp wholestage on 1222 1230 10 8.2 122.2 1.0X +day of timestamp wholestage off 751 770 27 13.3 75.1 1.0X +day of timestamp wholestage on 735 741 7 13.6 73.5 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz dayofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofyear of timestamp wholestage off 1294 1297 4 7.7 129.4 1.0X -dayofyear of timestamp wholestage on 1257 1264 6 8.0 125.7 1.0X +dayofyear of timestamp wholestage off 765 769 5 13.1 76.5 1.0X +dayofyear of timestamp wholestage on 762 770 7 13.1 76.2 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz dayofmonth of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofmonth of timestamp wholestage off 1247 1253 8 8.0 124.7 1.0X -dayofmonth of timestamp wholestage on 1225 1229 4 8.2 122.5 1.0X +dayofmonth of timestamp wholestage off 780 782 2 12.8 78.0 1.0X +dayofmonth of timestamp wholestage on 720 736 12 13.9 72.0 1.1X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz dayofweek of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofweek of timestamp wholestage off 1416 1416 0 7.1 141.6 1.0X -dayofweek of timestamp wholestage on 1376 1382 8 7.3 137.6 1.0X +dayofweek of timestamp wholestage off 887 899 17 11.3 88.7 1.0X +dayofweek of timestamp wholestage on 820 847 20 12.2 82.0 1.1X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz weekday of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekday of timestamp wholestage off 1350 1351 1 7.4 135.0 1.0X -weekday of timestamp wholestage on 1308 1318 13 7.6 130.8 1.0X +weekday of timestamp wholestage off 821 825 5 12.2 82.1 1.0X +weekday of timestamp wholestage on 802 814 9 12.5 80.2 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz hour of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -hour of timestamp wholestage off 1004 1007 3 10.0 100.4 1.0X -hour of timestamp wholestage on 928 938 7 10.8 92.8 1.1X +hour of timestamp wholestage off 611 622 14 16.4 61.1 1.0X +hour of timestamp wholestage on 571 577 8 17.5 57.1 1.1X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz minute of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -minute of timestamp wholestage off 1009 1020 15 9.9 100.9 1.0X -minute of timestamp wholestage on 933 935 2 10.7 93.3 1.1X +minute of timestamp wholestage off 607 615 12 16.5 60.7 1.0X +minute of timestamp wholestage on 573 580 6 17.5 57.3 1.1X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz second of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -second of timestamp wholestage off 995 995 0 10.0 99.5 1.0X -second of timestamp wholestage on 932 937 8 10.7 93.2 1.1X +second of timestamp wholestage off 615 616 2 16.3 61.5 1.0X +second of timestamp wholestage on 564 575 8 17.7 56.4 1.1X ================================================================================================ Current date and time ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz current_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_date wholestage off 292 316 34 34.2 29.2 1.0X -current_date wholestage on 270 276 6 37.0 27.0 1.1X +current_date wholestage off 166 169 4 60.4 16.6 1.0X +current_date wholestage on 150 153 3 66.7 15.0 1.1X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz current_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_timestamp wholestage off 313 328 20 31.9 31.3 1.0X -current_timestamp wholestage on 270 331 95 37.0 27.0 1.2X +current_timestamp wholestage off 179 181 2 55.8 17.9 1.0X +current_timestamp wholestage on 162 324 138 61.9 16.2 1.1X ================================================================================================ Date arithmetic ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz cast to date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date wholestage off 1078 1081 3 9.3 107.8 1.0X -cast to date wholestage on 1035 1040 7 9.7 103.5 1.0X +cast to date wholestage off 658 661 5 15.2 65.8 1.0X +cast to date wholestage on 644 654 10 15.5 64.4 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz last_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -last_day wholestage off 1265 1266 3 7.9 126.5 1.0X -last_day wholestage on 1236 1246 10 8.1 123.6 1.0X +last_day wholestage off 768 772 5 13.0 76.8 1.0X +last_day wholestage on 737 750 12 13.6 73.7 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz next_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -next_day wholestage off 1118 1118 1 8.9 111.8 1.0X -next_day wholestage on 1085 1090 8 9.2 108.5 1.0X +next_day wholestage off 691 704 17 14.5 69.1 1.0X +next_day wholestage on 664 676 10 15.1 66.4 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_add: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_add wholestage off 1052 1054 4 9.5 105.2 1.0X -date_add wholestage on 1046 1051 6 9.6 104.6 1.0X +date_add wholestage off 646 646 0 15.5 64.6 1.0X +date_add wholestage on 623 640 13 16.1 62.3 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_sub: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_sub wholestage off 1075 1075 0 9.3 107.5 1.0X -date_sub wholestage on 1043 1046 3 9.6 104.3 1.0X +date_sub wholestage off 638 645 9 15.7 63.8 1.0X +date_sub wholestage on 618 629 8 16.2 61.8 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz add_months: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -add_months wholestage off 1409 1409 0 7.1 140.9 1.0X -add_months wholestage on 1448 1453 4 6.9 144.8 1.0X +add_months wholestage off 892 896 5 11.2 89.2 1.0X +add_months wholestage on 926 938 7 10.8 92.6 1.0X ================================================================================================ Formatting dates ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz format date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -format date wholestage off 5373 5390 24 1.9 537.3 1.0X -format date wholestage on 5337 5346 12 1.9 533.7 1.0X +format date wholestage off 3395 3439 62 2.9 339.5 1.0X +format date wholestage on 3418 3438 14 2.9 341.8 1.0X ================================================================================================ Formatting timestamps ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz from_unixtime: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_unixtime wholestage off 7302 7308 9 1.4 730.2 1.0X -from_unixtime wholestage on 7298 7319 16 1.4 729.8 1.0X +from_unixtime wholestage off 4565 4592 38 2.2 456.5 1.0X +from_unixtime wholestage on 4608 4635 32 2.2 460.8 1.0X ================================================================================================ Convert timestamps ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz from_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_utc_timestamp wholestage off 1322 1355 48 7.6 132.2 1.0X -from_utc_timestamp wholestage on 1290 1294 5 7.8 129.0 1.0X +from_utc_timestamp wholestage off 801 807 9 12.5 80.1 1.0X +from_utc_timestamp wholestage on 819 830 7 12.2 81.9 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz to_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_utc_timestamp wholestage off 1692 1705 18 5.9 169.2 1.0X -to_utc_timestamp wholestage on 1653 1657 4 6.1 165.3 1.0X +to_utc_timestamp wholestage off 1108 1114 8 9.0 110.8 1.0X +to_utc_timestamp wholestage on 1067 1078 13 9.4 106.7 1.0X ================================================================================================ Intervals ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz cast interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast interval wholestage off 340 356 22 29.4 34.0 1.0X -cast interval wholestage on 293 296 2 34.1 29.3 1.2X +cast interval wholestage off 211 213 2 47.4 21.1 1.0X +cast interval wholestage on 185 188 3 54.1 18.5 1.1X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz datediff: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -datediff wholestage off 1843 1862 28 5.4 184.3 1.0X -datediff wholestage on 1766 1780 16 5.7 176.6 1.0X +datediff wholestage off 1120 1120 1 8.9 112.0 1.0X +datediff wholestage on 1174 1205 19 8.5 117.4 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz months_between: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -months_between wholestage off 5856 5858 2 1.7 585.6 1.0X -months_between wholestage on 5799 5815 14 1.7 579.9 1.0X +months_between wholestage off 3669 3688 26 2.7 366.9 1.0X +months_between wholestage on 3687 3819 181 2.7 368.7 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz window: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -window wholestage off 2017 2147 183 0.5 2017.4 1.0X -window wholestage on 47789 47910 91 0.0 47788.6 0.0X +window wholestage off 1147 1148 1 0.9 1146.6 1.0X +window wholestage on 16997 17207 226 0.1 16996.7 0.1X ================================================================================================ Truncation ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc YEAR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YEAR wholestage off 2689 2689 1 3.7 268.9 1.0X -date_trunc YEAR wholestage on 2655 2670 17 3.8 265.5 1.0X +date_trunc YEAR wholestage off 1824 1859 50 5.5 182.4 1.0X +date_trunc YEAR wholestage on 1844 1942 71 5.4 184.4 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc YYYY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YYYY wholestage off 2698 2700 3 3.7 269.8 1.0X -date_trunc YYYY wholestage on 2654 2660 6 3.8 265.4 1.0X +date_trunc YYYY wholestage off 1808 1815 11 5.5 180.8 1.0X +date_trunc YYYY wholestage on 1833 1864 49 5.5 183.3 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc YY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YY wholestage off 2692 2697 7 3.7 269.2 1.0X -date_trunc YY wholestage on 2653 2662 7 3.8 265.3 1.0X +date_trunc YY wholestage off 1867 1883 23 5.4 186.7 1.0X +date_trunc YY wholestage on 1843 1861 15 5.4 184.3 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc MON: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MON wholestage off 2752 2756 6 3.6 275.2 1.0X -date_trunc MON wholestage on 2666 2675 15 3.8 266.6 1.0X +date_trunc MON wholestage off 1845 1858 18 5.4 184.5 1.0X +date_trunc MON wholestage on 1830 1893 42 5.5 183.0 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc MONTH: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MONTH wholestage off 2743 2746 4 3.6 274.3 1.0X -date_trunc MONTH wholestage on 2667 2673 8 3.7 266.7 1.0X +date_trunc MONTH wholestage off 1822 1855 47 5.5 182.2 1.0X +date_trunc MONTH wholestage on 1832 1863 20 5.5 183.2 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc MM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MM wholestage off 2741 2741 1 3.6 274.1 1.0X -date_trunc MM wholestage on 2670 2678 7 3.7 267.0 1.0X +date_trunc MM wholestage off 1843 1848 7 5.4 184.3 1.0X +date_trunc MM wholestage on 1886 1905 14 5.3 188.6 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc DAY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DAY wholestage off 2338 2342 6 4.3 233.8 1.0X -date_trunc DAY wholestage on 2269 2277 7 4.4 226.9 1.0X +date_trunc DAY wholestage off 1542 1545 4 6.5 154.2 1.0X +date_trunc DAY wholestage on 1610 1616 5 6.2 161.0 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc DD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DD wholestage off 2324 2325 1 4.3 232.4 1.0X -date_trunc DD wholestage on 2270 2273 2 4.4 227.0 1.0X +date_trunc DD wholestage off 1521 1529 11 6.6 152.1 1.0X +date_trunc DD wholestage on 1595 1611 21 6.3 159.5 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc HOUR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc HOUR wholestage off 2325 2326 1 4.3 232.5 1.0X -date_trunc HOUR wholestage on 2284 2295 8 4.4 228.4 1.0X +date_trunc HOUR wholestage off 1496 1543 67 6.7 149.6 1.0X +date_trunc HOUR wholestage on 1567 1594 18 6.4 156.7 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc MINUTE: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MINUTE wholestage off 407 408 0 24.5 40.7 1.0X -date_trunc MINUTE wholestage on 382 386 3 26.1 38.2 1.1X +date_trunc MINUTE wholestage off 230 230 1 43.5 23.0 1.0X +date_trunc MINUTE wholestage on 288 295 7 34.7 28.8 0.8X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc SECOND: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc SECOND wholestage off 404 404 1 24.8 40.4 1.0X -date_trunc SECOND wholestage on 386 390 4 25.9 38.6 1.0X +date_trunc SECOND wholestage off 247 249 4 40.5 24.7 1.0X +date_trunc SECOND wholestage on 297 314 12 33.6 29.7 0.8X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc WEEK: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc WEEK wholestage off 2693 2694 2 3.7 269.3 1.0X -date_trunc WEEK wholestage on 2619 2629 10 3.8 261.9 1.0X +date_trunc WEEK wholestage off 1786 1788 3 5.6 178.6 1.0X +date_trunc WEEK wholestage on 1786 1832 46 5.6 178.6 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc QUARTER: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc QUARTER wholestage off 3454 3466 17 2.9 345.4 1.0X -date_trunc QUARTER wholestage on 3384 3404 24 3.0 338.4 1.0X +date_trunc QUARTER wholestage off 2319 2365 66 4.3 231.9 1.0X +date_trunc QUARTER wholestage on 2424 2551 182 4.1 242.4 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz trunc year: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc year wholestage off 339 340 1 29.5 33.9 1.0X -trunc year wholestage on 337 347 9 29.7 33.7 1.0X +trunc year wholestage off 180 189 12 55.5 18.0 1.0X +trunc year wholestage on 271 277 5 36.9 27.1 0.7X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz trunc yyyy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yyyy wholestage off 347 348 2 28.8 34.7 1.0X -trunc yyyy wholestage on 334 335 2 29.9 33.4 1.0X +trunc yyyy wholestage off 189 191 4 52.9 18.9 1.0X +trunc yyyy wholestage on 276 284 6 36.2 27.6 0.7X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz trunc yy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yy wholestage off 339 346 11 29.5 33.9 1.0X -trunc yy wholestage on 333 338 5 30.0 33.3 1.0X +trunc yy wholestage off 189 190 1 52.9 18.9 1.0X +trunc yy wholestage on 279 294 11 35.9 27.9 0.7X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz trunc mon: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mon wholestage off 339 347 11 29.5 33.9 1.0X -trunc mon wholestage on 331 336 4 30.2 33.1 1.0X +trunc mon wholestage off 185 186 1 54.1 18.5 1.0X +trunc mon wholestage on 272 285 13 36.8 27.2 0.7X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz trunc month: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc month wholestage off 341 344 3 29.3 34.1 1.0X -trunc month wholestage on 332 338 9 30.1 33.2 1.0X +trunc month wholestage off 190 190 1 52.6 19.0 1.0X +trunc month wholestage on 293 300 4 34.1 29.3 0.6X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz trunc mm: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mm wholestage off 337 338 1 29.6 33.7 1.0X -trunc mm wholestage on 332 336 5 30.1 33.2 1.0X +trunc mm wholestage off 178 182 6 56.3 17.8 1.0X +trunc mm wholestage on 306 312 5 32.7 30.6 0.6X ================================================================================================ Parsing ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz to timestamp str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to timestamp str wholestage off 184 187 4 5.4 183.9 1.0X -to timestamp str wholestage on 159 162 2 6.3 159.4 1.2X +to timestamp str wholestage off 111 117 8 9.0 110.9 1.0X +to timestamp str wholestage on 101 109 6 9.9 100.6 1.1X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz to_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_timestamp wholestage off 1683 1689 8 0.6 1683.3 1.0X -to_timestamp wholestage on 1722 1725 4 0.6 1721.6 1.0X +to_timestamp wholestage off 735 746 15 1.4 734.9 1.0X +to_timestamp wholestage on 708 725 11 1.4 708.2 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz to_unix_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_unix_timestamp wholestage off 1733 1736 4 0.6 1733.1 1.0X -to_unix_timestamp wholestage on 1687 1690 4 0.6 1686.6 1.0X +to_unix_timestamp wholestage off 718 727 12 1.4 717.9 1.0X +to_unix_timestamp wholestage on 739 755 12 1.4 739.1 1.0X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz to date str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to date str wholestage off 218 220 4 4.6 217.6 1.0X -to date str wholestage on 213 215 2 4.7 212.6 1.0X +to date str wholestage off 124 125 1 8.0 124.4 1.0X +to date str wholestage on 134 138 3 7.5 133.9 0.9X -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz to_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_date wholestage off 3697 3699 2 0.3 3697.2 1.0X -to_date wholestage on 3603 3624 15 0.3 3602.7 1.0X +to_date wholestage off 1510 1544 48 0.7 1510.4 1.0X +to_date wholestage on 1544 1557 15 0.6 1544.2 1.0X ================================================================================================ Conversion from/to external types ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz To/from Java's date-time: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -From java.sql.Date 432 436 7 11.6 86.4 1.0X -From java.time.LocalDate 343 347 6 14.6 68.6 1.3X -Collect java.sql.Date 1888 2453 971 2.6 377.6 0.2X -Collect java.time.LocalDate 1779 1820 42 2.8 355.7 0.2X -From java.sql.Timestamp 375 384 9 13.3 75.0 1.2X -From java.time.Instant 317 326 8 15.8 63.5 1.4X -Collect longs 1338 1428 115 3.7 267.6 0.3X -Collect java.sql.Timestamp 1716 2014 281 2.9 343.1 0.3X -Collect java.time.Instant 1832 1970 122 2.7 366.5 0.2X +From java.sql.Date 269 278 9 18.6 53.7 1.0X +From java.time.LocalDate 227 234 7 22.0 45.4 1.2X +Collect java.sql.Date 1164 1272 141 4.3 232.8 0.2X +Collect java.time.LocalDate 1070 1130 59 4.7 214.1 0.3X +From java.sql.Timestamp 246 248 2 20.3 49.2 1.1X +From java.time.Instant 214 216 2 23.4 42.8 1.3X +Collect longs 814 831 15 6.1 162.7 0.3X +Collect java.sql.Timestamp 1016 1096 78 4.9 203.2 0.3X +Collect java.time.Instant 1012 1093 86 4.9 202.4 0.3X diff --git a/sql/core/benchmarks/DateTimeBenchmark-results.txt b/sql/core/benchmarks/DateTimeBenchmark-results.txt index 3ef2f922f95bf..dba6c909be637 100644 --- a/sql/core/benchmarks/DateTimeBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeBenchmark-results.txt @@ -2,456 +2,456 @@ datetime +/- interval ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz datetime +/- interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date + interval(m) 1613 1622 13 6.2 161.3 1.0X -date + interval(m, d) 1729 1752 32 5.8 172.9 0.9X -date + interval(m, d, ms) 6421 6424 5 1.6 642.1 0.3X -date - interval(m) 1441 1443 2 6.9 144.1 1.1X -date - interval(m, d) 1687 1689 2 5.9 168.7 1.0X -date - interval(m, d, ms) 6617 6625 11 1.5 661.7 0.2X -timestamp + interval(m) 2713 2733 28 3.7 271.3 0.6X -timestamp + interval(m, d) 3027 3032 8 3.3 302.7 0.5X -timestamp + interval(m, d, ms) 3501 3509 12 2.9 350.1 0.5X -timestamp - interval(m) 2892 2895 4 3.5 289.2 0.6X -timestamp - interval(m, d) 3190 3196 9 3.1 319.0 0.5X -timestamp - interval(m, d, ms) 3497 3500 5 2.9 349.7 0.5X +date + interval(m) 1003 1012 13 0.0 334353721.7 1.0X +date + interval(m, d) 1154 1174 29 0.0 384575202.7 0.9X +date + interval(m, d, ms) 4338 4366 40 0.0 1446002701.3 0.2X +date - interval(m) 850 858 8 0.0 283424914.7 1.2X +date - interval(m, d) 1017 1031 19 0.0 339034354.7 1.0X +date - interval(m, d, ms) 4699 4717 25 0.0 1566218686.3 0.2X +timestamp + interval(m) 2044 2046 3 0.0 681382301.0 0.5X +timestamp + interval(m, d) 2215 2249 48 0.0 738464286.7 0.5X +timestamp + interval(m, d, ms) 2053 2063 13 0.0 684393366.0 0.5X +timestamp - interval(m) 1668 1677 12 0.0 556138256.7 0.6X +timestamp - interval(m, d) 1865 1882 25 0.0 621574795.3 0.5X +timestamp - interval(m, d, ms) 2075 2077 3 0.0 691569937.3 0.5X ================================================================================================ Extract components ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz cast to timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to timestamp wholestage off 321 323 2 31.1 32.1 1.0X -cast to timestamp wholestage on 294 306 10 34.0 29.4 1.1X +cast to timestamp wholestage off 192 192 0 52.2 19.2 1.0X +cast to timestamp wholestage on 163 166 3 61.3 16.3 1.2X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz year of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -year of timestamp wholestage off 1235 1242 9 8.1 123.5 1.0X -year of timestamp wholestage on 1208 1216 8 8.3 120.8 1.0X +year of timestamp wholestage off 743 745 4 13.5 74.3 1.0X +year of timestamp wholestage on 708 715 5 14.1 70.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz quarter of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -quarter of timestamp wholestage off 1415 1424 12 7.1 141.5 1.0X -quarter of timestamp wholestage on 1338 1341 4 7.5 133.8 1.1X +quarter of timestamp wholestage off 848 857 12 11.8 84.8 1.0X +quarter of timestamp wholestage on 803 813 6 12.5 80.3 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz month of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -month of timestamp wholestage off 1224 1225 1 8.2 122.4 1.0X -month of timestamp wholestage on 1193 1202 8 8.4 119.3 1.0X +month of timestamp wholestage off 740 745 7 13.5 74.0 1.0X +month of timestamp wholestage on 703 710 5 14.2 70.3 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz weekofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekofyear of timestamp wholestage off 1864 1866 3 5.4 186.4 1.0X -weekofyear of timestamp wholestage on 1827 1840 7 5.5 182.7 1.0X +weekofyear of timestamp wholestage off 1162 1182 28 8.6 116.2 1.0X +weekofyear of timestamp wholestage on 1093 1102 9 9.2 109.3 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz day of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -day of timestamp wholestage off 1209 1211 2 8.3 120.9 1.0X -day of timestamp wholestage on 1191 1194 6 8.4 119.1 1.0X +day of timestamp wholestage off 742 748 9 13.5 74.2 1.0X +day of timestamp wholestage on 703 713 7 14.2 70.3 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz dayofyear of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofyear of timestamp wholestage off 1270 1271 2 7.9 127.0 1.0X -dayofyear of timestamp wholestage on 1241 1250 12 8.1 124.1 1.0X +dayofyear of timestamp wholestage off 791 799 11 12.6 79.1 1.0X +dayofyear of timestamp wholestage on 732 744 9 13.7 73.2 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz dayofmonth of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofmonth of timestamp wholestage off 1236 1250 20 8.1 123.6 1.0X -dayofmonth of timestamp wholestage on 1193 1195 3 8.4 119.3 1.0X +dayofmonth of timestamp wholestage off 738 752 20 13.6 73.8 1.0X +dayofmonth of timestamp wholestage on 695 712 9 14.4 69.5 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz dayofweek of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -dayofweek of timestamp wholestage off 1402 1405 4 7.1 140.2 1.0X -dayofweek of timestamp wholestage on 1352 1359 7 7.4 135.2 1.0X +dayofweek of timestamp wholestage off 854 856 3 11.7 85.4 1.0X +dayofweek of timestamp wholestage on 819 839 16 12.2 81.9 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz weekday of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -weekday of timestamp wholestage off 1346 1347 2 7.4 134.6 1.0X -weekday of timestamp wholestage on 1294 1299 7 7.7 129.4 1.0X +weekday of timestamp wholestage off 816 821 7 12.3 81.6 1.0X +weekday of timestamp wholestage on 788 800 8 12.7 78.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz hour of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -hour of timestamp wholestage off 1000 1008 11 10.0 100.0 1.0X -hour of timestamp wholestage on 936 941 6 10.7 93.6 1.1X +hour of timestamp wholestage off 595 595 1 16.8 59.5 1.0X +hour of timestamp wholestage on 533 541 10 18.8 53.3 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz minute of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -minute of timestamp wholestage off 969 976 10 10.3 96.9 1.0X -minute of timestamp wholestage on 933 936 4 10.7 93.3 1.0X +minute of timestamp wholestage off 585 588 4 17.1 58.5 1.0X +minute of timestamp wholestage on 532 545 11 18.8 53.2 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz second of timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -second of timestamp wholestage off 1002 1005 3 10.0 100.2 1.0X -second of timestamp wholestage on 935 938 2 10.7 93.5 1.1X +second of timestamp wholestage off 579 589 13 17.3 57.9 1.0X +second of timestamp wholestage on 529 537 6 18.9 52.9 1.1X ================================================================================================ Current date and time ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz current_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_date wholestage off 308 316 11 32.5 30.8 1.0X -current_date wholestage on 265 275 12 37.8 26.5 1.2X +current_date wholestage off 171 174 4 58.4 17.1 1.0X +current_date wholestage on 152 155 3 65.6 15.2 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz current_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -current_timestamp wholestage off 307 312 7 32.6 30.7 1.0X -current_timestamp wholestage on 263 268 5 38.1 26.3 1.2X +current_timestamp wholestage off 178 181 4 56.2 17.8 1.0X +current_timestamp wholestage on 138 149 7 72.6 13.8 1.3X ================================================================================================ Date arithmetic ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz cast to date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast to date wholestage off 1061 1065 5 9.4 106.1 1.0X -cast to date wholestage on 985 991 11 10.2 98.5 1.1X +cast to date wholestage off 630 640 14 15.9 63.0 1.0X +cast to date wholestage on 591 594 5 16.9 59.1 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz last_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -last_day wholestage off 1261 1262 1 7.9 126.1 1.0X -last_day wholestage on 1223 1235 12 8.2 122.3 1.0X +last_day wholestage off 759 778 26 13.2 75.9 1.0X +last_day wholestage on 727 736 9 13.8 72.7 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz next_day: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -next_day wholestage off 1114 1119 7 9.0 111.4 1.0X -next_day wholestage on 1034 1038 3 9.7 103.4 1.1X +next_day wholestage off 649 659 15 15.4 64.9 1.0X +next_day wholestage on 628 629 1 15.9 62.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_add: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_add wholestage off 1059 1076 25 9.4 105.9 1.0X -date_add wholestage on 1012 1021 9 9.9 101.2 1.0X +date_add wholestage off 621 622 1 16.1 62.1 1.0X +date_add wholestage on 600 606 6 16.7 60.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_sub: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_sub wholestage off 1046 1046 0 9.6 104.6 1.0X -date_sub wholestage on 1019 1023 3 9.8 101.9 1.0X +date_sub wholestage off 611 626 21 16.4 61.1 1.0X +date_sub wholestage on 588 600 7 17.0 58.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz add_months: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -add_months wholestage off 1392 1393 1 7.2 139.2 1.0X -add_months wholestage on 1335 1346 14 7.5 133.5 1.0X +add_months wholestage off 843 845 2 11.9 84.3 1.0X +add_months wholestage on 818 831 11 12.2 81.8 1.0X ================================================================================================ Formatting dates ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz format date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -format date wholestage off 5959 5994 50 1.7 595.9 1.0X -format date wholestage on 5991 6008 28 1.7 599.1 1.0X +format date wholestage off 3557 3569 18 2.8 355.7 1.0X +format date wholestage on 3564 3588 17 2.8 356.4 1.0X ================================================================================================ Formatting timestamps ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz from_unixtime: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_unixtime wholestage off 8851 8872 29 1.1 885.1 1.0X -from_unixtime wholestage on 8855 8872 10 1.1 885.5 1.0X +from_unixtime wholestage off 4875 4887 17 2.1 487.5 1.0X +from_unixtime wholestage on 4845 4870 16 2.1 484.5 1.0X ================================================================================================ Convert timestamps ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz from_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -from_utc_timestamp wholestage off 1105 1107 2 9.0 110.5 1.0X -from_utc_timestamp wholestage on 1072 1084 11 9.3 107.2 1.0X +from_utc_timestamp wholestage off 665 671 8 15.0 66.5 1.0X +from_utc_timestamp wholestage on 654 672 14 15.3 65.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz to_utc_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_utc_timestamp wholestage off 1531 1534 3 6.5 153.1 1.0X -to_utc_timestamp wholestage on 1451 1463 14 6.9 145.1 1.1X +to_utc_timestamp wholestage off 982 983 1 10.2 98.2 1.0X +to_utc_timestamp wholestage on 877 889 9 11.4 87.7 1.1X ================================================================================================ Intervals ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz cast interval: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -cast interval wholestage off 360 366 8 27.8 36.0 1.0X -cast interval wholestage on 286 292 7 35.0 28.6 1.3X +cast interval wholestage off 200 206 9 50.1 20.0 1.0X +cast interval wholestage on 157 163 5 63.6 15.7 1.3X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz datediff: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -datediff wholestage off 1809 1814 8 5.5 180.9 1.0X -datediff wholestage on 1742 1751 8 5.7 174.2 1.0X +datediff wholestage off 1065 1068 4 9.4 106.5 1.0X +datediff wholestage on 1028 1047 15 9.7 102.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz months_between: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -months_between wholestage off 5007 5007 1 2.0 500.7 1.0X -months_between wholestage on 4957 4980 35 2.0 495.7 1.0X +months_between wholestage off 3102 3111 13 3.2 310.2 1.0X +months_between wholestage on 2970 3028 46 3.4 297.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz window: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -window wholestage off 1945 2027 116 0.5 1945.3 1.0X -window wholestage on 45637 45648 8 0.0 45637.2 0.0X +window wholestage off 1142 1154 16 0.9 1142.2 1.0X +window wholestage on 14817 15049 257 0.1 14816.5 0.1X ================================================================================================ Truncation ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc YEAR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YEAR wholestage off 2463 2465 2 4.1 246.3 1.0X -date_trunc YEAR wholestage on 2406 2409 3 4.2 240.6 1.0X +date_trunc YEAR wholestage off 1516 1518 2 6.6 151.6 1.0X +date_trunc YEAR wholestage on 1458 1468 9 6.9 145.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc YYYY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YYYY wholestage off 2462 2463 1 4.1 246.2 1.0X -date_trunc YYYY wholestage on 2407 2411 6 4.2 240.7 1.0X +date_trunc YYYY wholestage off 1535 1535 1 6.5 153.5 1.0X +date_trunc YYYY wholestage on 1453 1461 7 6.9 145.3 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc YY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc YY wholestage off 2462 2466 6 4.1 246.2 1.0X -date_trunc YY wholestage on 2401 2406 4 4.2 240.1 1.0X +date_trunc YY wholestage off 1561 1567 9 6.4 156.1 1.0X +date_trunc YY wholestage on 1452 1467 16 6.9 145.2 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc MON: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MON wholestage off 2437 2437 0 4.1 243.7 1.0X -date_trunc MON wholestage on 2416 2421 6 4.1 241.6 1.0X +date_trunc MON wholestage off 1522 1531 13 6.6 152.2 1.0X +date_trunc MON wholestage on 1458 1467 7 6.9 145.8 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc MONTH: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MONTH wholestage off 2430 2437 9 4.1 243.0 1.0X -date_trunc MONTH wholestage on 2417 2423 5 4.1 241.7 1.0X +date_trunc MONTH wholestage off 1518 1519 0 6.6 151.8 1.0X +date_trunc MONTH wholestage on 1452 1465 16 6.9 145.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc MM: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MM wholestage off 2429 2431 3 4.1 242.9 1.0X -date_trunc MM wholestage on 2417 2421 4 4.1 241.7 1.0X +date_trunc MM wholestage off 1531 1532 1 6.5 153.1 1.0X +date_trunc MM wholestage on 1453 1463 8 6.9 145.3 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc DAY: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DAY wholestage off 2074 2075 2 4.8 207.4 1.0X -date_trunc DAY wholestage on 2001 2010 16 5.0 200.1 1.0X +date_trunc DAY wholestage off 1287 1309 31 7.8 128.7 1.0X +date_trunc DAY wholestage on 1310 1337 16 7.6 131.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc DD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc DD wholestage off 2067 2067 0 4.8 206.7 1.0X -date_trunc DD wholestage on 2000 2003 3 5.0 200.0 1.0X +date_trunc DD wholestage off 1322 1328 9 7.6 132.2 1.0X +date_trunc DD wholestage on 1282 1324 28 7.8 128.2 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc HOUR: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc HOUR wholestage off 2074 2084 14 4.8 207.4 1.0X -date_trunc HOUR wholestage on 2057 2067 10 4.9 205.7 1.0X +date_trunc HOUR wholestage off 1379 1393 20 7.3 137.9 1.0X +date_trunc HOUR wholestage on 1288 1302 11 7.8 128.8 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc MINUTE: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc MINUTE wholestage off 362 364 3 27.6 36.2 1.0X -date_trunc MINUTE wholestage on 319 333 14 31.3 31.9 1.1X +date_trunc MINUTE wholestage off 243 245 2 41.2 24.3 1.0X +date_trunc MINUTE wholestage on 213 219 8 47.0 21.3 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc SECOND: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc SECOND wholestage off 361 366 7 27.7 36.1 1.0X -date_trunc SECOND wholestage on 324 341 23 30.9 32.4 1.1X +date_trunc SECOND wholestage off 238 245 11 42.1 23.8 1.0X +date_trunc SECOND wholestage on 201 210 9 49.7 20.1 1.2X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc WEEK: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc WEEK wholestage off 2385 2393 11 4.2 238.5 1.0X -date_trunc WEEK wholestage on 2313 2322 6 4.3 231.3 1.0X +date_trunc WEEK wholestage off 1443 1477 49 6.9 144.3 1.0X +date_trunc WEEK wholestage on 1491 1516 17 6.7 149.1 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz date_trunc QUARTER: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -date_trunc QUARTER wholestage off 3278 3280 2 3.1 327.8 1.0X -date_trunc QUARTER wholestage on 3228 3234 8 3.1 322.8 1.0X +date_trunc QUARTER wholestage off 2017 2039 32 5.0 201.7 1.0X +date_trunc QUARTER wholestage on 1966 2005 36 5.1 196.6 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz trunc year: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc year wholestage off 328 331 4 30.5 32.8 1.0X -trunc year wholestage on 286 295 9 35.0 28.6 1.1X +trunc year wholestage off 206 206 1 48.6 20.6 1.0X +trunc year wholestage on 175 178 2 57.2 17.5 1.2X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz trunc yyyy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yyyy wholestage off 317 319 3 31.5 31.7 1.0X -trunc yyyy wholestage on 283 287 6 35.3 28.3 1.1X +trunc yyyy wholestage off 188 189 2 53.2 18.8 1.0X +trunc yyyy wholestage on 176 180 4 56.9 17.6 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz trunc yy: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc yy wholestage off 321 321 0 31.1 32.1 1.0X -trunc yy wholestage on 284 293 11 35.2 28.4 1.1X +trunc yy wholestage off 191 191 0 52.4 19.1 1.0X +trunc yy wholestage on 175 180 4 57.0 17.5 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz trunc mon: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mon wholestage off 318 319 1 31.4 31.8 1.0X -trunc mon wholestage on 283 287 4 35.4 28.3 1.1X +trunc mon wholestage off 203 205 3 49.3 20.3 1.0X +trunc mon wholestage on 183 186 2 54.8 18.3 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz trunc month: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc month wholestage off 319 321 3 31.3 31.9 1.0X -trunc month wholestage on 286 293 7 35.0 28.6 1.1X +trunc month wholestage off 199 199 0 50.3 19.9 1.0X +trunc month wholestage on 177 179 2 56.4 17.7 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz trunc mm: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -trunc mm wholestage off 317 319 2 31.5 31.7 1.0X -trunc mm wholestage on 282 285 3 35.4 28.2 1.1X +trunc mm wholestage off 198 198 1 50.5 19.8 1.0X +trunc mm wholestage on 180 183 3 55.7 18.0 1.1X ================================================================================================ Parsing ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz to timestamp str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to timestamp str wholestage off 219 220 0 4.6 219.4 1.0X -to timestamp str wholestage on 214 218 6 4.7 214.1 1.0X +to timestamp str wholestage off 138 139 2 7.2 138.2 1.0X +to timestamp str wholestage on 129 138 7 7.8 128.9 1.1X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz to_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_timestamp wholestage off 1912 1913 2 0.5 1912.0 1.0X -to_timestamp wholestage on 1671 1675 7 0.6 1670.8 1.1X +to_timestamp wholestage off 885 889 5 1.1 885.3 1.0X +to_timestamp wholestage on 854 866 10 1.2 854.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz to_unix_timestamp: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_unix_timestamp wholestage off 1761 1763 3 0.6 1761.1 1.0X -to_unix_timestamp wholestage on 1695 1697 2 0.6 1695.4 1.0X +to_unix_timestamp wholestage off 848 856 12 1.2 848.1 1.0X +to_unix_timestamp wholestage on 826 850 18 1.2 826.4 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz to date str: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to date str wholestage off 267 272 7 3.7 266.9 1.0X -to date str wholestage on 266 267 2 3.8 265.8 1.0X +to date str wholestage off 167 171 5 6.0 167.2 1.0X +to date str wholestage on 165 173 4 6.1 165.0 1.0X -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz to_date: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -to_date wholestage off 3705 3743 55 0.3 3704.6 1.0X -to_date wholestage on 3736 3746 11 0.3 3736.4 1.0X +to_date wholestage off 1612 1633 31 0.6 1611.7 1.0X +to_date wholestage on 1588 1605 19 0.6 1588.2 1.0X ================================================================================================ Conversion from/to external types ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_251-b08 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz To/from Java's date-time: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -From java.sql.Date 400 406 6 12.5 80.1 1.0X -From java.time.LocalDate 343 349 7 14.6 68.6 1.2X -Collect java.sql.Date 1904 2739 1170 2.6 380.9 0.2X -Collect java.time.LocalDate 1477 1495 19 3.4 295.3 0.3X -From java.sql.Timestamp 376 388 10 13.3 75.2 1.1X -From java.time.Instant 237 239 3 21.1 47.4 1.7X -Collect longs 1258 1356 111 4.0 251.7 0.3X -Collect java.sql.Timestamp 1878 1937 64 2.7 375.6 0.2X -Collect java.time.Instant 1667 1904 238 3.0 333.4 0.2X +From java.sql.Date 245 247 1 20.4 49.0 1.0X +From java.time.LocalDate 228 233 4 21.9 45.6 1.1X +Collect java.sql.Date 1239 1361 209 4.0 247.9 0.2X +Collect java.time.LocalDate 1049 1107 54 4.8 209.8 0.2X +From java.sql.Timestamp 247 252 4 20.2 49.5 1.0X +From java.time.Instant 156 158 3 32.1 31.2 1.6X +Collect longs 854 910 59 5.9 170.8 0.3X +Collect java.sql.Timestamp 1133 1140 12 4.4 226.6 0.2X +Collect java.time.Instant 1108 1159 74 4.5 221.7 0.2X diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala index f56efa3bba600..0034819b58893 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala @@ -61,7 +61,7 @@ object DateTimeBenchmark extends SqlBasedBenchmark { withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> LA.getId) { val N = 10000000 runBenchmark("datetime +/- interval") { - val benchmark = new Benchmark("datetime +/- interval", N, output = output) + val benchmark = new Benchmark("datetime +/- interval", 3, output = output) val ts = "cast(id as timestamp)" val dt = s"cast($ts as date)" benchmark.addCase("date + interval(m)") { _ => From 760decec89c9dc42eb0d1a62c45ecbdf9a69a80d Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 4 May 2020 11:23:43 -0700 Subject: [PATCH 19/19] Revert "[SPARK-31571][R] Overhaul stop/message/warning calls to be more canonical" This reverts commit 0c8146a062e14042a8f865282636d157c0e4e029. --- R/pkg/R/DataFrame.R | 40 +++++++++-------- R/pkg/R/RDD.R | 2 +- R/pkg/R/SQLContext.R | 17 ++++---- R/pkg/R/client.R | 7 +-- R/pkg/R/context.R | 8 ++-- R/pkg/R/deserialize.R | 2 +- R/pkg/R/group.R | 4 +- R/pkg/R/install.R | 62 ++++++++++++++++----------- R/pkg/R/mllib_classification.R | 4 +- R/pkg/R/mllib_stat.R | 3 +- R/pkg/R/pairRDD.R | 2 +- R/pkg/R/schema.R | 2 +- R/pkg/R/serialize.R | 4 +- R/pkg/R/sparkR.R | 9 ++-- R/pkg/R/utils.R | 53 ++++++++++++----------- R/pkg/tests/fulltests/test_sparkSQL.R | 2 +- 16 files changed, 118 insertions(+), 103 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 15b3ce2935427..09e831814b893 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -431,7 +431,7 @@ setMethod("coltypes", if (is.null(type)) { specialtype <- specialtypeshandle(x) if (is.null(specialtype)) { - stop("Unsupported data type: ", x) + stop(paste("Unsupported data type: ", x)) } type <- PRIMITIVE_TYPES[[specialtype]] } @@ -829,8 +829,8 @@ setMethod("repartitionByRange", jcol <- lapply(cols, function(c) { c@jc }) sdf <- callJMethod(x@sdf, "repartitionByRange", numToInt(numPartitions), jcol) } else { - stop("numPartitions and col must be numeric and Column; however, got ", - class(numPartitions), " and ", class(col)) + stop(paste("numPartitions and col must be numeric and Column; however, got", + class(numPartitions), "and", class(col))) } } else if (!is.null(col)) { # only columns are specified @@ -839,7 +839,7 @@ setMethod("repartitionByRange", jcol <- lapply(cols, function(c) { c@jc }) sdf <- callJMethod(x@sdf, "repartitionByRange", jcol) } else { - stop("col must be Column; however, got ", class(col)) + stop(paste("col must be Column; however, got", class(col))) } } else if (!is.null(numPartitions)) { # only numPartitions is specified @@ -1068,10 +1068,10 @@ setMethod("sample", signature(x = "SparkDataFrame"), function(x, withReplacement = FALSE, fraction, seed) { if (!is.numeric(fraction)) { - stop("fraction must be numeric; however, got ", class(fraction)) + stop(paste("fraction must be numeric; however, got", class(fraction))) } if (!is.logical(withReplacement)) { - stop("withReplacement must be logical; however, got ", class(withReplacement)) + stop(paste("withReplacement must be logical; however, got", class(withReplacement))) } if (!missing(seed)) { @@ -1211,10 +1211,11 @@ setMethod("collect", checkSchemaInArrow(schema(x)) TRUE }, error = function(e) { - warning("The conversion from Spark DataFrame to R DataFrame was attempted ", - "with Arrow optimization because ", - "'spark.sql.execution.arrow.sparkr.enabled' is set to true; ", - "however, failed, attempting non-optimization. Reason: ", e) + warning(paste0("The conversion from Spark DataFrame to R DataFrame was attempted ", + "with Arrow optimization because ", + "'spark.sql.execution.arrow.sparkr.enabled' is set to true; ", + "however, failed, attempting non-optimization. Reason: ", + e)) FALSE }) } @@ -1507,8 +1508,8 @@ dapplyInternal <- function(x, func, schema) { if (inherits(schema, "structType")) { checkSchemaInArrow(schema) } else if (is.null(schema)) { - stop("Arrow optimization does not support 'dapplyCollect' yet. Please disable ", - "Arrow optimization or use 'collect' and 'dapply' APIs instead.") + stop(paste0("Arrow optimization does not support 'dapplyCollect' yet. Please disable ", + "Arrow optimization or use 'collect' and 'dapply' APIs instead.")) } else { stop("'schema' should be DDL-formatted string or structType.") } @@ -2011,8 +2012,8 @@ setMethod("[", signature(x = "SparkDataFrame"), x } else { if (class(i) != "Column") { - stop("Expressions other than filtering predicates are not supported ", - "in the first parameter of extract operator [ or subset() method.") + stop(paste0("Expressions other than filtering predicates are not supported ", + "in the first parameter of extract operator [ or subset() method.")) } filter(x, i) } @@ -2603,17 +2604,18 @@ setMethod("join", if (is.null(joinType)) { sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc) } else { - validJoinTypes <- c("inner", "cross", + if (joinType %in% c("inner", "cross", "outer", "full", "fullouter", "full_outer", "left", "leftouter", "left_outer", "right", "rightouter", "right_outer", - "semi", "leftsemi", "left_semi", "anti", "leftanti", "left_anti") - if (joinType %in% validJoinTypes) { + "semi", "left_semi", "leftsemi", "anti", "left_anti", "leftanti")) { joinType <- gsub("_", "", joinType, fixed = TRUE) sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, joinType) } else { - stop("joinType must be one of the following types: ", - "'", paste(validJoinTypes, collapse = "', '"), "'") + stop(paste("joinType must be one of the following types:", + "'inner', 'cross', 'outer', 'full', 'fullouter', 'full_outer',", + "'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',", + "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'.")) } } } diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 7a1d157bb8a36..7ee725d90d550 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -947,7 +947,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical", MAXINT <- .Machine$integer.max if (num < 0) - stop("Negative number of elements requested") + stop(paste("Negative number of elements requested")) if (initialCount > MAXINT - 1) { maxSelected <- MAXINT - 1 diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index c0ac68332ec41..1ef2641742704 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -34,7 +34,7 @@ getInternalType <- function(x) { Date = "date", POSIXlt = "timestamp", POSIXct = "timestamp", - stop("Unsupported type for SparkDataFrame: ", class(x))) + stop(paste("Unsupported type for SparkDataFrame:", class(x)))) } #' return the SparkSession @@ -112,9 +112,9 @@ sparkR.conf <- function(key, defaultValue) { error = function(e) { estr <- as.character(e) if (any(grepl("java.util.NoSuchElementException", estr, fixed = TRUE))) { - stop("Config '", key, "' is not set") + stop(paste0("Config '", key, "' is not set")) } else { - stop("Unknown error: ", estr) + stop(paste0("Unknown error: ", estr)) } }) } else { @@ -208,7 +208,7 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) { names <- lapply(names, function(n) { nn <- gsub(".", "_", n, fixed = TRUE) if (nn != n) { - warning("Use ", nn, " instead of ", n, " as column name") + warning(paste("Use", nn, "instead of", n, "as column name")) } nn }) @@ -290,9 +290,10 @@ createDataFrame <- function(data, schema = NULL, samplingRatio = 1.0, TRUE }, error = function(e) { - warning("createDataFrame attempted Arrow optimization because ", - "'spark.sql.execution.arrow.sparkr.enabled' is set to true; however, ", - "failed, attempting non-optimization. Reason: ", e) + warning(paste0("createDataFrame attempted Arrow optimization because ", + "'spark.sql.execution.arrow.sparkr.enabled' is set to true; however, ", + "failed, attempting non-optimization. Reason: ", + e)) FALSE }) } @@ -325,7 +326,7 @@ createDataFrame <- function(data, schema = NULL, samplingRatio = 1.0, } else if (inherits(data, "RDD")) { rdd <- data } else { - stop("unexpected type: ", class(data)) + stop(paste("unexpected type:", class(data))) } schema <- getSchema(schema, firstRow, rdd) diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R index 797a5c7da1549..872b21443eaad 100644 --- a/R/pkg/R/client.R +++ b/R/pkg/R/client.R @@ -102,9 +102,10 @@ checkJavaVersion <- function() { javaVersionNum <- as.integer(versions[1]) } if (javaVersionNum < minJavaVersion || javaVersionNum >= maxJavaVersion) { - stop("Java version, greater than or equal to ", minJavaVersion, - " and less than ", maxJavaVersion, ", is required for this ", - "package; found version: ", javaVersionStr) + stop(paste0("Java version, greater than or equal to ", minJavaVersion, + " and less than ", maxJavaVersion, + ", is required for this package; found version: ", + javaVersionStr)) } return(javaVersionNum) } diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index e3c9d9f8793d6..d96a287f818a2 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -144,13 +144,13 @@ parallelize <- function(sc, coll, numSlices = 1) { if ((!is.list(coll) && !is.vector(coll)) || is.data.frame(coll)) { # nolint end if (is.data.frame(coll)) { - message("context.R: A data frame is parallelized by columns.") + message(paste("context.R: A data frame is parallelized by columns.")) } else { if (is.matrix(coll)) { - message("context.R: A matrix is parallelized by elements.") + message(paste("context.R: A matrix is parallelized by elements.")) } else { - message("context.R: parallelize() currently only supports lists and vectors. ", - "Calling as.list() to coerce coll into a list.") + message(paste("context.R: parallelize() currently only supports lists and vectors.", + "Calling as.list() to coerce coll into a list.")) } } coll <- as.list(coll) diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index 3e7c456bd548d..ca4a6e342d772 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -57,7 +57,7 @@ readTypedObject <- function(con, type) { "s" = readStruct(con), "n" = NULL, "j" = getJobj(readString(con)), - stop("Unsupported type for deserialization ", type)) + stop(paste("Unsupported type for deserialization", type))) } readStringData <- function(con, len) { diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R index 99d62240a3b2a..2b7995e1e37f6 100644 --- a/R/pkg/R/group.R +++ b/R/pkg/R/group.R @@ -234,8 +234,8 @@ gapplyInternal <- function(x, func, schema) { if (inherits(schema, "structType")) { checkSchemaInArrow(schema) } else if (is.null(schema)) { - stop("Arrow optimization does not support 'gapplyCollect' yet. Please disable ", - "Arrow optimization or use 'collect' and 'gapply' APIs instead.") + stop(paste0("Arrow optimization does not support 'gapplyCollect' yet. Please disable ", + "Arrow optimization or use 'collect' and 'gapply' APIs instead.")) } else { stop("'schema' should be DDL-formatted string or structType.") } diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index ea2c0b4c0f42f..8c5355a8324f9 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -89,8 +89,8 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, } if (overwrite) { - message("Overwrite = TRUE: download and overwrite the tar file", - "and Spark package directory if they exist.") + message(paste0("Overwrite = TRUE: download and overwrite the tar file", + "and Spark package directory if they exist.")) } releaseUrl <- Sys.getenv("SPARKR_RELEASE_DOWNLOAD_URL") @@ -103,11 +103,12 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, # can use dir.exists(packageLocalDir) under R 3.2.0 or later if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) { if (releaseUrl != "") { - message(packageName, " found, setting SPARK_HOME to ", packageLocalDir) + message(paste(packageName, "found, setting SPARK_HOME to", packageLocalDir)) } else { - message(version, " for Hadoop ", - if (hadoopVersion == "without") "Free build" else hadoopVersion, - " found, setting SPARK_HOME to ", packageLocalDir) + fmt <- "%s for Hadoop %s found, setting SPARK_HOME to %s" + msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), + packageLocalDir) + message(msg) } Sys.setenv(SPARK_HOME = packageLocalDir) return(invisible(packageLocalDir)) @@ -126,23 +127,26 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, success <- downloadUrl(releaseUrl, packageLocalPath) if (!success) { unlink(packageLocalPath) - stop("Fetch failed from ", releaseUrl) + stop(paste0("Fetch failed from ", releaseUrl)) } } else { robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) } } - message("Installing to ", localDir) + message(sprintf("Installing to %s", localDir)) # There are two ways untar can fail - untar could stop() on errors like incomplete block on file # or, tar command can return failure code success <- tryCatch(untar(tarfile = packageLocalPath, exdir = localDir) == 0, error = function(e) { - message(e, "\n") + message(e) + message() FALSE }, warning = function(w) { - message(w, "\n") + # Treat warning as error, add an empty line with message() + message(w) + message() FALSE }) if (!tarExists || overwrite || !success) { @@ -156,7 +160,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, if (!success) stop("Extract archive failed.") message("DONE.") Sys.setenv(SPARK_HOME = packageLocalDir) - message("SPARK_HOME set to ", packageLocalDir) + message(paste("SPARK_HOME set to", packageLocalDir)) invisible(packageLocalDir) } @@ -169,7 +173,7 @@ robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, pa if (success) { return() } else { - message("Unable to download from mirrorUrl: ", mirrorUrl) + message(paste0("Unable to download from mirrorUrl: ", mirrorUrl)) } } else { message("MirrorUrl not provided.") @@ -197,9 +201,11 @@ robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, pa # remove any partially downloaded file unlink(packageLocalPath) message("Unable to download from default mirror site: ", mirrorUrl) - stop("Unable to download Spark ", version, - " for Hadoop ", if (hadoopVersion == "without") "Free build" else hadoopVersion, - ". Please check network connection, Hadoop version, or provide other mirror sites.") + msg <- sprintf(paste("Unable to download Spark %s for Hadoop %s.", + "Please check network connection, Hadoop version,", + "or provide other mirror sites."), + version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion)) + stop(msg) } } @@ -216,7 +222,7 @@ getPreferredMirror <- function(version, packageName) { endPos <- matchInfo + attr(matchInfo, "match.length") - 2 mirrorPreferred <- base::substr(linePreferred, startPos, endPos) mirrorPreferred <- paste0(mirrorPreferred, "spark") - message("Preferred mirror site found: ", mirrorPreferred) + message(sprintf("Preferred mirror site found: %s", mirrorPreferred)) } else { mirrorPreferred <- NULL } @@ -225,20 +231,24 @@ getPreferredMirror <- function(version, packageName) { directDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) { packageRemotePath <- paste0(file.path(mirrorUrl, version, packageName), ".tgz") - message("Downloading ", version, " for Hadoop ", - if (hadoopVersion == "without") "Free build" else hadoopVersion, - " from:\n- ", packageRemotePath) + fmt <- "Downloading %s for Hadoop %s from:\n- %s" + msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion), + packageRemotePath) + message(msg) downloadUrl(packageRemotePath, packageLocalPath) } downloadUrl <- function(remotePath, localPath) { isFail <- tryCatch(download.file(remotePath, localPath), error = function(e) { - message(e, "\n") + message(e) + message() TRUE }, warning = function(w) { - message(w, "\n") + # Treat warning as error, add an empty line with message() + message(w) + message() TRUE }) !isFail @@ -269,9 +279,9 @@ sparkCachePath <- function() { winAppPath <- Sys.getenv("USERPROFILE", unset = NA) } if (is.na(winAppPath)) { - stop("%LOCALAPPDATA% and %USERPROFILE% not found. ", - "Please define the environment variable ", - "or restart and enter an installation path in localDir.") + stop(paste("%LOCALAPPDATA% and %USERPROFILE% not found.", + "Please define the environment variable", + "or restart and enter an installation path in localDir.")) } else { path <- file.path(winAppPath, "Apache", "Spark", "Cache") } @@ -283,7 +293,7 @@ sparkCachePath <- function() { Sys.getenv("XDG_CACHE_HOME", file.path(Sys.getenv("HOME"), ".cache")), "spark") } } else { - stop("Unknown OS: ", .Platform$OS.type) + stop(sprintf("Unknown OS: %s", .Platform$OS.type)) } normalizePath(path, mustWork = FALSE) } @@ -312,7 +322,7 @@ installInstruction <- function(mode) { "If you need further help, ", "contact the administrators of the cluster.") } else { - stop("No instruction found for mode ", mode) + stop(paste0("No instruction found for ", mode, " mode.")) } } diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index ec83b6bd406a7..5cc97ea723afc 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -337,8 +337,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients) || col != ncol(upperBoundsOnCoefficients))) { - stop("dimension of upperBoundsOnCoefficients ", - "is not the same as lowerBoundsOnCoefficients") + stop(paste0("dimension of upperBoundsOnCoefficients ", + "is not the same as lowerBoundsOnCoefficients")) } if (is.null(lowerBoundsOnCoefficients)) { diff --git a/R/pkg/R/mllib_stat.R b/R/pkg/R/mllib_stat.R index 6db4d5d4831dd..f8c3329359961 100644 --- a/R/pkg/R/mllib_stat.R +++ b/R/pkg/R/mllib_stat.R @@ -69,7 +69,8 @@ setMethod("spark.kstest", signature(data = "SparkDataFrame"), function(data, testCol = "test", nullHypothesis = c("norm"), distParams = c(0, 1)) { tryCatch(match.arg(nullHypothesis), error = function(e) { - stop("Distribution ", nullHypothesis, " is not supported.") + msg <- paste("Distribution", nullHypothesis, "is not supported.") + stop(msg) }) if (nullHypothesis == "norm") { distParams <- as.numeric(distParams) diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R index b29381bb900fb..9c2e57d3067db 100644 --- a/R/pkg/R/pairRDD.R +++ b/R/pkg/R/pairRDD.R @@ -906,7 +906,7 @@ setMethod("sampleByKey", for (elem in fractions) { if (elem < 0.0) { - stop("Negative fraction value ", fractions[which(fractions == elem)]) + stop(paste("Negative fraction value ", fractions[which(fractions == elem)])) } } diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R index 7044ede0cc58b..89d5c2cd1a5e2 100644 --- a/R/pkg/R/schema.R +++ b/R/pkg/R/schema.R @@ -199,7 +199,7 @@ checkType <- function(type) { }) } - stop("Unsupported type for SparkDataframe: ", type) + stop(paste("Unsupported type for SparkDataframe:", type)) } #' @param type The data type of the field diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R index 7760d9be16f0b..cb3c1c59d12ed 100644 --- a/R/pkg/R/serialize.R +++ b/R/pkg/R/serialize.R @@ -84,7 +84,7 @@ writeObject <- function(con, object, writeType = TRUE) { Date = writeDate(con, object), POSIXlt = writeTime(con, object), POSIXct = writeTime(con, object), - stop("Unsupported type for serialization ", type)) + stop(paste("Unsupported type for serialization", type))) } writeVoid <- function(con) { @@ -158,7 +158,7 @@ writeType <- function(con, class) { Date = "D", POSIXlt = "t", POSIXct = "t", - stop("Unsupported type for serialization ", class)) + stop(paste("Unsupported type for serialization", class))) writeBin(charToRaw(type), con) } diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index e4a11a5f78a71..9ba36ad46740a 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -154,8 +154,8 @@ sparkR.sparkContext <- function( connectionTimeout <- as.numeric(Sys.getenv("SPARKR_BACKEND_CONNECTION_TIMEOUT", "6000")) if (existingPort != "") { if (length(packages) != 0) { - warning("sparkPackages has no effect when using spark-submit or sparkR shell, ", - "please use the --packages commandline instead") + warning(paste("sparkPackages has no effect when using spark-submit or sparkR shell", + " please use the --packages commandline instead", sep = ",")) } backendPort <- existingPort authSecret <- Sys.getenv("SPARKR_BACKEND_AUTH_SECRET") @@ -439,9 +439,8 @@ sparkR.session <- function( rPackageVersion <- paste0(packageVersion("SparkR")) if (jvmVersionStrip != rPackageVersion) { - warning("Version mismatch between Spark JVM and SparkR package. ", - "JVM version was ", jvmVersion, - ", while R package version was ", rPackageVersion) + warning(paste("Version mismatch between Spark JVM and SparkR package. JVM version was", + jvmVersion, ", while R package version was", rPackageVersion)) } sparkSession diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 65db9c21d9dbb..c60e4db1496d0 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -46,9 +46,9 @@ convertJListToRList <- function(jList, flatten, logicalUpperBound = NULL, res <- list(unserialize(keyBytes), unserialize(valBytes)) } else { - stop("utils.R: convertJListToRList only supports ", - "RDD[Array[Byte]] and ", - "JavaPairRDD[Array[Byte], Array[Byte]] for now") + stop(paste("utils.R: convertJListToRList only supports", + "RDD[Array[Byte]] and", + "JavaPairRDD[Array[Byte], Array[Byte]] for now")) } } else { if (inherits(obj, "raw")) { @@ -354,8 +354,8 @@ varargsToStrEnv <- function(...) { } else { value <- pairs[[name]] if (!(is.logical(value) || is.numeric(value) || is.character(value) || is.null(value))) { - stop("Unsupported type for ", name, " : ", toString(class(value)), ". ", - "Supported types are logical, numeric, character and NULL.", call. = FALSE) + stop(paste0("Unsupported type for ", name, " : ", class(value), + ". Supported types are logical, numeric, character and NULL."), call. = FALSE) } if (is.logical(value)) { env[[name]] <- tolower(as.character(value)) @@ -369,7 +369,8 @@ varargsToStrEnv <- function(...) { } if (length(ignoredNames) != 0) { - warning("Unnamed arguments ignored: ", toString(ignoredNames), ".", call. = FALSE) + warning(paste0("Unnamed arguments ignored: ", paste(ignoredNames, collapse = ", "), "."), + call. = FALSE) } env } @@ -448,7 +449,7 @@ storageLevelToString <- function(levelObj) { # the user to type (for example) `5` instead of `5L` to avoid a confusing error message. numToInt <- function(num) { if (as.integer(num) != num) { - warning("Coercing ", as.list(sys.call())[[2L]], " to integer.") + warning(paste("Coercing", as.list(sys.call())[[2]], "to integer.")) } as.integer(num) } @@ -649,8 +650,8 @@ mergePartitions <- function(rdd, zip) { # For zip operation, check if corresponding partitions # of both RDDs have the same number of elements. if (zip && lengthOfKeys != lengthOfValues) { - stop("Can only zip RDDs with same number of elements ", - "in each pair of corresponding partitions.") + stop(paste("Can only zip RDDs with same number of elements", + "in each pair of corresponding partitions.")) } if (lengthOfKeys > 1) { @@ -803,7 +804,7 @@ handledCallJMethod <- function(obj, method, ...) { captureJVMException <- function(e, method) { rawmsg <- as.character(e) - if (any(grepl("^Error in .*?: ", rawmsg))) { + if (any(grep("^Error in .*?: ", rawmsg))) { # If the exception message starts with "Error in ...", this is possibly # "Error in invokeJava(...)". Here, it replaces the characters to # `paste("Error in", method, ":")` in order to identify which function @@ -817,58 +818,58 @@ captureJVMException <- function(e, method) { } # StreamingQueryException could wrap an IllegalArgumentException, so look for that first - if (any(grepl("org.apache.spark.sql.streaming.StreamingQueryException: ", - stacktrace, fixed = TRUE))) { + if (any(grep("org.apache.spark.sql.streaming.StreamingQueryException: ", + stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.streaming.StreamingQueryException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(rmsg, "streaming query error - ", first, call. = FALSE) - } else if (any(grepl("java.lang.IllegalArgumentException: ", stacktrace, fixed = TRUE))) { + stop(paste0(rmsg, "streaming query error - ", first), call. = FALSE) + } else if (any(grep("java.lang.IllegalArgumentException: ", stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "java.lang.IllegalArgumentException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(rmsg, "illegal argument - ", first, call. = FALSE) - } else if (any(grepl("org.apache.spark.sql.AnalysisException: ", stacktrace, fixed = TRUE))) { + stop(paste0(rmsg, "illegal argument - ", first), call. = FALSE) + } else if (any(grep("org.apache.spark.sql.AnalysisException: ", stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.AnalysisException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(rmsg, "analysis error - ", first, call. = FALSE) + stop(paste0(rmsg, "analysis error - ", first), call. = FALSE) } else - if (any(grepl("org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ", - stacktrace, fixed = TRUE))) { + if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ", + stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(rmsg, "no such database - ", first, call. = FALSE) + stop(paste0(rmsg, "no such database - ", first), call. = FALSE) } else - if (any(grepl("org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ", - stacktrace, fixed = TRUE))) { + if (any(grep("org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ", + stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.catalyst.analysis.NoSuchTableException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(rmsg, "no such table - ", first, call. = FALSE) - } else if (any(grepl("org.apache.spark.sql.catalyst.parser.ParseException: ", - stacktrace, fixed = TRUE))) { + stop(paste0(rmsg, "no such table - ", first), call. = FALSE) + } else if (any(grep("org.apache.spark.sql.catalyst.parser.ParseException: ", + stacktrace, fixed = TRUE))) { msg <- strsplit(stacktrace, "org.apache.spark.sql.catalyst.parser.ParseException: ", fixed = TRUE)[[1]] # Extract "Error in ..." message. rmsg <- msg[1] # Extract the first message of JVM exception. first <- strsplit(msg[2], "\r?\n\tat")[[1]][1] - stop(rmsg, "parse error - ", first, call. = FALSE) + stop(paste0(rmsg, "parse error - ", first), call. = FALSE) } else { stop(stacktrace, call. = FALSE) } diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 611d9057c0f13..b7172b2ae0774 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -2558,7 +2558,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { error_msg <- paste("joinType must be one of the following types:", "'inner', 'cross', 'outer', 'full', 'fullouter', 'full_outer',", "'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',", - "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti', 'left_anti'") + "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'.") expect_error(join(df2, df, df2$name == df$name, "invalid"), error_msg) merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE)