diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml index 36e0b99a07ffd..e469b44c94db8 100644 --- a/docs/_data/menu-sql.yaml +++ b/docs/_data/menu-sql.yaml @@ -15,6 +15,8 @@ url: sql-getting-started.html#creating-datasets - text: Interoperating with RDDs url: sql-getting-started.html#interoperating-with-rdds + - text: Scalar Functions + url: sql-getting-started.html#scalar-functions - text: Aggregations url: sql-getting-started.html#aggregations - text: Data Sources @@ -34,6 +36,8 @@ url: sql-data-sources-jdbc.html - text: Avro Files url: sql-data-sources-avro.html + - text: Whole Binary Files + url: sql-data-sources-binaryFile.html - text: Troubleshooting url: sql-data-sources-troubleshooting.html - text: Performance Tuning @@ -43,8 +47,8 @@ url: sql-performance-tuning.html#caching-data-in-memory - text: Other Configuration Options url: sql-performance-tuning.html#other-configuration-options - - text: Broadcast Hint for SQL Queries - url: sql-performance-tuning.html#broadcast-hint-for-sql-queries + - text: Join Strategy Hints for SQL Queries + url: sql-performance-tuning.html#join-strategy-hints-for-sql-queries - text: Distributed SQL Engine url: sql-distributed-sql-engine.html subitems: diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 9bcd36ce41271..661c693db708e 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -519,7 +519,7 @@ license: | Note that, for DecimalType(38,0)*, the table above intentionally does not cover all other combinations of scales and precisions because currently we only infer decimal type like `BigInteger`/`BigInt`. For example, 1.1 is inferred as double type. - - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Broadcast Hint](sql-performance-tuning.html#broadcast-hint-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489). + - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Join Strategy Hints for SQL Queries](sql-performance-tuning.html#join-strategy-hints-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489). - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`. diff --git a/docs/sql-performance-tuning.md b/docs/sql-performance-tuning.md index 2a1edda84252c..e289854c7acc7 100644 --- a/docs/sql-performance-tuning.md +++ b/docs/sql-performance-tuning.md @@ -129,8 +129,7 @@ a specific strategy may not support all join types.
{% highlight scala %} -import org.apache.spark.sql.functions.broadcast -broadcast(spark.table("src")).join(spark.table("records"), "key").show() +spark.table("src").join(spark.table("records").hint("broadcast"), "key").show() {% endhighlight %}
@@ -138,8 +137,7 @@ broadcast(spark.table("src")).join(spark.table("records"), "key").show()
{% highlight java %} -import static org.apache.spark.sql.functions.broadcast; -broadcast(spark.table("src")).join(spark.table("records"), "key").show(); +spark.table("src").join(spark.table("records").hint("broadcast"), "key").show(); {% endhighlight %}
@@ -147,8 +145,7 @@ broadcast(spark.table("src")).join(spark.table("records"), "key").show();
{% highlight python %} -from pyspark.sql.functions import broadcast -broadcast(spark.table("src")).join(spark.table("records"), "key").show() +spark.table("src").join(spark.table("records").hint("broadcast"), "key").show() {% endhighlight %}
@@ -158,7 +155,7 @@ broadcast(spark.table("src")).join(spark.table("records"), "key").show() {% highlight r %} src <- sql("SELECT * FROM src") records <- sql("SELECT * FROM records") -head(join(broadcast(src), records, src$key == records$key)) +head(join(src, hint(records, "broadcast"), src$key == records$key)) {% endhighlight %} @@ -172,3 +169,18 @@ SELECT /*+ BROADCAST(r) */ * FROM records r JOIN src s ON r.key = s.key + +## Coalesce Hints for SQL Queries + +Coalesce hints allows the Spark SQL users to control the number of output files just like the +`coalesce`, `repartition` and `repartitionByRange` in Dataset API, they can be used for performance +tuning and reducing the number of output files. The "COALESCE" hint only has a partition number as a +parameter. The "REPARTITION" hint has a partition number, columns, or both of them as parameters. +The "REPARTITION_BY_RANGE" hint must have column names and a partition number is optional. + + SELECT /*+ COALESCE(3) */ * FROM t + SELECT /*+ REPARTITION(3) */ * FROM t + SELECT /*+ REPARTITION(c) */ * FROM t + SELECT /*+ REPARTITION(3, c) */ * FROM t + SELECT /*+ REPARTITION_BY_RANGE(c) */ * FROM t + SELECT /*+ REPARTITION_BY_RANGE(3, c) */ * FROM t