Skip to content
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
708e9ad
point to python 3 download
nchammas Dec 11, 2023
7d7f39b
both ruby 1 and 2 are EOL
nchammas Dec 11, 2023
be00f99
allow configs to have tags / groups for documentation
nchammas Dec 11, 2023
482f9bc
add method to export all configs with tags to python
nchammas Dec 11, 2023
6747836
add tags to a few configs
nchammas Dec 11, 2023
83b7d69
typo
nchammas Dec 11, 2023
3eff490
replace manual table with generated table
nchammas Dec 11, 2023
fb7097f
generate config table per config tag / group
nchammas Dec 11, 2023
807f526
revert heading change for now
nchammas Dec 11, 2023
84e27dc
aqe coalesce partitions -> generated table
nchammas Dec 12, 2023
5dcc747
make anchors unique even if config is repeated
nchammas Dec 12, 2023
88a200c
validate tags with a regex instead
nchammas Dec 13, 2023
c7eb234
replace remaining html tables with includes
nchammas Dec 13, 2023
9d9e79b
put generated tables under _generated/
nchammas Dec 13, 2023
dda1109
move default processing to separate function
nchammas Dec 13, 2023
b8e8f00
prevent break after `#` anchor
nchammas Dec 13, 2023
9d68b41
Merge branch 'master' of github.com:apache/spark into sql-config-groups
nchammas Dec 14, 2023
5adaa4f
tags should be a set, not a list
nchammas Dec 16, 2023
f32cd37
Merge branch 'master' of github.com:apache/spark into sql-config-groups
nchammas Dec 17, 2023
88e5ed2
remove unnecessary whitespace
nchammas Dec 17, 2023
2cf958c
remove new section on cbo
nchammas Dec 17, 2023
a6d7bab
remove cbo tags
nchammas Dec 17, 2023
9390439
tweak generated table locations and names
nchammas Dec 17, 2023
de637da
remove cbo sidebar item
nchammas Dec 17, 2023
9f0db38
automatically tag static vs. runtime sql configs
nchammas Dec 18, 2023
c96899b
restore generated-* ignore
nchammas Dec 18, 2023
9656c93
Merge branch 'master' of github.com:apache/spark into sql-config-groups
nchammas Dec 22, 2023
1d7db08
Merge branch 'master' of github.com:apache/spark into sql-config-groups
nchammas Dec 23, 2023
3815b9e
clarify some details with comments
nchammas Dec 23, 2023
fa22dd4
Merge branch 'master' of github.com:apache/spark into sql-config-groups
nchammas Dec 26, 2023
7a360b8
Merge branch 'master' into sql-config-groups
nchammas Jan 9, 2024
d486ee2
Merge branch 'master' into sql-config-groups
nchammas Jan 10, 2024
0e5401a
use include_api_gen + _generated_config_tables -> _generated/config_t…
nchammas Jan 10, 2024
fe7e9d7
generate parent dir too
nchammas Jan 10, 2024
2abe799
Merge branch 'master' into sql-config-groups
nchammas Jan 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.spark.internal.config

import java.util.Locale
import java.util.concurrent.TimeUnit
import java.util.regex.PatternSyntaxException

Expand Down Expand Up @@ -131,7 +132,7 @@ private[spark] class TypedConfigBuilder[T](
def createOptional: OptionalConfigEntry[T] = {
val entry = new OptionalConfigEntry[T](parent.key, parent._prependedKey,
parent._prependSeparator, parent._alternatives, converter, stringConverter, parent._doc,
parent._public, parent._version)
parent._public, parent._tags, parent._version)
parent._onCreate.foreach(_(entry))
entry
}
Expand All @@ -146,7 +147,7 @@ private[spark] class TypedConfigBuilder[T](
val transformedDefault = converter(stringConverter(default))
val entry = new ConfigEntryWithDefault[T](parent.key, parent._prependedKey,
parent._prependSeparator, parent._alternatives, transformedDefault, converter,
stringConverter, parent._doc, parent._public, parent._version)
stringConverter, parent._doc, parent._public, parent._tags, parent._version)
parent._onCreate.foreach(_ (entry))
entry
}
Expand All @@ -156,7 +157,7 @@ private[spark] class TypedConfigBuilder[T](
def createWithDefaultFunction(defaultFunc: () => T): ConfigEntry[T] = {
val entry = new ConfigEntryWithDefaultFunction[T](parent.key, parent._prependedKey,
parent._prependSeparator, parent._alternatives, defaultFunc, converter, stringConverter,
parent._doc, parent._public, parent._version)
parent._doc, parent._public, parent._tags, parent._version)
parent._onCreate.foreach(_ (entry))
entry
}
Expand All @@ -168,7 +169,7 @@ private[spark] class TypedConfigBuilder[T](
def createWithDefaultString(default: String): ConfigEntry[T] = {
val entry = new ConfigEntryWithDefaultString[T](parent.key, parent._prependedKey,
parent._prependSeparator, parent._alternatives, default, converter, stringConverter,
parent._doc, parent._public, parent._version)
parent._doc, parent._public, parent._tags, parent._version)
parent._onCreate.foreach(_(entry))
entry
}
Expand All @@ -187,6 +188,7 @@ private[spark] case class ConfigBuilder(key: String) {
private[config] var _prependedKey: Option[String] = None
private[config] var _prependSeparator: String = ""
private[config] var _public = true
private[config] var _tags = List.empty[String]
private[config] var _doc = ""
private[config] var _version = ""
private[config] var _onCreate: Option[ConfigEntry[_] => Unit] = None
Expand All @@ -197,6 +199,13 @@ private[spark] case class ConfigBuilder(key: String) {
this
}

def withTag(tag: String): ConfigBuilder = {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am on the fence regarding this name. Maybe something more explicit like withDocumentationGroup would be better?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also do away with custom group names and instead select certain prefixes for use as documentation groups, like spark.sql.cbo, spark.sql.statistics, etc.

However, this won't work for groupings that don't align with config name prefixes, like the breakdown of runtime vs. static configurations that was added in #28274.

require(!tag.contains(" "))
require(tag.toLowerCase(Locale.ROOT) == tag)
_tags = _tags :+ tag
this
}

def doc(s: String): ConfigBuilder = {
_doc = s
this
Expand Down Expand Up @@ -263,7 +272,7 @@ private[spark] case class ConfigBuilder(key: String) {

def fallbackConf[T](fallback: ConfigEntry[T]): ConfigEntry[T] = {
val entry = new FallbackConfigEntry(key, _prependedKey, _prependSeparator, _alternatives, _doc,
_public, _version, fallback)
_public, _tags, _version, fallback)
_onCreate.foreach(_(entry))
entry
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ private[spark] abstract class ConfigEntry[T] (
val stringConverter: T => String,
val doc: String,
val isPublic: Boolean,
val tags: List[String],
val version: String) {

import ConfigEntry._
Expand Down Expand Up @@ -120,6 +121,7 @@ private class ConfigEntryWithDefault[T] (
stringConverter: T => String,
doc: String,
isPublic: Boolean,
tags: List[String],
version: String)
extends ConfigEntry(
key,
Expand All @@ -130,6 +132,7 @@ private class ConfigEntryWithDefault[T] (
stringConverter,
doc,
isPublic,
tags,
version
) {

Expand All @@ -152,6 +155,7 @@ private class ConfigEntryWithDefaultFunction[T] (
stringConverter: T => String,
doc: String,
isPublic: Boolean,
tags: List[String],
version: String)
extends ConfigEntry(
key,
Expand All @@ -162,6 +166,7 @@ private class ConfigEntryWithDefaultFunction[T] (
stringConverter,
doc,
isPublic,
tags,
version
) {

Expand All @@ -184,6 +189,7 @@ private class ConfigEntryWithDefaultString[T] (
stringConverter: T => String,
doc: String,
isPublic: Boolean,
tags: List[String],
version: String)
extends ConfigEntry(
key,
Expand All @@ -194,6 +200,7 @@ private class ConfigEntryWithDefaultString[T] (
stringConverter,
doc,
isPublic,
tags,
version
) {

Expand All @@ -220,6 +227,7 @@ private[spark] class OptionalConfigEntry[T](
val rawStringConverter: T => String,
doc: String,
isPublic: Boolean,
tags: List[String],
version: String)
extends ConfigEntry[Option[T]](
key,
Expand All @@ -230,6 +238,7 @@ private[spark] class OptionalConfigEntry[T](
v => v.map(rawStringConverter).orNull,
doc,
isPublic,
tags,
version
) {

Expand All @@ -250,6 +259,7 @@ private[spark] class FallbackConfigEntry[T] (
alternatives: List[String],
doc: String,
isPublic: Boolean,
tags: List[String],
version: String,
val fallback: ConfigEntry[T])
extends ConfigEntry[T](
Expand All @@ -261,6 +271,7 @@ private[spark] class FallbackConfigEntry[T] (
fallback.stringConverter,
doc,
isPublic,
tags,
version
) {

Expand Down
4 changes: 1 addition & 3 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ The Spark documentation build uses a number of tools to build HTML docs and API
Python, R and SQL.

You need to have [Ruby](https://www.ruby-lang.org/en/documentation/installation/) and
[Python](https://docs.python.org/2/using/unix.html#getting-and-installing-the-latest-version-of-python)
[Python](https://www.python.org/downloads/)
installed. Make sure the `bundle` command is available, if not install the Gem containing it:

```sh
Expand All @@ -46,8 +46,6 @@ $ cd docs
$ bundle install
```

Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to replace gem with gem2.0.

### SQL and Python API Documentation (Optional)

To generate SQL and Python API docs, you'll need to install these libraries:
Expand Down
2 changes: 1 addition & 1 deletion docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -3675,7 +3675,7 @@ deep learning and signal processing. Spark now supports requesting and schedulin

There are configurations available to request resources for the driver: <code>spark.driver.resource.{resourceName}.amount</code>, request resources for the executor(s): <code>spark.executor.resource.{resourceName}.amount</code> and specify the requirements for each task: <code>spark.task.resource.{resourceName}.amount</code>. The <code>spark.driver.resource.{resourceName}.discoveryScript</code> config is required on YARN, Kubernetes and a client side Driver on Spark Standalone. <code>spark.executor.resource.{resourceName}.discoveryScript</code> config is required for YARN and Kubernetes. Kubernetes also requires <code>spark.driver.resource.{resourceName}.vendor</code> and/or <code>spark.executor.resource.{resourceName}.vendor</code>. See the config descriptions above for more information on each.

Spark will use the configurations specified to first request containers with the corresponding resources from the cluster manager. Once it gets the container, Spark launches an Executor in that container which will discover what resources the container has and the addresses associated with each resource. The Executor will register with the Driver and report back the resources available to that Executor. The Spark scheduler can then schedule tasks to each Executor and assign specific resource addresses based on the resource requirements the user specified. The user can see the resources assigned to a task using the <code>TaskContext.get().resources</code> api. On the driver, the user can see the resources assigned with the SparkContext <code>resources</code> call. It's then up to the user to use the assignedaddresses to do the processing they want or pass those into the ML/AI framework they are using.
Spark will use the configurations specified to first request containers with the corresponding resources from the cluster manager. Once it gets the container, Spark launches an Executor in that container which will discover what resources the container has and the addresses associated with each resource. The Executor will register with the Driver and report back the resources available to that Executor. The Spark scheduler can then schedule tasks to each Executor and assign specific resource addresses based on the resource requirements the user specified. The user can see the resources assigned to a task using the <code>TaskContext.get().resources</code> api. On the driver, the user can see the resources assigned with the SparkContext <code>resources</code> call. It's then up to the user to use the assigned addresses to do the processing they want or pass those into the ML/AI framework they are using.

See your cluster manager specific page for requirements and details on each of - [YARN](running-on-yarn.html#resource-allocation-and-configuration-overview), [Kubernetes](running-on-kubernetes.html#resource-allocation-and-configuration-overview) and [Standalone Mode](spark-standalone.html#resource-allocation-and-configuration-overview). It is currently not available with local mode. And please also note that local-cluster mode with multiple workers is not supported(see Standalone documentation).

Expand Down
25 changes: 2 additions & 23 deletions docs/sql-performance-tuning.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,30 +34,9 @@ memory usage and GC pressure. You can call `spark.catalog.uncacheTable("tableNam
Configuration of in-memory caching can be done using the `setConf` method on `SparkSession` or by running
`SET key=value` commands using SQL.

<table>
<thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr></thead>
<tr>
<td><code>spark.sql.inMemoryColumnarStorage.compressed</code></td>
<td>true</td>
<td>
When set to true, Spark SQL will automatically select a compression codec for each column based
on statistics of the data.
</td>
<td>1.0.1</td>
</tr>
<tr>
<td><code>spark.sql.inMemoryColumnarStorage.batchSize</code></td>
<td>10000</td>
<td>
Controls the size of batches for columnar caching. Larger batch sizes can improve memory utilization
and compression, but risk OOMs when caching data.
</td>
<td>1.1.1</td>
</tr>

</table>
{% include_relative generated-sql-config-table-caching-data.html %}
Copy link
Contributor Author

@nchammas nchammas Dec 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This diff demonstrates the main benefit of this PR. Instead of needing to copy and maintain the full HTML table of some configs, we tags the ones we want to group together in SQLConf.scala and then reference that group's table here.


## Other Configuration Options
## Tuning Partitions

The following options can also be used to tune the performance of query execution. It is possible
that these options will be deprecated in future release as more optimizations are performed automatically.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -470,13 +470,15 @@ object SQLConf {
.doc("When set to true Spark SQL will automatically select a compression codec for each " +
"column based on statistics of the data.")
.version("1.0.1")
.withTag("caching-data")
.booleanConf
.createWithDefault(true)

val COLUMN_BATCH_SIZE = buildConf("spark.sql.inMemoryColumnarStorage.batchSize")
.doc("Controls the size of batches for columnar caching. Larger batch sizes can improve " +
"memory utilization and compression, but risk OOMs when caching data.")
.version("1.1.1")
.withTag("caching-data")
.intConf
.createWithDefault(10000)

Expand Down Expand Up @@ -596,6 +598,7 @@ object SQLConf {
"run, and file-based data source tables where the statistics are computed directly on " +
"the files of data.")
.version("1.1.0")
.withTag("tuning")
.bytesConf(ByteUnit.BYTE)
.createWithDefaultString("10MB")

Expand Down Expand Up @@ -2695,6 +2698,7 @@ object SQLConf {
buildConf("spark.sql.cbo.enabled")
.doc("Enables CBO for estimation of plan statistics when set true.")
.version("2.2.0")
.withTag("cbo")
.booleanConf
.createWithDefault(false)

Expand Down Expand Up @@ -5610,6 +5614,14 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
def getAllConfs: immutable.Map[String, String] =
settings.synchronized { settings.asScala.toMap }

def getAllDefinedConfsWithTags: Seq[(String, String, String, String, Seq[String])] = {
loadDefinedConfs()
getConfigEntries().asScala.filter(_.isPublic).map { entry =>
val displayValue = Option(getConfString(entry.key, null)).getOrElse(entry.defaultValueString)
(entry.key, displayValue, entry.doc, entry.version, entry.tags)
}.toSeq
}

/**
* Return all the configuration definitions that have been defined in [[SQLConf]]. Each
* definition contains key, defaultValue and doc.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@ private[sql] object PythonSQLUtils extends Logging {
FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray
}

def listAllSQLConfigsWithTags():
Array[(String, String, String, String, Array[String])] = {
val conf = new SQLConf()
conf.getAllDefinedConfsWithTags.map(c =>
Tuple5(c._1, c._2, c._3, c._4, c._5.toArray)
).toArray
}

private def listAllSQLConfigs(): Seq[(String, String, String, String)] = {
val conf = new SQLConf()
conf.getAllDefinedConfs
Expand Down
67 changes: 41 additions & 26 deletions sql/gen-sql-config-docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import os
import re

from collections import namedtuple
from collections import namedtuple, defaultdict
from textwrap import dedent

# To avoid adding a new direct dependency, we import markdown from within mkdocs.
Expand All @@ -28,23 +28,38 @@


SQLConfEntry = namedtuple(
"SQLConfEntry", ["name", "default", "description", "version"])


def get_sql_configs(jvm, group):
if group == "static":
config_set = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listStaticSQLConfigs()
else:
config_set = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listRuntimeSQLConfigs()
sql_configs = [
SQLConfEntry(
name=_sql_config._1(),
default=_sql_config._2(),
description=_sql_config._3(),
version=_sql_config._4()
)
for _sql_config in config_set
"SQLConfEntry", [
"name",
"default",
"description",
"version",
"tags",
]
)


def get_sql_configs(jvm):
sql_configs = defaultdict(
list, {
"__all": [],
"__no_group": [],
}
)
config_set = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listAllSQLConfigsWithTags()
for raw_config in config_set:
sql_config = SQLConfEntry(
name=raw_config._1(),
default=raw_config._2(),
description=raw_config._3(),
version=raw_config._4(),
tags=list(raw_config._5()),
)
sql_configs["__all"].append(sql_config)
if not sql_config.tags:
sql_configs["__no_group"].append(sql_config)
else:
for tag in sql_config.tags:
sql_configs[tag].append(sql_config)
return sql_configs


Expand Down Expand Up @@ -105,8 +120,11 @@ def generate_sql_configs_table_html(sql_configs, path):

f.write(dedent(
"""
<tr>
<td><code>{name}</code></td>
<tr id="{name}">
<td>
<a href="#{name}"><code>#</code></a>
<code>{name}</code>
</td>
<td>{default}</td>
<td>{description}</td>
<td>{version}</td>
Expand All @@ -126,10 +144,7 @@ def generate_sql_configs_table_html(sql_configs, path):
jvm = launch_gateway().jvm
docs_root_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "docs")

sql_configs = get_sql_configs(jvm, "runtime")
sql_configs_table_path = os.path.join(docs_root_dir, "generated-runtime-sql-config-table.html")
generate_sql_configs_table_html(sql_configs, path=sql_configs_table_path)

sql_configs = get_sql_configs(jvm, "static")
sql_configs_table_path = os.path.join(docs_root_dir, "generated-static-sql-config-table.html")
generate_sql_configs_table_html(sql_configs, path=sql_configs_table_path)
sql_configs = get_sql_configs(jvm)
for group in sql_configs:
html_table_path = os.path.join(docs_root_dir, f"generated-sql-config-table-{group}.html")
generate_sql_configs_table_html(sql_configs[group], path=html_table_path)