apache · nchammas · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023
diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.internal.config
 
+import java.util.Locale
 import java.util.concurrent.TimeUnit
 import java.util.regex.PatternSyntaxException
 
@@ -131,7 +132,7 @@ private[spark] class TypedConfigBuilder[T](
   def createOptional: OptionalConfigEntry[T] = {
     val entry = new OptionalConfigEntry[T](parent.key, parent._prependedKey,
       parent._prependSeparator, parent._alternatives, converter, stringConverter, parent._doc,
-      parent._public, parent._version)
+      parent._public, parent._tags, parent._version)
     parent._onCreate.foreach(_(entry))
     entry
   }
@@ -146,7 +147,7 @@ private[spark] class TypedConfigBuilder[T](
         val transformedDefault = converter(stringConverter(default))
         val entry = new ConfigEntryWithDefault[T](parent.key, parent._prependedKey,
           parent._prependSeparator, parent._alternatives, transformedDefault, converter,
-          stringConverter, parent._doc, parent._public, parent._version)
+          stringConverter, parent._doc, parent._public, parent._tags, parent._version)
         parent._onCreate.foreach(_ (entry))
         entry
     }
@@ -156,7 +157,7 @@ private[spark] class TypedConfigBuilder[T](
   def createWithDefaultFunction(defaultFunc: () => T): ConfigEntry[T] = {
     val entry = new ConfigEntryWithDefaultFunction[T](parent.key, parent._prependedKey,
       parent._prependSeparator, parent._alternatives, defaultFunc, converter, stringConverter,
-      parent._doc, parent._public, parent._version)
+      parent._doc, parent._public, parent._tags, parent._version)
     parent._onCreate.foreach(_ (entry))
     entry
   }
@@ -168,7 +169,7 @@ private[spark] class TypedConfigBuilder[T](
   def createWithDefaultString(default: String): ConfigEntry[T] = {
     val entry = new ConfigEntryWithDefaultString[T](parent.key, parent._prependedKey,
       parent._prependSeparator, parent._alternatives, default, converter, stringConverter,
-      parent._doc, parent._public, parent._version)
+      parent._doc, parent._public, parent._tags, parent._version)
     parent._onCreate.foreach(_(entry))
     entry
   }
@@ -187,6 +188,7 @@ private[spark] case class ConfigBuilder(key: String) {
   private[config] var _prependedKey: Option[String] = None
   private[config] var _prependSeparator: String = ""
   private[config] var _public = true
+  private[config] var _tags = List.empty[String]
   private[config] var _doc = ""
   private[config] var _version = ""
   private[config] var _onCreate: Option[ConfigEntry[_] => Unit] = None
@@ -197,6 +199,13 @@ private[spark] case class ConfigBuilder(key: String) {
     this
   }
 
+  def withTag(tag: String): ConfigBuilder = {
+    require(!tag.contains(" "))
+    require(tag.toLowerCase(Locale.ROOT) == tag)
+    _tags = _tags :+ tag
+    this
+  }
+
   def doc(s: String): ConfigBuilder = {
     _doc = s
     this
@@ -263,7 +272,7 @@ private[spark] case class ConfigBuilder(key: String) {
 
   def fallbackConf[T](fallback: ConfigEntry[T]): ConfigEntry[T] = {
     val entry = new FallbackConfigEntry(key, _prependedKey, _prependSeparator, _alternatives, _doc,
-      _public, _version, fallback)
+      _public, _tags, _version, fallback)
     _onCreate.foreach(_(entry))
     entry
   }

diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala
@@ -80,6 +80,7 @@ private[spark] abstract class ConfigEntry[T] (
     val stringConverter: T => String,
     val doc: String,
     val isPublic: Boolean,
+    val tags: List[String],
     val version: String) {
 
   import ConfigEntry._
@@ -120,6 +121,7 @@ private class ConfigEntryWithDefault[T] (
     stringConverter: T => String,
     doc: String,
     isPublic: Boolean,
+    tags: List[String],
     version: String)
   extends ConfigEntry(
     key,
@@ -130,6 +132,7 @@ private class ConfigEntryWithDefault[T] (
     stringConverter,
     doc,
     isPublic,
+    tags,
     version
   ) {
 
@@ -152,6 +155,7 @@ private class ConfigEntryWithDefaultFunction[T] (
     stringConverter: T => String,
     doc: String,
     isPublic: Boolean,
+    tags: List[String],
     version: String)
   extends ConfigEntry(
     key,
@@ -162,6 +166,7 @@ private class ConfigEntryWithDefaultFunction[T] (
     stringConverter,
     doc,
     isPublic,
+    tags,
     version
   ) {
 
@@ -184,6 +189,7 @@ private class ConfigEntryWithDefaultString[T] (
     stringConverter: T => String,
     doc: String,
     isPublic: Boolean,
+    tags: List[String],
     version: String)
   extends ConfigEntry(
     key,
@@ -194,6 +200,7 @@ private class ConfigEntryWithDefaultString[T] (
     stringConverter,
     doc,
     isPublic,
+    tags,
     version
   ) {
 
@@ -220,6 +227,7 @@ private[spark] class OptionalConfigEntry[T](
     val rawStringConverter: T => String,
     doc: String,
     isPublic: Boolean,
+    tags: List[String],
     version: String)
   extends ConfigEntry[Option[T]](
     key,
@@ -230,6 +238,7 @@ private[spark] class OptionalConfigEntry[T](
     v => v.map(rawStringConverter).orNull,
     doc,
     isPublic,
+    tags,
     version
   ) {
 
@@ -250,6 +259,7 @@ private[spark] class FallbackConfigEntry[T] (
     alternatives: List[String],
     doc: String,
     isPublic: Boolean,
+    tags: List[String],
     version: String,
     val fallback: ConfigEntry[T])
   extends ConfigEntry[T](
@@ -261,6 +271,7 @@ private[spark] class FallbackConfigEntry[T] (
     fallback.stringConverter,
     doc,
     isPublic,
+    tags,
     version
   ) {
 

diff --git a/docs/README.md b/docs/README.md
@@ -32,7 +32,7 @@ The Spark documentation build uses a number of tools to build HTML docs and API
 Python, R and SQL.
 
 You need to have [Ruby](https://www.ruby-lang.org/en/documentation/installation/) and
-[Python](https://docs.python.org/2/using/unix.html#getting-and-installing-the-latest-version-of-python)
+[Python](https://www.python.org/downloads/)
 installed. Make sure the `bundle` command is available, if not install the Gem containing it:
 
 ```sh
@@ -46,8 +46,6 @@ $ cd docs
 $ bundle install
 ```
 
-Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to replace gem with gem2.0.
-
 ### SQL and Python API Documentation (Optional)
 
 To generate SQL and Python API docs, you'll need to install these libraries:

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -3675,7 +3675,7 @@ deep learning and signal processing. Spark now supports requesting and schedulin
 
 There are configurations available to request resources for the driver: <code>spark.driver.resource.{resourceName}.amount</code>, request resources for the executor(s): <code>spark.executor.resource.{resourceName}.amount</code> and specify the requirements for each task: <code>spark.task.resource.{resourceName}.amount</code>. The <code>spark.driver.resource.{resourceName}.discoveryScript</code> config is required on YARN, Kubernetes and a client side Driver on Spark Standalone. <code>spark.executor.resource.{resourceName}.discoveryScript</code> config is required for YARN and Kubernetes. Kubernetes also requires <code>spark.driver.resource.{resourceName}.vendor</code> and/or <code>spark.executor.resource.{resourceName}.vendor</code>. See the config descriptions above for more information on each.
 
-Spark will use the configurations specified to first request containers with the corresponding resources from the cluster manager. Once it gets the container, Spark launches an Executor in that container which will discover what resources the container has and the addresses associated with each resource. The Executor will register with the Driver and report back the resources available to that Executor. The Spark scheduler can then schedule tasks to each Executor and assign specific resource addresses based on the resource requirements the user specified. The user can see the resources assigned to a task using the <code>TaskContext.get().resources</code> api. On the driver, the user can see the resources assigned with the SparkContext <code>resources</code> call. It's then up to the user to use the assignedaddresses to do the processing they want or pass those into the ML/AI framework they are using.
+Spark will use the configurations specified to first request containers with the corresponding resources from the cluster manager. Once it gets the container, Spark launches an Executor in that container which will discover what resources the container has and the addresses associated with each resource. The Executor will register with the Driver and report back the resources available to that Executor. The Spark scheduler can then schedule tasks to each Executor and assign specific resource addresses based on the resource requirements the user specified. The user can see the resources assigned to a task using the <code>TaskContext.get().resources</code> api. On the driver, the user can see the resources assigned with the SparkContext <code>resources</code> call. It's then up to the user to use the assigned addresses to do the processing they want or pass those into the ML/AI framework they are using.
 
 See your cluster manager specific page for requirements and details on each of - [YARN](running-on-yarn.html#resource-allocation-and-configuration-overview), [Kubernetes](running-on-kubernetes.html#resource-allocation-and-configuration-overview) and [Standalone Mode](spark-standalone.html#resource-allocation-and-configuration-overview). It is currently not available with local mode. And please also note that local-cluster mode with multiple workers is not supported(see Standalone documentation).
 

diff --git a/docs/sql-performance-tuning.md b/docs/sql-performance-tuning.md
@@ -34,30 +34,9 @@ memory usage and GC pressure. You can call `spark.catalog.uncacheTable("tableNam
 Configuration of in-memory caching can be done using the `setConf` method on `SparkSession` or by running
 `SET key=value` commands using SQL.
 
-<table>
-<thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr></thead>
-<tr>
-  <td><code>spark.sql.inMemoryColumnarStorage.compressed</code></td>
-  <td>true</td>
-  <td>
-    When set to true, Spark SQL will automatically select a compression codec for each column based
-    on statistics of the data.
-  </td>
-  <td>1.0.1</td>
-</tr>
-<tr>
-  <td><code>spark.sql.inMemoryColumnarStorage.batchSize</code></td>
-  <td>10000</td>
-  <td>
-    Controls the size of batches for columnar caching. Larger batch sizes can improve memory utilization
-    and compression, but risk OOMs when caching data.
-  </td>
-  <td>1.1.1</td>
-</tr>
-
-</table>
+{% include_relative generated-sql-config-table-caching-data.html %}
 
-## Other Configuration Options
+## Tuning Partitions
 
 The following options can also be used to tune the performance of query execution. It is possible
 that these options will be deprecated in future release as more optimizations are performed automatically.

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -470,13 +470,15 @@ object SQLConf {
     .doc("When set to true Spark SQL will automatically select a compression codec for each " +
       "column based on statistics of the data.")
     .version("1.0.1")
+    .withTag("caching-data")
     .booleanConf
     .createWithDefault(true)
 
   val COLUMN_BATCH_SIZE = buildConf("spark.sql.inMemoryColumnarStorage.batchSize")
     .doc("Controls the size of batches for columnar caching.  Larger batch sizes can improve " +
       "memory utilization and compression, but risk OOMs when caching data.")
     .version("1.1.1")
+    .withTag("caching-data")
     .intConf
     .createWithDefault(10000)
 
@@ -596,6 +598,7 @@ object SQLConf {
       "run, and file-based data source tables where the statistics are computed directly on " +
       "the files of data.")
     .version("1.1.0")
+    .withTag("tuning")
     .bytesConf(ByteUnit.BYTE)
     .createWithDefaultString("10MB")
 
@@ -2695,6 +2698,7 @@ object SQLConf {
     buildConf("spark.sql.cbo.enabled")
       .doc("Enables CBO for estimation of plan statistics when set true.")
       .version("2.2.0")
+      .withTag("cbo")
       .booleanConf
       .createWithDefault(false)
 
@@ -5610,6 +5614,14 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
   def getAllConfs: immutable.Map[String, String] =
     settings.synchronized { settings.asScala.toMap }
 
+  def getAllDefinedConfsWithTags: Seq[(String, String, String, String, Seq[String])] = {
+    loadDefinedConfs()
+    getConfigEntries().asScala.filter(_.isPublic).map { entry =>
+      val displayValue = Option(getConfString(entry.key, null)).getOrElse(entry.defaultValueString)
+      (entry.key, displayValue, entry.doc, entry.version, entry.tags)
+    }.toSeq
+  }
+
   /**
    * Return all the configuration definitions that have been defined in [[SQLConf]]. Each
    * definition contains key, defaultValue and doc.

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
@@ -72,6 +72,14 @@ private[sql] object PythonSQLUtils extends Logging {
     FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray
   }
 
+  def listAllSQLConfigsWithTags():
+    Array[(String, String, String, String, Array[String])] = {
+    val conf = new SQLConf()
+    conf.getAllDefinedConfsWithTags.map(c =>
+      Tuple5(c._1, c._2, c._3, c._4, c._5.toArray)
+    ).toArray
+  }
+
   private def listAllSQLConfigs(): Seq[(String, String, String, String)] = {
     val conf = new SQLConf()
     conf.getAllDefinedConfs

diff --git a/sql/gen-sql-config-docs.py b/sql/gen-sql-config-docs.py
@@ -18,7 +18,7 @@
 import os
 import re
 
-from collections import namedtuple
+from collections import namedtuple, defaultdict
 from textwrap import dedent
 
 # To avoid adding a new direct dependency, we import markdown from within mkdocs.
@@ -28,23 +28,38 @@
 
 
 SQLConfEntry = namedtuple(
-    "SQLConfEntry", ["name", "default", "description", "version"])
-
-
-def get_sql_configs(jvm, group):
-    if group == "static":
-        config_set = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listStaticSQLConfigs()
-    else:
-        config_set = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listRuntimeSQLConfigs()
-    sql_configs = [
-        SQLConfEntry(
-            name=_sql_config._1(),
-            default=_sql_config._2(),
-            description=_sql_config._3(),
-            version=_sql_config._4()
-        )
-        for _sql_config in config_set
+    "SQLConfEntry", [
+        "name",
+        "default",
+        "description",
+        "version",
+        "tags",
     ]
+)
+
+
+def get_sql_configs(jvm):
+    sql_configs = defaultdict(
+        list, {
+            "__all": [],
+            "__no_group": [],
+        }
+    )
+    config_set = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listAllSQLConfigsWithTags()
+    for raw_config in config_set:
+        sql_config = SQLConfEntry(
+            name=raw_config._1(),
+            default=raw_config._2(),
+            description=raw_config._3(),
+            version=raw_config._4(),
+            tags=list(raw_config._5()),
+        )
+        sql_configs["__all"].append(sql_config)
+        if not sql_config.tags:
+            sql_configs["__no_group"].append(sql_config)
+        else:
+            for tag in sql_config.tags:
+                sql_configs[tag].append(sql_config)
     return sql_configs
 
 
@@ -105,8 +120,11 @@ def generate_sql_configs_table_html(sql_configs, path):
 
             f.write(dedent(
                 """
-                <tr>
-                    <td><code>{name}</code></td>
+                <tr id="{name}">
+                    <td>
+                        <a href="#{name}"><code>#</code></a>
+                        <code>{name}</code>
+                    </td>
                     <td>{default}</td>
                     <td>{description}</td>
                     <td>{version}</td>
@@ -126,10 +144,7 @@ def generate_sql_configs_table_html(sql_configs, path):
     jvm = launch_gateway().jvm
     docs_root_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "docs")
 
-    sql_configs = get_sql_configs(jvm, "runtime")
-    sql_configs_table_path = os.path.join(docs_root_dir, "generated-runtime-sql-config-table.html")
-    generate_sql_configs_table_html(sql_configs, path=sql_configs_table_path)
-
-    sql_configs = get_sql_configs(jvm, "static")
-    sql_configs_table_path = os.path.join(docs_root_dir, "generated-static-sql-config-table.html")
-    generate_sql_configs_table_html(sql_configs, path=sql_configs_table_path)
+    sql_configs = get_sql_configs(jvm)
+    for group in sql_configs:
+        html_table_path = os.path.join(docs_root_dir, f"generated-sql-config-table-{group}.html")
+        generate_sql_configs_table_html(sql_configs[group], path=html_table_path)