-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-20196][PYTHON][SQL] update doc for catalog functions for all languages, add pyspark refreshByPath API #17512
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -544,12 +544,15 @@ sql <- function(x, ...) { | |
| dispatchFunc("sql(sqlQuery)", x, ...) | ||
| } | ||
|
|
||
| #' Create a SparkDataFrame from a SparkSQL Table | ||
| #' Create a SparkDataFrame from a SparkSQL table or temporary view | ||
| #' | ||
| #' Returns the specified Table as a SparkDataFrame. The Table must have already been registered | ||
| #' in the SparkSession. | ||
| #' Returns the specified table or temporary view as a SparkDataFrame. The temporary view must have | ||
|
||
| #' already been registered in the SparkSession. | ||
| #' | ||
| #' @param tableName The SparkSQL Table to convert to a SparkDataFrame. | ||
| #' @param tableName the qualified or unqualified name that designates a table or view. If a database | ||
| #' is specified, it identifies the table/view from the database. | ||
| #' Otherwise, it first attempts to find a temporary view with the given name | ||
| #' and then match the table/view from the current database. | ||
| #' @return SparkDataFrame | ||
| #' @rdname tableToDF | ||
| #' @name tableToDF | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -72,10 +72,10 @@ def listDatabases(self): | |
| @ignore_unicode_prefix | ||
| @since(2.0) | ||
| def listTables(self, dbName=None): | ||
| """Returns a list of tables in the specified database. | ||
| """Returns a list of tables/views in the specified database. | ||
|
|
||
| If no database is specified, the current database is used. | ||
| This includes all temporary tables. | ||
| This includes all temporary views. | ||
| """ | ||
| if dbName is None: | ||
| dbName = self.currentDatabase() | ||
|
|
@@ -115,7 +115,7 @@ def listFunctions(self, dbName=None): | |
| @ignore_unicode_prefix | ||
| @since(2.0) | ||
| def listColumns(self, tableName, dbName=None): | ||
| """Returns a list of columns for the given table in the specified database. | ||
| """Returns a list of columns for the given table/view in the specified database. | ||
|
|
||
| If no database is specified, the current database is used. | ||
|
|
||
|
|
@@ -161,14 +161,15 @@ def createExternalTable(self, tableName, path=None, source=None, schema=None, ** | |
| def createTable(self, tableName, path=None, source=None, schema=None, **options): | ||
| """Creates a table based on the dataset in a data source. | ||
|
|
||
| It returns the DataFrame associated with the external table. | ||
| It returns the DataFrame associated with the table. | ||
|
|
||
| The data source is specified by the ``source`` and a set of ``options``. | ||
| If ``source`` is not specified, the default data source configured by | ||
| ``spark.sql.sources.default`` will be used. | ||
| ``spark.sql.sources.default`` will be used. When ``path`` is specified, an external table is | ||
| created from the data at the given path. Otherwise a managed table is created. | ||
|
|
||
| Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and | ||
| created external table. | ||
| created table. | ||
|
|
||
| :return: :class:`DataFrame` | ||
| """ | ||
|
|
@@ -276,14 +277,24 @@ def clearCache(self): | |
|
|
||
| @since(2.0) | ||
| def refreshTable(self, tableName): | ||
| """Invalidate and refresh all the cached metadata of the given table.""" | ||
| """Invalidates and refreshes all the cached data and metadata of the given table.""" | ||
| self._jcatalog.refreshTable(tableName) | ||
|
|
||
| @since('2.1.1') | ||
| def recoverPartitions(self, tableName): | ||
| """Recover all the partitions of the given table and update the catalog.""" | ||
| """Recovers all the partitions of the given table and update the catalog. | ||
|
|
||
| Only works with a partitioned table, and not a temporary view. | ||
|
||
| """ | ||
| self._jcatalog.recoverPartitions(tableName) | ||
|
|
||
| @since('2.2.0') | ||
| def refreshByPath(self, path): | ||
| """Invalidates and refreshes all the cached data (and the associated metadata) for any | ||
| DataFrame that contains the given data source path. | ||
| """ | ||
| self._jcatalog.refreshByPath(path) | ||
|
|
||
| def _reset(self): | ||
| """(Internal use only) Drop all existing databases (except "default"), tables, | ||
| partitions and functions, and set the current database to "default". | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -283,7 +283,7 @@ abstract class Catalog { | |
|
|
||
| /** | ||
| * :: Experimental :: | ||
| * Creates a table from the given path based on a data source and a set of options. | ||
| * Creates a table based on the dataset in a data source and a set of options. | ||
| * Then, returns the corresponding DataFrame. | ||
| * | ||
| * @param tableName is either a qualified or unqualified name that designates a table. | ||
|
|
@@ -321,7 +321,7 @@ abstract class Catalog { | |
| /** | ||
| * :: Experimental :: | ||
| * (Scala-specific) | ||
| * Creates a table from the given path based on a data source and a set of options. | ||
| * Creates a table based on the dataset in a data source and a set of options. | ||
| * Then, returns the corresponding DataFrame. | ||
| * | ||
| * @param tableName is either a qualified or unqualified name that designates a table. | ||
|
|
@@ -357,7 +357,7 @@ abstract class Catalog { | |
|
|
||
| /** | ||
| * :: Experimental :: | ||
| * Create a table from the given path based on a data source, a schema and a set of options. | ||
| * Create a table based on the dataset in a data source, a schema and a set of options. | ||
| * Then, returns the corresponding DataFrame. | ||
| * | ||
| * @param tableName is either a qualified or unqualified name that designates a table. | ||
|
|
@@ -397,7 +397,7 @@ abstract class Catalog { | |
| /** | ||
| * :: Experimental :: | ||
| * (Scala-specific) | ||
| * Create a table from the given path based on a data source, a schema and a set of options. | ||
| * Create a table based on the dataset in a data source, a schema and a set of options. | ||
| * Then, returns the corresponding DataFrame. | ||
| * | ||
| * @param tableName is either a qualified or unqualified name that designates a table. | ||
|
|
@@ -447,6 +447,7 @@ abstract class Catalog { | |
|
|
||
| /** | ||
| * Recovers all the partitions in the directory of a table and update the catalog. | ||
| * Only works with a partitioned table, and not a temporary view. | ||
|
||
| * | ||
| * @param tableName is either a qualified or unqualified name that designates a table. | ||
| * If no database identifier is provided, it refers to a table in the | ||
|
|
@@ -493,10 +494,10 @@ abstract class Catalog { | |
| def clearCache(): Unit | ||
|
|
||
| /** | ||
| * Invalidates and refreshes all the cached metadata of the given table. For performance reasons, | ||
| * Spark SQL or the external data source library it uses might cache certain metadata about a | ||
| * table, such as the location of blocks. When those change outside of Spark SQL, users should | ||
| * call this function to invalidate the cache. | ||
| * Invalidates and refreshes all the cached data and metadata of the given table. For performance | ||
| * reasons, Spark SQL or the external data source library it uses might cache certain metadata | ||
| * about a table, such as the location of blocks. When those change outside of Spark SQL, users | ||
| * should call this function to invalidate the cache. | ||
| * | ||
| * If this table is cached as an InMemoryRelation, drop the original cached version and make the | ||
| * new version cached lazily. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -141,7 +141,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { | |
| } | ||
|
|
||
| /** | ||
| * Returns a list of columns for the given table temporary view. | ||
| * Returns a list of columns for the given table/view or temporary view. | ||
| */ | ||
| @throws[AnalysisException]("table does not exist") | ||
| override def listColumns(tableName: String): Dataset[Column] = { | ||
|
|
@@ -150,7 +150,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { | |
| } | ||
|
|
||
| /** | ||
| * Returns a list of columns for the given table in the specified database. | ||
| * Returns a list of columns for the given table/view or temporary view in the specified database. | ||
| */ | ||
| @throws[AnalysisException]("database or table does not exist") | ||
| override def listColumns(dbName: String, tableName: String): Dataset[Column] = { | ||
|
|
@@ -273,7 +273,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { | |
|
|
||
| /** | ||
| * :: Experimental :: | ||
| * Creates a table from the given path based on a data source and returns the corresponding | ||
| * Creates a table from the given path and returns the corresponding | ||
| * DataFrame. | ||
| * | ||
| * @group ddl_ops | ||
|
|
@@ -287,7 +287,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { | |
| /** | ||
| * :: Experimental :: | ||
| * (Scala-specific) | ||
| * Creates a table from the given path based on a data source and a set of options. | ||
| * Creates a table based on the dataset in a data source and a set of options. | ||
| * Then, returns the corresponding DataFrame. | ||
| * | ||
| * @group ddl_ops | ||
|
|
@@ -304,7 +304,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { | |
| /** | ||
| * :: Experimental :: | ||
| * (Scala-specific) | ||
| * Creates a table from the given path based on a data source, a schema and a set of options. | ||
| * Creates a table based on the dataset in a data source, a schema and a set of options. | ||
| * Then, returns the corresponding DataFrame. | ||
| * | ||
| * @group ddl_ops | ||
|
|
@@ -367,6 +367,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { | |
|
|
||
| /** | ||
| * Recovers all the partitions in the directory of a table and update the catalog. | ||
| * Only works with a partitioned table, and not a temporary view. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The same here.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| * | ||
| * @param tableName is either a qualified or unqualified name that designates a table. | ||
| * If no database identifier is provided, it refers to a table in the | ||
|
|
@@ -434,6 +435,9 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { | |
| * Refreshes the cache entry for a table or view, if any. For Hive metastore table, the metadata | ||
| * is refreshed. For data source tables, the schema will not be inferred and refreshed. | ||
| * | ||
| * If this table is cached as an InMemoryRelation, drop the original cached version and make the | ||
| * new version cached lazily. | ||
| * | ||
| * @group cachemgmt | ||
| * @since 2.0.0 | ||
| */ | ||
|
|
@@ -456,7 +460,8 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { | |
|
|
||
| /** | ||
| * Refreshes the cache entry and the associated metadata for all Dataset (if any), that contain | ||
| * the given data source path. | ||
| * the given data source path. Path matching is by prefix, i.e. "/" would invalidate | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We also do the re-cache, but the new version cached lazily.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For some reason in here, CatalogImpl.scala is very different from Catalog.scala - let me know if you want me to change them - for now I've updated the first sentence.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. I found this sentence is copied from Catalog.scala. Maybe, we can update them to
|
||
| * everything that is cached. | ||
| * | ||
| * @group cachemgmt | ||
| * @since 2.0.0 | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
table or viewHere, actually, it includes both temporary views or persistent views.