-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-4912][SQL] Persistent tables for the Spark SQL data sources api #3752
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
677e8b3
055869e
1002d20
e23f8fb
563ff40
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,15 +20,16 @@ package org.apache.spark.sql.hive | |
| import java.io.IOException | ||
| import java.util.{List => JList} | ||
|
|
||
| import com.google.common.cache.{CacheLoader, CacheBuilder} | ||
| import org.apache.spark.sql.execution.SparkPlan | ||
| import org.apache.spark.sql.sources.{LogicalRelation, ResolvedDataSource, BaseRelation} | ||
|
|
||
| import scala.util.parsing.combinator.RegexParsers | ||
|
|
||
| import org.apache.hadoop.util.ReflectionUtils | ||
|
|
||
| import org.apache.hadoop.hive.metastore.TableType | ||
| import org.apache.hadoop.hive.metastore.api.FieldSchema | ||
| import org.apache.hadoop.hive.metastore.api.{Table => TTable, Partition => TPartition} | ||
| import org.apache.hadoop.hive.metastore.api.{Table => TTable, Partition => TPartition, SerDeInfo, FieldSchema} | ||
| import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table, HiveException} | ||
| import org.apache.hadoop.hive.ql.plan.CreateTableDesc | ||
| import org.apache.hadoop.hive.serde.serdeConstants | ||
|
|
@@ -55,8 +56,60 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with | |
| /** Connection to hive metastore. Usages should lock on `this`. */ | ||
| protected[hive] val client = Hive.get(hive.hiveconf) | ||
|
|
||
| // TODO: Use this everywhere instead of tuples or databaseName, tableName,. | ||
| /** A fully qualified identifier for a table (i.e., database.tableName) */ | ||
| case class TableIdent(database: String, name: String) { | ||
| def toLowerCase = TableIdent(database.toLowerCase, name.toLowerCase) | ||
| } | ||
|
|
||
| /** A cache of Spark SQL data source tables that have been accessed. */ | ||
| protected[hive] val cachedDataSourceTables = CacheBuilder.newBuilder() | ||
| .maximumSize(1000) | ||
| .build( | ||
| new CacheLoader[TableIdent, LogicalPlan]() { | ||
| override def load(in: TableIdent): LogicalPlan = { | ||
| logDebug(s"Creating new cached data source for $in") | ||
| val table = client.getTable(in.database, in.name) | ||
|
|
||
| // It does not appear that the ql client for the metastore has a way to enumerate all the | ||
| // SerDe properties directly... | ||
| val method = classOf[Table].getDeclaredMethod("getSerdeInfo") | ||
| method.setAccessible(true) | ||
| val serdeInfo = method.invoke(table).asInstanceOf[SerDeInfo] | ||
|
|
||
| val resolvedRelation = | ||
| ResolvedDataSource( | ||
| hive, | ||
| table.getProperty("spark.sql.sources.provider"), | ||
| serdeInfo.getParameters.toMap) | ||
|
|
||
| LogicalRelation(resolvedRelation.relation) | ||
| } | ||
| }) | ||
|
|
||
| def refreshTable(databaseName: String, tableName: String): Unit = { | ||
| cachedDataSourceTables.refresh(TableIdent(databaseName, tableName).toLowerCase) | ||
| } | ||
|
|
||
| def invalidateTable(databaseName: String, tableName: String): Unit = { | ||
| cachedDataSourceTables.invalidate(TableIdent(databaseName, tableName).toLowerCase) | ||
| } | ||
|
|
||
| val caseSensitive: Boolean = false | ||
|
|
||
| def createDataSourceTable(tableName: String, provider: String, options: Map[String, String]) = { | ||
| val (dbName, tblName) = processDatabaseAndTableName("default", tableName) | ||
| val tbl = new Table(dbName, tblName) | ||
|
|
||
| tbl.setProperty("spark.sql.sources.provider", provider) | ||
| options.foreach { case (key, value) => tbl.setSerdeParam(key, value) } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are using serde properties to store all parameters that will be passed to a relation provider (for creating a relation), right? Probably we can add a comment at here. |
||
|
|
||
| // create the table | ||
| synchronized { | ||
| client.createTable(tbl, false) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we mark tables from external data sources as external table? We can do this by tbl.putToParameters("EXTERNAL", "TRUE")
tbl.setTableType(TableType.EXTERNAL_TABLE.toString())
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, thats a good idea at least for the types of tables you can create now. We should think about options here and if we want to support non-external tables. Also I assume you meant something like this? tbl.setProperty("EXTERNAL", "TRUE")
tbl.setTableType(TableType.EXTERNAL_TABLE)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh yes. I was reading part of HCatalog code, which is actually manipulating |
||
| } | ||
| } | ||
|
|
||
| def tableExists(db: Option[String], tableName: String): Boolean = { | ||
| val (databaseName, tblName) = processDatabaseAndTableName( | ||
| db.getOrElse(hive.sessionState.getCurrentDatabase), tableName) | ||
|
|
@@ -70,7 +123,10 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with | |
| val (databaseName, tblName) = | ||
| processDatabaseAndTableName(db.getOrElse(hive.sessionState.getCurrentDatabase), tableName) | ||
| val table = client.getTable(databaseName, tblName) | ||
| if (table.isView) { | ||
|
|
||
| if (table.getProperty("spark.sql.sources.provider") != null) { | ||
| cachedDataSourceTables(TableIdent(databaseName, tblName).toLowerCase) | ||
| } else if (table.isView) { | ||
| // if the unresolved relation is from hive view | ||
| // parse the text into logic node. | ||
| HiveQl.createPlanForView(table, alias) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can be used to retrieve all SerDe properties:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also I think
SerDeInfomight not be a proper place to put external data source table options. Semantically, these options are more like general table properties, thus usingtable.putPropertymight be better.@yhuai Are there notable differences between SerDe properties and general table properties in Hive? Or to be more specifically, differences between properties saved in
metastore.Table.getTTable.getParametersand those in `metastore.Table.getTTable.getSd.getSerdeInfo.getParameters.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My reasoning here was that table properties might have other things that could conflict with the options that a data source requires. It seems like SerDe properties are scoped to hold just the options that describe how the serialization library should read the data (which seems analogous to our data sources).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the explanations, then this sounds good to me.