-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-18675][SQL] CTAS for hive serde table should work for all hive versions #16104
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,7 +22,6 @@ import java.net.URI | |
| import java.text.SimpleDateFormat | ||
| import java.util.{Date, Locale, Random} | ||
|
|
||
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.fs.{FileSystem, Path} | ||
| import org.apache.hadoop.hive.common.FileUtils | ||
| import org.apache.hadoop.hive.ql.exec.TaskRunner | ||
|
|
@@ -86,14 +85,15 @@ case class InsertIntoHiveTable( | |
|
|
||
| val hadoopConf = sessionState.newHadoopConf() | ||
| val stagingDir = hadoopConf.get("hive.exec.stagingdir", ".hive-staging") | ||
| val scratchDir = hadoopConf.get("hive.exec.scratchdir", "/tmp/hive") | ||
|
|
||
| private def executionId: String = { | ||
| val rand: Random = new Random | ||
| val format = new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_SSS", Locale.US) | ||
| "hive_" + format.format(new Date) + "_" + Math.abs(rand.nextLong) | ||
| } | ||
|
|
||
| private def getStagingDir(inputPath: Path, hadoopConf: Configuration): Path = { | ||
| private def getStagingDir(inputPath: Path): Path = { | ||
| val inputPathUri: URI = inputPath.toUri | ||
| val inputPathName: String = inputPathUri.getPath | ||
| val fs: FileSystem = inputPath.getFileSystem(hadoopConf) | ||
|
|
@@ -121,21 +121,61 @@ case class InsertIntoHiveTable( | |
| return dir | ||
| } | ||
|
|
||
| private def getExternalScratchDir(extURI: URI, hadoopConf: Configuration): Path = { | ||
| getStagingDir(new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath), hadoopConf) | ||
| private def getExternalScratchDir(extURI: URI): Path = { | ||
| getStagingDir(new Path(extURI.getScheme, extURI.getAuthority, extURI.getPath)) | ||
| } | ||
|
|
||
| def getExternalTmpPath(path: Path, hadoopConf: Configuration): Path = { | ||
| def getExternalTmpPath(path: Path): Path = { | ||
| val hiveVersion = externalCatalog.asInstanceOf[HiveExternalCatalog].client.version.fullVersion | ||
| if (hiveVersion.startsWith("0.12") || | ||
| hiveVersion.startsWith("0.13") || | ||
| hiveVersion.startsWith("0.14") || | ||
| hiveVersion.startsWith("1.0")) { | ||
| oldStyleExternalTempPath(path) | ||
| } else if (hiveVersion.startsWith("1.1") || hiveVersion.startsWith("1.2")) { | ||
| newStyleExternalTempPath(path) | ||
| } else { | ||
| throw new IllegalStateException("Unsupported hive version: " + hiveVersion) | ||
|
||
| } | ||
| } | ||
|
|
||
| // Mostly copied from Context.java#getExternalTmpPath of Hive 0.13 | ||
| def oldStyleExternalTempPath(path: Path): Path = { | ||
| val extURI: URI = path.toUri | ||
| val scratchPath = new Path(scratchDir, executionId) | ||
| var dirPath = new Path( | ||
| extURI.getScheme, | ||
| extURI.getAuthority, | ||
| scratchPath.toUri.getPath + "-" + TaskRunner.getTaskRunnerID()) | ||
|
|
||
| try { | ||
| val fs: FileSystem = dirPath.getFileSystem(hadoopConf) | ||
| dirPath = new Path(fs.makeQualified(dirPath).toString()) | ||
|
|
||
| if (!FileUtils.mkdir(fs, dirPath, true, hadoopConf)) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We create the directory, but we never drop it? I checked all the related Hive versions, they have the same context clear function to remove these temporary directories.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. uh, we drop it after the normal termination of vm by calling
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keeping these useless directories and files might not make sense, right? Many many files could accumulate?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I followed the way we clean up staging dir in https://github.com/apache/spark/pull/16104/files#diff-d579db9a8f27e0bbef37720ab14ec3f6R114
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My above concern is not directly related to this PR. Just submitted a PR to resolve the existing issue. #16134 I think it is a very serious bug. |
||
| throw new IllegalStateException("Cannot create staging directory: " + dirPath.toString) | ||
| } | ||
| fs.deleteOnExit(dirPath) | ||
| } catch { | ||
| case e: IOException => | ||
| throw new RuntimeException("Cannot create staging directory: " + dirPath.toString, e) | ||
|
|
||
| } | ||
| dirPath | ||
| } | ||
|
|
||
| // Mostly copied from Context.java#getExternalTmpPath of Hive 1.2 | ||
| def newStyleExternalTempPath(path: Path): Path = { | ||
| val extURI: URI = path.toUri | ||
| if (extURI.getScheme == "viewfs") { | ||
| getExtTmpPathRelTo(path.getParent, hadoopConf) | ||
| getExtTmpPathRelTo(path.getParent) | ||
| } else { | ||
| new Path(getExternalScratchDir(extURI, hadoopConf), "-ext-10000") | ||
| new Path(getExternalScratchDir(extURI), "-ext-10000") | ||
| } | ||
| } | ||
|
|
||
| def getExtTmpPathRelTo(path: Path, hadoopConf: Configuration): Path = { | ||
| new Path(getStagingDir(path, hadoopConf), "-ext-10000") // Hive uses 10000 | ||
| def getExtTmpPathRelTo(path: Path): Path = { | ||
| new Path(getStagingDir(path), "-ext-10000") // Hive uses 10000 | ||
| } | ||
|
|
||
| private def saveAsHiveFile( | ||
|
|
@@ -172,7 +212,7 @@ case class InsertIntoHiveTable( | |
| // instances within the closure, since Serializer is not serializable while TableDesc is. | ||
| val tableDesc = table.tableDesc | ||
| val tableLocation = table.hiveQlTable.getDataLocation | ||
| val tmpLocation = getExternalTmpPath(tableLocation, hadoopConf) | ||
| val tmpLocation = getExternalTmpPath(tableLocation) | ||
| val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false) | ||
| val isCompressed = hadoopConf.get("hive.exec.compress.output", "false").toBoolean | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The default value of the
hive.exec.scratchdiralso depends on the version.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yea that's true, but I don't think the default value here is a big deal and need to worry about.