-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-5979][SPARK-6031][SPARK-6032][SPARK-6047] Refactoring for --packages -> Move to SparkSubmitDriverBootstrapper #4754
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
838d301
9bacee8
941c65e
e3ca1b7
5191f3a
0070af8
d9e3cf0
c9c11ef
f25c55b
43c3cb2
a4bc489
12e2764
f06a754
c73aabe
7f958c1
b7a9e93
994869e
44dbf67
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -251,24 +251,19 @@ object SparkSubmit { | |
| } | ||
|
|
||
| val isYarnCluster = clusterManager == YARN && deployMode == CLUSTER | ||
|
|
||
| // Resolve maven dependencies if there are any and add classpath to jars. Add them to py-files | ||
| // too for packages that include Python code | ||
| val resolvedMavenCoordinates = | ||
| SparkSubmitUtils.resolveMavenCoordinates( | ||
| args.packages, Option(args.repositories), Option(args.ivyRepoPath)) | ||
| if (!resolvedMavenCoordinates.trim.isEmpty) { | ||
| if (args.jars == null || args.jars.trim.isEmpty) { | ||
| args.jars = resolvedMavenCoordinates | ||
| val packagesResolved = | ||
| if (args.packagesResolved != null) { | ||
| // SparkSubmitDriverBootstrapper already downloaded the jars for us | ||
| args.packagesResolved | ||
| } else { | ||
| args.jars += s",$resolvedMavenCoordinates" | ||
| SparkSubmitUtils.resolveMavenCoordinates(args.packages, Option(args.repositories), | ||
| Option(args.ivyRepoPath)).mkString(",") | ||
| } | ||
|
|
||
| if (packagesResolved.nonEmpty) { | ||
| args.jars = mergeFileLists(args.jars, packagesResolved) | ||
| if (args.isPython) { | ||
| if (args.pyFiles == null || args.pyFiles.trim.isEmpty) { | ||
| args.pyFiles = resolvedMavenCoordinates | ||
| } else { | ||
| args.pyFiles += s",$resolvedMavenCoordinates" | ||
| } | ||
| args.pyFiles = mergeFileLists(args.pyFiles, packagesResolved) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -655,8 +650,7 @@ private[spark] object SparkSubmitUtils { | |
|
|
||
| /** | ||
| * Extracts maven coordinates from a comma-delimited string. Coordinates should be provided | ||
| * in the format `groupId:artifactId:version` or `groupId/artifactId:version`. The latter provides | ||
| * simplicity for Spark Package users. | ||
| * in the format `groupId:artifactId:version` or `groupId/artifactId:version`. | ||
| * @param coordinates Comma-delimited string of maven coordinates | ||
| * @return Sequence of Maven coordinates | ||
| */ | ||
|
|
@@ -721,17 +715,17 @@ private[spark] object SparkSubmitUtils { | |
| * after a '!' by Ivy. It also sometimes contains '(bundle)' after '.jar'. Remove that as well. | ||
| * @param artifacts Sequence of dependencies that were resolved and retrieved | ||
| * @param cacheDirectory directory where jars are cached | ||
| * @return a comma-delimited list of paths for the dependencies | ||
| * @return A sequence of paths for the dependencies | ||
| */ | ||
| private[spark] def resolveDependencyPaths( | ||
| artifacts: Array[AnyRef], | ||
| cacheDirectory: File): String = { | ||
| cacheDirectory: File): Seq[String] = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here, update java doc |
||
| artifacts.map { artifactInfo => | ||
| val artifactString = artifactInfo.toString | ||
| val jarName = artifactString.drop(artifactString.lastIndexOf("!") + 1) | ||
| cacheDirectory.getAbsolutePath + File.separator + | ||
| jarName.substring(0, jarName.lastIndexOf(".jar") + 4) | ||
| }.mkString(",") | ||
| } | ||
| } | ||
|
|
||
| /** Adds the given maven coordinates to Ivy's module descriptor. */ | ||
|
|
@@ -748,6 +742,35 @@ private[spark] object SparkSubmitUtils { | |
| } | ||
| } | ||
|
|
||
| /** Add exclusion rules for dependencies already included in the spark-assembly */ | ||
| private[spark] def addExclusionRules( | ||
| ivySettings: IvySettings, | ||
| ivyConfName: String, | ||
| md: DefaultModuleDescriptor): Unit = { | ||
| // Add scala exclusion rule | ||
| val scalaArtifacts = new ArtifactId(new ModuleId("*", "scala-library"), "*", "*", "*") | ||
| val scalaDependencyExcludeRule = | ||
| new DefaultExcludeRule(scalaArtifacts, ivySettings.getMatcher("glob"), null) | ||
| scalaDependencyExcludeRule.addConfiguration(ivyConfName) | ||
| md.addExcludeRule(scalaDependencyExcludeRule) | ||
|
|
||
| // We need to specify each component explicitly, otherwise we miss spark-streaming-kafka and | ||
| // other spark-streaming utility components. Underscore is there to differentiate between | ||
| // spark-streaming_2.1x and spark-streaming-kafka-assembly_2.1x | ||
| val components = Seq("bagel_", "catalyst_", "core_", "graphx_", "hive_", "mllib_", "repl_", | ||
| "sql_", "streaming_", "yarn_", "network-common_", "network-shuffle_", "network-yarn_") | ||
|
|
||
| components.foreach { comp => | ||
| val sparkArtifacts = | ||
| new ArtifactId(new ModuleId("org.apache.spark", s"spark-$comp*"), "*", "*", "*") | ||
| val sparkDependencyExcludeRule = | ||
| new DefaultExcludeRule(sparkArtifacts, ivySettings.getMatcher("glob"), null) | ||
| sparkDependencyExcludeRule.addConfiguration(ivyConfName) | ||
|
|
||
| md.addExcludeRule(sparkDependencyExcludeRule) | ||
| } | ||
| } | ||
|
|
||
| /** A nice function to use in tests as well. Values are dummy strings. */ | ||
| private[spark] def getModuleDescriptor = DefaultModuleDescriptor.newDefaultInstance( | ||
| ModuleRevisionId.newInstance("org.apache.spark", "spark-submit-parent", "1.0")) | ||
|
|
@@ -757,17 +780,20 @@ private[spark] object SparkSubmitUtils { | |
| * @param coordinates Comma-delimited string of maven coordinates | ||
| * @param remoteRepos Comma-delimited string of remote repositories other than maven central | ||
| * @param ivyPath The path to the local ivy repository | ||
| * @return The comma-delimited path to the jars of the given maven artifacts including their | ||
| * @return A sequence of paths to the jars of the given maven artifacts including their | ||
| * transitive dependencies | ||
| */ | ||
| private[spark] def resolveMavenCoordinates( | ||
| coordinates: String, | ||
| remoteRepos: Option[String], | ||
| ivyPath: Option[String], | ||
| isTest: Boolean = false): String = { | ||
| isTest: Boolean = false): Seq[String] = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you'll need to update the javadoc to reflect this change. Right now it still says comma-delimited path |
||
| if (coordinates == null || coordinates.trim.isEmpty) { | ||
| "" | ||
| Seq.empty | ||
| } else { | ||
| val sysOut = System.out | ||
| // To prevent ivy from logging to system out | ||
| System.setOut(printStream) | ||
| val artifacts = extractMavenCoordinates(coordinates) | ||
| // Default configuration name for ivy | ||
| val ivyConfName = "default" | ||
|
|
@@ -811,19 +837,9 @@ private[spark] object SparkSubmitUtils { | |
| val md = getModuleDescriptor | ||
| md.setDefaultConf(ivyConfName) | ||
|
|
||
| // Add an exclusion rule for Spark and Scala Library | ||
| val sparkArtifacts = new ArtifactId(new ModuleId("org.apache.spark", "*"), "*", "*", "*") | ||
| val sparkDependencyExcludeRule = | ||
| new DefaultExcludeRule(sparkArtifacts, ivySettings.getMatcher("glob"), null) | ||
| sparkDependencyExcludeRule.addConfiguration(ivyConfName) | ||
| val scalaArtifacts = new ArtifactId(new ModuleId("*", "scala-library"), "*", "*", "*") | ||
| val scalaDependencyExcludeRule = | ||
| new DefaultExcludeRule(scalaArtifacts, ivySettings.getMatcher("glob"), null) | ||
| scalaDependencyExcludeRule.addConfiguration(ivyConfName) | ||
|
|
||
| // Exclude any Spark dependencies, and add all supplied maven artifacts as dependencies | ||
| md.addExcludeRule(sparkDependencyExcludeRule) | ||
| md.addExcludeRule(scalaDependencyExcludeRule) | ||
| // Add exclusion rules for Spark and Scala Library | ||
| addExclusionRules(ivySettings, ivyConfName, md) | ||
| // add all supplied maven artifacts as dependencies | ||
| addDependenciesToIvy(md, artifacts, ivyConfName) | ||
|
|
||
| // resolve dependencies | ||
|
|
@@ -836,6 +852,7 @@ private[spark] object SparkSubmitUtils { | |
| packagesDirectory.getAbsolutePath + File.separator + "[artifact](-[classifier]).[ext]", | ||
| retrieveOptions.setConfs(Array(ivyConfName))) | ||
|
|
||
| System.setOut(sysOut) | ||
| resolveDependencyPaths(rr.getArtifacts.toArray, packagesDirectory) | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -44,7 +44,7 @@ private[spark] object SparkSubmitDriverBootstrapper { | |
| System.exit(1) | ||
| } | ||
|
|
||
| val submitArgs = args | ||
| var submitArgs = args | ||
| val runner = sys.env("RUNNER") | ||
| val classpath = sys.env("CLASSPATH") | ||
| val javaOpts = sys.env("JAVA_OPTS") | ||
|
|
@@ -58,6 +58,8 @@ private[spark] object SparkSubmitDriverBootstrapper { | |
| val submitLibraryPath = sys.env.get("SPARK_SUBMIT_LIBRARY_PATH") | ||
| val submitClasspath = sys.env.get("SPARK_SUBMIT_CLASSPATH") | ||
| val submitJavaOpts = sys.env.get("SPARK_SUBMIT_OPTS") | ||
| val submitPackages = sys.env.getOrElse("SPARK_SUBMIT_PACKAGES", "") | ||
| val submitRepositories = sys.env.get("SPARK_SUBMIT_REPOSITORIES") | ||
|
|
||
| assume(runner != null, "RUNNER must be set") | ||
| assume(classpath != null, "CLASSPATH must be set") | ||
|
|
@@ -73,6 +75,7 @@ private[spark] object SparkSubmitDriverBootstrapper { | |
| val confLibraryPath = properties.get("spark.driver.extraLibraryPath") | ||
| val confClasspath = properties.get("spark.driver.extraClassPath") | ||
| val confJavaOpts = properties.get("spark.driver.extraJavaOptions") | ||
| val confIvyRepo = properties.get("spark.jars.ivy") | ||
|
|
||
| // Favor Spark submit arguments over the equivalent configs in the properties file. | ||
| // Note that we do not actually use the Spark submit values for library path, classpath, | ||
|
|
@@ -82,13 +85,25 @@ private[spark] object SparkSubmitDriverBootstrapper { | |
| .orElse(confDriverMemory) | ||
| .getOrElse(defaultDriverMemory) | ||
|
|
||
| val newClasspath = | ||
| var newClasspath = | ||
| if (submitClasspath.isDefined) { | ||
| classpath | ||
| } else { | ||
| classpath + confClasspath.map(sys.props("path.separator") + _).getOrElse("") | ||
| } | ||
|
|
||
| // Resolve maven dependencies if there are any and add them to classpath. Also send them | ||
| // to SparkSubmit so that they can be shipped to executors. | ||
| val resolvedMavenCoordinates = | ||
| SparkSubmitUtils.resolveMavenCoordinates( | ||
| submitPackages, submitRepositories, confIvyRepo) | ||
| if (resolvedMavenCoordinates.nonEmpty) { | ||
| newClasspath += sys.props("path.separator") + | ||
| resolvedMavenCoordinates.mkString(sys.props("path.separator")) | ||
| submitArgs = | ||
| Array("--packages-resolved", resolvedMavenCoordinates.mkString(",")) ++ submitArgs | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we thread this through using an environment variable |
||
| } | ||
|
|
||
| val newJavaOpts = | ||
| if (submitJavaOpts.isDefined) { | ||
| // SPARK_SUBMIT_OPTS is already captured in JAVA_OPTS | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like there is an opportunity for abstracting the two cases here: