apache · brkyvz · Jan 26, 2015 · Jan 27, 2015 · Jan 27, 2015 · Jan 27, 2015
diff --git a/bin/utils.sh b/bin/utils.sh
@@ -26,14 +26,14 @@ function gatherSparkSubmitOpts() {
     exit 1
   fi
 
-  # NOTE: If you add or remove spark-sumbmit options,
+  # NOTE: If you add or remove spark-submit options,
   # modify NOT ONLY this script but also SparkSubmitArgument.scala
   SUBMISSION_OPTS=()
   APPLICATION_OPTS=()
   while (($#)); do
     case "$1" in
-      --master | --deploy-mode | --class | --name | --jars | --py-files | --files | \
-      --conf | --properties-file | --driver-memory | --driver-java-options | \
+      --master | --deploy-mode | --class | --name | --jars | --maven | --py-files | --files | \
+      --conf | --maven_repos | --properties-file | --driver-memory | --driver-java-options | \
       --driver-library-path | --driver-class-path | --executor-memory | --driver-cores | \
       --total-executor-cores | --executor-cores | --queue | --num-executors | --archives)
         if [[ $# -lt 2 ]]; then

diff --git a/core/pom.xml b/core/pom.xml
@@ -224,6 +224,11 @@
       <artifactId>derby</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.ivy</groupId>
+      <artifactId>ivy</artifactId>
+      <version>2.4.0</version>
+    </dependency>
     <dependency>
       <groupId>org.tachyonproject</groupId>
       <artifactId>tachyon-client</artifactId>

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -21,10 +21,19 @@ import java.io.{File, PrintStream}
 import java.lang.reflect.{Modifier, InvocationTargetException}
 import java.net.URL
 
+import org.apache.ivy.Ivy
+import org.apache.ivy.core.module.descriptor.{DefaultDependencyDescriptor, DefaultModuleDescriptor}
+import org.apache.ivy.core.module.id.ModuleRevisionId
+import org.apache.ivy.core.report.ResolveReport
+import org.apache.ivy.core.resolve.ResolveOptions
+import org.apache.ivy.core.retrieve.RetrieveOptions
+import org.apache.ivy.core.settings.IvySettings
+import org.apache.ivy.plugins.resolver.IBiblioResolver
+
 import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
 
 import org.apache.spark.executor.ExecutorURLClassLoader
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{MavenCoordinate, Utils}
 
 /**
  * Main gateway of launching a Spark application.
@@ -56,6 +65,12 @@ object SparkSubmit {
 
   private val CLASS_NOT_FOUND_EXIT_STATUS = 101
 
+  // Directories for caching downloads through ivy and storing the jars when maven coordinates are
+  // supplied to spark-submit
+  // TODO: Take these as arguments? For example, on AWS /mnt/ is a better location.
+  private val IVY_CACHE = new File("ivy/cache")
+  private val MAVEN_JARS = new File("ivy/jars")
+
   // Exposed for testing
   private[spark] var exitFn: () => Unit = () => System.exit(-1)
   private[spark] var printStream: PrintStream = System.err
@@ -168,6 +183,16 @@ object SparkSubmit {
     // Special flag to avoid deprecation warnings at the client
     sysProps("SPARK_SUBMIT") = "true"
 
+    // Resolve maven dependencies if there are any and add classpath to jars
+    val resolvedMavenCoordinates = resolveMavenCoordinates(args.maven, args.maven_repos)
+    if (!resolvedMavenCoordinates.trim.isEmpty) {
+      if (args.jars == null || args.jars.trim.isEmpty) {
+        args.jars = resolvedMavenCoordinates
+      } else {
+        args.jars += s",$resolvedMavenCoordinates"
+      }
+    }
+
     // A list of rules to map each argument to system properties or command-line options in
     // each deploy mode; we iterate through these below
     val options = List[OptionAssigner](
@@ -429,6 +454,80 @@ object SparkSubmit {
                       .mkString(",")
     if (merged == "") null else merged
   }
+
+  /**
+   * Resolves any dependencies that were supplied through maven coordinates
+   * @param coordinates Comma-delimited string of maven coordinates
+   * @param remoteRepos Comma-delimited string of remote repositories other than maven central
+   * @return The comma-delimited path to the jars of the given maven artifacts including their
+   *         transitive dependencies
+   */
+  private def resolveMavenCoordinates(coordinates: String, remoteRepos: String): String = {
+    if (coordinates == null || coordinates.trim.isEmpty) {
+      ""
+    } else {
+      val artifacts = coordinates.split(",").map { p =>
+        val splits = p.split(":")
+        require(splits.length == 3, s"Provided Maven Coordinates must be in the form " +
+          s"'groupId:artifactId:version'. The coordinate provided is: $p")
+        new MavenCoordinate(splits(0), splits(1), splits(2))
+      }
+      // create an ivy instance
+      val ivySettings: IvySettings = new IvySettings
+      ivySettings.setDefaultCache(IVY_CACHE)
+
+      // the biblio resolver resolves POM declared dependencies
+      val br: IBiblioResolver = new IBiblioResolver
+      br.setM2compatible(true)
+      br.setUsepoms(true)
+      br.setName("central")
+      ivySettings.addResolver(br)
+      // add any other remote repositories other than maven central
+      if (remoteRepos != null && !remoteRepos.trim.isEmpty) {
+        remoteRepos.split(",").foreach { repo =>
+          val brr: IBiblioResolver = new IBiblioResolver
+          brr.setM2compatible(true)
+          brr.setUsepoms(true)
+          brr.setRoot(repo)
+          ivySettings.addResolver(brr)
+        }
+      }
+      ivySettings.setDefaultResolver(br.getName)
+
+      val ivy = Ivy.newInstance(ivySettings)
+      // Set resolve options to download transitive dependencies as well
+      val ro = new ResolveOptions
+      ro.setTransitive(true)
+      ro.setDownload(true)
+      // A Module descriptor must be specified. Entries are dummy strings
+      val md = DefaultModuleDescriptor.newDefaultInstance(
+        ModuleRevisionId.newInstance("org.apache.spark", "spark-submit-envelope", "1.0"))
+
+      artifacts.foreach { mvn =>
+        val ri = ModuleRevisionId.newInstance(mvn.groupId, mvn.artifactId, mvn.version)
+        val dd = new DefaultDependencyDescriptor(ri, false, false)
+        dd.addDependencyConfiguration("default", "default")
+        md.addDependency(dd)
+      }
+      // resolve dependencies
+      val rr: ResolveReport = ivy.resolve(md, ro)
+      if (rr.hasError) {
+        throw new RuntimeException(rr.getAllProblemMessages.toString)
+      }
+      // retrieve all resolved dependencies
+      val m = rr.getModuleDescriptor
+      ivy.retrieve(m.getModuleRevisionId,
+        MAVEN_JARS.getAbsolutePath + "/[artifact](-[classifier]).[ext]",
+        new RetrieveOptions().setConfs(Array("default")))
+
+      // output downloaded jars to classpath (will append to jars). The name of the jar is given
+      // after a `!` by Ivy.
+      rr.getArtifacts.toArray.map { case artifactInfo =>
+        val artifactString = artifactInfo.toString
+        MAVEN_JARS.getAbsolutePath + "/" + artifactString.drop(artifactString.lastIndexOf("!") + 1)
+      }.mkString(",")
+    }
+  }
 }
 
 /**

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -50,6 +50,8 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
   var name: String = null
   var childArgs: ArrayBuffer[String] = new ArrayBuffer[String]()
   var jars: String = null
+  var maven: String = null
+  var maven_repos: String = null
   var verbose: Boolean = false
   var isPython: Boolean = false
   var pyFiles: String = null
@@ -224,6 +226,8 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
     |  name                    $name
     |  childArgs               [${childArgs.mkString(" ")}]
     |  jars                    $jars
+    |  maven                   $maven
+    |  maven_repos             $maven_repos
     |  verbose                 $verbose
     |
     |Spark properties used, including those specified through
@@ -330,6 +334,14 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
         jars = Utils.resolveURIs(value)
         parse(tail)
 
+      case ("--maven") :: value :: tail =>
+        maven = value
+        parse(tail)
+
+      case ("--maven_repos") :: value :: tail =>
+        maven_repos = value
+        parse(tail)
+
       case ("--conf" | "-c") :: value :: tail =>
         value.split("=", 2).toSeq match {
           case Seq(k, v) => sparkProperties(k) = v
@@ -380,6 +392,12 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St
         |  --name NAME                 A name of your application.
         |  --jars JARS                 Comma-separated list of local jars to include on the driver
         |                              and executor classpaths.
+        |  --maven                     Comma-separated list of maven coordinates of jars to include
+        |                              on the driver and executor classpaths. Will search the local
+        |                              maven repo, then maven central and any additional remote
+        |                              repositories given by --maven_repos.
+        |  --maven_repos               Supply additional remote repositories to search for the
+        |                              maven coordinates given with --maven.
         |  --py-files PY_FILES         Comma-separated list of .zip, .egg, or .py files to place
         |                              on the PYTHONPATH for Python apps.
         |  --files FILES               Comma-separated list of files to be placed in the working

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1873,6 +1873,14 @@ private[spark] object Utils extends Logging {
   }
 }
 
+/**
+ * Represents a Maven Coordinate
+ * @param groupId the groupId of the coordinate
+ * @param artifactId the artifactId of the coordinate
+ * @param version the version of the coordinate
+ */
+private[spark] case class MavenCoordinate(groupId: String, artifactId: String, version: String)
+
 /**
  * A utility class to redirect the child process's stdout or stderr.
  */