diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index 9bdbfb33bf54..58d686681709 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -35,15 +35,28 @@ class SparkHadoopUtil { val conf: Configuration = newConfiguration() UserGroupInformation.setConfiguration(conf) - def runAsUser(user: String)(func: () => Unit) { + /** Creates a UserGroupInformation for Spark based on SPARK_USER environment variable. */ + def createSparkUser(): Option[UserGroupInformation] = { + val user = Option(System.getenv("SPARK_USER")).getOrElse(SparkContext.SPARK_UNKNOWN_USER) if (user != SparkContext.SPARK_UNKNOWN_USER) { - val ugi = UserGroupInformation.createRemoteUser(user) - transferCredentials(UserGroupInformation.getCurrentUser(), ugi) - ugi.doAs(new PrivilegedExceptionAction[Unit] { - def run: Unit = func() - }) + Some(UserGroupInformation.createRemoteUser(user)) } else { - func() + None + } + } + + /** + * If a user is specified, we will run the function as that user. We additionally transfer + * Spark's tokens to the given UGI to ensure it has access to data written by Spark. + */ + def runAsUser(user: Option[UserGroupInformation])(func: () => Unit) { + user match { + case Some(ugi) => { + transferCredentials(UserGroupInformation.getCurrentUser(), ugi) + ugi.doAs(new PrivilegedExceptionAction[Unit] { + def run: Unit = func() + })} + case None => func() } } diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 272bcda5f8f2..e2a83ba42513 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -128,7 +128,13 @@ private[spark] class Executor( // Maintains the list of running tasks. private val runningTasks = new ConcurrentHashMap[Long, TaskRunner] - val sparkUser = Option(System.getenv("SPARK_USER")).getOrElse(SparkContext.SPARK_UNKNOWN_USER) + // NB: Workaround for SPARK-1676. Caching UGIs prevents continuously creating FileSystem + // objects with "unique" UGIs, but is not a good solution if real UGIs and tokens are needed, + // mainly because expired tokens cannot be removed from the UGI. + val cacheUgi = conf.getBoolean("spark.user.cacheUserGroupInformation", true) + + val cachedSparkUser = SparkHadoopUtil.get.createSparkUser() + def getSparkUser = if (cacheUgi) cachedSparkUser else SparkHadoopUtil.get.createSparkUser() def launchTask(context: ExecutorBackend, taskId: Long, serializedTask: ByteBuffer) { val tr = new TaskRunner(context, taskId, serializedTask) @@ -172,7 +178,7 @@ private[spark] class Executor( } } - override def run(): Unit = SparkHadoopUtil.get.runAsUser(sparkUser) { () => + override def run(): Unit = SparkHadoopUtil.get.runAsUser(getSparkUser) { () => val startTime = System.currentTimeMillis() SparkEnv.set(env) Thread.currentThread.setContextClassLoader(replClassLoader) diff --git a/docs/configuration.md b/docs/configuration.md index e7e1dd56cf12..af3c3e32fc98 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -679,6 +679,17 @@ Apart from these, the following properties are also available, and may be useful Set a special library path to use when launching executor JVM's. + + spark.user.cacheUserGroupInformation + true + + Caching UGIs is a workaround for [SPARK-1676](https://issues.apache.org/jira/browse/SPARK-1676) + for users who are not using security in a very serious manner. Caching UGIs can produce + security-related exceptions when tokens have an expiry, or are shared between users. On the other + hand, not caching UGIs means that every FileSystem.get() call can potentially create and cache a + new FileSystem object, which leads to leaked memory and file descriptors. + + diff --git a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index 2f74965900ba..edb2de1fa4b6 100644 --- a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -70,8 +70,7 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration, private var registered = false - private val sparkUser = Option(System.getenv("SPARK_USER")).getOrElse( - SparkContext.SPARK_UNKNOWN_USER) + private val sparkUser = SparkHadoopUtil.createSparkUser() def run() { // Setup the directories so things go to yarn approved directories rather diff --git a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index 90e807160d4b..64fba45ef28a 100644 --- a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -72,8 +72,7 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration, private var registered = false - private val sparkUser = Option(System.getenv("SPARK_USER")).getOrElse( - SparkContext.SPARK_UNKNOWN_USER) + private val sparkUser = SparkHadoopUtil.createSparkUser() def run() { // Setup the directories so things go to YARN approved directories rather