Address review feedback

Andrew Or · Andrew Or · commit 27c9a6c1c554 · 2015-03-01T22:11:28.000-08:00
Things changed in this commit:
(1) No more metadata in log content
(2) No more Spark version in log file name
(3) Use short name for compression codec in log file name
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -237,7 +237,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   private[spark] val eventLogCodec: Option[String] = {
     val compress = conf.getBoolean("spark.eventLog.compress", false)
     if (compress && isEventLogEnabled) {
-      Some(CompressionCodec.createCodec(conf)).map(_.getClass.getCanonicalName)
+      Some(CompressionCodec.getCodecName(conf)).map(CompressionCodec.getShortName)
     } else {
       None
     }
diff --git a/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala b/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala
@@ -23,8 +23,8 @@ private[spark] class ApplicationDescription(
     val memoryPerSlave: Int,
     val command: Command,
     var appUiUrl: String,
-    val sparkVersion: String,
     val eventLogDir: Option[String] = None,
+    // short name of compression codec used when writing event logs, if any (e.g. lzf)
     val eventLogCodec: Option[String] = None)
   extends Serializable {
 
@@ -36,11 +36,10 @@ private[spark] class ApplicationDescription(
       memoryPerSlave: Int = memoryPerSlave,
       command: Command = command,
       appUiUrl: String = appUiUrl,
-      sparkVersion: String = sparkVersion,
       eventLogDir: Option[String] = eventLogDir,
       eventLogCodec: Option[String] = eventLogCodec): ApplicationDescription =
     new ApplicationDescription(
-      name, maxCores, memoryPerSlave, command, appUiUrl, sparkVersion, eventLogDir, eventLogCodec)
+      name, maxCores, memoryPerSlave, command, appUiUrl, eventLogDir, eventLogCodec)
 
   override def toString: String = "ApplicationDescription(" + name + ")"
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/TestClient.scala
@@ -49,8 +49,7 @@ private[spark] object TestClient {
     val (actorSystem, _) = AkkaUtils.createActorSystem("spark", Utils.localIpAddress, 0,
       conf = conf, securityManager = new SecurityManager(conf))
     val desc = new ApplicationDescription("TestClient", Some(1), 512,
-      Command("spark.deploy.client.TestExecutor", Seq(), Map(), Seq(), Seq(), Seq()),
-      "ignored", "1.2.3")
+      Command("spark.deploy.client.TestExecutor", Seq(), Map(), Seq(), Seq(), Seq()), "ignored")
     val listener = new TestListener
     val client = new AppClient(actorSystem, Array(url), desc, listener, new SparkConf)
     client.start()
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -291,7 +291,7 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
   private def replay(eventLog: FileStatus, bus: ReplayListenerBus): FsApplicationHistoryInfo = {
     val logPath = eventLog.getPath()
     logInfo(s"Replaying log path: $logPath")
-    val (logInput, sparkVersion) =
+    val logInput =
       if (isLegacyLogDirectory(eventLog)) {
         openLegacyEventLog(logPath)
       } else {
@@ -300,7 +300,7 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
     try {
       val appListener = new ApplicationEventListener
       bus.addListener(appListener)
-      bus.replay(logInput, sparkVersion, logPath.toString)
+      bus.replay(logInput, logPath.toString)
       new FsApplicationHistoryInfo(
         logPath.getName(),
         appListener.appId.getOrElse(logPath.getName()),
@@ -322,28 +322,22 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
    *
    * @return 2-tuple of (input stream of the events, version of Spark which wrote the log)
    */
-  private[history] def openLegacyEventLog(dir: Path): (InputStream, String) = {
+  private[history] def openLegacyEventLog(dir: Path): InputStream = {
     val children = fs.listStatus(dir)
     var eventLogPath: Path = null
     var codecName: Option[String] = None
-    var sparkVersion: String = null
 
     children.foreach { child =>
       child.getPath().getName() match {
         case name if name.startsWith(LOG_PREFIX) =>
           eventLogPath = child.getPath()
-
         case codec if codec.startsWith(COMPRESSION_CODEC_PREFIX) =>
           codecName = Some(codec.substring(COMPRESSION_CODEC_PREFIX.length()))
-
-        case version if version.startsWith(SPARK_VERSION_PREFIX) =>
-          sparkVersion = version.substring(SPARK_VERSION_PREFIX.length())
-
         case _ =>
       }
     }
 
-    if (eventLogPath == null || sparkVersion == null) {
+    if (eventLogPath == null) {
       throw new IllegalArgumentException(s"$dir is not a Spark application log directory.")
     }
 
@@ -355,7 +349,7 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
       }
 
     val in = new BufferedInputStream(fs.open(eventLogPath))
-    (codec.map(_.compressedInputStream(in)).getOrElse(in), sparkVersion)
+    codec.map(_.compressedInputStream(in)).getOrElse(in)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -756,12 +756,12 @@ private[spark] class Master(
         return false
       }
 
-      val (logInput, sparkVersion) = EventLoggingListener.openEventLog(new Path(eventLogFile), fs)
+      val logInput = EventLoggingListener.openEventLog(new Path(eventLogFile), fs)
       val replayBus = new ReplayListenerBus()
       val ui = SparkUI.createHistoryUI(new SparkConf, replayBus, new SecurityManager(conf),
         appName + " (completed)", HistoryServer.UI_PATH_PREFIX + s"/${app.id}")
       try {
-        replayBus.replay(logInput, sparkVersion, eventLogFile)
+        replayBus.replay(logInput, eventLogFile)
       } finally {
         logInput.close()
       }
diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -26,7 +26,6 @@ import org.xerial.snappy.{Snappy, SnappyInputStream, SnappyOutputStream}
 import org.apache.spark.SparkConf
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.util.Utils
-import org.apache.spark.Logging
 
 /**
  * :: DeveloperApi ::
@@ -53,8 +52,12 @@ private[spark] object CompressionCodec {
     "lzf" -> classOf[LZFCompressionCodec].getName,
     "snappy" -> classOf[SnappyCompressionCodec].getName)
 
+  def getCodecName(conf: SparkConf): String = {
+    conf.get(configKey, DEFAULT_COMPRESSION_CODEC)
+  }
+
   def createCodec(conf: SparkConf): CompressionCodec = {
-    createCodec(conf, conf.get(configKey, DEFAULT_COMPRESSION_CODEC))
+    createCodec(conf, getCodecName(conf))
   }
 
   def createCodec(conf: SparkConf, codecName: String): CompressionCodec = {
@@ -71,6 +74,21 @@ private[spark] object CompressionCodec {
       s"Consider setting $configKey=$FALLBACK_COMPRESSION_CODEC"))
   }
 
+  /**
+   * Return the short version of the given codec name.
+   * If it is already a short name, just return it.
+   */
+  def getShortName(codecName: String): String = {
+    if (shortCompressionCodecNames.contains(codecName)) {
+      codecName
+    } else {
+      shortCompressionCodecNames
+        .collect { case (k, v) if v == codecName => k }
+        .headOption
+        .getOrElse { throw new IllegalArgumentException(s"No short name for codec $codecName.") }
+    }
+  }
+
   val FALLBACK_COMPRESSION_CODEC = "lzf"
   val DEFAULT_COMPRESSION_CODEC = "snappy"
   val ALL_COMPRESSION_CODECS = shortCompressionCodecNames.values.toSeq
diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -23,14 +23,13 @@ import java.net.URI
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
-import com.google.common.base.Charsets
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path}
 import org.apache.hadoop.fs.permission.FsPermission
 import org.json4s.JsonAST.JValue
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.{Logging, SparkConf, SPARK_VERSION}
+import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.util.{JsonProtocol, Utils}
@@ -68,7 +67,9 @@ private[spark] class EventLoggingListener(
     } else {
       None
     }
-  private val compressionCodecName = compressionCodec.map(_.getClass.getCanonicalName)
+  private val compressionCodecName = compressionCodec.map { c =>
+    CompressionCodec.getShortName(c.getClass.getName)
+  }
 
   // Only defined if the file system scheme is not local
   private var hadoopDataStream: Option[FSDataOutputStream] = None
@@ -121,11 +122,8 @@ private[spark] class EventLoggingListener(
     try {
       val cstream = compressionCodec.map(_.compressedOutputStream(dstream)).getOrElse(dstream)
       val bstream = new BufferedOutputStream(cstream, outputBufferSize)
-
       fileSystem.setPermission(path, LOG_FILE_PERMISSIONS)
-
-      val logStream = initEventLog(bstream, compressionCodec)
-      writer = Some(new PrintWriter(logStream))
+      writer = Some(new PrintWriter(bstream))
       logInfo("Logging events to %s".format(logPath))
     } catch {
       case e: Exception =>
@@ -214,56 +212,21 @@ private[spark] object EventLoggingListener extends Logging {
 
   private val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("770", 8).toShort)
 
-  // To avoid corrupted files causing the heap to fill up. Value is arbitrary.
-  private val MAX_HEADER_LINE_LENGTH = 4096
-
   // A cache for compression codecs to avoid creating the same codec many times
   private val codecMap = new mutable.HashMap[String, CompressionCodec]
 
-  /**
-   * Write metadata about the event log to the given stream.
-   *
-   * The header is a single line of JSON in the beginning of the file. Note that this
-   * assumes all metadata necessary to parse the log is also included in the file name.
-   * The format needs to be kept in sync with the `openEventLog()` method below. Also, it
-   * cannot change in new Spark versions without some other way of detecting the change.
-   *
-   * @param logStream Raw output stream to the event log file.
-   * @param compressionCodec Optional compression codec to use.
-   * @return A stream to which event log data is written. This may be a wrapper around the original
-   *         stream (for example, when compression is enabled).
-   */
-  def initEventLog(
-      logStream: OutputStream,
-      compressionCodec: Option[CompressionCodec]): OutputStream = {
-    val metadata = new mutable.HashMap[String, String]
-    // Some of these metadata are already encoded in the file name
-    // Here we include them again within the file itself for completeness
-    metadata += ("Event" -> Utils.getFormattedClassName(SparkListenerMetadataIdentifier))
-    metadata += (SPARK_VERSION_KEY -> SPARK_VERSION)
-    compressionCodec.foreach { codec =>
-      metadata += (COMPRESSION_CODEC_KEY -> codec.getClass.getCanonicalName)
-    }
-    val metadataJson = compact(render(JsonProtocol.mapToJson(metadata)))
-    val metadataBytes = (metadataJson + "\n").getBytes(Charsets.UTF_8)
-    if (metadataBytes.length > MAX_HEADER_LINE_LENGTH) {
-      throw new IOException(s"Event log metadata too long: $metadataJson")
-    }
-    logStream.write(metadataBytes, 0, metadataBytes.length)
-    logStream
-  }
-
   /**
    * Return a file-system-safe path to the log file for the given application.
    *
    * Note that because we currently only create a single log file for each application,
    * we must encode all the information needed to parse this event log in the file name
    * instead of within the file itself. Otherwise, if the file is compressed, for instance,
-   * we won't know which codec to use to decompress the metadata.
+   * we won't know which codec to use to decompress the metadata needed to open the file in
+   * the first place.
    *
    * @param logBaseDir Directory where the log file will be written.
    * @param appId A unique app ID.
-   * @param compressionCodecName Name of the compression codec used to compress the contents
+   * @param compressionCodecName Name to identify the codec used to compress the contents
    *                             of the log, or None if compression is not enabled.
    * @return A path which consists of file-system-safe characters.
    */
@@ -272,22 +235,19 @@ private[spark] object EventLoggingListener extends Logging {
       appId: String,
       compressionCodecName: Option[String] = None): String = {
     val sanitizedAppId = appId.replaceAll("[ :/]", "-").replaceAll("[${}'\"]", "_").toLowerCase
-    // e.g. EVENT_LOG_app_123_SPARK_VERSION_1.3.1
-    // e.g. EVENT_LOG_ {...} _COMPRESSION_CODEC_org.apache.spark.io.LZFCompressionCodec
-    val logName = s"${sanitizedAppId}_${SPARK_VERSION_KEY}_$SPARK_VERSION" +
-      compressionCodecName.map { c => s"_${COMPRESSION_CODEC_KEY}_$c" }.getOrElse("")
+    // e.g. app_123, app_123_COMPRESSION_CODEC_lzf
+    val logName = sanitizedAppId + compressionCodecName
+      .map { c => s"_${COMPRESSION_CODEC_KEY}_$c" }
+      .getOrElse("")
     Utils.resolveURI(logBaseDir).toString.stripSuffix("/") + "/" + logName
   }
 
   /**
    * Opens an event log file and returns an input stream that contains the event data.
    *
-   * The first line of the returned input stream is a JSON header that describes the metadata
-   * of the event log.
-   *
-   * @return 2-tuple (event input stream, Spark version of event data)
+   * @return input stream that holds one JSON serialized event per line
    */
-  def openEventLog(log: Path, fs: FileSystem): (InputStream, String) = {
+  def openEventLog(log: Path, fs: FileSystem): InputStream = {
     // It's not clear whether FileSystem.open() throws FileNotFoundException or just plain
     // IOException when a file does not exist, so try our best to throw a proper exception.
     if (!fs.exists(log)) {
@@ -296,21 +256,19 @@ private[spark] object EventLoggingListener extends Logging {
 
     val in = new BufferedInputStream(fs.open(log))
 
-    // Parse information from the log name
+    // Parse compression codec from the log name
     val logName = log.getName
-    val baseRegex = s"(.*)_${SPARK_VERSION_KEY}_(.*)".r
-    val compressionRegex = (baseRegex + s"_${COMPRESSION_CODEC_KEY}_(.*)").r
-    val (sparkVersion, codecName) = logName match {
-      case compressionRegex(_, version, _codecName) => (version, Some(_codecName))
-      case baseRegex(_, version) => (version, None)
-      case _ => throw new IllegalArgumentException(s"Malformed event log name: $logName")
+    val compressionRegex = s".*_${COMPRESSION_CODEC_KEY}_(.*)".r
+    val codecName: Option[String] = logName match {
+      case compressionRegex(_codecName) => Some(_codecName)
+      case _ => None
     }
     val codec = codecName.map { c =>
       codecMap.getOrElseUpdate(c, CompressionCodec.createCodec(new SparkConf, c))
     }
 
     try {
-      (codec.map(_.compressedInputStream(in)).getOrElse(in), sparkVersion)
+      codec.map(_.compressedInputStream(in)).getOrElse(in)
     } catch {
       case e: Exception =>
         in.close()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
@@ -39,20 +39,16 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging {
    * error is thrown by this method.
    *
    * @param logData Stream containing event log data.
-   * @param version Spark version that generated the events.
    * @param sourceName Filename (or other source identifier) from whence @logData is being read
    */
-  def replay(logData: InputStream, version: String, sourceName: String) {
+  def replay(logData: InputStream, sourceName: String): Unit = {
     var currentLine: String = null
     var lineNumber: Int = 1
     try {
       val lines = Source.fromInputStream(logData).getLines()
       lines.foreach { line =>
         currentLine = line
-        JsonProtocol.sparkEventFromJson(parse(line)) match {
-          case SparkListenerMetadataIdentifier => // Ignore metadata for now
-          case event => postToAll(event)
-        }
+        postToAll(JsonProtocol.sparkEventFromJson(parse(line)))
         lineNumber += 1
       }
     } catch {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala
@@ -98,12 +98,6 @@ case class SparkListenerExecutorAdded(time: Long, executorId: String, executorIn
 case class SparkListenerExecutorRemoved(time: Long, executorId: String, reason: String)
   extends SparkListenerEvent
 
-/**
- * A special dummy event used to identify the metadata header in event logs.
- * This is not actually posted anywhere.
- */
-private[spark] case object SparkListenerMetadataIdentifier extends SparkListenerEvent
-
 /**
  * Periodic updates from executors.
  * @param execId executor id
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/SparkListenerBus.scala
@@ -58,7 +58,6 @@ private[spark] trait SparkListenerBus extends ListenerBus[SparkListener, SparkLi
         listener.onExecutorAdded(executorAdded)
       case executorRemoved: SparkListenerExecutorRemoved =>
         listener.onExecutorRemoved(executorRemoved)
-      case SparkListenerMetadataIdentifier =>
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
@@ -85,7 +85,7 @@ private[spark] class SparkDeploySchedulerBackend(
       args, sc.executorEnvs, classPathEntries ++ testingClassPath, libraryPathEntries, javaOpts)
     val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")
     val appDesc = new ApplicationDescription(sc.appName, maxCores, sc.executorMemory, command,
-      appUIAddress, SPARK_VERSION, sc.eventLogDir, sc.eventLogCodec)
+      appUIAddress, sc.eventLogDir, sc.eventLogCodec)
 
     client = new AppClient(sc.env.actorSystem, masters, appDesc, this, conf)
     client.start()
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -91,7 +91,6 @@ private[spark] object JsonProtocol {
         executorRemovedToJson(executorRemoved)
       // These aren't used, but keeps compiler happy
       case SparkListenerExecutorMetricsUpdate(_, _) => JNothing
-      case SparkListenerMetadataIdentifier => JNothing
     }
   }
 
@@ -448,7 +447,6 @@ private[spark] object JsonProtocol {
     val applicationEnd = Utils.getFormattedClassName(SparkListenerApplicationEnd)
     val executorAdded = Utils.getFormattedClassName(SparkListenerExecutorAdded)
     val executorRemoved = Utils.getFormattedClassName(SparkListenerExecutorRemoved)
-    val metadataIdentifier = Utils.getFormattedClassName(SparkListenerMetadataIdentifier)
 
     (json \ "Event").extract[String] match {
       case `stageSubmitted` => stageSubmittedFromJson(json)
@@ -466,7 +464,6 @@ private[spark] object JsonProtocol {
       case `applicationEnd` => applicationEndFromJson(json)
       case `executorAdded` => executorAddedFromJson(json)
       case `executorRemoved` => executorRemovedFromJson(json)
-      case `metadataIdentifier` => SparkListenerMetadataIdentifier
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ExecutorRunnerTest.scala
diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -237,7 +237,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli`
`237`	`237`	`private[spark] val eventLogCodec: Option[String] = {`
`238`	`238`	`val compress = conf.getBoolean("spark.eventLog.compress", false)`
`239`	`239`	`if (compress && isEventLogEnabled) {`
`240`		`- Some(CompressionCodec.createCodec(conf)).map(_.getClass.getCanonicalName)`
	`240`	`+ Some(CompressionCodec.getCodecName(conf)).map(CompressionCodec.getShortName)`
`241`	`241`	`} else {`
`242`	`242`	`None`
`243`	`243`	`}`
Original file line number	Diff line number	Diff line change
`@@ -756,12 +756,12 @@ private[spark] class Master(`
`756`	`756`	`return false`
`757`	`757`	`}`
`758`	`758`
`759`		`- val (logInput, sparkVersion) = EventLoggingListener.openEventLog(new Path(eventLogFile), fs)`
	`759`	`+ val logInput = EventLoggingListener.openEventLog(new Path(eventLogFile), fs)`
`760`	`760`	`val replayBus = new ReplayListenerBus()`
`761`	`761`	`val ui = SparkUI.createHistoryUI(new SparkConf, replayBus, new SecurityManager(conf),`
`762`	`762`	`appName + " (completed)", HistoryServer.UI_PATH_PREFIX + s"/${app.id}")`
`763`	`763`	`try {`
`764`		`- replayBus.replay(logInput, sparkVersion, eventLogFile)`
	`764`	`+ replayBus.replay(logInput, eventLogFile)`
`765`	`765`	`} finally {`
`766`	`766`	`logInput.close()`
`767`	`767`	`}`
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,6 @@ private[spark] trait SparkListenerBus extends ListenerBus[SparkListener, SparkLi`
`58`	`58`	`listener.onExecutorAdded(executorAdded)`
`59`	`59`	`case executorRemoved: SparkListenerExecutorRemoved =>`
`60`	`60`	`listener.onExecutorRemoved(executorRemoved)`
`61`		`- case SparkListenerMetadataIdentifier =>`
`62`	`61`	`}`
`63`	`62`	`}`
`64`	`63`