@@ -49,7 +49,7 @@ import org.apache.spark.scheduler.cluster.mesos.{CoarseMesosSchedulerBackend, Me
4949import org .apache .spark .scheduler .local .LocalBackend
5050import org .apache .spark .storage .{BlockManagerSource , RDDInfo , StorageStatus , StorageUtils }
5151import org .apache .spark .ui .SparkUI
52- import org .apache .spark .util .{ClosureCleaner , MetadataCleaner , MetadataCleanerType , TimeStampedWeakValueHashMap , Utils }
52+ import org .apache .spark .util .{CallSite , ClosureCleaner , MetadataCleaner , MetadataCleanerType , TimeStampedWeakValueHashMap , Utils }
5353
5454/**
5555 * Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
@@ -434,12 +434,21 @@ class SparkContext(config: SparkConf) extends Logging {
434434
435435 // Methods for creating RDDs
436436
437- /** Distribute a local Scala collection to form an RDD. */
437+ /** Distribute a local Scala collection to form an RDD.
438+ *
439+ * @note Parallelize acts lazily. If `seq` is a mutable collection and is
440+ * altered after the call to parallelize and before the first action on the
441+ * RDD, the resultant RDD will reflect the modified collection. Pass a copy of
442+ * the argument to avoid this.
443+ */
438444 def parallelize [T : ClassTag ](seq : Seq [T ], numSlices : Int = defaultParallelism): RDD [T ] = {
439445 new ParallelCollectionRDD [T ](this , seq, numSlices, Map [Int , Seq [String ]]())
440446 }
441447
442- /** Distribute a local Scala collection to form an RDD. */
448+ /** Distribute a local Scala collection to form an RDD.
449+ *
450+ * This method is identical to `parallelize`.
451+ */
443452 def makeRDD [T : ClassTag ](seq : Seq [T ], numSlices : Int = defaultParallelism): RDD [T ] = {
444453 parallelize(seq, numSlices)
445454 }
@@ -1027,9 +1036,11 @@ class SparkContext(config: SparkConf) extends Logging {
10271036 * Capture the current user callsite and return a formatted version for printing. If the user
10281037 * has overridden the call site, this will return the user's version.
10291038 */
1030- private [spark] def getCallSite (): String = {
1031- val defaultCallSite = Utils .getCallSiteInfo
1032- Option (getLocalProperty(" externalCallSite" )).getOrElse(defaultCallSite.toString)
1039+ private [spark] def getCallSite (): CallSite = {
1040+ Option (getLocalProperty(" externalCallSite" )) match {
1041+ case Some (callSite) => CallSite (callSite, long = " " )
1042+ case None => Utils .getCallSite
1043+ }
10331044 }
10341045
10351046 /**
@@ -1049,11 +1060,11 @@ class SparkContext(config: SparkConf) extends Logging {
10491060 }
10501061 val callSite = getCallSite
10511062 val cleanedFunc = clean(func)
1052- logInfo(" Starting job: " + callSite)
1063+ logInfo(" Starting job: " + callSite.short )
10531064 val start = System .nanoTime
10541065 dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, allowLocal,
10551066 resultHandler, localProperties.get)
1056- logInfo(" Job finished: " + callSite + " , took " + (System .nanoTime - start) / 1e9 + " s" )
1067+ logInfo(" Job finished: " + callSite.short + " , took " + (System .nanoTime - start) / 1e9 + " s" )
10571068 rdd.doCheckpoint()
10581069 }
10591070
@@ -1134,11 +1145,11 @@ class SparkContext(config: SparkConf) extends Logging {
11341145 evaluator : ApproximateEvaluator [U , R ],
11351146 timeout : Long ): PartialResult [R ] = {
11361147 val callSite = getCallSite
1137- logInfo(" Starting job: " + callSite)
1148+ logInfo(" Starting job: " + callSite.short )
11381149 val start = System .nanoTime
11391150 val result = dagScheduler.runApproximateJob(rdd, func, evaluator, callSite, timeout,
11401151 localProperties.get)
1141- logInfo(" Job finished: " + callSite + " , took " + (System .nanoTime - start) / 1e9 + " s" )
1152+ logInfo(" Job finished: " + callSite.short + " , took " + (System .nanoTime - start) / 1e9 + " s" )
11421153 result
11431154 }
11441155
0 commit comments