[SPARK-51185][CORE][3.5] Revert simplifications to PartitionedFileUtil API to reduce memory requirements

LukasRupprecht · cloud-fan · commit 23637fe83e1f · 2025-02-21T09:36:24.000+08:00
### What changes were proposed in this pull request? This PR reverts an earlier change (#41632) that converted FileStatusWithMetadata.getPath from a def to a lazy val in order to simplify the PartitionedFileUtils helpers. This is the 3.5 PR. The main PR for 4.0 is #49915. ### Why are the changes needed? The conversion of getPath from a def to a lazy val increases the memory requirements because now paths need to be kept in memory as long as the FileStatusWithMetadata exists. As paths are expensive to store, this can lead to higher memory utilization and increase the risk for OOMs. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? This is a small revert to code that has already existed before so the existing tests are sufficient. ### Was this patch authored or co-authored using generative AI tooling? No Closes #49995 from LukasRupprecht/def_get-path_3.5. Authored-by: Lukas Rupprecht <lukas.l.rupprecht@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -624,7 +624,7 @@ case class FileSourceScanExec(
     logInfo(s"Planning with ${bucketSpec.numBuckets} buckets")
     val filesGroupedToBuckets =
       selectedPartitions.flatMap { p =>
-        p.files.map(f => PartitionedFileUtil.getPartitionedFile(f, p.values))
+        p.files.map(f => PartitionedFileUtil.getPartitionedFile(f, f.getPath, p.values))
       }.groupBy { f =>
         BucketingUtils
           .getBucketId(f.toPath.getName)
@@ -689,12 +689,15 @@ case class FileSourceScanExec(
 
     val splitFiles = selectedPartitions.flatMap { partition =>
       partition.files.flatMap { file =>
-        if (shouldProcess(file.getPath)) {
+        // getPath() is very expensive so we only want to call it once in this block:
+        val filePath = file.getPath
+        if (shouldProcess(filePath)) {
           val isSplitable = relation.fileFormat.isSplitable(
-              relation.sparkSession, relation.options, file.getPath)
+              relation.sparkSession, relation.options, filePath)
           PartitionedFileUtil.splitFiles(
             sparkSession = relation.sparkSession,
             file = file,
+            filePath = filePath,
             isSplitable = isSplitable,
             maxSplitBytes = maxSplitBytes,
             partitionValues = partition.values
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/PartitionedFileUtil.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/PartitionedFileUtil.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus}
+import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path}
 
 import org.apache.spark.paths.SparkPath
 import org.apache.spark.sql.SparkSession
@@ -28,6 +28,7 @@ object PartitionedFileUtil {
   def splitFiles(
       sparkSession: SparkSession,
       file: FileStatusWithMetadata,
+      filePath: Path,
       isSplitable: Boolean,
       maxSplitBytes: Long,
       partitionValues: InternalRow): Seq[PartitionedFile] = {
@@ -36,19 +37,20 @@ object PartitionedFileUtil {
         val remaining = file.getLen - offset
         val size = if (remaining > maxSplitBytes) maxSplitBytes else remaining
         val hosts = getBlockHosts(getBlockLocations(file.fileStatus), offset, size)
-        PartitionedFile(partitionValues, SparkPath.fromPath(file.getPath), offset, size, hosts,
+        PartitionedFile(partitionValues, SparkPath.fromPath(filePath), offset, size, hosts,
           file.getModificationTime, file.getLen, file.metadata)
       }
     } else {
-      Seq(getPartitionedFile(file, partitionValues))
+      Seq(getPartitionedFile(file, filePath, partitionValues))
     }
   }
 
   def getPartitionedFile(
       file: FileStatusWithMetadata,
+      filePath: Path,
       partitionValues: InternalRow): PartitionedFile = {
     val hosts = getBlockHosts(getBlockLocations(file.fileStatus), 0, file.getLen)
-    PartitionedFile(partitionValues, SparkPath.fromPath(file.getPath), 0, file.getLen, hosts,
+    PartitionedFile(partitionValues, SparkPath.fromPath(filePath), 0, file.getLen, hosts,
       file.getModificationTime, file.getLen, file.metadata)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileIndex.scala
@@ -30,8 +30,7 @@ import org.apache.spark.sql.types.StructType
  */
 case class FileStatusWithMetadata(fileStatus: FileStatus, metadata: Map[String, Any] = Map.empty) {
   // Wrapper methods to improve source compatibility in code that still expects a [[FileStatus]].
-  // NOTE: getPath() is very expensive, so we only want to call it once (if accessed at all).
-  lazy val getPath: Path = fileStatus.getPath
+  def getPath: Path = fileStatus.getPath
   def getLen: Long = fileStatus.getLen
   def getModificationTime: Long = fileStatus.getModificationTime
   def isDirectory: Boolean = fileStatus.isDirectory
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala
@@ -151,10 +151,12 @@ trait FileScan extends Scan
         partition.values
       }
       partition.files.flatMap { file =>
+        val filePath = file.getPath
         PartitionedFileUtil.splitFiles(
           sparkSession = sparkSession,
           file = file,
-          isSplitable = isSplitable(file.getPath),
+          filePath = filePath,
+          isSplitable = isSplitable(filePath),
           maxSplitBytes = maxSplitBytes,
           partitionValues = partitionValues
         )

Original file line number	Diff line number	Diff line change
`@@ -151,10 +151,12 @@ trait FileScan extends Scan`
`151`	`151`	`partition.values`
`152`	`152`	`}`
`153`	`153`	`partition.files.flatMap { file =>`
	`154`	`+ val filePath = file.getPath`
`154`	`155`	`PartitionedFileUtil.splitFiles(`
`155`	`156`	`sparkSession = sparkSession,`
`156`	`157`	`file = file,`
`157`		`- isSplitable = isSplitable(file.getPath),`
	`158`	`+ filePath = filePath,`
	`159`	`+ isSplitable = isSplitable(filePath),`
`158`	`160`	`maxSplitBytes = maxSplitBytes,`
`159`	`161`	`partitionValues = partitionValues`
`160`	`162`	`)`