Skip to content

Commit 23637fe

Browse files
LukasRupprechtcloud-fan
authored andcommitted
[SPARK-51185][CORE][3.5] Revert simplifications to PartitionedFileUtil API to reduce memory requirements
### What changes were proposed in this pull request? This PR reverts an earlier change (#41632) that converted FileStatusWithMetadata.getPath from a def to a lazy val in order to simplify the PartitionedFileUtils helpers. This is the 3.5 PR. The main PR for 4.0 is #49915. ### Why are the changes needed? The conversion of getPath from a def to a lazy val increases the memory requirements because now paths need to be kept in memory as long as the FileStatusWithMetadata exists. As paths are expensive to store, this can lead to higher memory utilization and increase the risk for OOMs. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? This is a small revert to code that has already existed before so the existing tests are sufficient. ### Was this patch authored or co-authored using generative AI tooling? No Closes #49995 from LukasRupprecht/def_get-path_3.5. Authored-by: Lukas Rupprecht <lukas.l.rupprecht@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent 4d15f64 commit 23637fe

4 files changed

Lines changed: 16 additions & 10 deletions

File tree

sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -624,7 +624,7 @@ case class FileSourceScanExec(
624624
logInfo(s"Planning with ${bucketSpec.numBuckets} buckets")
625625
val filesGroupedToBuckets =
626626
selectedPartitions.flatMap { p =>
627-
p.files.map(f => PartitionedFileUtil.getPartitionedFile(f, p.values))
627+
p.files.map(f => PartitionedFileUtil.getPartitionedFile(f, f.getPath, p.values))
628628
}.groupBy { f =>
629629
BucketingUtils
630630
.getBucketId(f.toPath.getName)
@@ -689,12 +689,15 @@ case class FileSourceScanExec(
689689

690690
val splitFiles = selectedPartitions.flatMap { partition =>
691691
partition.files.flatMap { file =>
692-
if (shouldProcess(file.getPath)) {
692+
// getPath() is very expensive so we only want to call it once in this block:
693+
val filePath = file.getPath
694+
if (shouldProcess(filePath)) {
693695
val isSplitable = relation.fileFormat.isSplitable(
694-
relation.sparkSession, relation.options, file.getPath)
696+
relation.sparkSession, relation.options, filePath)
695697
PartitionedFileUtil.splitFiles(
696698
sparkSession = relation.sparkSession,
697699
file = file,
700+
filePath = filePath,
698701
isSplitable = isSplitable,
699702
maxSplitBytes = maxSplitBytes,
700703
partitionValues = partition.values

sql/core/src/main/scala/org/apache/spark/sql/execution/PartitionedFileUtil.scala

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
package org.apache.spark.sql.execution
1919

20-
import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus}
20+
import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path}
2121

2222
import org.apache.spark.paths.SparkPath
2323
import org.apache.spark.sql.SparkSession
@@ -28,6 +28,7 @@ object PartitionedFileUtil {
2828
def splitFiles(
2929
sparkSession: SparkSession,
3030
file: FileStatusWithMetadata,
31+
filePath: Path,
3132
isSplitable: Boolean,
3233
maxSplitBytes: Long,
3334
partitionValues: InternalRow): Seq[PartitionedFile] = {
@@ -36,19 +37,20 @@ object PartitionedFileUtil {
3637
val remaining = file.getLen - offset
3738
val size = if (remaining > maxSplitBytes) maxSplitBytes else remaining
3839
val hosts = getBlockHosts(getBlockLocations(file.fileStatus), offset, size)
39-
PartitionedFile(partitionValues, SparkPath.fromPath(file.getPath), offset, size, hosts,
40+
PartitionedFile(partitionValues, SparkPath.fromPath(filePath), offset, size, hosts,
4041
file.getModificationTime, file.getLen, file.metadata)
4142
}
4243
} else {
43-
Seq(getPartitionedFile(file, partitionValues))
44+
Seq(getPartitionedFile(file, filePath, partitionValues))
4445
}
4546
}
4647

4748
def getPartitionedFile(
4849
file: FileStatusWithMetadata,
50+
filePath: Path,
4951
partitionValues: InternalRow): PartitionedFile = {
5052
val hosts = getBlockHosts(getBlockLocations(file.fileStatus), 0, file.getLen)
51-
PartitionedFile(partitionValues, SparkPath.fromPath(file.getPath), 0, file.getLen, hosts,
53+
PartitionedFile(partitionValues, SparkPath.fromPath(filePath), 0, file.getLen, hosts,
5254
file.getModificationTime, file.getLen, file.metadata)
5355
}
5456

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileIndex.scala

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@ import org.apache.spark.sql.types.StructType
3030
*/
3131
case class FileStatusWithMetadata(fileStatus: FileStatus, metadata: Map[String, Any] = Map.empty) {
3232
// Wrapper methods to improve source compatibility in code that still expects a [[FileStatus]].
33-
// NOTE: getPath() is very expensive, so we only want to call it once (if accessed at all).
34-
lazy val getPath: Path = fileStatus.getPath
33+
def getPath: Path = fileStatus.getPath
3534
def getLen: Long = fileStatus.getLen
3635
def getModificationTime: Long = fileStatus.getModificationTime
3736
def isDirectory: Boolean = fileStatus.isDirectory

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,10 +151,12 @@ trait FileScan extends Scan
151151
partition.values
152152
}
153153
partition.files.flatMap { file =>
154+
val filePath = file.getPath
154155
PartitionedFileUtil.splitFiles(
155156
sparkSession = sparkSession,
156157
file = file,
157-
isSplitable = isSplitable(file.getPath),
158+
filePath = filePath,
159+
isSplitable = isSplitable(filePath),
158160
maxSplitBytes = maxSplitBytes,
159161
partitionValues = partitionValues
160162
)

0 commit comments

Comments
 (0)