Skip to content

Commit 92df232

Browse files
committed
[SPARK-53057][CORE] Support sizeOf in SparkFileUtils and JavaUtils
### What changes were proposed in this pull request? This PR aims to support `sizeOf` in `SparkFileUtils` and `JavaUtils`. ### Why are the changes needed? To provide a better implementation than `FileUtils.sizeOf` and `FileUtils.sizeOfDirectory`. ```scala scala> spark.time(org.apache.spark.network.util.JavaUtils.sizeOf(new java.io.File("/tmp/spark"))) Time taken: 143 ms val res0: Long = 732733895 scala> spark.time(org.apache.commons.io.FileUtils.sizeOf(new java.io.File("/tmp/spark"))) Time taken: 208 ms val res1: Long = 732733895 ``` ### Does this PR introduce _any_ user-facing change? No behavior change. ### How was this patch tested? Pass the CIs. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51765 from dongjoon-hyun/SPARK-53057. Authored-by: Dongjoon Hyun <dongjoon@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
1 parent fbebc20 commit 92df232

5 files changed

Lines changed: 42 additions & 7 deletions

File tree

common/utils/src/main/java/org/apache/spark/network/util/JavaUtils.java

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,13 @@
2323
import java.nio.charset.StandardCharsets;
2424
import java.nio.file.Files;
2525
import java.nio.file.LinkOption;
26+
import java.nio.file.Path;
27+
import java.nio.file.FileVisitResult;
28+
import java.nio.file.SimpleFileVisitor;
2629
import java.nio.file.attribute.BasicFileAttributes;
2730
import java.util.*;
2831
import java.util.concurrent.TimeUnit;
32+
import java.util.concurrent.atomic.AtomicLong;
2933
import java.util.regex.Matcher;
3034
import java.util.regex.Pattern;
3135

@@ -82,6 +86,25 @@ public static String bytesToString(ByteBuffer b) {
8286
return StandardCharsets.UTF_8.decode(b.slice()).toString();
8387
}
8488

89+
public static long sizeOf(File file) throws IOException {
90+
if (!file.exists()) {
91+
throw new IllegalArgumentException(file.getAbsolutePath() + " not found");
92+
}
93+
return sizeOf(file.toPath());
94+
}
95+
96+
public static long sizeOf(Path dirPath) throws IOException {
97+
AtomicLong size = new AtomicLong(0);
98+
Files.walkFileTree(dirPath, new SimpleFileVisitor<Path>() {
99+
@Override
100+
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
101+
size.addAndGet(attrs.size());
102+
return FileVisitResult.CONTINUE;
103+
}
104+
});
105+
return size.get();
106+
}
107+
85108
/**
86109
* Delete a file or directory and its contents recursively.
87110
* Don't follow directories if they are symlinks.

common/utils/src/main/scala/org/apache/spark/util/SparkFileUtils.scala

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ private[spark] trait SparkFileUtils extends Logging {
4949
new File(path).getCanonicalFile().toURI()
5050
}
5151

52+
/**
53+
* Size of files recursively.
54+
*/
55+
def sizeOf(f: File): Long = {
56+
JavaUtils.sizeOf(f)
57+
}
58+
5259
/**
5360
* Lists files recursively.
5461
*/

connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -760,12 +760,12 @@ abstract class AvroSuite
760760
spark.conf.set(SQLConf.AVRO_COMPRESSION_CODEC.key, ZSTANDARD.lowerCaseName())
761761
df.write.format("avro").save(zstandardDir)
762762

763-
val uncompressSize = FileUtils.sizeOfDirectory(new File(uncompressDir))
764-
val bzip2Size = FileUtils.sizeOfDirectory(new File(bzip2Dir))
765-
val xzSize = FileUtils.sizeOfDirectory(new File(xzDir))
766-
val deflateSize = FileUtils.sizeOfDirectory(new File(deflateDir))
767-
val snappySize = FileUtils.sizeOfDirectory(new File(snappyDir))
768-
val zstandardSize = FileUtils.sizeOfDirectory(new File(zstandardDir))
763+
val uncompressSize = Utils.sizeOf(new File(uncompressDir))
764+
val bzip2Size = Utils.sizeOf(new File(bzip2Dir))
765+
val xzSize = Utils.sizeOf(new File(xzDir))
766+
val deflateSize = Utils.sizeOf(new File(deflateDir))
767+
val snappySize = Utils.sizeOf(new File(snappyDir))
768+
val zstandardSize = Utils.sizeOf(new File(zstandardDir))
769769

770770
assert(uncompressSize > deflateSize)
771771
assert(snappySize > deflateSize)

core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ private class HistoryServerDiskManager(
286286
}
287287

288288
/** Visible for testing. Return the size of a directory. */
289-
private[history] def sizeOf(path: File): Long = FileUtils.sizeOf(path)
289+
private[history] def sizeOf(path: File): Long = Utils.sizeOf(path)
290290

291291
private[history] class Lease(val tmpPath: File, private val leased: Long) {
292292

scalastyle-config.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,11 @@ This file is divided into 3 sections:
292292
<customMessage>Use java.nio.file.Files.readAllBytes</customMessage>
293293
</check>
294294

295+
<check customId="sizeOf" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
296+
<parameters><parameter name="regex">FileUtils\.sizeOf(Directory)?</parameter></parameters>
297+
<customMessage>Use sizeOf of JavaUtils or Utils instead.</customMessage>
298+
</check>
299+
295300
<check customId="commonslang2" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
296301
<parameters><parameter name="regex">org\.apache\.commons\.lang\.</parameter></parameters>
297302
<customMessage>Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead

0 commit comments

Comments
 (0)