-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-19112][CORE] Support for ZStandard codec #18805
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
287a9da
295f38a
ea7f4f6
029a753
0525307
2580633
eba3024
95e6b8b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,7 @@ package org.apache.spark.io | |
| import java.io._ | ||
| import java.util.Locale | ||
|
|
||
| import com.github.luben.zstd.{ZstdInputStream, ZstdOutputStream} | ||
| import com.ning.compress.lzf.{LZFInputStream, LZFOutputStream} | ||
| import net.jpountz.lz4.LZ4BlockOutputStream | ||
| import org.xerial.snappy.{Snappy, SnappyInputStream, SnappyOutputStream} | ||
|
|
@@ -50,13 +51,14 @@ private[spark] object CompressionCodec { | |
|
|
||
| private[spark] def supportsConcatenationOfSerializedStreams(codec: CompressionCodec): Boolean = { | ||
| (codec.isInstanceOf[SnappyCompressionCodec] || codec.isInstanceOf[LZFCompressionCodec] | ||
| || codec.isInstanceOf[LZ4CompressionCodec]) | ||
| || codec.isInstanceOf[LZ4CompressionCodec] || codec.isInstanceOf[ZStdCompressionCodec]) | ||
| } | ||
|
|
||
| private val shortCompressionCodecNames = Map( | ||
| "lz4" -> classOf[LZ4CompressionCodec].getName, | ||
| "lzf" -> classOf[LZFCompressionCodec].getName, | ||
| "snappy" -> classOf[SnappyCompressionCodec].getName) | ||
| "snappy" -> classOf[SnappyCompressionCodec].getName, | ||
| "zstd" -> classOf[ZStdCompressionCodec].getName) | ||
|
|
||
| def getCodecName(conf: SparkConf): String = { | ||
| conf.get(configKey, DEFAULT_COMPRESSION_CODEC) | ||
|
|
@@ -216,3 +218,33 @@ private final class SnappyOutputStreamWrapper(os: SnappyOutputStream) extends Ou | |
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * :: DeveloperApi :: | ||
| * ZStandard implementation of [[org.apache.spark.io.CompressionCodec]]. For more | ||
| * details see - http://facebook.github.io/zstd/ | ||
| * | ||
| * @note The wire protocol for this codec is not guaranteed to be compatible across versions | ||
| * of Spark. This is intended for use as an internal compression utility within a single Spark | ||
| * application. | ||
| */ | ||
| @DeveloperApi | ||
| class ZStdCompressionCodec(conf: SparkConf) extends CompressionCodec { | ||
|
|
||
| override def compressedOutputStream(s: OutputStream): OutputStream = { | ||
| // Default compression level for zstd compression to 1 because it is | ||
| // fastest of all with reasonably high compression ratio. | ||
| val level = conf.getSizeAsBytes("spark.io.compression.zstd.level", "1").toInt | ||
| val bufferSize = conf.getSizeAsBytes("spark.io.compression.zstd.bufferSize", "32k").toInt | ||
|
||
| // Wrap the zstd output stream in a buffered output stream, so that we can | ||
| // avoid overhead excessive of JNI call while trying to compress small amount of data. | ||
| new BufferedOutputStream(new ZstdOutputStream(s, level), bufferSize) | ||
| } | ||
|
|
||
| override def compressedInputStream(s: InputStream): InputStream = { | ||
| val bufferSize = conf.getSizeAsBytes("spark.io.compression.zstd.bufferSize", "32k").toInt | ||
| // Wrap the zstd input stream in a buffered input stream so that we can | ||
| // avoid overhead excessive of JNI call while trying to uncompress small amount of data. | ||
| new BufferedInputStream(new ZstdInputStream(s), bufferSize) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -186,3 +186,4 @@ xercesImpl-2.9.1.jar | |
| xmlenc-0.52.jar | ||
| xz-1.0.jar | ||
| zookeeper-3.4.6.jar | ||
| zstd-jni-1.3.0-1.jar | ||
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -187,3 +187,4 @@ xercesImpl-2.9.1.jar | |
| xmlenc-0.52.jar | ||
| xz-1.0.jar | ||
| zookeeper-3.4.6.jar | ||
| zstd-jni-1.3.0-1.jar | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -866,7 +866,8 @@ Apart from these, the following properties are also available, and may be useful | |
| e.g. | ||
| <code>org.apache.spark.io.LZ4CompressionCodec</code>, | ||
| <code>org.apache.spark.io.LZFCompressionCodec</code>, | ||
| and <code>org.apache.spark.io.SnappyCompressionCodec</code>. | ||
| <code>org.apache.spark.io.SnappyCompressionCodec</code>. | ||
|
||
| and <code>org.apache.spark.io.ZstdCompressionCodec</code>. | ||
| </td> | ||
| </tr> | ||
| <tr> | ||
|
|
@@ -885,6 +886,23 @@ Apart from these, the following properties are also available, and may be useful | |
| is used. Lowering this block size will also lower shuffle memory usage when Snappy is used. | ||
| </td> | ||
| </tr> | ||
| <tr> | ||
| <td><code>spark.io.compression.zstd.level</code></td> | ||
| <td>1</td> | ||
| <td> | ||
| Compression leve for Zstd compression codec. Increasing the compression level will result in better | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: leve -> level |
||
| compression at the expense of more CPU and memory. | ||
| </td> | ||
| </tr> | ||
| <tr> | ||
| <td><code>spark.io.compression.zstd.bufferSize</code></td> | ||
| <td>32k</td> | ||
| <td> | ||
| Buffer size used in Zstd compression, in the case when Zstd compression codec | ||
| is used. Lowering this size will lower the shuffle memory usage when Zstd is used, but it | ||
| might increase the compression cost because of excessive JNI call overhead. | ||
| </td> | ||
| </tr> | ||
| <tr> | ||
| <td><code>spark.kryo.classesToRegister</code></td> | ||
| <td>(none)</td> | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this
getIntinstead ofgetSizeAsBytes?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good eye, fixed.