Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ private[sql] class OrcFileFormat
job: Job,
options: Map[String, String],
dataSchema: StructType): OutputWriterFactory = {
val orcOptions = new OrcOptions(options)

val configuration = job.getConfiguration

val orcOptions = new OrcOptions(options, configuration)

configuration.set(OrcRelation.ORC_COMPRESSION, orcOptions.compressionCodec)
configuration match {
case conf: JobConf =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,40 @@

package org.apache.spark.sql.hive.orc

import org.apache.hadoop.conf.Configuration

/**
* Options for the ORC data source.
*/
private[orc] class OrcOptions(
@transient private val parameters: Map[String, String])
@transient private val parameters: Map[String, String],
@transient private val conf: Configuration)
extends Serializable {

import OrcOptions._

/**
* Compression codec to use. By default snappy compression.
* Compression codec to use. By default use the value specified in Hadoop configuration.
* If `orc.compress` is unset, then we use snappy.
* Acceptable values are defined in [[shortOrcCompressionCodecNames]].
*/
val compressionCodec: String = {
val codecName = parameters.getOrElse("compression", "snappy").toLowerCase
if (!shortOrcCompressionCodecNames.contains(codecName)) {
val availableCodecs = shortOrcCompressionCodecNames.keys.map(_.toLowerCase)
throw new IllegalArgumentException(s"Codec [$codecName] " +
s"is not available. Available codecs are ${availableCodecs.mkString(", ")}.")
val default = conf.get(OrcRelation.ORC_COMPRESSION, "SNAPPY")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry. Maybe I did not explain clearly in the jira. The use case I mentioned was df.write.option("orc.compress", ...). We do not need to look at hadoop conf.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see. Then, it all adds up. Sorry for not reading your comments carefully.


// Because the ORC configuration value in `default` is not guaranteed to be the same
// with keys in `shortOrcCompressionCodecNames` in Spark, this value should not be
// used as the key for `shortOrcCompressionCodecNames` but just a return value.
parameters.get("compression") match {
case None => default
case Some(name) =>
val lowerCaseName = name.toLowerCase
if (!shortOrcCompressionCodecNames.contains(lowerCaseName)) {
val availableCodecs = shortOrcCompressionCodecNames.keys.map(_.toLowerCase)
throw new IllegalArgumentException(s"Codec [$lowerCaseName] " +
s"is not available. Available codecs are ${availableCodecs.mkString(", ")}.")
}
shortOrcCompressionCodecNames(lowerCaseName)
}
shortOrcCompressionCodecNames(codecName)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,29 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
}
}

test("SPARK-16610: Respect orc.compress configuration when compression is unset") {
// Respect `orc.compress`.
withTempPath { file =>
spark.range(0, 10).write
.option("orc.compress", "ZLIB")
.orc(file.getCanonicalPath)
val expectedCompressionKind =
OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression
assert("ZLIB" === expectedCompressionKind.name())
}

// `compression` overrides `orc.compress`.
withTempPath { file =>
spark.range(0, 10).write
.option("compression", "ZLIB")
.option("orc.compress", "SNAPPY")
.orc(file.getCanonicalPath)
val expectedCompressionKind =
OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression
assert("ZLIB" === expectedCompressionKind.name())
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice. Thank you for adding this test.


// Hive supports zlib, snappy and none for Hive 1.2.1.
test("Compression options for writing to an ORC file (SNAPPY, ZLIB and NONE)") {
withTempPath { file =>
Expand Down