[SPARK-24322][BUILD] Upgrade Apache ORC to 1.4.4

dongjoon-hyun · fli · commit 057ccb1a5d5a · 2018-08-18T16:46:41.000-07:00
ORC 1.4.4 includes [nine fixes](https://issues.apache.org/jira/issues/?filter=12342568&jql=project%20%3D%20ORC%20AND%20resolution%20%3D%20Fixed%20AND%20fixVersion%20%3D%201.4.4). One of the issues is about `Timestamp` bug (ORC-306) which occurs when `native` ORC vectorized reader reads ORC column vector's sub-vector `times` and `nanos`. ORC-306 fixes this according to the [original definition](https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java#L45-L46) and this PR includes the updated interpretation on ORC column vectors. Note that `hive` ORC reader and ORC MR reader is not affected. ```scala scala> spark.version res0: String = 2.3.0 scala> spark.sql("set spark.sql.orc.impl=native") scala> Seq(java.sql.Timestamp.valueOf("1900-05-05 12:34:56.000789")).toDF().write.orc("/tmp/orc") scala> spark.read.orc("/tmp/orc").show(false) +--------------------------+ |value | +--------------------------+ |1900-05-05 12:34:55.000789| +--------------------------+ ``` This PR aims to update Apache Spark to use it. **FULL LIST** ID | TITLE -- | -- ORC-281 | Fix compiler warnings from clang 5.0 ORC-301 | `extractFileTail` should open a file in `try` statement ORC-304 | Fix TestRecordReaderImpl to not fail with new storage-api ORC-306 | Fix incorrect workaround for bug in java.sql.Timestamp ORC-324 | Add support for ARM and PPC arch ORC-330 | Remove unnecessary Hive artifacts from root pom ORC-332 | Add syntax version to orc_proto.proto ORC-336 | Remove avro and parquet dependency management entries ORC-360 | Implement error checking on subtype fields in Java Pass the Jenkins. Author: Dongjoon Hyun <dongjoon@apache.org> Closes apache#21372 from dongjoon-hyun/SPARK_ORC144.
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
@@ -156,8 +156,8 @@ objenesis-2.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.4.3-nohive.jar
-orc-mapreduce-1.4.3-nohive.jar
+orc-core-1.4.4-nohive.jar
+orc-mapreduce-1.4.4-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
@@ -157,8 +157,8 @@ objenesis-2.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.4.3-nohive.jar
-orc-mapreduce-1.4.3-nohive.jar
+orc-core-1.4.4-nohive.jar
+orc-mapreduce-1.4.4-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
diff --git a/pom.xml b/pom.xml
@@ -130,8 +130,7 @@
     <!-- Version used for internal directory structure -->
     <hive.version.short>1.2.1</hive.version.short>
     <derby.version>10.12.1.1</derby.version>
-    <parquet.version>1.8.2</parquet.version>
-    <orc.version>1.4.3</orc.version>
+    <orc.version>1.4.4</orc.version>
     <orc.classifier>nohive</orc.classifier>
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <jetty.version>9.3.20.v20170531</jetty.version>
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -136,7 +136,7 @@ public int getInt(int rowId) {
   public long getLong(int rowId) {
     int index = getRowIndex(rowId);
     if (isTimestamp) {
-      return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000;
+      return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000 % 1000;
     } else {
       return longData.vector[index];
     }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
@@ -495,7 +495,7 @@ private void putValues(
    * Returns the number of micros since epoch from an element of TimestampColumnVector.
    */
   private static long fromTimestampColumnVector(TimestampColumnVector vector, int index) {
-    return vector.time[index] * 1000L + vector.nanos[index] / 1000L;
+    return vector.time[index] * 1000 + (vector.nanos[index] / 1000 % 1000);
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.datasources.orc
 
 import java.io.File
+import java.sql.Timestamp
 import java.util.Locale
 
 import org.apache.orc.OrcConf.COMPRESS
@@ -169,6 +170,14 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
       }
     }
   }
+
+  test("SPARK-24322 Fix incorrect workaround for bug in java.sql.Timestamp") {
+    withTempPath { path =>
+      val ts = Timestamp.valueOf("1900-05-05 12:34:56.000789")
+      Seq(ts).toDF.write.orc(path.getCanonicalPath)
+      checkAnswer(spark.read.orc(path.getCanonicalPath), Row(ts))
+    }
+  }
 }
 
 class OrcSourceSuite extends OrcSuite with SharedSQLContext {

Original file line number	Diff line number	Diff line change
`@@ -136,7 +136,7 @@ public int getInt(int rowId) {`
`136`	`136`	`public long getLong(int rowId) {`
`137`	`137`	`int index = getRowIndex(rowId);`
`138`	`138`	`if (isTimestamp) {`
`139`		`- return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000;`
	`139`	`+ return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000 % 1000;`
`140`	`140`	`} else {`
`141`	`141`	`return longData.vector[index];`
`142`	`142`	`}`
Original file line number	Diff line number	Diff line change
`@@ -495,7 +495,7 @@ private void putValues(`
`495`	`495`	`* Returns the number of micros since epoch from an element of TimestampColumnVector.`
`496`	`496`	`*/`
`497`	`497`	`private static long fromTimestampColumnVector(TimestampColumnVector vector, int index) {`
`498`		`- return vector.time[index] * 1000L + vector.nanos[index] / 1000L;`
	`498`	`+ return vector.time[index] * 1000 + (vector.nanos[index] / 1000 % 1000);`
`499`	`499`	`}`
`500`	`500`
`501`	`501`	`/**`
Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@`
`18`	`18`	`package org.apache.spark.sql.execution.datasources.orc`
`19`	`19`
`20`	`20`	`import java.io.File`
	`21`	`+import java.sql.Timestamp`
`21`	`22`	`import java.util.Locale`
`22`	`23`
`23`	`24`	`import org.apache.orc.OrcConf.COMPRESS`
`@@ -169,6 +170,14 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {`
`169`	`170`	`}`
`170`	`171`	`}`
`171`	`172`	`}`
	`173`	`+`
	`174`	`+ test("SPARK-24322 Fix incorrect workaround for bug in java.sql.Timestamp") {`
	`175`	`+ withTempPath { path =>`
	`176`	`+ val ts = Timestamp.valueOf("1900-05-05 12:34:56.000789")`
	`177`	`+ Seq(ts).toDF.write.orc(path.getCanonicalPath)`
	`178`	`+ checkAnswer(spark.read.orc(path.getCanonicalPath), Row(ts))`
	`179`	`+ }`
	`180`	`+ }`
`172`	`181`	`}`
`173`	`182`
`174`	`183`	`class OrcSourceSuite extends OrcSuite with SharedSQLContext {`