apache · wangyum · Mar 27, 2019 · Mar 28, 2019 · Mar 30, 2019 · Mar 31, 2019
diff --git a/dev/deps/spark-deps-hadoop-3.2 b/dev/deps/spark-deps-hadoop-3.2
@@ -50,7 +50,7 @@ curator-client-2.13.0.jar
 curator-framework-2.13.0.jar
 curator-recipes-2.13.0.jar
 datanucleus-api-jdo-3.2.6.jar
-datanucleus-core-3.2.10.jar
+datanucleus-core-4.1.17.jar
 datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 dnsjava-2.1.7.jar
@@ -76,6 +76,7 @@ hadoop-yarn-common-3.2.0.jar
 hadoop-yarn-registry-3.2.0.jar
 hadoop-yarn-server-common-3.2.0.jar
 hadoop-yarn-server-web-proxy-3.2.0.jar
+hive-storage-api-2.6.0.jar
 hk2-api-2.4.0-b34.jar
 hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar

diff --git a/pom.xml b/pom.xml
@@ -130,6 +130,12 @@
     <hive.version>1.2.1.spark2</hive.version>
     <!-- Version used for internal directory structure -->
     <hive.version.short>1.2.1</hive.version.short>
+    <!--
+      Hive 2.3 need hive-common, hive-serde, hive-shims and hive-llap-client,
+      but Hive 1.2 does not need these dependencies. We use hive.extra.deps.scope to implement it.
+      It should be removed once we drop Hive 1.2 support.
+    -->
+    <hive.extra.deps.scope>provided</hive.extra.deps.scope>
     <!-- note that this should be compatible with Kafka brokers version 0.10 and up -->
     <kafka.version>2.2.0</kafka.version>
     <derby.version>10.12.1.1</derby.version>
@@ -1384,7 +1390,7 @@
         <groupId>${hive.group}</groupId>
         <artifactId>hive-common</artifactId>
         <version>${hive.version}</version>
-        <scope>${hive.deps.scope}</scope>
+        <scope>${hive.extra.deps.scope}</scope>
         <exclusions>
           <exclusion>
             <groupId>${hive.group}</groupId>
@@ -1414,6 +1420,18 @@
             <groupId>commons-logging</groupId>
             <artifactId>commons-logging</artifactId>
           </exclusion>
+          <!-- Begin of Hive 2.3.4 exclusion -->
+          <!-- jetty-all conflict with jetty 9.4.12.v20180830 -->
+          <exclusion>
+            <groupId>org.eclipse.jetty.aggregate</groupId>
+            <artifactId>jetty-all</artifactId>
+          </exclusion>
+          <!-- org.apache.logging.log4j:* conflict with log4j 1.2.17 -->
+          <exclusion>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>*</artifactId>
+          </exclusion>
+          <!-- End of Hive 2.3.4 exclusion -->
         </exclusions>
       </dependency>
 
@@ -1532,6 +1550,27 @@
             <groupId>org.json</groupId>
             <artifactId>json</artifactId>
           </exclusion>
+          <!-- Begin of Hive 2.3.4 exclusion -->
+          <!-- Do not need Tez -->
+          <exclusion>
+            <groupId>${hive.group}</groupId>
+          <artifactId>hive-llap-tez</artifactId>
+          </exclusion>
+          <!-- Do not need Calcite, see SPARK-27054 -->
+          <exclusion>
+            <groupId>org.apache.calcite</groupId>
+            <artifactId>calcite-druid</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.calcite.avatica</groupId>
+            <artifactId>avatica</artifactId>
+          </exclusion>
+          <!-- org.apache.logging.log4j:* conflict with log4j 1.2.17 -->
+          <exclusion>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>*</artifactId>
+          </exclusion>
+          <!-- End of Hive 2.3.4 exclusion -->
         </exclusions>
       </dependency>
       <dependency>
@@ -1647,7 +1686,7 @@
         <groupId>${hive.group}</groupId>
         <artifactId>hive-serde</artifactId>
         <version>${hive.version}</version>
-        <scope>${hive.deps.scope}</scope>
+        <scope>${hive.extra.deps.scope}</scope>
         <exclusions>
           <exclusion>
             <groupId>${hive.group}</groupId>
@@ -1697,6 +1736,22 @@
             <groupId>org.codehaus.groovy</groupId>
             <artifactId>groovy-all</artifactId>
           </exclusion>
+          <!-- Begin of Hive 2.3.4 exclusion -->
+          <!-- parquet-hadoop-bundle:1.8.1 conflict with 1.10.1 -->
+          <exclusion>
+            <groupId>org.apache.parquet</groupId>
+            <artifactId>parquet-hadoop-bundle</artifactId>
+          </exclusion>
+          <!-- Do not need Jasper, see HIVE-19799 -->
+          <exclusion>
+            <groupId>tomcat</groupId>
+            <artifactId>jasper-compiler</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>tomcat</groupId>
+            <artifactId>jasper-runtime</artifactId>
+          </exclusion>
+          <!-- End of Hive 2.3.4 exclusion -->
         </exclusions>
       </dependency>
 
@@ -1720,7 +1775,7 @@
         <groupId>${hive.group}</groupId>
         <artifactId>hive-shims</artifactId>
         <version>${hive.version}</version>
-        <scope>${hive.deps.scope}</scope>
+        <scope>${hive.extra.deps.scope}</scope>
         <exclusions>
           <exclusion>
             <groupId>com.google.guava</groupId>
@@ -1762,6 +1817,13 @@
             <groupId>org.codehaus.groovy</groupId>
             <artifactId>groovy-all</artifactId>
           </exclusion>
+          <!-- Begin of Hive 2.3.4 exclusion -->
+          <!-- Exclude log4j-slf4j-impl, otherwise throw NCDFE when starting spark-shell -->
+          <exclusion>
+            <groupId>org.apache.logging.log4j</groupId>
+            <artifactId>log4j-slf4j-impl</artifactId>
+          </exclusion>
+          <!-- End of Hive 2.3.4 exclusion -->
         </exclusions>
       </dependency>
       <dependency>
@@ -2656,7 +2718,24 @@
         <hadoop.version>3.2.0</hadoop.version>
         <curator.version>2.13.0</curator.version>
         <zookeeper.version>3.4.13</zookeeper.version>
+        <hive.group>org.apache.hive</hive.group>
+        <hive.classifier>core</hive.classifier>
+        <hive.version>2.3.4</hive.version>
+        <hive.version.short>${hive.version}</hive.version.short>
+        <hive.extra.deps.scope>${hive.deps.scope}</hive.extra.deps.scope>
+        <hive.parquet.version>${parquet.version}</hive.parquet.version>
+        <orc.classifier></orc.classifier>
+        <hive.parquet.group>org.apache.parquet</hive.parquet.group>
+        <datanucleus-core.version>4.1.17</datanucleus-core.version>
       </properties>
+      <dependencies>
+        <!-- Both ORC and Parquet need hive-storage-api, but it is excluded by orc-mapreduce -->
+        <dependency>
+          <groupId>org.apache.hive</groupId>
+          <artifactId>hive-storage-api</artifactId>
+          <version>2.6.0</version>
+        </dependency>
+      </dependencies>
     </profile>
 
     <profile>

diff --git a/.../v2.3.4/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/.../v2.3.4/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -19,7 +19,7 @@
 
 import java.math.BigDecimal;
 
-import org.apache.orc.storage.ql.exec.vector.*;
+import org.apache.hadoop.hive.ql.exec.vector.*;
 
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.Decimal;

diff --git a/...ore/v2.3.4/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala b/...ore/v2.3.4/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
@@ -17,11 +17,11 @@
 
 package org.apache.spark.sql.execution.datasources.orc
 
-import org.apache.orc.storage.common.`type`.HiveDecimal
-import org.apache.orc.storage.ql.io.sarg.{PredicateLeaf, SearchArgument}
-import org.apache.orc.storage.ql.io.sarg.SearchArgument.Builder
-import org.apache.orc.storage.ql.io.sarg.SearchArgumentFactory.newBuilder
-import org.apache.orc.storage.serde2.io.HiveDecimalWritable
+import org.apache.hadoop.hive.common.`type`.HiveDecimal
+import org.apache.hadoop.hive.ql.io.sarg.{PredicateLeaf, SearchArgument}
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.Builder
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory.newBuilder
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable
 
 import org.apache.spark.sql.sources.Filter
 import org.apache.spark.sql.types._

diff --git a/...e/v2.3.4/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala b/...e/v2.3.4/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala
@@ -19,11 +19,11 @@ package org.apache.spark.sql.execution.datasources.orc
 
 import java.sql.Date
 
-import org.apache.orc.storage.common.`type`.HiveDecimal
-import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch
-import org.apache.orc.storage.ql.io.sarg.{SearchArgument => OrcSearchArgument}
-import org.apache.orc.storage.ql.io.sarg.PredicateLeaf.{Operator => OrcOperator}
-import org.apache.orc.storage.serde2.io.{DateWritable, HiveDecimalWritable}
+import org.apache.hadoop.hive.common.`type`.HiveDecimal
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch
+import org.apache.hadoop.hive.ql.io.sarg.{SearchArgument => OrcSearchArgument}
+import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf.{Operator => OrcOperator}
+import org.apache.hadoop.hive.serde2.io.{DateWritable, HiveDecimalWritable}
 
 import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
 import org.apache.spark.sql.types.Decimal

diff --git a/...v2.3.4/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala b/...v2.3.4/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala
@@ -23,7 +23,7 @@ import java.sql.{Date, Timestamp}
 
 import scala.collection.JavaConverters._
 
-import org.apache.orc.storage.ql.io.sarg.{PredicateLeaf, SearchArgument}
+import org.apache.hadoop.hive.ql.io.sarg.{PredicateLeaf, SearchArgument}
 
 import org.apache.spark.sql.{AnalysisException, Column, DataFrame}
 import org.apache.spark.sql.catalyst.dsl.expressions._

diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
@@ -88,12 +88,10 @@
       <version>${protobuf.version}</version>
     </dependency>
 -->
-<!--
     <dependency>
       <groupId>${hive.group}</groupId>
       <artifactId>hive-common</artifactId>
     </dependency>
--->
     <dependency>
       <groupId>${hive.group}</groupId>
       <artifactId>hive-exec</artifactId>
@@ -103,16 +101,38 @@
       <groupId>${hive.group}</groupId>
       <artifactId>hive-metastore</artifactId>
     </dependency>
-    <!--
-        <dependency>
-          <groupId>${hive.group}</groupId>
+    <dependency>
+      <groupId>${hive.group}</groupId>
+      <artifactId>hive-serde</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>${hive.group}</groupId>
+      <artifactId>hive-shims</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hive</groupId>
+      <artifactId>hive-llap-client</artifactId>
+      <version>2.3.4</version>
+      <scope>${hive.extra.deps.scope}</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hive</groupId>
+          <artifactId>hive-common</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.hive</groupId>
           <artifactId>hive-serde</artifactId>
-        </dependency>
-        <dependency>
-          <groupId>${hive.group}</groupId>
-          <artifactId>hive-shims</artifactId>
-        </dependency>
-    -->
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.curator</groupId>
+          <artifactId>curator-framework</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.curator</groupId>
+          <artifactId>apache-curator</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
     <!-- hive-serde already depends on avro, but this brings in customized config of avro deps from parent -->
     <dependency>
       <groupId>org.apache.avro</groupId>

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.hive
 
 import java.io.{InputStream, OutputStream}
+import java.lang.reflect.Method
 import java.rmi.server.UID
 
 import scala.collection.JavaConverters._
@@ -28,15 +29,13 @@ import com.google.common.base.Objects
 import org.apache.avro.Schema
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
-import org.apache.hadoop.hive.ql.exec.{UDF, Utilities}
+import org.apache.hadoop.hive.ql.exec.UDF
 import org.apache.hadoop.hive.ql.plan.{FileSinkDesc, TableDesc}
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFMacro
 import org.apache.hadoop.hive.serde2.ColumnProjectionUtils
 import org.apache.hadoop.hive.serde2.avro.{AvroGenericRecordWritable, AvroSerdeUtils}
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector
 import org.apache.hadoop.io.Writable
-import org.apache.hive.com.esotericsoftware.kryo.Kryo
-import org.apache.hive.com.esotericsoftware.kryo.io.{Input, Output}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.types.Decimal
@@ -146,34 +145,60 @@ private[hive] object HiveShim {
       case _ => false
     }
 
-    @transient
-    def deserializeObjectByKryo[T: ClassTag](
-        kryo: Kryo,
-        in: InputStream,
-        clazz: Class[_]): T = {
-      val inp = new Input(in)
-      val t: T = kryo.readObject(inp, clazz).asInstanceOf[T]
-      inp.close()
-      t
-    }
+    private lazy val serUtilClass =
+      Utils.classForName("org.apache.hadoop.hive.ql.exec.SerializationUtilities")
+    private lazy val utilClass = Utils.classForName("org.apache.hadoop.hive.ql.exec.Utilities")
+    private val deserializeMethodName = "deserializeObjectByKryo"
+    private val serializeMethodName = "serializeObjectByKryo"
 
-    @transient
-    def serializeObjectByKryo(
-        kryo: Kryo,
-        plan: Object,
-        out: OutputStream) {
-      val output: Output = new Output(out)
-      kryo.writeObject(output, plan)
-      output.close()
+    private def findMethod(klass: Class[_], name: String, args: Class[_]*): Method = {
+      val method = klass.getDeclaredMethod(name, args: _*)
+      method.setAccessible(true)
+      method
     }
 
     def deserializePlan[UDFType](is: java.io.InputStream, clazz: Class[_]): UDFType = {
-      deserializeObjectByKryo(Utilities.runtimeSerializationKryo.get(), is, clazz)
-        .asInstanceOf[UDFType]
+      if (HiveUtils.isSupportedHive2) {
+        val borrowKryo = serUtilClass.getMethod("borrowKryo")
+        val kryo = borrowKryo.invoke(serUtilClass)
+        val deserializeObjectByKryo = findMethod(serUtilClass, deserializeMethodName,
+          kryo.getClass.getSuperclass, classOf[InputStream], classOf[Class[_]])
+        try {
+          deserializeObjectByKryo.invoke(null, kryo, is, clazz).asInstanceOf[UDFType]
+        } finally {
+          serUtilClass.getMethod("releaseKryo", kryo.getClass.getSuperclass).invoke(null, kryo)
+        }
+      } else {
+        val runtimeSerializationKryo = utilClass.getField("runtimeSerializationKryo")
+        val threadLocalValue = runtimeSerializationKryo.get(utilClass)
+        val getMethod = threadLocalValue.getClass.getMethod("get")
+        val kryo = getMethod.invoke(threadLocalValue)
+        val deserializeObjectByKryo = findMethod(utilClass, deserializeMethodName,
+          kryo.getClass, classOf[InputStream], classOf[Class[_]])
+        deserializeObjectByKryo.invoke(null, kryo, is, clazz).asInstanceOf[UDFType]
+      }
     }
 
     def serializePlan(function: AnyRef, out: java.io.OutputStream): Unit = {
-      serializeObjectByKryo(Utilities.runtimeSerializationKryo.get(), function, out)
+      if (HiveUtils.isSupportedHive2) {
+        val borrowKryo = serUtilClass.getMethod("borrowKryo")
+        val kryo = borrowKryo.invoke(serUtilClass)
+        val serializeObjectByKryo = findMethod(serUtilClass, serializeMethodName,
+          kryo.getClass.getSuperclass, classOf[Object], classOf[OutputStream])
+        try {
+          serializeObjectByKryo.invoke(null, kryo, function, out)
+        } finally {
+          serUtilClass.getMethod("releaseKryo", kryo.getClass.getSuperclass).invoke(null, kryo)
+        }
+      } else {
+        val runtimeSerializationKryo = utilClass.getField("runtimeSerializationKryo")
+        val threadLocalValue = runtimeSerializationKryo.get(utilClass)
+        val getMethod = threadLocalValue.getClass.getMethod("get")
+        val kryo = getMethod.invoke(threadLocalValue)
+        val serializeObjectByKryo = findMethod(utilClass, serializeMethodName,
+          kryo.getClass, classOf[Object], classOf[OutputStream])
+        serializeObjectByKryo.invoke(null, kryo, function, out)
+      }
     }
 
     def writeExternal(out: java.io.ObjectOutput) {