[SPARK-23874][SQL][PYTHON] Upgrade Apache Arrow to 0.10.0

BryanCutler · curtishoward · commit 712e7953f330 · 2018-10-31T15:09:57.000-04:00
Upgrade Apache Arrow to 0.10.0 Version 0.10.0 has a number of bug fixes and improvements with the following pertaining directly to usage in Spark: * Allow for adding BinaryType support ARROW-2141 * Bug fix related to array serialization ARROW-1973 * Python2 str will be made into an Arrow string instead of bytes ARROW-2101 * Python bytearrays are supported in as input to pyarrow ARROW-2141 * Java has common interface for reset to cleanup complex vectors in Spark ArrowWriter ARROW-1962 * Cleanup pyarrow type equality checks ARROW-2423 * ArrowStreamWriter should not hold references to ArrowBlocks ARROW-2632, ARROW-2645 * Improved low level handling of messages for RecordBatch ARROW-2704 existing tests Author: Bryan Cutler <cutlerb@gmail.com> Closes apache#21939 from BryanCutler/arrow-upgrade-010. (cherry picked from commit ed075e1)
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
@@ -14,9 +14,9 @@ apacheds-kerberos-codec-2.0.0-M15.jar
 api-asn1-api-1.0.0-M20.jar
 api-util-1.0.0-M20.jar
 arpack_combined_all-0.1.jar
-arrow-format-0.8.0.jar
-arrow-memory-0.8.0.jar
-arrow-vector-0.8.0.jar
+arrow-format-0.10.0.jar
+arrow-memory-0.10.0.jar
+arrow-vector-0.10.0.jar
 automaton-1.11-8.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
@@ -14,9 +14,9 @@ apacheds-kerberos-codec-2.0.0-M15.jar
 api-asn1-api-1.0.0-M20.jar
 api-util-1.0.0-M20.jar
 arpack_combined_all-0.1.jar
-arrow-format-0.8.0.jar
-arrow-memory-0.8.0.jar
-arrow-vector-0.8.0.jar
+arrow-format-0.10.0.jar
+arrow-memory-0.10.0.jar
+arrow-vector-0.10.0.jar
 automaton-1.11-8.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
diff --git a/pom.xml b/pom.xml
@@ -193,7 +193,7 @@
     If you are changing Arrow version specification, please check ./python/pyspark/sql/utils.py,
     ./python/run-tests.py and ./python/setup.py too.
     -->
-    <arrow.version>0.8.0</arrow.version>
+    <arrow.version>0.10.0</arrow.version>
 
     <test.java.home>${java.home}</test.java.home>
     <test.exclude.tags></test.exclude.tags>
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
@@ -228,12 +228,14 @@ def _create_batch(series, timezone):
     def create_array(s, t):
         mask = s.isnull()
         # Ensure timestamp series are in expected form for Spark internal representation
+        # TODO: maybe don't need None check anymore as of Arrow 0.9.1
         if t is not None and pa.types.is_timestamp(t):
             s = _check_series_convert_timestamps_internal(s.fillna(0), timezone)
             # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2
             return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False)
         elif t is not None and pa.types.is_string(t) and sys.version < '3':
             # TODO: need decode before converting to Arrow in Python 2
+            # TODO: don't need as of Arrow 0.9.1
             return pa.Array.from_pandas(s.apply(
                 lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t)
         elif t is not None and pa.types.is_decimal(t) and \
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -161,13 +161,13 @@ public ArrowColumnVector(ValueVector vector) {
     } else if (vector instanceof ListVector) {
       ListVector listVector = (ListVector) vector;
       accessor = new ArrayAccessor(listVector);
-    } else if (vector instanceof NullableMapVector) {
-      NullableMapVector mapVector = (NullableMapVector) vector;
-      accessor = new StructAccessor(mapVector);
+    } else if (vector instanceof StructVector) {
+      StructVector structVector = (StructVector) vector;
+      accessor = new StructAccessor(structVector);
 
-      childColumns = new ArrowColumnVector[mapVector.size()];
+      childColumns = new ArrowColumnVector[structVector.size()];
       for (int i = 0; i < childColumns.length; ++i) {
-        childColumns[i] = new ArrowColumnVector(mapVector.getVectorById(i));
+        childColumns[i] = new ArrowColumnVector(structVector.getVectorById(i));
       }
     } else {
       throw new UnsupportedOperationException();
@@ -470,7 +470,7 @@ final ColumnarArray getArray(int rowId) {
    */
   private static class StructAccessor extends ArrowVectorAccessor {
 
-    StructAccessor(NullableMapVector vector) {
+    StructAccessor(StructVector vector) {
       super(vector);
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
@@ -21,7 +21,6 @@ import scala.collection.JavaConverters._
 
 import org.apache.arrow.vector._
 import org.apache.arrow.vector.complex._
-import org.apache.arrow.vector.types.pojo.ArrowType
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
@@ -62,7 +61,7 @@ object ArrowWriter {
       case (ArrayType(_, _), vector: ListVector) =>
         val elementVector = createFieldWriter(vector.getDataVector())
         new ArrayWriter(vector, elementVector)
-      case (StructType(_), vector: NullableMapVector) =>
+      case (StructType(_), vector: StructVector) =>
         val children = (0 until vector.size()).map { ordinal =>
           createFieldWriter(vector.getChildByOrdinal(ordinal))
         }
@@ -129,20 +128,7 @@ private[arrow] abstract class ArrowFieldWriter {
   }
 
   def reset(): Unit = {
-    // TODO: reset() should be in a common interface
-    valueVector match {
-      case fixedWidthVector: BaseFixedWidthVector => fixedWidthVector.reset()
-      case variableWidthVector: BaseVariableWidthVector => variableWidthVector.reset()
-      case listVector: ListVector =>
-        // Manual "reset" the underlying buffer.
-        // TODO: When we upgrade to Arrow 0.10.0, we can simply remove this and call
-        // `listVector.reset()`.
-        val buffers = listVector.getBuffers(false)
-        buffers.foreach(buf => buf.setZero(0, buf.capacity()))
-        listVector.setValueCount(0)
-        listVector.setLastSet(0)
-      case _ =>
-    }
+    valueVector.reset()
     count = 0
   }
 }
@@ -323,7 +309,7 @@ private[arrow] class ArrayWriter(
 }
 
 private[arrow] class StructWriter(
-    val valueVector: NullableMapVector,
+    val valueVector: StructVector,
     children: Array[ArrowFieldWriter]) extends ArrowFieldWriter {
 
   override def setNull(): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
@@ -336,7 +336,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("struct", 0, Long.MaxValue)
     val schema = new StructType().add("int", IntegerType).add("long", LongType)
     val vector = ArrowUtils.toArrowField("struct", schema, nullable = false, null)
-      .createVector(allocator).asInstanceOf[NullableMapVector]
+      .createVector(allocator).asInstanceOf[StructVector]
 
     vector.allocateNew()
     val intVector = vector.getChildByOrdinal(0).asInstanceOf[IntVector]
@@ -373,7 +373,7 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("struct", 0, Long.MaxValue)
     val schema = new StructType().add("int", IntegerType).add("long", LongType)
     val vector = ArrowUtils.toArrowField("struct", schema, nullable = true, null)
-      .createVector(allocator).asInstanceOf[NullableMapVector]
+      .createVector(allocator).asInstanceOf[StructVector]
     vector.allocateNew()
     val intVector = vector.getChildByOrdinal(0).asInstanceOf[IntVector]
     val longVector = vector.getChildByOrdinal(1).asInstanceOf[BigIntVector]