Replace Arrow file format with Arrow stream format instead of having a new conf.

ueshin · ueshin · commit 7f6e43f7b291 · 2017-09-27T11:11:23.000+09:00
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -36,7 +36,6 @@ private[spark] object PythonEvalType {
   val NON_UDF = 0
   val SQL_BATCHED_UDF = 1
   val SQL_PANDAS_UDF = 2
-  val SQL_PANDAS_UDF_STREAM = 3
 }
 
 /**
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
@@ -86,7 +86,6 @@ class PythonEvalType(object):
     NON_UDF = 0
     SQL_BATCHED_UDF = 1
     SQL_PANDAS_UDF = 2
-    SQL_PANDAS_UDF_STREAM = 3
 
 
 class Serializer(object):
@@ -235,7 +234,7 @@ def cast_series(s, t):
 
 class ArrowPandasSerializer(ArrowSerializer):
     """
-    Serializes Pandas.Series as Arrow data.
+    Serializes Pandas.Series as Arrow data with Arrow file format.
     """
 
     def dumps(self, series):
@@ -259,7 +258,7 @@ def __repr__(self):
 
 class ArrowStreamPandasSerializer(Serializer):
     """
-    (De)serializes a vectorized(Apache Arrow) stream.
+    Serializes Pandas.Series as Arrow data with Arrow streaming format.
     """
 
     def load_stream(self, stream):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -3376,16 +3376,6 @@ def test_vectorized_udf_empty_partition(self):
         res = df.select(f(col('id')))
         self.assertEquals(df.collect(), res.collect())
 
-
-@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
-class ArrowStreamVectorizedUDFTests(VectorizedUDFTests):
-
-    @classmethod
-    def setUpClass(cls):
-        VectorizedUDFTests.setUpClass()
-        cls.spark.conf.set("spark.sql.execution.arrow.stream.enable", "true")
-
-
 if __name__ == "__main__":
     from pyspark.sql.tests import *
     if xmlrunner:
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -31,7 +31,7 @@
 from pyspark.files import SparkFiles
 from pyspark.serializers import write_with_length, write_int, read_long, \
     write_long, read_int, SpecialLengths, PythonEvalType, UTF8Deserializer, PickleSerializer, \
-    BatchedSerializer, ArrowPandasSerializer, ArrowStreamPandasSerializer
+    BatchedSerializer, ArrowStreamPandasSerializer
 from pyspark.sql.types import toArrowType
 from pyspark import shuffle
 
@@ -98,10 +98,10 @@ def read_single_udf(pickleSer, infile, eval_type):
         else:
             row_func = chain(row_func, f)
     # the last returnType will be the return type of UDF
-    if eval_type == PythonEvalType.SQL_BATCHED_UDF:
-        return arg_offsets, wrap_udf(row_func, return_type)
-    else:
+    if eval_type == PythonEvalType.SQL_PANDAS_UDF:
         return arg_offsets, wrap_pandas_udf(row_func, return_type)
+    else:
+        return arg_offsets, wrap_udf(row_func, return_type)
 
 
 def read_udfs(pickleSer, infile, eval_type):
@@ -123,8 +123,6 @@ def read_udfs(pickleSer, infile, eval_type):
     func = lambda _, it: map(mapper, it)
 
     if eval_type == PythonEvalType.SQL_PANDAS_UDF:
-        ser = ArrowPandasSerializer()
-    elif eval_type == PythonEvalType.SQL_PANDAS_UDF_STREAM:
         ser = ArrowStreamPandasSerializer()
     else:
         ser = BatchedSerializer(PickleSerializer(), 100)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -925,13 +925,6 @@ object SQLConf {
       .intConf
       .createWithDefault(10000)
 
-  val ARROW_EXECUTION_STREAM_ENABLE =
-    buildConf("spark.sql.execution.arrow.stream.enable")
-      .internal()
-      .doc("When using Apache Arrow, use Arrow stream protocol if possible.")
-      .booleanConf
-      .createWithDefault(false)
-
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1210,8 +1203,6 @@ class SQLConf extends Serializable with Logging {
 
   def arrowMaxRecordsPerBatch: Int = getConf(ARROW_EXECUTION_MAX_RECORDS_PER_BATCH)
 
-  def arrowStreamEnable: Boolean = getConf(ARROW_EXECUTION_STREAM_ENABLE)
-
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -102,7 +102,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
 
   /** A sequence of rules that will be applied in order to the physical plan before execution. */
   protected def preparations: Seq[Rule[SparkPlan]] = Seq(
-    python.ExtractPythonUDFs(sparkSession.sessionState.conf),
+    python.ExtractPythonUDFs,
     PlanSubqueries(sparkSession),
     new ReorderJoinPredicates,
     EnsureRequirements(sparkSession.sessionState.conf),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.sql.execution.python
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.TaskContext
 import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.arrow.{ArrowConverters, ArrowPayload}
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -39,25 +40,36 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
       iter: Iterator[InternalRow],
       schema: StructType,
       context: TaskContext): Iterator[InternalRow] = {
-    val inputIterator = ArrowConverters.toPayloadIterator(
-      iter, schema, conf.arrowMaxRecordsPerBatch, context).map(_.asPythonSerializable)
-
-    // Output iterator for results from Python.
-    val outputIterator = new PythonUDFRunner(
-        funcs, bufferSize, reuseWorker, PythonEvalType.SQL_PANDAS_UDF, argOffsets)
-      .compute(inputIterator, context.partitionId(), context)
-
-    val outputRowIterator = ArrowConverters.fromPayloadIterator(
-      outputIterator.map(new ArrowPayload(_)), context)
-
-    // Verify that the output schema is correct
-    if (outputRowIterator.hasNext) {
-      val schemaOut = StructType.fromAttributes(output.drop(child.output.length).zipWithIndex
-        .map { case (attr, i) => attr.withName(s"_$i") })
-      assert(schemaOut.equals(outputRowIterator.schema),
-        s"Invalid schema from pandas_udf: expected $schemaOut, got ${outputRowIterator.schema}")
-    }
 
-    outputRowIterator
+    val schemaOut = StructType.fromAttributes(output.drop(child.output.length).zipWithIndex
+      .map { case (attr, i) => attr.withName(s"_$i") })
+
+    val columnarBatchIter = new ArrowPythonRunner(
+        funcs, conf.arrowMaxRecordsPerBatch, bufferSize, reuseWorker,
+        PythonEvalType.SQL_PANDAS_UDF, argOffsets, schema)
+      .compute(iter, context.partitionId(), context)
+
+    new Iterator[InternalRow] {
+
+      var currentIter = if (columnarBatchIter.hasNext) {
+        val batch = columnarBatchIter.next()
+        assert(schemaOut.equals(batch.schema),
+          s"Invalid schema from pandas_udf: expected $schemaOut, got ${batch.schema}")
+        batch.rowIterator.asScala
+      } else {
+        Iterator.empty
+      }
+
+      override def hasNext: Boolean = currentIter.hasNext || {
+        if (columnarBatchIter.hasNext) {
+          currentIter = columnarBatchIter.next().rowIterator.asScala
+          hasNext
+        } else {
+          false
+        }
+      }
+
+      override def next(): InternalRow = currentIter.next()
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -37,7 +37,7 @@ import org.apache.spark.util.Utils
 /**
  * Similar to `PythonUDFRunner`, but exchange data with Python worker via Arrow stream.
  */
-class ArrowStreamPythonUDFRunner(
+class ArrowPythonRunner(
     funcs: Seq[ChainedPythonFunctions],
     batchSize: Int,
     bufferSize: Int,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowStreamEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowStreamEvalPythonExec.scala
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Proj
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution
 import org.apache.spark.sql.execution.{FilterExec, SparkPlan}
-import org.apache.spark.sql.internal.SQLConf
 
 
 /**
@@ -91,7 +90,7 @@ object ExtractPythonUDFFromAggregate extends Rule[LogicalPlan] {
  * This has the limitation that the input to the Python UDF is not allowed include attributes from
  * multiple child operators.
  */
-case class ExtractPythonUDFs(conf: SQLConf) extends Rule[SparkPlan] with PredicateHelper {
+object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
 
   private def hasPythonUDF(e: Expression): Boolean = {
     e.find(_.isInstanceOf[PythonUDF]).isDefined
@@ -142,11 +141,7 @@ case class ExtractPythonUDFs(conf: SQLConf) extends Rule[SparkPlan] with Predica
 
           val evaluation = validUdfs.partition(_.vectorized) match {
             case (vectorizedUdfs, plainUdfs) if plainUdfs.isEmpty =>
-              if (conf.arrowStreamEnable) {
-                ArrowStreamEvalPythonExec(vectorizedUdfs, child.output ++ resultAttrs, child)
-              } else {
-                ArrowEvalPythonExec(vectorizedUdfs, child.output ++ resultAttrs, child)
-              }
+              ArrowEvalPythonExec(vectorizedUdfs, child.output ++ resultAttrs, child)
             case (vectorizedUdfs, plainUdfs) if vectorizedUdfs.isEmpty =>
               BatchEvalPythonExec(plainUdfs, child.output ++ resultAttrs, child)
             case _ =>

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,6 @@ private[spark] object PythonEvalType {`
`36`	`36`	`val NON_UDF = 0`
`37`	`37`	`val SQL_BATCHED_UDF = 1`
`38`	`38`	`val SQL_PANDAS_UDF = 2`
`39`		`- val SQL_PANDAS_UDF_STREAM = 3`
`40`	`39`	`}`
`41`	`40`
`42`	`41`	`/**`