Address comments

icexelloss · icexelloss · commit 6b22fea5b42b · 2018-07-27T13:49:47.000Z
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -95,7 +95,7 @@ object ExtractPythonUDFFromAggregate extends Rule[LogicalPlan] {
  */
 object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
 
-  private case class LazyEvalType(var evalType: Int = -1) {
+  private case class EvalTypeHolder(var evalType: Int = -1) {
 
     def isSet: Boolean = evalType >= 0
 
@@ -120,57 +120,24 @@ object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
     e.find(PythonUDF.isScalarPythonUDF).isDefined
   }
 
-  /**
-   * Check whether a PythonUDF expression can be evaluated in Python.
-   *
-   * If the lazy eval type is not set, this method checks for either Batched Python UDF and Scalar
-   * Pandas UDF. If the lazy eval type is set, this method checks for the expression of the
-   * specified eval type.
-   *
-   * This method will also set the lazy eval type to be the type of the first evaluable expression,
-   * i.e., if lazy eval type is not set and we find a evaluable Python UDF expression, lazy eval
-   * type will be set to the eval type of the expression.
-   *
-   */
-  private def canEvaluateInPython(e: PythonUDF, lazyEvalType: LazyEvalType): Boolean = {
-    if (!lazyEvalType.isSet) {
-      e.children match {
-        // single PythonUDF child could be chained and evaluated in Python if eval type is the same
-        case Seq(u: PythonUDF) =>
-          // Need to recheck the eval type because lazy eval type will be set if child Python UDF is
-          // evaluable
-          canEvaluateInPython(u, lazyEvalType) && lazyEvalType.get == e.evalType
-        // Python UDF can't be evaluated directly in JVM
-        case children => if (!children.exists(hasScalarPythonUDF)) {
-          // We found the first evaluable expression, set lazy eval type to its eval type.
-          lazyEvalType.set(e.evalType)
-          true
-        } else {
-          false
-        }
-      }
-    } else {
-      if (e.evalType != lazyEvalType.get) {
-        false
-      } else {
-        e.children match {
-          case Seq(u: PythonUDF) => canEvaluateInPython(u, lazyEvalType)
-          case children => !children.exists(hasScalarPythonUDF)
-        }
-      }
+  private def canEvaluateInPython(e: PythonUDF): Boolean = {
+    e.children match {
+      // single PythonUDF child could be chained and evaluated in Python
+      case Seq(u: PythonUDF) => e.evalType == u.evalType && canEvaluateInPython(u)
+      // Python UDF can't be evaluated directly in JVM
+      case children => !children.exists(hasScalarPythonUDF)
     }
   }
 
   private def collectEvaluableUDFs(
       expr: Expression,
-      evalType: LazyEvalType
-  ): Seq[PythonUDF] = {
-    expr match {
-      case udf: PythonUDF if
-      PythonUDF.isScalarPythonUDF(udf) && canEvaluateInPython(udf, evalType) =>
-        Seq(udf)
-      case e => e.children.flatMap(collectEvaluableUDFs(_, evalType))
-    }
+      firstEvalType: EvalTypeHolder): Seq[PythonUDF] = expr match {
+    case udf: PythonUDF if PythonUDF.isScalarPythonUDF(udf)
+      && (!firstEvalType.isSet || firstEvalType.get == udf.evalType)
+      && canEvaluateInPython(udf) =>
+      firstEvalType.evalType = udf.evalType
+      Seq(udf)
+    case e => e.children.flatMap(collectEvaluableUDFs(_, firstEvalType))
   }
 
   def apply(plan: SparkPlan): SparkPlan = plan transformUp {
@@ -181,8 +148,8 @@ object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
    * Extract all the PythonUDFs from the current operator and evaluate them before the operator.
    */
   private def extract(plan: SparkPlan): SparkPlan = {
-    val lazyEvalType = new LazyEvalType
-    val udfs = plan.expressions.flatMap(collectEvaluableUDFs(_, lazyEvalType))
+    val firstEvalType = new EvalTypeHolder
+    val udfs = plan.expressions.flatMap(collectEvaluableUDFs(_, firstEvalType))
       // ignore the PythonUDF that come from second/third aggregate, which is not used
       .filter(udf => udf.references.subsetOf(plan.inputSet))
     if (udfs.isEmpty) {