[SPARK-35560][SQL] Remove redundant subexpression evaluation in nested subexpressions

viirya · viirya · commit dbf0b50757ce · 2021-06-01T19:13:12.000-07:00
### What changes were proposed in this pull request? This patch proposes to improve subexpression evaluation under whole-stage codegen for the cases of nested subexpressions. ### Why are the changes needed? In the cases of nested subexpressions, whole-stage codegen's subexpression elimination will do redundant subexpression evaluation. We should reduce it. For example, if we have two sub-exprs: 1. `simpleUDF($"id")` 2. `functions.length(simpleUDF($"id"))` We should only evaluate `simpleUDF($"id")` once, i.e. ```java subExpr1 = simpleUDF($"id"); subExpr2 = functions.length(subExpr1); ``` Snippets of generated codes: Before: ```java /* 040 */ private int project_subExpr_1(long project_expr_0_0) { /* 041 */ boolean project_isNull_6 = false; /* 042 */ UTF8String project_value_6 = null; /* 043 */ if (!false) { /* 044 */ project_value_6 = UTF8String.fromString(String.valueOf(project_expr_0_0)); /* 045 */ } /* 046 */ /* 047 */ Object project_arg_1 = null; /* 048 */ if (project_isNull_6) { /* 049 */ project_arg_1 = ((scala.Function1[]) references[3] /* converters */)[0].apply(null); /* 050 */ } else { /* 051 */ project_arg_1 = ((scala.Function1[]) references[3] /* converters */)[0].apply(project_value_6); /* 052 */ } /* 053 */ /* 054 */ UTF8String project_result_1 = null; /* 055 */ try { /* 056 */ project_result_1 = (UTF8String)((scala.Function1[]) references[3] /* converters */)[1].apply(((scala.Function1) references[4] /* udf */).apply(project_arg_1) ); /* 057 */ } catch (Throwable e) { /* 058 */ throw QueryExecutionErrors.failedExecuteUserDefinedFunctionError( /* 059 */ "DataFrameSuite$$Lambda$6418/1507986601", "string", "string", e); /* 060 */ } /* 061 */ /* 062 */ boolean project_isNull_5 = project_result_1 == null; /* 063 */ UTF8String project_value_5 = null; /* 064 */ if (!project_isNull_5) { /* 065 */ project_value_5 = project_result_1; /* 066 */ } /* 067 */ boolean project_isNull_4 = project_isNull_5; /* 068 */ int project_value_4 = -1; /* 069 */ /* 070 */ if (!project_isNull_5) { /* 071 */ project_value_4 = (project_value_5).numChars(); /* 072 */ } /* 073 */ project_subExprIsNull_1 = project_isNull_4; /* 074 */ return project_value_4; /* 075 */ } ... /* 149 */ private UTF8String project_subExpr_0(long project_expr_0_0) { /* 150 */ boolean project_isNull_2 = false; /* 151 */ UTF8String project_value_2 = null; /* 152 */ if (!false) { /* 153 */ project_value_2 = UTF8String.fromString(String.valueOf(project_expr_0_0)); /* 154 */ } /* 155 */ /* 156 */ Object project_arg_0 = null; /* 157 */ if (project_isNull_2) { /* 158 */ project_arg_0 = ((scala.Function1[]) references[1] /* converters */)[0].apply(null); /* 159 */ } else { /* 160 */ project_arg_0 = ((scala.Function1[]) references[1] /* converters */)[0].apply(project_value_2); /* 161 */ } /* 162 */ /* 163 */ UTF8String project_result_0 = null; /* 164 */ try { /* 165 */ project_result_0 = (UTF8String)((scala.Function1[]) references[1] /* converters */)[1].apply(((scala.Function1) references[2] /* udf */).apply(project_arg_0) ); /* 166 */ } catch (Throwable e) { /* 167 */ throw QueryExecutionErrors.failedExecuteUserDefinedFunctionError( /* 168 */ "DataFrameSuite$$Lambda$6418/1507986601", "string", "string", e); /* 169 */ } /* 170 */ /* 171 */ boolean project_isNull_1 = project_result_0 == null; /* 172 */ UTF8String project_value_1 = null; /* 173 */ if (!project_isNull_1) { /* 174 */ project_value_1 = project_result_0; /* 175 */ } /* 176 */ project_subExprIsNull_0 = project_isNull_1; /* 177 */ return project_value_1; /* 178 */ } ``` After: ```java /* 041 */ private void project_subExpr_1(long project_expr_0_0) { /* 042 */ boolean project_isNull_8 = project_subExprIsNull_0; /* 043 */ int project_value_8 = -1; /* 044 */ /* 045 */ if (!project_subExprIsNull_0) { /* 046 */ project_value_8 = (project_mutableStateArray_0[0]).numChars(); /* 047 */ } /* 048 */ project_subExprIsNull_1 = project_isNull_8; /* 049 */ project_subExprValue_0 = project_value_8; /* 050 */ } /* 056 */ ... /* 123 */ /* 124 */ private void project_subExpr_0(long project_expr_0_0) { /* 125 */ boolean project_isNull_6 = false; /* 126 */ UTF8String project_value_6 = null; /* 127 */ if (!false) { /* 128 */ project_value_6 = UTF8String.fromString(String.valueOf(project_expr_0_0)); /* 129 */ } /* 130 */ /* 131 */ Object project_arg_1 = null; /* 132 */ if (project_isNull_6) { /* 133 */ project_arg_1 = ((scala.Function1[]) references[3] /* converters */)[0].apply(null); /* 134 */ } else { /* 135 */ project_arg_1 = ((scala.Function1[]) references[3] /* converters */)[0].apply(project_value_6); /* 136 */ } /* 137 */ /* 138 */ UTF8String project_result_1 = null; /* 139 */ try { /* 140 */ project_result_1 = (UTF8String)((scala.Function1[]) references[3] /* converters */)[1].apply(((scala.Function1) references[4] /* udf */).apply(project_arg_1) ); /* 141 */ } catch (Throwable e) { /* 142 */ throw QueryExecutionErrors.failedExecuteUserDefinedFunctionError( /* 143 */ "DataFrameSuite$$Lambda$6430/2004847941", "string", "string", e); /* 144 */ } /* 145 */ /* 146 */ boolean project_isNull_5 = project_result_1 == null; /* 147 */ UTF8String project_value_5 = null; /* 148 */ if (!project_isNull_5) { /* 149 */ project_value_5 = project_result_1; /* 150 */ } /* 151 */ project_subExprIsNull_0 = project_isNull_5; /* 152 */ project_mutableStateArray_0[0] = project_value_5; /* 153 */ } ``` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Unit test. Closes #32699 from viirya/improve-subexpr. Authored-by: Liang-Chi Hsieh <viirya@gmail.com> Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -1039,21 +1039,25 @@ class CodegenContext extends Logging {
   def subexpressionEliminationForWholeStageCodegen(expressions: Seq[Expression]): SubExprCodes = {
     // Create a clear EquivalentExpressions and SubExprEliminationState mapping
     val equivalentExpressions: EquivalentExpressions = new EquivalentExpressions
-    val localSubExprEliminationExprs = mutable.HashMap.empty[Expression, SubExprEliminationState]
+    val localSubExprEliminationExprsForNonSplit =
+      mutable.HashMap.empty[Expression, SubExprEliminationState]
 
     // Add each expression tree and compute the common subexpressions.
     expressions.foreach(equivalentExpressions.addExprTree(_))
 
     // Get all the expressions that appear at least twice and set up the state for subexpression
     // elimination.
     val commonExprs = equivalentExpressions.getAllEquivalentExprs(1)
-    lazy val commonExprVals = commonExprs.map(_.head.genCode(this))
 
-    lazy val nonSplitExprCode = {
-      commonExprs.zip(commonExprVals).map { case (exprs, eval) =>
-        // Generate the code for this expression tree.
-        val state = SubExprEliminationState(eval.isNull, eval.value)
-        exprs.foreach(localSubExprEliminationExprs.put(_, state))
+    val nonSplitExprCode = {
+      commonExprs.map { exprs =>
+        val eval = withSubExprEliminationExprs(localSubExprEliminationExprsForNonSplit.toMap) {
+          val eval = exprs.head.genCode(this)
+          // Generate the code for this expression tree.
+          val state = SubExprEliminationState(eval.isNull, eval.value)
+          exprs.foreach(localSubExprEliminationExprsForNonSplit.put(_, state))
+          Seq(eval)
+        }.head
         eval.code.toString
       }
     }
@@ -1068,11 +1072,19 @@ class CodegenContext extends Logging {
     }.unzip
 
     val splitThreshold = SQLConf.get.methodSplitThreshold
-    val codes = if (commonExprVals.map(_.code.length).sum > splitThreshold) {
+
+    val (codes, subExprsMap, exprCodes) = if (nonSplitExprCode.map(_.length).sum > splitThreshold) {
       if (inputVarsForAllFuncs.map(calculateParamLengthFromExprValues).forall(isValidParamLength)) {
-        commonExprs.zipWithIndex.map { case (exprs, i) =>
+        val localSubExprEliminationExprs =
+          mutable.HashMap.empty[Expression, SubExprEliminationState]
+
+        val splitCodes = commonExprs.zipWithIndex.map { case (exprs, i) =>
           val expr = exprs.head
-          val eval = commonExprVals(i)
+          val eval = withSubExprEliminationExprs(localSubExprEliminationExprs.toMap) {
+            Seq(expr.genCode(this))
+          }.head
+
+          val value = addMutableState(javaType(expr.dataType), "subExprValue")
 
           val isNullLiteral = eval.isNull match {
             case TrueLiteral | FalseLiteral => true
@@ -1090,34 +1102,33 @@ class CodegenContext extends Logging {
           val inputVars = inputVarsForAllFuncs(i)
           val argList =
             inputVars.map(v => s"${CodeGenerator.typeName(v.javaType)} ${v.variableName}")
-          val returnType = javaType(expr.dataType)
           val fn =
             s"""
-               |private $returnType $fnName(${argList.mkString(", ")}) {
+               |private void $fnName(${argList.mkString(", ")}) {
                |  ${eval.code}
                |  $isNullEvalCode
-               |  return ${eval.value};
+               |  $value = ${eval.value};
                |}
                """.stripMargin
 
-          val value = freshName("subExprValue")
-          val state = SubExprEliminationState(isNull, JavaCode.variable(value, expr.dataType))
+          val state = SubExprEliminationState(isNull, JavaCode.global(value, expr.dataType))
           exprs.foreach(localSubExprEliminationExprs.put(_, state))
           val inputVariables = inputVars.map(_.variableName).mkString(", ")
-          s"$returnType $value = ${addNewFunction(fnName, fn)}($inputVariables);"
+          s"${addNewFunction(fnName, fn)}($inputVariables);"
         }
+        (splitCodes, localSubExprEliminationExprs, exprCodesNeedEvaluate)
       } else {
         if (Utils.isTesting) {
           throw QueryExecutionErrors.failedSplitSubExpressionError(MAX_JVM_METHOD_PARAMS_LENGTH)
         } else {
           logInfo(QueryExecutionErrors.failedSplitSubExpressionMsg(MAX_JVM_METHOD_PARAMS_LENGTH))
-          nonSplitExprCode
+          (nonSplitExprCode, localSubExprEliminationExprsForNonSplit, Seq.empty)
         }
       }
     } else {
-      nonSplitExprCode
+      (nonSplitExprCode, localSubExprEliminationExprsForNonSplit, Seq.empty)
     }
-    SubExprCodes(codes, localSubExprEliminationExprs.toMap, exprCodesNeedEvaluate.flatten)
+    SubExprCodes(codes, subExprsMap.toMap, exprCodes.flatten)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2882,6 +2882,31 @@ class DataFrameSuite extends QueryTest
     df2.collect()
     assert(accum.value == 15)
   }
+
+  test("SPARK-35560: Remove redundant subexpression evaluation in nested subexpressions") {
+    Seq(1, Int.MaxValue).foreach { splitThreshold =>
+      withSQLConf(SQLConf.CODEGEN_METHOD_SPLIT_THRESHOLD.key -> splitThreshold.toString) {
+        val accum = sparkContext.longAccumulator("call")
+        val simpleUDF = udf((s: String) => {
+          accum.add(1)
+          s
+        })
+
+        // Common exprs:
+        //  1. simpleUDF($"id")
+        //  2. functions.length(simpleUDF($"id"))
+        // We should only evaluate `simpleUDF($"id")` once, i.e.
+        // subExpr1 = simpleUDF($"id");
+        // subExpr2 = functions.length(subExpr1);
+        val df = spark.range(5).select(
+          when(functions.length(simpleUDF($"id")) === 1, lower(simpleUDF($"id")))
+            .when(functions.length(simpleUDF($"id")) === 0, upper(simpleUDF($"id")))
+            .otherwise(simpleUDF($"id")).as("output"))
+        df.collect()
+        assert(accum.value == 5)
+      }
+    }
+  }
 }
 
 case class GroupByKey(a: Int, b: Int)