apache · mgaido91 · Sep 1, 2019 · Sep 2, 2019 · Sep 2, 2019 · Sep 2, 2019
diff --git a/...lyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/...lyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -403,13 +403,14 @@ class CodegenContext {
    *  equivalentExpressions will match the tree containing `col1 + col2` and it will only
    *  be evaluated once.
    */
-  val equivalentExpressions: EquivalentExpressions = new EquivalentExpressions
+  private val equivalentExpressions: EquivalentExpressions = new EquivalentExpressions
 
   // Foreach expression that is participating in subexpression elimination, the state to use.
-  var subExprEliminationExprs = Map.empty[Expression, SubExprEliminationState]
+  // Visible for testing.
+  private[expressions] var subExprEliminationExprs = Map.empty[Expression, SubExprEliminationState]
 
   // The collection of sub-expression result resetting methods that need to be called on each row.
-  val subexprFunctions = mutable.ArrayBuffer.empty[String]
+  private val subexprFunctions = mutable.ArrayBuffer.empty[String]
 
   val outerClassName = "OuterClass"
 
@@ -993,6 +994,14 @@ class CodegenContext {
     }
   }
 
+  /**
+   * Returns the code for subexpression elimination after splitting it if necessary.
+   */
+  def subexprFunctionsCode: String = {
+    // Whole-stage codegen's subexpression elimination is handled in another code path
+    splitExpressions(subexprFunctions, "subexprFunc_split", Seq("InternalRow" -> INPUT_ROW))
+  }
+
   /**
    * Perform a function which generates a sequence of ExprCodes with a given mapping between
    * expressions and common expressions, instead of using the mapping in current context.

diff --git a/...n/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/...n/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -92,7 +92,7 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], MutableP
     }
 
     // Evaluate all the subexpressions.
-    val evalSubexpr = ctx.subexprFunctions.mkString("\n")
+    val evalSubexpr = ctx.subexprFunctionsCode
 
     val allProjections = ctx.splitExpressionsWithCurrentInputs(projectionCodes.map(_._1))
     val allUpdates = ctx.splitExpressionsWithCurrentInputs(projectionCodes.map(_._2))

diff --git a/...in/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/...in/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -299,7 +299,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       v => s"$v = new $rowWriterClass(${expressions.length}, ${numVarLenFields * 32});")
 
     // Evaluate all the subexpression.
-    val evalSubexpr = ctx.subexprFunctions.mkString("\n")
+    val evalSubexpr = ctx.subexprFunctionsCode
 
     val writeExpressions = writeExpressionsToBuffer(
       ctx, ctx.INPUT_ROW, exprEvals, exprSchemas, rowWriter, isTopLevel = true)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2202,4 +2202,12 @@ class DataFrameSuite extends QueryTest with SharedSparkSession {
           |*(1) Range (0, 10, step=1, splits=2)""".stripMargin))
     }
   }
+
+  test("SPARK-28916: subexrepssion elimination can cause 64kb code limit") {
+    val df = spark.range(2).selectExpr((0 to 5000).map(i => s"id as field_$i"): _*)
+    df.createOrReplaceTempView("spark64kb")
+    val data = spark.sql("select * from spark64kb limit 10")
+    // This fails if 64Kb limit is reached in code generation
+    data.describe()
+  }
 }