-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-18091] [SQL] Deep if expressions cause Generated SpecificUnsafeProjection code to exceed JVM code size limit #15620
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -64,19 +64,75 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi | |
| val trueEval = trueValue.genCode(ctx) | ||
| val falseEval = falseValue.genCode(ctx) | ||
|
|
||
| ev.copy(code = s""" | ||
| ${condEval.code} | ||
| boolean ${ev.isNull} = false; | ||
| ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)}; | ||
| if (!${condEval.isNull} && ${condEval.value}) { | ||
| ${trueEval.code} | ||
| ${ev.isNull} = ${trueEval.isNull}; | ||
| ${ev.value} = ${trueEval.value}; | ||
| } else { | ||
| ${falseEval.code} | ||
| ${ev.isNull} = ${falseEval.isNull}; | ||
| ${ev.value} = ${falseEval.value}; | ||
| }""") | ||
| // place generated code of condition, true value and false value in separate methods if | ||
| // their code combined is large | ||
| val combinedLength = condEval.code.length + trueEval.code.length + falseEval.code.length | ||
| val generatedCode = if (combinedLength > 1024 && | ||
| // Split these expressions only if they are created from a row object | ||
| (ctx.INPUT_ROW != null && ctx.currentVars == null)) { | ||
|
||
|
|
||
| val (condFuncName, condGlobalIsNull, condGlobalValue) = | ||
| createAndAddFunction(ctx, condEval, predicate.dataType, "evalIfCondExpr") | ||
| val (trueFuncName, trueGlobalIsNull, trueGlobalValue) = | ||
| createAndAddFunction(ctx, trueEval, trueValue.dataType, "evalIfTrueExpr") | ||
| val (falseFuncName, falseGlobalIsNull, falseGlobalValue) = | ||
| createAndAddFunction(ctx, falseEval, falseValue.dataType, "evalIfFalseExpr") | ||
| s""" | ||
| $condFuncName(${ctx.INPUT_ROW}); | ||
| boolean ${ev.isNull} = false; | ||
| ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)}; | ||
| if (!$condGlobalIsNull && $condGlobalValue) { | ||
| $trueFuncName(${ctx.INPUT_ROW}); | ||
| ${ev.isNull} = $trueGlobalIsNull; | ||
| ${ev.value} = $trueGlobalValue; | ||
| } else { | ||
| $falseFuncName(${ctx.INPUT_ROW}); | ||
| ${ev.isNull} = $falseGlobalIsNull; | ||
| ${ev.value} = $falseGlobalValue; | ||
| } | ||
| """ | ||
| } | ||
| else { | ||
| s""" | ||
| ${condEval.code} | ||
| boolean ${ev.isNull} = false; | ||
| ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)}; | ||
| if (!${condEval.isNull} && ${condEval.value}) { | ||
| ${trueEval.code} | ||
| ${ev.isNull} = ${trueEval.isNull}; | ||
| ${ev.value} = ${trueEval.value}; | ||
| } else { | ||
| ${falseEval.code} | ||
| ${ev.isNull} = ${falseEval.isNull}; | ||
| ${ev.value} = ${falseEval.value}; | ||
| } | ||
| """ | ||
| } | ||
|
|
||
| ev.copy(code = generatedCode) | ||
| } | ||
|
|
||
| private def createAndAddFunction( | ||
| ctx: CodegenContext, | ||
| ev: ExprCode, | ||
| dataType: DataType, | ||
| baseFuncName: String): (String, String, String) = { | ||
| val globalIsNull = ctx.freshName("isNull") | ||
| ctx.addMutableState("boolean", globalIsNull, s"$globalIsNull = false;") | ||
| val globalValue = ctx.freshName("value") | ||
| ctx.addMutableState(ctx.javaType(dataType), globalValue, | ||
| s"$globalValue = ${ctx.defaultValue(dataType)};") | ||
| val funcName = ctx.freshName(baseFuncName) | ||
| val funcBody = | ||
| s""" | ||
| |private void $funcName(InternalRow ${ctx.INPUT_ROW}) { | ||
| | ${ev.code.trim} | ||
| | $globalIsNull = ${ev.isNull}; | ||
| | $globalValue = ${ev.value}; | ||
| |} | ||
| """.stripMargin | ||
| ctx.addNewFunction(funcName, funcBody) | ||
| (funcName, globalIsNull, globalValue) | ||
| } | ||
|
|
||
| override def toString: String = s"if ($predicate) $trueValue else $falseValue" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -97,6 +97,27 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { | |
| assert(actual(0) == cases) | ||
| } | ||
|
|
||
| test("SPARK-18091: split large if expressions into blocks due to JVM code size limit") { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test looks better now. |
||
| val inStr = "StringForTesting" | ||
| val row = create_row(inStr) | ||
| val inputStrAttr = 'a.string.at(0) | ||
|
|
||
| var strExpr: Expression = inputStrAttr | ||
| for (_ <- 1 to 13) { | ||
| strExpr = If(EqualTo(Decode(Encode(strExpr, "utf-8"), "utf-8"), inputStrAttr), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cc @srowen I think this is the root cause. This test is an overkill, although this PR fixed the code size limitation problem, this test may still hit constants pool size limitation, which is a known limiation and has not been fixed yet. It seems that maven and sbt have different JVM settings when run test, so the problem only exists at maven side. I'm going to submit a PR to simplify this test a bit.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Very interested in this, we know with 5 iterations instead of a 13 we don't get the problem (which makes sense as we'd only be generating so much code, not hitting the 64k constant pool size limit). I'm testing with IBM's SDK for Java so curious if it manifests itself in a different way for us or if we have a problem to fix on our end. I have log files exceeding 2 GB from the test printing the generated code on failure. If we add prints for the strExpr we see something like The problem is With my prints: Is this the same issue we're referring to? I've also seen timeouts and our Jenkins farm has 5h limits, I see the problem against branch-2.0, branch-2.1, master, but didn't see it for Spark 2.1.0 RC1.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @a-roberts I think this is merged into branch-2.1 after RC1 coming out. |
||
| strExpr, strExpr) | ||
| } | ||
|
|
||
| val expressions = Seq(strExpr) | ||
| val plan = GenerateUnsafeProjection.generate(expressions, true) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: other similar tests use
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The issue I had encountered involved GenerateUnsafeProjection and I was trying to reproduce the same in the unit test so I went with it. I can't think of any other reason. Any specific reason you guys use GenerateMutableProjection?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no specific reason too. |
||
| val actual = plan(row).toSeq(expressions.map(_.dataType)) | ||
| val expected = Seq(UTF8String.fromString(inStr)) | ||
|
|
||
| if (!checkResult(actual, expected)) { | ||
| fail(s"Incorrect Evaluation: expressions: $expressions, actual: $actual, expected: $expected") | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-14793: split wide array creation into blocks due to JVM code size limit") { | ||
| val length = 5000 | ||
| val expressions = Seq(CreateArray(List.fill(length)(EqualTo(Literal(1), Literal(1))))) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The limitation is 64k, we don't need to be so conservative
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I used the same limit as in following change:
https://github.com/apache/spark/pull/14692/files#diff-8bcc5aea39c73d4bf38aef6f6951d42cL595
which is based on some benchmarks. In addition, for JIT to do its optimisations, I think the limit is 8k.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah i see