@@ -378,7 +378,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
378378 val numOutput = metricTerm(ctx, " numOutputRows" )
379379
380380 val initTerm = ctx.addMutableState(CodeGenerator .JAVA_BOOLEAN , " initRange" )
381- val number = ctx.addMutableState(CodeGenerator .JAVA_LONG , " number " )
381+ val nextIndex = ctx.addMutableState(CodeGenerator .JAVA_LONG , " nextIndex " )
382382
383383 val value = ctx.freshName(" value" )
384384 val ev = ExprCode .forNonNullValue(JavaCode .variable(value, LongType ))
@@ -397,7 +397,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
397397 // within a batch, while the code in the outer loop is setting batch parameters and updating
398398 // the metrics.
399399
400- // Once number == batchEnd, it's time to progress to the next batch.
400+ // Once nextIndex == batchEnd, it's time to progress to the next batch.
401401 val batchEnd = ctx.addMutableState(CodeGenerator .JAVA_LONG , " batchEnd" )
402402
403403 // How many values should still be generated by this range operator.
@@ -421,13 +421,13 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
421421 |
422422 | $BigInt st = index.multiply(numElement).divide(numSlice).multiply(step).add(start);
423423 | if (st.compareTo( $BigInt.valueOf(Long.MAX_VALUE)) > 0) {
424- | $number = Long.MAX_VALUE;
424+ | $nextIndex = Long.MAX_VALUE;
425425 | } else if (st.compareTo( $BigInt.valueOf(Long.MIN_VALUE)) < 0) {
426- | $number = Long.MIN_VALUE;
426+ | $nextIndex = Long.MIN_VALUE;
427427 | } else {
428- | $number = st.longValue();
428+ | $nextIndex = st.longValue();
429429 | }
430- | $batchEnd = $number ;
430+ | $batchEnd = $nextIndex ;
431431 |
432432 | $BigInt end = index.add( $BigInt.ONE).multiply(numElement).divide(numSlice)
433433 | .multiply(step).add(start);
@@ -440,7 +440,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
440440 | }
441441 |
442442 | $BigInt startToEnd = $BigInt.valueOf(partitionEnd).subtract(
443- | $BigInt.valueOf( $number ));
443+ | $BigInt.valueOf( $nextIndex ));
444444 | $numElementsTodo = startToEnd.divide(step).longValue();
445445 | if ( $numElementsTodo < 0) {
446446 | $numElementsTodo = 0;
@@ -452,46 +452,68 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
452452
453453 val localIdx = ctx.freshName(" localIdx" )
454454 val localEnd = ctx.freshName(" localEnd" )
455- val range = ctx.freshName(" range" )
456455 val shouldStop = if (parent.needStopCheck) {
457- s " if (shouldStop()) { $number = $value + ${step}L; return; } "
456+ s " if (shouldStop()) { $nextIndex = $value + ${step}L; return; } "
458457 } else {
459458 " // shouldStop check is eliminated"
460459 }
460+
461+ // An overview of the Range processing.
462+ //
463+ // For each partition, the Range task needs to produce records from partition start(inclusive)
464+ // to end(exclusive). For better performance, we separate the partition range into batches, and
465+ // use 2 loops to produce data. The outer while loop is used to iterate batches, and the inner
466+ // for loop is used to iterate records inside a batch.
467+ //
468+ // `nextIndex` tracks the index of the next record that is going to be consumed, initialized
469+ // with partition start. `batchEnd` tracks the end index of the current batch, initialized
470+ // with `nextIndex`. In the outer loop, we first check if `nextIndex == batchEnd`. If it's true,
471+ // it means the current batch is fully consumed, and we will update `batchEnd` to process the
472+ // next batch. If `batchEnd` reaches partition end, exit the outer loop. finally we enter the
473+ // inner loop. Note that, when we enter inner loop, `nextIndex` must be different from
474+ // `batchEnd`, otherwise the outer loop should already exits.
475+ //
476+ // The inner loop iterates from 0 to `localEnd`, which is calculated by
477+ // `(batchEnd - nextIndex) / step`. Since `batchEnd` is increased by `nextBatchTodo * step` in
478+ // the outer loop, and initialized with `nextIndex`, so `batchEnd - nextIndex` is always
479+ // divisible by `step`. The `nextIndex` is increased by `step` during each iteration, and ends
480+ // up being equal to `batchEnd` when the inner loop finishes.
481+ //
482+ // The inner loop can be interrupted, if the query has produced at least one result row, so that
483+ // we don't buffer too many result rows and waste memory. It's ok to interrupt the inner loop,
484+ // because `nextIndex` will be updated before interrupting.
485+
461486 s """
462487 | // initialize Range
463488 | if (! $initTerm) {
464489 | $initTerm = true;
465490 | $initRangeFuncName(partitionIndex);
466491 | }
467492 |
468- | while (true) {
469- | long $range = $batchEnd - $number;
470- | if ( $range != 0L) {
471- | int $localEnd = (int)( $range / ${step}L);
472- | for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) {
473- | long $value = ((long) $localIdx * ${step}L) + $number;
474- | ${consume(ctx, Seq (ev))}
475- | $shouldStop
493+ | while (true $keepProducingDataCond) {
494+ | if ( $nextIndex == $batchEnd) {
495+ | long $nextBatchTodo;
496+ | if ( $numElementsTodo > ${batchSize}L) {
497+ | $nextBatchTodo = ${batchSize}L;
498+ | $numElementsTodo -= ${batchSize}L;
499+ | } else {
500+ | $nextBatchTodo = $numElementsTodo;
501+ | $numElementsTodo = 0;
502+ | if ( $nextBatchTodo == 0) break;
476503 | }
477- | $number = $batchEnd;
504+ | $numOutput.add( $nextBatchTodo);
505+ | $inputMetrics.incRecordsRead( $nextBatchTodo);
506+ | $batchEnd += $nextBatchTodo * ${step}L;
478507 | }
479508 |
480- | $taskContext.killTaskIfInterrupted();
481- |
482- | long $nextBatchTodo;
483- | if ( $numElementsTodo > ${batchSize}L) {
484- | $nextBatchTodo = ${batchSize}L;
485- | $numElementsTodo -= ${batchSize}L;
486- | } else {
487- | $nextBatchTodo = $numElementsTodo;
488- | $numElementsTodo = 0;
489- | if ( $nextBatchTodo == 0) break;
509+ | int $localEnd = (int)(( $batchEnd - $nextIndex) / ${step}L);
510+ | for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) {
511+ | long $value = ((long) $localIdx * ${step}L) + $nextIndex;
512+ | ${consume(ctx, Seq (ev))}
513+ | $shouldStop
490514 | }
491- | $numOutput.add( $nextBatchTodo);
492- | $inputMetrics.incRecordsRead( $nextBatchTodo);
493- |
494- | $batchEnd += $nextBatchTodo * ${step}L;
515+ | $nextIndex = $batchEnd;
516+ | $taskContext.killTaskIfInterrupted();
495517 | }
496518 """ .stripMargin
497519 }
0 commit comments