apache
diff --git a/‎core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala‎
Lines changed: 45 additions & 39 deletions b/‎core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala‎
Lines changed: 45 additions & 39 deletions
diff --git a/‎sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/UnsafeRowParquetRecordReader.java‎
Lines changed: 24 additions & 5 deletions b/‎sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/UnsafeRowParquetRecordReader.java‎
Lines changed: 24 additions & 5 deletions
diff --git a/‎sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java‎
Lines changed: 57 additions & 0 deletions b/‎sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java‎
Lines changed: 12 additions & 0 deletions b/‎sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java‎
Lines changed: 0 additions & 3 deletions b/‎sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala‎
Lines changed: 59 additions & 6 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala‎
Lines changed: 59 additions & 6 deletions
@@ -86,7 +86,7 @@ private[ui] class ExecutorsPage(
           <th>Failed Tasks</th>
           <th>Complete Tasks</th>
           <th>Total Tasks</th>
-          <th data-toggle="tooltip" title={ToolTips.TASK_TIME}>Task Time (GC Time)</th>
+          <th><span data-toggle="tooltip" title={ToolTips.TASK_TIME}>Task Time (GC Time)</span></th>
           <th><span data-toggle="tooltip" title={ToolTips.INPUT}>Input</span></th>
           <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_READ}>Shuffle Read</span></th>
           <th>
@@ -109,13 +109,8 @@ private[ui] class ExecutorsPage(
     val content =
       <div class="row">
         <div class="span12">
-          <h4>Dead Executors({deadExecutorInfo.size})</h4>
-        </div>
-      </div>
-      <div class="row">
-        <div class="span12">
-          <h4>Active Executors({activeExecutorInfo.size})</h4>
-          {execSummary(activeExecutorInfo)}
+          <h4>Summary</h4>
+          {execSummary(activeExecutorInfo, deadExecutorInfo)}
         </div>
       </div>
       <div class = "row">
@@ -198,7 +193,7 @@ private[ui] class ExecutorsPage(
     </tr>
   }
 
-  private def execSummary(execInfo: Seq[ExecutorSummary]): Seq[Node] = {
+  private def execSummaryRow(execInfo: Seq[ExecutorSummary], rowName: String): Seq[Node] = {
     val maximumMemory = execInfo.map(_.maxMemory).sum
     val memoryUsed = execInfo.map(_.memoryUsed).sum
     val diskUsed = execInfo.map(_.diskUsed).sum
@@ -207,37 +202,46 @@ private[ui] class ExecutorsPage(
     val totalShuffleRead = execInfo.map(_.totalShuffleRead).sum
     val totalShuffleWrite = execInfo.map(_.totalShuffleWrite).sum
 
-    val sumContent =
-      <tr>
-        <td>{execInfo.map(_.rddBlocks).sum}</td>
-        <td sorttable_customkey={memoryUsed.toString}>
-          {Utils.bytesToString(memoryUsed)} /
-          {Utils.bytesToString(maximumMemory)}
-        </td>
-        <td sorttable_customkey={diskUsed.toString}>
-          {Utils.bytesToString(diskUsed)}
-        </td>
-        <td>{totalCores}</td>
-        {taskData(execInfo.map(_.maxTasks).sum,
-        execInfo.map(_.activeTasks).sum,
-        execInfo.map(_.failedTasks).sum,
-        execInfo.map(_.completedTasks).sum,
-        execInfo.map(_.totalTasks).sum,
-        execInfo.map(_.totalDuration).sum,
-        execInfo.map(_.totalGCTime).sum)}
-        <td sorttable_customkey={totalInputBytes.toString}>
-          {Utils.bytesToString(totalInputBytes)}
-        </td>
-        <td sorttable_customkey={totalShuffleRead.toString}>
-          {Utils.bytesToString(totalShuffleRead)}
-        </td>
-        <td sorttable_customkey={totalShuffleWrite.toString}>
-          {Utils.bytesToString(totalShuffleWrite)}
-        </td>
-      </tr>;
+    <tr>
+      <td><b>{rowName}({execInfo.size})</b></td>
+      <td>{execInfo.map(_.rddBlocks).sum}</td>
+      <td sorttable_customkey={memoryUsed.toString}>
+        {Utils.bytesToString(memoryUsed)} /
+        {Utils.bytesToString(maximumMemory)}
+      </td>
+      <td sorttable_customkey={diskUsed.toString}>
+        {Utils.bytesToString(diskUsed)}
+      </td>
+      <td>{totalCores}</td>
+      {taskData(execInfo.map(_.maxTasks).sum,
+      execInfo.map(_.activeTasks).sum,
+      execInfo.map(_.failedTasks).sum,
+      execInfo.map(_.completedTasks).sum,
+      execInfo.map(_.totalTasks).sum,
+      execInfo.map(_.totalDuration).sum,
+      execInfo.map(_.totalGCTime).sum)}
+      <td sorttable_customkey={totalInputBytes.toString}>
+        {Utils.bytesToString(totalInputBytes)}
+      </td>
+      <td sorttable_customkey={totalShuffleRead.toString}>
+        {Utils.bytesToString(totalShuffleRead)}
+      </td>
+      <td sorttable_customkey={totalShuffleWrite.toString}>
+        {Utils.bytesToString(totalShuffleWrite)}
+      </td>
+    </tr>
+  }
+
+  private def execSummary(activeExecInfo: Seq[ExecutorSummary], deadExecInfo: Seq[ExecutorSummary]):
+    Seq[Node] = {
+    val totalExecInfo = activeExecInfo ++ deadExecInfo
+    val activeRow = execSummaryRow(activeExecInfo, "Active");
+    val deadRow = execSummaryRow(deadExecInfo, "Dead");
+    val totalRow = execSummaryRow(totalExecInfo, "Total");
 
     <table class={UIUtils.TABLE_CLASS_STRIPED}>
       <thead>
+        <th></th>
         <th>RDD Blocks</th>
         <th><span data-toggle="tooltip" title={ToolTips.STORAGE_MEMORY}>Storage Memory</span></th>
         <th>Disk Used</th>
@@ -246,7 +250,7 @@ private[ui] class ExecutorsPage(
         <th>Failed Tasks</th>
         <th>Complete Tasks</th>
         <th>Total Tasks</th>
-        <th data-toggle="tooltip" title={ToolTips.TASK_TIME}>Task Time (GC Time)</th>
+        <th><span data-toggle="tooltip" title={ToolTips.TASK_TIME}>Task Time (GC Time)</span></th>
         <th><span data-toggle="tooltip" title={ToolTips.INPUT}>Input</span></th>
         <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_READ}>Shuffle Read</span></th>
         <th>
@@ -256,7 +260,9 @@ private[ui] class ExecutorsPage(
         </th>
       </thead>
       <tbody>
-        {sumContent}
+        {activeRow}
+        {deadRow}
+        {totalRow}
       </tbody>
     </table>
   }
 
@@ -37,7 +37,6 @@
 import org.apache.parquet.schema.Type;
 
 import org.apache.spark.memory.MemoryMode;
-import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 import org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder;
 import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter;
@@ -57,10 +56,14 @@
  *
  * TODO: handle complex types, decimal requiring more than 8 bytes, INT96. Schema mismatch.
  * All of these can be handled efficiently and easily with codegen.
+ *
+ * This class can either return InternalRows or ColumnarBatches. With whole stage codegen
+ * enabled, this class returns ColumnarBatches which offers significant performance gains.
+ * TODO: make this always return ColumnarBatches.
  */
-public class UnsafeRowParquetRecordReader extends SpecificParquetRecordReaderBase<InternalRow> {
+public class UnsafeRowParquetRecordReader extends SpecificParquetRecordReaderBase<Object> {
   /**
-   * Batch of unsafe rows that we assemble and the current index we've returned. Everytime this
+   * Batch of unsafe rows that we assemble and the current index we've returned. Every time this
    * batch is used up (batchIdx == numBatched), we populated the batch.
    */
   private UnsafeRow[] rows = new UnsafeRow[64];
@@ -115,11 +118,15 @@ public class UnsafeRowParquetRecordReader extends SpecificParquetRecordReaderBas
    * code between the path that uses the MR decoders and the vectorized ones.
    *
    * TODOs:
-   *  - Implement all the encodings to support vectorized.
    *  - Implement v2 page formats (just make sure we create the correct decoders).
    */
   private ColumnarBatch columnarBatch;
 
+  /**
+   * If true, this class returns batches instead of rows.
+   */
+  private boolean returnColumnarBatch;
+
   /**
    * The default config on whether columnarBatch should be offheap.
    */
@@ -169,6 +176,8 @@ public void close() throws IOException {
 
   @Override
   public boolean nextKeyValue() throws IOException, InterruptedException {
+    if (returnColumnarBatch) return nextBatch();
+
     if (batchIdx >= numBatched) {
       if (vectorizedDecode()) {
         if (!nextBatch()) return false;
@@ -181,7 +190,9 @@ public boolean nextKeyValue() throws IOException, InterruptedException {
   }
 
   @Override
-  public InternalRow getCurrentValue() throws IOException, InterruptedException {
+  public Object getCurrentValue() throws IOException, InterruptedException {
+    if (returnColumnarBatch) return columnarBatch;
+
     if (vectorizedDecode()) {
       return columnarBatch.getRow(batchIdx - 1);
     } else {
@@ -210,6 +221,14 @@ public ColumnarBatch resultBatch(MemoryMode memMode) {
     return columnarBatch;
   }
 
+  /**
+   * Can be called before any rows are returned to enable returning columnar batches directly.
+   */
+  public void enableReturningBatches() {
+    assert(vectorizedDecode());
+    returnColumnarBatch = true;
+  }
+
   /**
    * Advances to the next batch of rows. Returns false if there are no more.
    */
 
@@ -26,16 +26,73 @@
 
 import org.apache.spark.memory.MemoryMode;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.util.DateTimeUtils;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.types.CalendarInterval;
+import org.apache.spark.unsafe.types.UTF8String;
 
 /**
  * Utilities to help manipulate data associate with ColumnVectors. These should be used mostly
  * for debugging or other non-performance critical paths.
  * These utilities are mostly used to convert ColumnVectors into other formats.
  */
 public class ColumnVectorUtils {
+  /**
+   * Populates the entire `col` with `row[fieldIdx]`
+   */
+  public static void populate(ColumnVector col, InternalRow row, int fieldIdx) {
+    int capacity = col.capacity;
+    DataType t = col.dataType();
+
+    if (row.isNullAt(fieldIdx)) {
+      col.putNulls(0, capacity);
+    } else {
+      if (t == DataTypes.BooleanType) {
+        col.putBooleans(0, capacity, row.getBoolean(fieldIdx));
+      } else if (t == DataTypes.ByteType) {
+        col.putBytes(0, capacity, row.getByte(fieldIdx));
+      } else if (t == DataTypes.ShortType) {
+        col.putShorts(0, capacity, row.getShort(fieldIdx));
+      } else if (t == DataTypes.IntegerType) {
+        col.putInts(0, capacity, row.getInt(fieldIdx));
+      } else if (t == DataTypes.LongType) {
+        col.putLongs(0, capacity, row.getLong(fieldIdx));
+      } else if (t == DataTypes.FloatType) {
+        col.putFloats(0, capacity, row.getFloat(fieldIdx));
+      } else if (t == DataTypes.DoubleType) {
+        col.putDoubles(0, capacity, row.getDouble(fieldIdx));
+      } else if (t == DataTypes.StringType) {
+        UTF8String v = row.getUTF8String(fieldIdx);
+        byte[] bytes = v.getBytes();
+        for (int i = 0; i < capacity; i++) {
+          col.putByteArray(i, bytes);
+        }
+      } else if (t instanceof DecimalType) {
+        DecimalType dt = (DecimalType)t;
+        Decimal d = row.getDecimal(fieldIdx, dt.precision(), dt.scale());
+        if (dt.precision() <= Decimal.MAX_INT_DIGITS()) {
+          col.putInts(0, capacity, (int)d.toUnscaledLong());
+        } else if (dt.precision() <= Decimal.MAX_LONG_DIGITS()) {
+          col.putLongs(0, capacity, d.toUnscaledLong());
+        } else {
+          final BigInteger integer = d.toJavaBigDecimal().unscaledValue();
+          byte[] bytes = integer.toByteArray();
+          for (int i = 0; i < capacity; i++) {
+            col.putByteArray(i, bytes, 0, bytes.length);
+          }
+        }
+      } else if (t instanceof CalendarIntervalType) {
+        CalendarInterval c = (CalendarInterval)row.get(fieldIdx, t);
+        col.getChildColumn(0).putInts(0, capacity, c.months);
+        col.getChildColumn(1).putLongs(0, capacity, c.microseconds);
+      } else if (t instanceof DateType) {
+        Date date = (Date)row.get(fieldIdx, t);
+        col.putInts(0, capacity, DateTimeUtils.fromJavaDate(date));
+      }
+    }
+  }
+
   /**
    * Returns the array data as the java primitive array.
    * For example, an array of IntegerType will return an int[].
 
@@ -22,6 +22,7 @@
 import org.apache.commons.lang.NotImplementedException;
 
 import org.apache.spark.memory.MemoryMode;
+import org.apache.spark.sql.Column;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericMutableRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
@@ -315,6 +316,17 @@ public int numValidRows() {
    */
   public ColumnVector column(int ordinal) { return columns[ordinal]; }
 
+  /**
+   * Sets (replaces) the column at `ordinal` with column. This can be used to do very efficient
+   * projections.
+   */
+  public void setColumn(int ordinal, ColumnVector column) {
+    if (column instanceof OffHeapColumnVector) {
+      throw new NotImplementedException("Need to ref count columns.");
+    }
+    columns[ordinal] = column;
+  }
+
   /**
    * Returns the row in this batch at `rowId`. Returned row is reused across calls.
    */
 
@@ -62,9 +62,6 @@ public final long nullsNativeAddress() {
 
   @Override
   public final void close() {
-    nulls = null;
-    intData = null;
-    doubleData = null;
   }
 
   //
 
@@ -139,23 +139,76 @@ private[sql] case class PhysicalRDD(
   // Support codegen so that we can avoid the UnsafeRow conversion in all cases. Codegen
   // never requires UnsafeRow as input.
   override protected def doProduce(ctx: CodegenContext): String = {
+    val columnarBatchClz = "org.apache.spark.sql.execution.vectorized.ColumnarBatch"
     val input = ctx.freshName("input")
+    val idx = ctx.freshName("batchIdx")
+    val batch = ctx.freshName("batch")
     // PhysicalRDD always just has one input
     ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];")
+    ctx.addMutableState(columnarBatchClz, batch, s"$batch = null;")
+    ctx.addMutableState("int", idx, s"$idx = 0;")
 
     val exprs = output.zipWithIndex.map(x => new BoundReference(x._2, x._1.dataType, true))
     val row = ctx.freshName("row")
     val numOutputRows = metricTerm(ctx, "numOutputRows")
     ctx.INPUT_ROW = row
     ctx.currentVars = null
     val columns = exprs.map(_.gen(ctx))
+
+    // The input RDD can either return (all) ColumnarBatches or InternalRows. We determine this
+    // by looking at the first value of the RDD and then calling the function which will process
+    // the remaining. It is faster to return batches.
+    // TODO: The abstractions between this class and SqlNewHadoopRDD makes it difficult to know
+    // here which path to use. Fix this.
+
+
+    val scanBatches = ctx.freshName("processBatches")
+    ctx.addNewFunction(scanBatches,
+      s"""
+      | private void $scanBatches() throws java.io.IOException {
+      |  while (true) {
+      |     int numRows = $batch.numRows();
+      |     if ($idx == 0) $numOutputRows.add(numRows);
+      |
+      |     while ($idx < numRows) {
+      |       InternalRow $row = $batch.getRow($idx++);
+      |       ${consume(ctx, columns).trim}
+      |       if (shouldStop()) return;
+      |     }
+      |
+      |     if (!$input.hasNext()) {
+      |       $batch = null;
+      |       break;
+      |     }
+      |     $batch = ($columnarBatchClz)$input.next();
+      |     $idx = 0;
+      |   }
+      | }""".stripMargin)
+
+    val scanRows = ctx.freshName("processRows")
+    ctx.addNewFunction(scanRows,
+      s"""
+       | private void $scanRows(InternalRow $row) throws java.io.IOException {
+       |   while (true) {
+       |     $numOutputRows.add(1);
+       |     ${consume(ctx, columns).trim}
+       |     if (shouldStop()) return;
+       |     if (!$input.hasNext()) break;
+       |     $row = (InternalRow)$input.next();
+       |   }
+       | }""".stripMargin)
+
+    val value = ctx.freshName("value")
     s"""
-       | while ($input.hasNext()) {
-       |   InternalRow $row = (InternalRow) $input.next();
-       |   $numOutputRows.add(1);
-       |   ${consume(ctx, columns).trim}
-       |   if (shouldStop()) {
-       |     return;
+       | if ($batch != null) {
+       |   $scanBatches();
+       | } else if ($input.hasNext()) {
+       |   Object $value = $input.next();
+       |   if ($value instanceof $columnarBatchClz) {
+       |     $batch = ($columnarBatchClz)$value;
+       |     $scanBatches();
+       |   } else {
+       |     $scanRows((InternalRow) $value);
        |   }
        | }
      """.stripMargin
Original file line number	Diff line number	Diff line change
`@@ -62,9 +62,6 @@ public final long nullsNativeAddress() {`
`62`	`62`
`63`	`63`	`@Override`
`64`	`64`	`public final void close() {`
`65`		`- nulls = null;`
`66`		`- intData = null;`
`67`		`- doubleData = null;`
`68`	`65`	`}`
`69`	`66`
`70`	`67`	`//`