marmbrus
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala‎
Lines changed: 5 additions & 3 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Row.scala‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala‎
Lines changed: 22 additions & 23 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala‎
Lines changed: 22 additions & 23 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala‎
Lines changed: 77 additions & 24 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableOperations.scala‎
Lines changed: 77 additions & 24 deletions
@@ -63,7 +63,8 @@ trait MutableRow extends Row {
   def setBoolean(ordinal: Int, value: Boolean)
   def setShort(ordinal: Int, value: Short)
   def setByte(ordinal: Int, value: Byte)
-  def setFloat(ordinal: Int, value: Byte)
+  def setFloat(ordinal: Int, value: Float)
+  def setString(ordinal: Int, value: String)
 
   /**
    * EXPERIMENTAL
@@ -152,7 +153,7 @@ class GenericRow(protected[catalyst] val values: Array[Any]) extends Row {
   }
 
   def getString(i: Int): String = {
-    if (values(i) == null) sys.error("Failed to check null bit for primitive byte value.")
+    if (values(i) == null) sys.error("Failed to check null bit for primitive String value.")
     values(i).asInstanceOf[String]
   }
 
@@ -168,9 +169,10 @@ class GenericMutableRow(size: Int) extends GenericRow(size) with MutableRow {
   override def setBoolean(ordinal: Int,value: Boolean): Unit = { values(ordinal) = value }
   override def setByte(ordinal: Int,value: Byte): Unit = { values(ordinal) = value }
   override def setDouble(ordinal: Int,value: Double): Unit = { values(ordinal) = value }
-  override def setFloat(ordinal: Int,value: Byte): Unit = { values(ordinal) = value }
+  override def setFloat(ordinal: Int,value: Float): Unit = { values(ordinal) = value }
   override def setInt(ordinal: Int,value: Int): Unit = { values(ordinal) = value }
   override def setLong(ordinal: Int,value: Long): Unit = { values(ordinal) = value }
+  override def setString(ordinal: Int,value: String): Unit = { values(ordinal) = value }
 
   override def setNullAt(i: Int): Unit = { values(i) = null }
 
 
@@ -77,7 +77,7 @@ import scala.collection.JavaConversions._
  *
  * scala> val query = TestHive.parseSql(query_string).transform {
  *    | case relation @ UnresolvedRelation(databaseName, name, alias) =>
- *    | if(name == "psrc") ParquetRelation(name, filename)
+ *    | if (name == "psrc") ParquetRelation(name, filename)
  *    | else relation
  *    | }
  * query: org.apache.spark.sql.catalyst.plans.logical.LogicalPlan =
@@ -119,7 +119,7 @@ case class ParquetRelation(val tableName: String, val path: String) extends Base
 object ParquetRelation {
 
   // The element type for the RDDs that this relation maps to.
-  type RowType = org.apache.spark.sql.catalyst.expressions.GenericRow
+  type RowType = org.apache.spark.sql.catalyst.expressions.GenericMutableRow
 
   /**
    * Creates a new ParquetRelation and underlying Parquetfile for the given
@@ -138,7 +138,7 @@ object ParquetRelation {
              child: LogicalPlan,
              conf: Configuration,
              tableName: Option[String]): ParquetRelation = {
-    if(!child.resolved) {
+    if (!child.resolved) {
       throw new UnresolvedException[LogicalPlan](
         child,
         "Attempt to create Parquet table from unresolved child (when schema is not available)")
@@ -153,13 +153,13 @@ object ParquetRelation {
   private def checkPath(pathStr: String, conf: Configuration): Path = {
     val path = new Path(pathStr)
     val fs = path.getFileSystem(conf)
-    if(fs.exists(path) &&
+    if (fs.exists(path) &&
         !fs.getFileStatus(path)
         .getPermission
         .getUserAction
         .implies(FsAction.READ_WRITE)) {
       throw new IOException(
-        s"Unable to create ParquetRelation: path ${path.toString} not read-writable")
+        s"Unable to create ParquetRelation: path $path not read-writable")
     }
     path
   }
@@ -176,9 +176,13 @@ object ParquetTypesConverter {
     case ParquetPrimitiveTypeName.FLOAT => FloatType
     case ParquetPrimitiveTypeName.INT32 => IntegerType
     case ParquetPrimitiveTypeName.INT64 => LongType
-    case ParquetPrimitiveTypeName.INT96 => LongType // TODO: is there an equivalent?
+    case ParquetPrimitiveTypeName.INT96 => {
+      // TODO: add BigInteger type? TODO(andre) use DecimalType instead????
+      sys.error("Warning: potential loss of precision: converting INT96 to long")
+      LongType
+    }
     case _ => sys.error(
-      s"Unsupported parquet datatype ${parquetType.asInstanceOf[Enum[String]].toString()}")
+      s"Unsupported parquet datatype $parquetType")
   }
 
   def fromDataType(ctype: DataType): ParquetPrimitiveTypeName = ctype match {
@@ -189,7 +193,7 @@ object ParquetTypesConverter {
     case FloatType => ParquetPrimitiveTypeName.FLOAT
     case IntegerType => ParquetPrimitiveTypeName.INT32
     case LongType => ParquetPrimitiveTypeName.INT64
-    case _ => sys.error(s"Unsupported datatype ${ctype.toString}")
+    case _ => sys.error(s"Unsupported datatype $ctype")
   }
 
   def consumeType(consumer: RecordConsumer, ctype: DataType, record: Row, index: Int): Unit = {
@@ -204,7 +208,7 @@ object ParquetTypesConverter {
       case DoubleType => consumer.addDouble(record.getDouble(index))
       case FloatType => consumer.addFloat(record.getFloat(index))
       case BooleanType => consumer.addBoolean(record.getBoolean(index))
-      case _ => sys.error(s"Unsupported datatype ${ctype.toString}, cannot write to consumer")
+      case _ => sys.error(s"Unsupported datatype $ctype, cannot write to consumer")
     }
   }
 
@@ -232,18 +236,18 @@ object ParquetTypesConverter {
   def writeMetaData(attributes: Seq[Attribute], path: Path, conf: Configuration) {
     val fileSystem = FileSystem.get(conf)
 
-    if(fileSystem.exists(path) && !fileSystem.getFileStatus(path).isDir) {
-      throw new IOException(s"Expected to write to directory ${path.toString} but found file")
+    if (fileSystem.exists(path) && !fileSystem.getFileStatus(path).isDir) {
+      throw new IOException(s"Expected to write to directory $path but found file")
     }
 
     val metadataPath = new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)
 
-    if(fileSystem.exists(metadataPath)) {
+    if (fileSystem.exists(metadataPath)) {
       try {
         fileSystem.delete(metadataPath, true)
       } catch {
         case e: IOException =>
-          throw new IOException(s"Unable to delete previous PARQUET_METADATA_FILE:\n${e.toString}")
+          throw new IOException(s"Unable to delete previous PARQUET_METADATA_FILE at $metadataPath")
       }
     }
 
@@ -255,18 +259,13 @@ object ParquetTypesConverter {
       ParquetTypesConverter.convertFromAttributes(attributes)
     val metaData: FileMetaData = new FileMetaData(
       parquetSchema,
-      new java.util.HashMap[String, String](),
-      "Shark")
+      extraMetadata,
+      "Spark")
 
     ParquetFileWriter.writeMetadataFile(
       conf,
       path,
-      new Footer(
-        path,
-        new ParquetMetadata(
-          metaData,
-          Nil)
-      ) :: Nil)
+      new Footer(path, new ParquetMetadata(metaData, Nil)) :: Nil)
   }
 
   /**
@@ -283,11 +282,11 @@ object ParquetTypesConverter {
 
     val metadataPath = new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)
 
-    if(fs.exists(metadataPath) && fs.isFile(metadataPath)) {
+    if (fs.exists(metadataPath) && fs.isFile(metadataPath)) {
       // TODO: improve exception handling, etc.
       ParquetFileReader.readFooter(conf, metadataPath)
     } else {
-      if(!fs.exists(path) || !fs.isFile(path)) {
+      if (!fs.exists(path) || !fs.isFile(path)) {
         throw new FileNotFoundException(
           s"Could not find file ${path.toString} when trying to read metadata")
       }
 
@@ -15,34 +15,35 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql
-package parquet
+package org.apache.spark.sql.parquet
 
-import _root_.parquet.io.InvalidRecordException
-import _root_.parquet.schema.MessageType
-import _root_.parquet.hadoop.{ParquetOutputFormat, ParquetInputFormat}
-import _root_.parquet.hadoop.util.ContextUtil
+import parquet.io.InvalidRecordException
+import parquet.schema.MessageType
+import parquet.hadoop.{ParquetOutputFormat, ParquetInputFormat}
+import parquet.hadoop.util.ContextUtil
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.api.java.JavaPairRDD
-import org.apache.spark.SparkContext
+import org.apache.spark.{TaskContext, SerializableWritable, SparkContext}
 import org.apache.spark.sql.catalyst.expressions.{Row, Attribute, Expression}
 import org.apache.spark.sql.execution.{SparkPlan, UnaryNode, LeafNode}
 
-import org.apache.hadoop.mapreduce.Job
+import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat => NewFileOutputFormat}
+import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
 import java.io.IOException
+import java.text.SimpleDateFormat
+import java.util.Date
 
 /**
  * Parquet table scan operator. Imports the file that backs the given
  * [[ParquetRelation]] as a RDD[Row].
  */
 case class ParquetTableScan(
-    output: Seq[Attribute],
-    relation: ParquetRelation,
-    columnPruningPred: Option[Expression])(
+    @transient output: Seq[Attribute],
+    @transient relation: ParquetRelation,
+    @transient columnPruningPred: Option[Expression])(
     @transient val sc: SparkContext)
   extends LeafNode {
 
@@ -56,12 +57,18 @@ case class ParquetTableScan(
         RowReadSupport.PARQUET_ROW_REQUESTED_SCHEMA,
         ParquetTypesConverter.convertFromAttributes(output).toString)
     // TODO: think about adding record filters
+    /* Comments regarding record filters: it would be nice to push down as much filtering
+      to Parquet as possible. However, currently it seems we cannot pass enough information
+      to materialize an (arbitrary) Catalyst [[Predicate]] inside Parquet's
+      ``FilteredRecordReader`` (via Configuration, for example). Simple
+      filter-rows-by-column-values however should be supported.
+    */
     sc.newAPIHadoopFile(
       relation.path,
       classOf[ParquetInputFormat[Row]],
       classOf[Void], classOf[Row],
       conf)
-      .map(_._2)
+    .map(_._2)
   }
 
   /**
@@ -72,7 +79,7 @@ case class ParquetTableScan(
    */
   def pruneColumns(prunedAttributes: Seq[Attribute]): ParquetTableScan = {
     val success = validateProjection(prunedAttributes)
-    if(success) {
+    if (success) {
       ParquetTableScan(prunedAttributes, relation, columnPruningPred)(sc)
     } else {
       sys.error("Warning: Could not validate Parquet schema projection in pruneColumns")
@@ -102,10 +109,10 @@ case class ParquetTableScan(
 }
 
 case class InsertIntoParquetTable(
-    relation: ParquetRelation,
-    child: SparkPlan)(
+    @transient relation: ParquetRelation,
+    @transient child: SparkPlan)(
     @transient val sc: SparkContext)
-  extends UnaryNode {
+  extends UnaryNode with SparkHadoopMapReduceUtil {
 
   /**
    * Inserts all the rows in the Parquet file. Note that OVERWRITE is implicit, since
@@ -142,18 +149,64 @@ case class InsertIntoParquetTable(
           s"Unable to clear output directory ${fspath.toString} prior"
           + s" to InsertIntoParquetTable:\n${e.toString}")
     }
-
-    JavaPairRDD.fromRDD(childRdd.map(Tuple2(null, _))).saveAsNewAPIHadoopFile(
-      relation.path.toString,
-      classOf[Void],
-      classOf[ParquetRelation.RowType],
-      classOf[_root_.parquet.hadoop.ParquetOutputFormat[ParquetRelation.RowType]],
-      conf)
+    saveAsHadoopFile(childRdd, relation.path.toString, conf)
 
     // We return the child RDD to allow chaining (alternatively, one could return nothing).
     childRdd
   }
 
   override def output = child.output
+
+  // based on ``saveAsNewAPIHadoopFile`` in [[PairRDDFunctions]]
+  // TODO: Maybe PairRDDFunctions should use Product2 instead of Tuple2?
+  // .. then we could use the default one and could use [[MutablePair]]
+  // instead of ``Tuple2``
+  private def saveAsHadoopFile(
+      rdd: RDD[Row],
+      path: String,
+      conf: Configuration) {
+    val job = new Job(conf)
+    val keyType = classOf[Void]
+    val outputFormatType = classOf[parquet.hadoop.ParquetOutputFormat[Row]]
+    job.setOutputKeyClass(keyType)
+    job.setOutputValueClass(classOf[Row])
+    val wrappedConf = new SerializableWritable(job.getConfiguration)
+    NewFileOutputFormat.setOutputPath(job, new Path(path))
+    val formatter = new SimpleDateFormat("yyyyMMddHHmm")
+    val jobtrackerID = formatter.format(new Date())
+    val stageId = sc.newRddId()
+
+    def writeShard(context: TaskContext, iter: Iterator[Row]): Int = {
+      // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
+      // around by taking a mod. We expect that no task will be attempted 2 billion times.
+      val attemptNumber = (context.attemptId % Int.MaxValue).toInt
+      /* "reduce task" <split #> <attempt # = spark task #> */
+      val attemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = false, context.partitionId,
+        attemptNumber)
+      val hadoopContext = newTaskAttemptContext(wrappedConf.value, attemptId)
+      val format = outputFormatType.newInstance
+      val committer = format.getOutputCommitter(hadoopContext)
+      committer.setupTask(hadoopContext)
+      val writer = format.getRecordWriter(hadoopContext)
+      while (iter.hasNext) {
+        val row = iter.next()
+        writer.write(null, row)
+      }
+      writer.close(hadoopContext)
+      committer.commitTask(hadoopContext)
+      return 1
+    }
+    val jobFormat = outputFormatType.newInstance
+    /* apparently we need a TaskAttemptID to construct an OutputCommitter;
+     * however we're only going to use this local OutputCommitter for
+     * setupJob/commitJob, so we just use a dummy "map" task.
+     */
+    val jobAttemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = true, 0, 0)
+    val jobTaskContext = newTaskAttemptContext(wrappedConf.value, jobAttemptId)
+    val jobCommitter = jobFormat.getOutputCommitter(jobTaskContext)
+    jobCommitter.setupJob(jobTaskContext)
+    sc.runJob(rdd, writeShard _)
+    jobCommitter.commitJob(jobTaskContext)
+  }
 }