Documenting conversions, bugfix, wrappers of Rows

AndreSchumacher · AndreSchumacher · commit b7fcc3544efb · 2014-06-19T17:27:49.000+03:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -115,7 +115,7 @@ class CatalystGroupConverter(
     protected[parquet] val index: Int,
     protected[parquet] val parent: CatalystConverter,
     protected[parquet] var current: ArrayBuffer[Any],
-    protected[parquet] var buffer: ArrayBuffer[ArrayBuffer[Any]])
+    protected[parquet] var buffer: ArrayBuffer[Row])
   extends GroupConverter with CatalystConverter {
 
   def this(schema: Seq[FieldType], index: Int, parent: CatalystConverter) =
@@ -124,7 +124,7 @@ class CatalystGroupConverter(
       index,
       parent,
       current=null,
-      buffer=new ArrayBuffer[ArrayBuffer[Any]](
+      buffer=new ArrayBuffer[Row](
         CatalystArrayConverter.INITIAL_ARRAY_SIZE))
 
   // This constructor is used for the root converter only
@@ -141,6 +141,7 @@ class CatalystGroupConverter(
   // Should be only called in root group converter!
   def getCurrentRecord: Row = {
     assert(isRootConverter, "getCurrentRecord should only be called in root group converter!")
+    // TODO: use iterators if possible
     new GenericRow {
       override val values: Array[Any] = current.toArray
     }
@@ -155,7 +156,7 @@ class CatalystGroupConverter(
 
   override protected[parquet] def clearBuffer(): Unit = {
     // TODO: reuse buffer?
-    buffer = new ArrayBuffer[ArrayBuffer[Any]](CatalystArrayConverter.INITIAL_ARRAY_SIZE)
+    buffer = new ArrayBuffer[Row](CatalystArrayConverter.INITIAL_ARRAY_SIZE)
   }
 
   override def start(): Unit = {
@@ -173,8 +174,13 @@ class CatalystGroupConverter(
   override def end(): Unit = {
     if (!isRootConverter) {
       assert(current!=null) // there should be no empty groups
-      buffer.append(current)
-      parent.updateField(index, buffer)
+      buffer.append(new GenericRow {
+        override val values: Array[Any] = current.toArray
+      })
+      // TODO: use iterators if possible, avoid Row wrapping
+      parent.updateField(index, new GenericRow {
+        override val values: Array[Any] = buffer.toArray
+      })
     }
   }
 }
@@ -276,7 +282,10 @@ class CatalystArrayConverter(
   // TODO: think about reusing the buffer
   override def end(): Unit = {
     assert(parent != null)
-    parent.updateField(index, buffer)
+    // TODO: use iterators if possible, avoid Row wrapping
+    parent.updateField(index, new GenericRow {
+      override val values: Array[Any] = buffer.toArray
+    })
     clearBuffer()
   }
 }
@@ -294,7 +303,10 @@ class CatalystStructConverter(
   // TODO: think about reusing the buffer
   override def end(): Unit = {
     assert(!isRootConverter)
-    parent.updateField(index, current)
+    // TODO: use iterators if possible, avoid Row wrapping!
+    parent.updateField(index, new GenericRow {
+      override val values: Array[Any] = current.toArray
+    })
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetRelation.scala
@@ -193,6 +193,31 @@ private[parquet] object ParquetTypesConverter {
       s"Unsupported parquet datatype $parquetType")
   }
 
+  /**
+   * Converts a given Parquet `Type` into the corresponding
+   * [[org.apache.spark.sql.catalyst.types.DataType]].
+   *
+   * Note that we apply the following conversion rules:
+   * <ul>
+   *   <li> Primitive types are converter to the corresponding primitive type.</li>
+   *   <li> Group types that have a single field with repetition `REPEATED` or themselves
+   *        have repetition level `REPEATED` are converted to an [[ArrayType]] with the
+   *        corresponding field type (possibly primitive) as element type.</li>
+   *   <li> Other group types are converted as follows:<ul>
+   *      <li> If they have a single field, they are converted into a [[StructType]] with
+   *           the corresponding field type.</li>
+   *      <li> If they have more than one field and repetition level `REPEATED` they are
+   *           converted into an [[ArrayType]] with the corresponding [[StructType]] as complex
+   *           element type.</li>
+   *      <li> Otherwise they are converted into a [[StructType]] with the corresponding
+   *           field types.</li></ul></li>
+   * </ul>
+   * Note that fields are determined to be `nullable` if and only if their Parquet repetition
+   * level is not `REQUIRED`.
+   *
+   * @param parquetType The type to convert.
+   * @return The corresponding Catalyst type.
+   */
   def toDataType(parquetType: ParquetType): DataType = {
     if (parquetType.isPrimitive) {
       toPrimitiveDataType(parquetType.asPrimitiveType.getPrimitiveTypeName)
@@ -215,7 +240,9 @@ private[parquet] object ParquetTypesConverter {
         case _ => { // everything else nested becomes a Struct, unless it has a single repeated field
           // in which case it becomes an array (this should correspond to the inverse operation of
           // parquet.schema.ConversionPatterns.listType)
-          if (groupType.getFieldCount == 1 && groupType.getFields.apply(0).getRepetition == Repetition.REPEATED) {
+          if (groupType.getFieldCount == 1 &&
+              (groupType.getFields.apply(0).getRepetition == Repetition.REPEATED ||
+                groupType.getRepetition == Repetition.REPEATED)) {
             val elementType = toDataType(groupType.getFields.apply(0))
             new ArrayType(elementType)
           } else {
@@ -225,9 +252,10 @@ private[parquet] object ParquetTypesConverter {
               ptype.getName,
               toDataType(ptype),
               ptype.getRepetition != Repetition.REQUIRED))
-            if (groupType.getFieldCount == 1) { // single field, either optional or required
+
+            if (groupType.getFieldCount == 1) {
               new StructType(fields)
-            } else { // multi field repeated group, which we map into an array of structs
+            } else {
               if (parquetType.getRepetition == Repetition.REPEATED) {
                 new ArrayType(StructType(fields))
               } else {
@@ -240,6 +268,14 @@ private[parquet] object ParquetTypesConverter {
     }
   }
 
+  /**
+   * For a given Catalyst [[org.apache.spark.sql.catalyst.types.DataType]] return
+   * the name of the corresponding Parquet primitive type or None if the given type
+   * is not primitive.
+   *
+   * @param ctype The type to convert
+   * @return The name of the corresponding Parquet primitive type
+   */
   def fromPrimitiveDataType(ctype: DataType): Option[ParquetPrimitiveTypeName] = ctype match {
     case StringType => Some(ParquetPrimitiveTypeName.BINARY)
     case BooleanType => Some(ParquetPrimitiveTypeName.BOOLEAN)
@@ -251,6 +287,41 @@ private[parquet] object ParquetTypesConverter {
     case _ => None
   }
 
+  /**
+   * Converts a given Catalyst [[org.apache.spark.sql.catalyst.types.DataType]] into
+   * the corrponsing Parquet `Type`.
+   *
+   * The conversion follows the rules below:
+   * <ul>
+   *   <li> Primitive types are converted into Parquet's primitive types.</li>
+   *   <li> [[org.apache.spark.sql.catalyst.types.StructType]]s are converted
+   *   into Parquet's `GroupType` with the corresponding field types.</li>
+   *   <li> [[org.apache.spark.sql.catalyst.types.ArrayType]]s are handled as follows:<ul>
+   *     <li> If their element is complex, that is of type
+   *          [[org.apache.spark.sql.catalyst.types.StructType]], they are converted
+   *          into a `GroupType` with the corresponding field types of the struct and
+   *          original type of the `GroupType` is set to `LIST`.</li>
+   *     <li> Otherwise, that is they contain a primitive they are converted into a `GroupType`
+   *     that is also a list but has only a single field of the type corresponding to
+   *     the element type.</li></ul></li>
+   * </ul>
+   * Parquet's repetition level is set according to the following rule:
+   * <ul>
+   *   <li> If the call to `fromDataType` is recursive inside an enclosing `ArrayType`, then
+   *   the repetition level is set to `REPEATED`.</li>
+   *   <li> Otherwise, if the attribute whose type is converted is `nullable`, the Parquet
+   *   type gets repetition level `OPTIONAL` and otherwise `REQUIRED`.</li>
+   * </ul>
+   * The single expection to this rule is an [[org.apache.spark.sql.catalyst.types.ArrayType]]
+   * that contains a [[org.apache.spark.sql.catalyst.types.StructType]], whose repetition level
+   * is always set to `REPEATED`.
+   *
+   @param ctype The type to convert.
+   * @param name The name of the [[org.apache.spark.sql.catalyst.expressions.Attribute]] whose type is converted
+   * @param nullable When true indicates that the attribute is nullable
+   * @param inArray When true indicates that this is a nested attribute inside an array.
+   * @return The corresponding Parquet type.
+   */
   def fromDataType(
       ctype: DataType,
       name: String,
@@ -271,8 +342,9 @@ private[parquet] object ParquetTypesConverter {
           elementType match {
             case StructType(fields) => { // first case: array of structs
               val parquetFieldTypes = fields.map(
-                f => fromDataType(f.dataType, f.name, f.nullable, false))
-              new ParquetGroupType(repetition, name, ParquetOriginalType.LIST, parquetFieldTypes)
+                f => fromDataType(f.dataType, f.name, f.nullable, inArray = false))
+              assert(fields.size > 1, "Found struct inside array with a single field.. error parsin Catalyst schema")
+              new ParquetGroupType(Repetition.REPEATED, name, ParquetOriginalType.LIST, parquetFieldTypes)
               //ConversionPatterns.listType(Repetition.REPEATED, name, parquetFieldTypes)
             }
             case _ => { // second case: array of primitive types
@@ -288,7 +360,7 @@ private[parquet] object ParquetTypesConverter {
         // TODO: test structs inside arrays
         case StructType(structFields) => {
           val fields = structFields.map {
-            field => fromDataType(field.dataType, field.name, field.nullable)
+            field => fromDataType(field.dataType, field.name, field.nullable, inArray = false)
           }
           new ParquetGroupType(repetition, name, fields)
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -54,6 +54,10 @@ case class OptionalReflectData(
     doubleField: Option[Double],
     booleanField: Option[Boolean])
 
+case class Nested(i: Int, s: String)
+
+case class Data(array: Seq[Int], nested: Nested)
+
 class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterAll {
   import TestData._
   TestData // Load test data tables.
@@ -366,6 +370,7 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
   }
 
   test("Importing nested Parquet file (Addressbook)") {
+    implicit def anyToRow(value: Any): Row = value.asInstanceOf[Row]
     ParquetTestData.readNestedFile(
       ParquetTestData.testNestedFile1,
       ParquetTestData.testNestedSchema1)
@@ -374,22 +379,23 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     assert(result.size === 2)
     val first_record = result(0)
     val second_record = result(1)
-    val first_owner_numbers = result(0).apply(1).asInstanceOf[ArrayBuffer[Any]]
-    val first_contacts = result(0).apply(2).asInstanceOf[ArrayBuffer[ArrayBuffer[Any]]]
+    val first_owner_numbers = result(0)(1)
+    val first_contacts = result(0)(2)
     assert(first_record.size === 3)
-    assert(second_record.apply(1) === null)
-    assert(second_record.apply(2) === null)
-    assert(second_record.apply(0) === "A. Nonymous")
-    assert(first_record.apply(0) === "Julien Le Dem")
-    assert(first_owner_numbers.apply(0) === "555 123 4567")
-    assert(first_owner_numbers.apply(2) === "XXX XXX XXXX")
-    assert(first_contacts.apply(0).size === 2)
-    assert(first_contacts.apply(0).apply(0) === "Dmitriy Ryaboy")
-    assert(first_contacts.apply(0).apply(1) === "555 987 6543")
-    assert(first_contacts.apply(1).apply(0) === "Chris Aniszczyk")
+    assert(second_record(1) === null)
+    assert(second_record(2) === null)
+    assert(second_record(0) === "A. Nonymous")
+    assert(first_record(0) === "Julien Le Dem")
+    assert(first_owner_numbers(0) === "555 123 4567")
+    assert(first_owner_numbers(2) === "XXX XXX XXXX")
+    assert(first_contacts(0).size === 2)
+    assert(first_contacts(0)(0) === "Dmitriy Ryaboy")
+    assert(first_contacts(0)(1) === "555 987 6543")
+    assert(first_contacts(1)(0) === "Chris Aniszczyk")
   }
 
   test("Importing nested Parquet file (nested numbers)") {
+    implicit def anyToRow(value: Any): Row = value.asInstanceOf[Row]
     ParquetTestData.readNestedFile(
       ParquetTestData.testNestedFile2,
       ParquetTestData.testNestedSchema2)
@@ -398,19 +404,43 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     assert(result(0).size === 5, "number of fields in row incorrect")
     assert(result(0)(0) === 1)
     assert(result(0)(1) === 7)
-    assert(result(0)(2).asInstanceOf[ArrayBuffer[Any]].size === 3)
-    assert(result(0)(2).asInstanceOf[ArrayBuffer[Any]].apply(0) === (1.toLong << 32))
-    assert(result(0)(2).asInstanceOf[ArrayBuffer[Any]].apply(1) === (1.toLong << 33))
-    assert(result(0)(2).asInstanceOf[ArrayBuffer[Any]].apply(2) === (1.toLong << 34))
-    assert(result(0)(3).asInstanceOf[ArrayBuffer[Any]].size === 2)
-    assert(result(0)(3).asInstanceOf[ArrayBuffer[Any]].apply(0) === 2.5)
-    assert(result(0)(3).asInstanceOf[ArrayBuffer[Any]].apply(1) === false)
-    assert(result(0)(4).asInstanceOf[ArrayBuffer[Any]].size === 2)
-    assert(result(0)(4).asInstanceOf[ArrayBuffer[Any]].apply(0).asInstanceOf[ArrayBuffer[Any]].size === 2)
-    assert(result(0)(4).asInstanceOf[ArrayBuffer[Any]].apply(1).asInstanceOf[ArrayBuffer[Any]].size === 1)
-    assert(result(0)(4).asInstanceOf[ArrayBuffer[Any]].apply(0).asInstanceOf[ArrayBuffer[ArrayBuffer[Any]]].apply(0).apply(0) === 7)
-    assert(result(0)(4).asInstanceOf[ArrayBuffer[Any]].apply(0).asInstanceOf[ArrayBuffer[ArrayBuffer[Any]]].apply(1).apply(0) === 8)
-    assert(result(0)(4).asInstanceOf[ArrayBuffer[Any]].apply(1).asInstanceOf[ArrayBuffer[ArrayBuffer[Any]]].apply(0).apply(0) === 9)
+    assert(result(0)(2).size === 3)
+    assert(result(0)(2)(0) === (1.toLong << 32))
+    assert(result(0)(2)(1) === (1.toLong << 33))
+    assert(result(0)(2)(2) === (1.toLong << 34))
+    assert(result(0)(3).size === 2)
+    assert(result(0)(3)(0) === 2.5)
+    assert(result(0)(3)(1) === false)
+    assert(result(0)(4).size === 2)
+    assert(result(0)(4)(0).size === 2)
+    assert(result(0)(4)(1).size === 1)
+    assert(result(0)(4)(0)(0)(0) === 7)
+    assert(result(0)(4)(0)(1)(0) === 8)
+    assert(result(0)(4)(1)(0)(0) === 9)
+  }
+
+  test("Simple query on addressbook") {
+    val data = TestSQLContext.parquetFile(ParquetTestData.testNestedFile1.toString).toSchemaRDD
+    val tmp = data.where('owner === "Julien Le Dem").select('owner as 'a, 'contacts as 'c).collect()
+    assert(tmp.size === 1)
+    assert(tmp(0)(0) === "Julien Le Dem")
+  }
+
+  test("Simple query on nested int data") {
+    implicit def anyToRow(value: Any): Row = value.asInstanceOf[Row]
+    val data = TestSQLContext.parquetFile(ParquetTestData.testNestedFile2.toString).toSchemaRDD
+    data.registerAsTable("data")
+    val tmp = sql("SELECT booleanNumberPairs.value, booleanNumberPairs.truth FROM data").collect()
+    assert(tmp(0)(0) === 2.5)
+    assert(tmp(0)(1) === false)
+    val result = sql("SELECT outerouter FROM data").collect()
+    // TODO: why does this not work?
+    //val result = sql("SELECT outerouter.values FROM data").collect()
+    // TODO: .. or this:
+    // val result = sql("SELECT outerouter[0] FROM data").collect()
+    assert(result(0)(0)(0)(0)(0) === 7)
+    assert(result(0)(0)(0)(1)(0) === 8)
+    assert(result(0)(0)(1)(0)(0) === 9)
   }
 
   /**