@@ -19,9 +19,13 @@ package org.apache.spark.sql.parquet
1919
2020import org .scalatest .{BeforeAndAfterAll , FunSuiteLike }
2121
22+ import org .apache .avro .{SchemaBuilder , Schema }
23+ import org .apache .avro .generic .{GenericData , GenericRecord }
24+
2225import org .apache .hadoop .fs .{Path , FileSystem }
2326import org .apache .hadoop .mapreduce .Job
2427
28+ import parquet .avro .AvroParquetWriter
2529import parquet .hadoop .ParquetFileWriter
2630import parquet .hadoop .util .ContextUtil
2731import parquet .schema .MessageTypeParser
@@ -34,11 +38,12 @@ import org.apache.spark.sql.SchemaRDD
3438import org .apache .spark .sql .catalyst .expressions ._
3539import org .apache .spark .sql .catalyst .types .IntegerType
3640import org .apache .spark .util .Utils
41+ import org .apache .spark .sql .SchemaRDD
42+ import org .apache .spark .sql .catalyst .util .getTempFilePath
43+ import org .apache .spark .sql .catalyst .expressions .Row
3744import org .apache .spark .sql .catalyst .types .{StringType , IntegerType , DataType }
38- import org .apache .spark .sql .{parquet , SchemaRDD }
3945import org .apache .spark .sql .catalyst .expressions .AttributeReference
40- import scala .Tuple2
41- import org .apache .spark .sql .catalyst .analysis .UnresolvedAttribute
46+ import org .apache .spark .util .Utils
4247
4348// Implicits
4449import org .apache .spark .sql .test .TestSQLContext ._
@@ -398,9 +403,6 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
398403
399404 test(" Importing nested Parquet file (Addressbook)" ) {
400405 implicit def anyToRow (value : Any ): Row = value.asInstanceOf [Row ]
401- ParquetTestData .readNestedFile(
402- ParquetTestData .testNestedDir1,
403- ParquetTestData .testNestedSchema1)
404406 val result = TestSQLContext
405407 .parquetFile(ParquetTestData .testNestedDir1.toString)
406408 .toSchemaRDD
@@ -426,9 +428,6 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
426428
427429 test(" Importing nested Parquet file (nested numbers)" ) {
428430 implicit def anyToRow (value : Any ): Row = value.asInstanceOf [Row ]
429- ParquetTestData .readNestedFile(
430- ParquetTestData .testNestedDir2,
431- ParquetTestData .testNestedSchema2)
432431 val result = TestSQLContext
433432 .parquetFile(ParquetTestData .testNestedDir2.toString)
434433 .toSchemaRDD
@@ -602,6 +601,145 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
602601 Utils .deleteRecursively(tmpdir)
603602 }
604603
604+ test(" Importing data generated with Avro" ) {
605+ val tmpdir = Utils .createTempDir()
606+ val file : File = new File (tmpdir, " test.avro" )
607+
608+ val primitiveArrayType : Schema = SchemaBuilder .array.items.intType
609+ val complexArrayType : Schema = SchemaBuilder .array.items.map.values.stringType
610+ val primitiveMapType : Schema = SchemaBuilder .map.values.booleanType
611+ val complexMapType : Schema = SchemaBuilder .map.values.array.items.floatType
612+ val schema : Schema = SchemaBuilder
613+ .record(" TestRecord" )
614+ .namespace(" " )
615+ .fields
616+ .name(" testInt" )
617+ .`type`.
618+ intType
619+ .noDefault
620+ .name(" testDouble" )
621+ .`type`
622+ .doubleType
623+ .noDefault
624+ .name(" testString" )
625+ .`type`
626+ .nullable
627+ .stringType
628+ .stringDefault(" " )
629+ .name(" testPrimitiveArray" )
630+ .`type`(primitiveArrayType)
631+ .noDefault
632+ .name(" testComplexArray" )
633+ .`type`(complexArrayType)
634+ .noDefault
635+ .name(" testPrimitiveMap" )
636+ .`type`(primitiveMapType)
637+ .noDefault
638+ .name(" testComplexMap" )
639+ .`type`(complexMapType)
640+ .noDefault
641+ .endRecord
642+
643+ val record1 : GenericRecord = new GenericData .Record (schema)
644+
645+ // primitive fields
646+ record1.put(" testInt" , 256 )
647+ record1.put(" testDouble" , 0.5 )
648+ record1.put(" testString" , " foo" )
649+
650+ val primitiveArrayData = new GenericData .Array [Integer ](10 , primitiveArrayType)
651+ val complexArrayData : GenericData .Array [java.util.Map [String , String ]] =
652+ new GenericData .Array [java.util.Map [String , String ]](10 , SchemaBuilder .array.items.map.values.stringType)
653+
654+ // two arrays: one primitive (array of ints), one complex (array of string->string maps)
655+ primitiveArrayData.add(1 )
656+ primitiveArrayData.add(2 )
657+ primitiveArrayData.add(3 )
658+ val map1 = new java.util.HashMap [String , String ]
659+ map1.put(" key11" , " data11" )
660+ map1.put(" key12" , " data12" )
661+ val map2 = new java.util.HashMap [String , String ]
662+ map2.put(" key21" , " data21" )
663+ map2.put(" key22" , " data22" )
664+ complexArrayData.add(0 , map1)
665+ complexArrayData.add(1 , map2)
666+
667+ record1.put(" testPrimitiveArray" , primitiveArrayData)
668+ record1.put(" testComplexArray" , complexArrayData)
669+
670+ // two maps: one primitive (string->boolean), one complex (string->array of floats)
671+ val primitiveMap = new java.util.HashMap [String , Boolean ](10 )
672+ primitiveMap.put(" key1" , true )
673+ primitiveMap.put(" key2" , false )
674+ val complexMap = new java.util.HashMap [String , GenericData .Array [Float ]](10 )
675+ val value1 : GenericData .Array [Float ] = new GenericData .Array [Float ](10 , SchemaBuilder .array.items.floatType)
676+ value1.add(0.1f )
677+ value1.add(0.2f )
678+ value1.add(0.3f )
679+ complexMap.put(" compKey1" , value1)
680+ val value2 : GenericData .Array [Float ] = new GenericData .Array [Float ](10 , SchemaBuilder .array.items.floatType)
681+ value2.add(1.1f )
682+ value2.add(1.2f )
683+ value2.add(1.3f )
684+ complexMap.put(" compKey2" , value2)
685+
686+ record1.put(" testPrimitiveMap" , primitiveMap)
687+ record1.put(" testComplexMap" , complexMap)
688+
689+ // TODO: test array or map with value type Avro record
690+
691+ val writer = new AvroParquetWriter [GenericRecord ](new Path (file.toString), schema)
692+ writer.write(record1)
693+ writer.close()
694+
695+ val data = TestSQLContext
696+ .parquetFile(tmpdir.toString)
697+ .toSchemaRDD
698+ data.registerAsTable(" avroTable" )
699+ val resultPrimitives = sql(" SELECT testInt, testDouble, testString FROM avroTable" ).collect()
700+ assert(resultPrimitives(0 )(0 ) === 256 )
701+ assert(resultPrimitives(0 )(1 ) === 0.5 )
702+ assert(resultPrimitives(0 )(2 ) === " foo" )
703+ val resultPrimitiveArray = sql(" SELECT testPrimitiveArray FROM avroTable" ).collect()
704+ assert(resultPrimitiveArray(0 )(0 ).asInstanceOf [Row ](0 ) === 1 )
705+ assert(resultPrimitiveArray(0 )(0 ).asInstanceOf [Row ](1 ) === 2 )
706+ assert(resultPrimitiveArray(0 )(0 ).asInstanceOf [Row ](2 ) === 3 )
707+ val resultComplexArray = sql(" SELECT testComplexArray FROM avroTable" ).collect()
708+ assert(resultComplexArray(0 )(0 ).asInstanceOf [Row ].size === 2 )
709+ assert(
710+ resultComplexArray(0 )(0 )
711+ .asInstanceOf [Row ]
712+ .apply(0 )
713+ .asInstanceOf [Map [String , String ]].get(" key11" ).get.equals(" data11" ))
714+ assert(
715+ resultComplexArray(0 )(0 )
716+ .asInstanceOf [Row ]
717+ .apply(1 )
718+ .asInstanceOf [Map [String , String ]].get(" key22" ).get.equals(" data22" ))
719+ val resultPrimitiveMap = sql(" SELECT testPrimitiveMap FROM avroTable" ).collect()
720+ assert(
721+ resultPrimitiveMap(0 )(0 )
722+ .asInstanceOf [Map [String , Boolean ]].get(" key1" ).get === true )
723+ assert(
724+ resultPrimitiveMap(0 )(0 )
725+ .asInstanceOf [Map [String , Boolean ]].get(" key2" ).get === false )
726+ val resultComplexMap = sql(" SELECT testComplexMap FROM avroTable" ).collect()
727+ val mapResult1 =
728+ resultComplexMap(0 )(0 )
729+ .asInstanceOf [Map [String , Row ]]
730+ .get(" compKey1" )
731+ .get
732+ val mapResult2 =
733+ resultComplexMap(0 )(0 )
734+ .asInstanceOf [Map [String , Row ]]
735+ .get(" compKey2" )
736+ .get
737+ assert(mapResult1(0 ) === 0.1f )
738+ assert(mapResult1(2 ) === 0.3f )
739+ assert(mapResult2(0 ) === 1.1f )
740+ assert(mapResult2(2 ) === 1.3f )
741+ }
742+
605743 /**
606744 * Creates an empty SchemaRDD backed by a ParquetRelation.
607745 *
@@ -613,6 +751,6 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
613751 val attributes = schema.map(t => new AttributeReference (t._1, t._2)())
614752 new SchemaRDD (
615753 TestSQLContext ,
616- parquet. ParquetRelation .createEmpty(path, attributes, sparkContext.hadoopConfiguration))
754+ ParquetRelation .createEmpty(path, attributes, sparkContext.hadoopConfiguration))
617755 }
618756}
0 commit comments