apache · MaxGekk · Oct 10, 2019 · Oct 10, 2019 · Oct 10, 2019 · Oct 12, 2019
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java b/sql/catalyst/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
@@ -173,6 +173,8 @@ public InternalRow copy() {
           row.setInt(i, getInt(i));
         } else if (dt instanceof TimestampType) {
           row.setLong(i, getLong(i));
+        } else if (dt instanceof CalendarIntervalType) {
+          row.update(i, getInterval(i));
         } else {
           throw new RuntimeException("Not implemented. " + dt);
         }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -535,8 +535,10 @@ case class DataSource(
    * Returns a logical plan to write the given [[LogicalPlan]] out to this [[DataSource]].
    */
   def planForWriting(mode: SaveMode, data: LogicalPlan): LogicalPlan = {
-    if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
-      throw new AnalysisException("Cannot save interval data type into external storage.")
+    if (providingClass != classOf[ParquetFileFormat]) {
+      if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
+        throw new AnalysisException("Cannot save interval data type into external storage.")
+      }
     }
 
     providingInstance() match {

diff --git a/...src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/...src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -371,6 +371,8 @@ class ParquetFileFormat
 
     case udt: UserDefinedType[_] => supportDataType(udt.sqlType)
 
+    case _: CalendarIntervalType => true
+
     case _ => false
   }
 }

diff --git a/...c/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/...c/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.SQLTimestamp
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
 /**
  * A [[ParentContainerUpdater]] is used by a Parquet converter to set converted values to some
@@ -325,6 +325,26 @@ private[parquet] class ParquetRowConverter(
             override def set(value: Any): Unit = updater.set(value.asInstanceOf[InternalRow].copy())
           })
 
+      case CalendarIntervalType
+        if parquetType.asPrimitiveType().getPrimitiveTypeName == FIXED_LEN_BYTE_ARRAY =>
+        new ParquetPrimitiveConverter(updater) {
+          override def addBinary(value: Binary): Unit = {
+            assert(
+              value.length() == 12,
+              "Intervals are expected to be stored in 12-byte fixed len byte array, " +
+                s"but got a ${value.length()}-byte array.")
+
+            val buf = value.toByteBuffer.order(ByteOrder.LITTLE_ENDIAN)
+            val milliseconds = buf.getInt
+            var microseconds = milliseconds * DateTimeUtils.MICROS_PER_MILLIS
+            val days = buf.getInt
+            val daysInUs = Math.multiplyExact(days, DateTimeUtils.MICROS_PER_DAY)
+            microseconds = Math.addExact(microseconds, daysInUs)
+            val months = buf.getInt
+            updater.set(new CalendarInterval(months, microseconds))
+          }
+        }
+
       case t =>
         throw new RuntimeException(
           s"Unable to create Parquet converter for data type ${t.json} " +

diff --git a/...ain/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala b/...ain/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
@@ -171,7 +171,7 @@ class ParquetToSparkSchemaConverter(
       case FIXED_LEN_BYTE_ARRAY =>
         originalType match {
           case DECIMAL => makeDecimalType(Decimal.maxPrecisionForBytes(field.getTypeLength))
-          case INTERVAL => typeNotImplemented()
+          case INTERVAL => CalendarIntervalType
           case _ => illegalType()
         }
 
@@ -553,6 +553,11 @@ class SparkToParquetSchemaConverter(
       case udt: UserDefinedType[_] =>
         convertField(field.copy(dataType = udt.sqlType))
 
+      case i: CalendarIntervalType =>
+        Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition).length(12)
+          .as(INTERVAL)
+          .named(field.name)
+
       case _ =>
         throw new AnalysisException(s"Unsupported data type ${field.dataType.catalogString}")
     }

diff --git a/...c/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala b/...c/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
@@ -73,6 +73,9 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging {
   // Reusable byte array used to write timestamps as Parquet INT96 values
   private val timestampBuffer = new Array[Byte](12)
 
+  // Reusable byte array used to write intervals as Parquet FIXED_LEN_BYTE_ARRAY values
+  private val intervalBuffer = new Array[Byte](12)
+
   // Reusable byte array used to write decimal values
   private val decimalBuffer =
     new Array[Byte](Decimal.minBytesForPrecision(DecimalType.MAX_PRECISION))
@@ -207,7 +210,19 @@ class ParquetWriteSupport extends WriteSupport[InternalRow] with Logging {
 
       case t: UserDefinedType[_] => makeWriter(t.sqlType)
 
-      // TODO Adds IntervalType support
+      case CalendarIntervalType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          val interval = row.getInterval(ordinal)
+          val microseconds = interval.microseconds % DateTimeUtils.MICROS_PER_DAY
+          val milliseconds: Int = (microseconds / DateTimeUtils.MICROS_PER_MILLIS).toInt
+          val days: Int = Math.toIntExact(interval.microseconds / DateTimeUtils.MICROS_PER_DAY)
+          val buf = ByteBuffer.wrap(intervalBuffer)
+          buf.order(ByteOrder.LITTLE_ENDIAN)
+            .putInt(milliseconds)
+            .putInt(days)
+            .putInt(interval.months)
+          recordConsumer.addBinary(Binary.fromReusedByteArray(intervalBuffer))
+
       case _ => sys.error(s"Unsupported data type $dataType.")
     }
   }

diff --git a/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetTable.scala b/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetTable.scala
@@ -58,6 +58,8 @@ case class ParquetTable(
 
     case udt: UserDefinedType[_] => supportsDataType(udt.sqlType)
 
+    case _: CalendarIntervalType => true
+
     case _ => false
   }
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -330,13 +330,13 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSparkSession {
     }
   }
 
-  test("SPARK-24204 error handling for unsupported Interval data types - csv, json, parquet, orc") {
+  test("SPARK-24204 error handling for unsupported Interval data types - csv, json, orc") {
     withTempDir { dir =>
       val tempDir = new File(dir, "files").getCanonicalPath
       // TODO: test file source V2 after write path is fixed.
       Seq(true).foreach { useV1 =>
         val useV1List = if (useV1) {
-          "csv,json,orc,parquet"
+          "csv,json,orc"
         } else {
           ""
         }
@@ -349,15 +349,15 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSparkSession {
 
         withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> useV1List) {
           // write path
-          Seq("csv", "json", "parquet", "orc").foreach { format =>
+          Seq("csv", "json", "orc").foreach { format =>
             val msg = intercept[AnalysisException] {
               sql("select interval 1 days").write.format(format).mode("overwrite").save(tempDir)
             }.getMessage
             validateErrorMessage(msg)
           }
 
           // read path
-          Seq("parquet", "csv").foreach { format =>
+          Seq("csv").foreach { format =>
             var msg = intercept[AnalysisException] {
               val schema = StructType(StructField("a", CalendarIntervalType, true) :: Nil)
               spark.range(1).write.format(format).mode("overwrite").save(tempDir)

diff --git a/...re/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/...re/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -49,7 +49,7 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
 // Write support class for nested groups: ParquetWriter initializes GroupWriteSupport
 // with an empty configuration (it is after all not intended to be used in this way?)
@@ -114,12 +114,13 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
         |  required fixed_len_byte_array(32) i(DECIMAL(32,0));
         |  required int64 j(TIMESTAMP_MILLIS);
         |  required int64 k(TIMESTAMP_MICROS);
+        |  required fixed_len_byte_array(12) l(INTERVAL);
         |}
       """.stripMargin)
 
     val expectedSparkTypes = Seq(ByteType, ShortType, DateType, DecimalType(1, 0),
       DecimalType(10, 0), StringType, StringType, DecimalType(32, 0), DecimalType(32, 0),
-      TimestampType, TimestampType)
+      TimestampType, TimestampType, CalendarIntervalType)
 
     withTempPath { location =>
       val path = new Path(location.getCanonicalPath)
@@ -735,7 +736,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
 
       val dataTypes =
         Seq(StringType, BooleanType, ByteType, ShortType, IntegerType, LongType,
-          FloatType, DoubleType, DecimalType(25, 5), DateType, TimestampType)
+          FloatType, DoubleType, DecimalType(25, 5), DateType, TimestampType, CalendarIntervalType)
 
       val constantValues =
         Seq(
@@ -749,7 +750,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
           0.75D,
           Decimal("1234.23456"),
           DateTimeUtils.fromJavaDate(java.sql.Date.valueOf("2015-01-01")),
-          DateTimeUtils.fromJavaTimestamp(java.sql.Timestamp.valueOf("2015-01-01 23:50:59.123")))
+          DateTimeUtils.fromJavaTimestamp(java.sql.Timestamp.valueOf("2015-01-01 23:50:59.123")),
+          CalendarInterval.fromString("interval 1 month 2 microsecond"))
 
       dataTypes.zip(constantValues).foreach { case (dt, v) =>
         val schema = StructType(StructField("pcol", dt) :: Nil)

diff --git a/...src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/...src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -903,6 +903,20 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS
     testMigration(fromTsType = "INT96", toTsType = "TIMESTAMP_MICROS")
     testMigration(fromTsType = "TIMESTAMP_MICROS", toTsType = "INT96")
   }
+
+  test("interval written and read as Parquet INTERVAL") {
+    withTempPath { file =>
+      val df = spark.range(10)
+        .selectExpr("interval 100 years 1 month 10 second 1 millisecond as i")
+      df.write.parquet(file.getCanonicalPath)
+      ("true" :: "false" :: Nil).foreach { vectorized =>
+        withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized) {
+          val df2 = spark.read.parquet(file.getCanonicalPath)
+          checkAnswer(df2, df.collect().toSeq)
+        }
+      }
+    }
+  }
 }
 
 class ParquetV1QuerySuite extends ParquetQuerySuite {

diff --git a/...rc/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/...rc/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -1010,6 +1010,17 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     writeLegacyParquetFormat = true,
     outputTimestampType = SQLConf.ParquetOutputTimestampType.TIMESTAMP_MICROS)
 
+  testSchema(
+    "Interval written and read as fixed_len_byte_array(12) with INTERVAL",
+    StructType(Seq(StructField("f1", CalendarIntervalType))),
+    """message root {
+      |  optional fixed_len_byte_array(12) f1 (INTERVAL);
+      |}
+    """.stripMargin,
+    binaryAsString = true,
+    int96AsTimestamp = false,
+    writeLegacyParquetFormat = true)
+
   private def testSchemaClipping(
       testName: String,
       parquetSchema: String,
-Original file line number
+Diff line change
@@ Expand Up / @@ -371,6 +371,8 @@ class ParquetFileFormat @@
         case udt: UserDefinedType[_] => supportDataType(udt.sqlType)
+        case _: CalendarIntervalType => true
         case _ => false
       }
     }
@@ Expand Down @@