apache · squito · Jan 27, 2017 · Jan 27, 2017 · Jan 31, 2017 · Jan 31, 2017
diff --git a/.../main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/.../main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -106,7 +106,7 @@ public VectorizedColumnReader(ColumnDescriptor descriptor, PageReader pageReader
     String sessionTzString =
         conf == null ? null : conf.get(SQLConf.SESSION_LOCAL_TIMEZONE().key());
     if (sessionTzString == null || sessionTzString.isEmpty()) {
-      sessionTz = TimeZone.getDefault();
+      sessionTz = DateTimeUtils.defaultTimeZone();
     } else {
       sessionTz = TimeZone.getTimeZone(sessionTzString);
     }

diff --git a/...c/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/...c/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
@@ -267,13 +267,11 @@ private[parquet] class ParquetRowConverter(
       case TimestampType =>
         // TODO Implements `TIMESTAMP_MICROS` once parquet-mr has that.
         // If the table has a timezone property, apply the correct conversions.  See SPARK-12297.
-        val sessionTz = TimeZone.getTimeZone(hadoopConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))
+        val sessionTsString = hadoopConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key)
+        val sessionTz = Option(sessionTsString).map(TimeZone.getTimeZone(_))
+          .getOrElse(DateTimeUtils.defaultTimeZone())
         val storageTzString = hadoopConf.get(ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY)
-        val storageTz = if (storageTzString == null) {
-          sessionTz
-        } else {
-          TimeZone.getTimeZone(storageTzString)
-        }
+        val storageTz = Option(storageTzString).map(TimeZone.getTimeZone(_)).getOrElse(sessionTz)
         new ParquetPrimitiveConverter(updater) {
           // Converts nanosecond timestamps stored as INT96
           override def addBinary(value: Binary): Unit = {

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -176,7 +176,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             // We don't support hive bucketed tables, only ones we write out.
             bucketSpec = None,
             fileFormat = fileFormat,
-            options = options)(sparkSession = sparkSession)
+            options = options ++ getStorageTzOptions(relation))(sparkSession = sparkSession)
           val created = LogicalRelation(fsRelation, updatedTable)
           tableRelationCache.put(tableIdentifier, created)
           created
@@ -194,15 +194,6 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
           fileFormatClass,
           None)
         val logicalRelation = cached.getOrElse {
-          // We add the timezone to the relation options, which automatically gets injected into
-          // the hadoopConf for the Parquet Converters
-          val storageTzKey = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
-          val storageTz = relation.tableMeta.properties.getOrElse(storageTzKey, "")
-          val sessionTz = sparkSession.sessionState.conf.sessionLocalTimeZone
-          val extraTzOptions = Map(
-            storageTzKey -> storageTz,
-            SQLConf.SESSION_LOCAL_TIMEZONE.key -> sessionTz
-          )
           val (dataSchema, updatedTable) = inferIfNeeded(relation, options, fileFormat)
           val created =
             LogicalRelation(
@@ -212,7 +203,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
                 userSpecifiedSchema = Option(dataSchema),
                 // We don't support hive bucketed tables, only ones we write out.
                 bucketSpec = None,
-                options = options ++ extraTzOptions,
+                options = options ++ getStorageTzOptions(relation),
                 className = fileType).resolveRelation(),
               table = updatedTable)
 
@@ -233,6 +224,17 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
     result.copy(output = newOutput)
   }
 
+  private def getStorageTzOptions(relation: CatalogRelation): Map[String, String] = {
+    // We add the table timezone to the relation options, which automatically gets injected into the
+    // hadoopConf for the Parquet Converters
+    val storageTzKey = ParquetFileFormat.PARQUET_TIMEZONE_TABLE_PROPERTY
+    val storageTz = relation.tableMeta.properties.getOrElse(storageTzKey, "")
+    val sessionTz = sparkSession.sessionState.conf.sessionLocalTimeZone
+    Map(
+      storageTzKey -> storageTz
+    )
+  }
+
   private def inferIfNeeded(
       relation: CatalogRelation,
       options: Map[String, String],

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ParquetHiveCompatibilitySuite.scala
@@ -259,8 +259,8 @@ class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with TestHi
   }
 
   val desiredTimestampStrings = Seq(
-    "2015-12-31 23:50:59.123",
     "2015-12-31 22:49:59.123",
+    "2015-12-31 23:50:59.123",
     "2016-01-01 00:39:59.123",
     "2016-01-01 01:29:59.123"
   )
@@ -286,23 +286,15 @@ class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with TestHi
   }
 
   private def createRawData(spark: SparkSession): Dataset[(String, Timestamp)] = {
-    val originalTsStrings = Seq(
-      "2015-12-31 22:49:59.123",
-      "2015-12-31 23:50:59.123",
-      "2016-01-01 00:39:59.123",
-      "2016-01-01 01:29:59.123"
-    )
-    val rowRdd = spark.sparkContext.parallelize(originalTsStrings, 1).map(Row(_))
+    val rowRdd = spark.sparkContext.parallelize(desiredTimestampStrings, 1).map(Row(_))
     val schema = StructType(Seq(
       StructField("display", StringType, true)
     ))
     val df = spark.createDataFrame(rowRdd, schema)
     // this will get the millis corresponding to the display time given the current *session*
     // timezone.
     import spark.implicits._
-    df.withColumn("ts", expr("cast(display as timestamp)")).map { row =>
-      (row.getAs[String](0), row.getAs[Timestamp](1))
-    }
+    df.withColumn("ts", expr("cast(display as timestamp)")).as[(String, Timestamp)]
   }
 
   private def testWriteTablesWithTimezone(
@@ -349,15 +341,20 @@ class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with TestHi
         // values in the parquet file.
         val onDiskLocation = spark.sessionState.catalog
           .getTableMetadata(TableIdentifier(s"insert_$baseTable")).location.getPath
-        val readFromDisk = spark.read.parquet(onDiskLocation).collect()
-        val storageTzId = explicitTz.getOrElse(sessionTzId)
-        readFromDisk.foreach { row =>
-          val displayTime = row.getAs[String](0)
-          val millis = row.getAs[Timestamp](1).getTime()
-          val expectedMillis = timestampTimezoneToMillis((displayTime, storageTzId))
-          assert(expectedMillis === millis, s"Display time '$displayTime' was stored incorrectly " +
-            s"with sessionTz = ${sessionTzOpt}; Got $millis, expected $expectedMillis " +
-            s"(delta = ${millis - expectedMillis})")
+        // we test reading the data back with and without the vectorized reader, to make sure we
+        // haven't broken reading parquet from non-hive tables, with both readers.
+        Seq(false, true).foreach { vectorized =>
+          spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, vectorized)
+          val readFromDisk = spark.read.parquet(onDiskLocation).collect()
+          val storageTzId = explicitTz.getOrElse(sessionTzId)
+          readFromDisk.foreach { row =>
+            val displayTime = row.getAs[String](0)
+            val millis = row.getAs[Timestamp](1).getTime()
+            val expectedMillis = timestampTimezoneToMillis((displayTime, storageTzId))
+            assert(expectedMillis === millis, s"Display time '$displayTime' was stored " +
+              s"incorrectly with sessionTz = ${sessionTzOpt}; Got $millis, expected " +
+              s"$expectedMillis (delta = ${millis - expectedMillis})")
+          }
         }
       }
     }
@@ -401,65 +398,69 @@ class ParquetHiveCompatibilitySuite extends ParquetCompatibilityTest with TestHi
             options = options
           )
           Seq(false, true).foreach { vectorized =>
-            withSQLConf((SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, vectorized.toString)) {
-              withClue(s"vectorized = $vectorized;") {
-                val sessionTz = sessionTzOpt.getOrElse(TimeZone.getDefault().getID())
-                val collectedFromExternal =
-                  spark.sql(s"select display, ts from external_$baseTable").collect()
-                collectedFromExternal.foreach { row =>
-                  val displayTime = row.getAs[String](0)
-                  val millis = row.getAs[Timestamp](1).getTime()
-                  val expectedMillis = timestampTimezoneToMillis((displayTime, sessionTz))
-                  val delta = millis - expectedMillis
-                  val deltaHours = delta / (1000L * 60 * 60)
-                  assert(millis === expectedMillis, s"Display time '$displayTime' did not have " +
-                    s"correct millis: was $millis, expected $expectedMillis; delta = $delta " +
-                    s"($deltaHours hours)")
-                }
-
-                // Now test that the behavior is still correct even with a filter which could get
-                // pushed down into parquet.  We don't need extra handling for pushed down
-                // predicates because (a) in ParquetFilters, we ignore TimestampType and (b) parquet
-                // does not read statistics from int96 fields, as they are unsigned.  See
-                // scalastyle:off line.size.limit
-                // https://github.com/apache/parquet-mr/blob/2fd62ee4d524c270764e9b91dca72e5cf1a005b7/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L419
-                // https://github.com/apache/parquet-mr/blob/2fd62ee4d524c270764e9b91dca72e5cf1a005b7/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L348
-                // scalastyle:on line.size.limit
-                //
-                // Just to be defensive in case anything ever changes in parquet, this test checks
-                // the assumption on column stats, and also the end-to-end behavior.
-
-                val hadoopConf = sparkContext.hadoopConfiguration
-                val fs = FileSystem.get(hadoopConf)
-                val parts = fs.listStatus(new Path(path.getCanonicalPath))
-                  .filter(_.getPath().getName().endsWith(".parquet"))
-                // grab the meta data from the parquet file.  The next section of asserts just make
-                // sure the test is configured correctly.
-                assert(parts.size == 1)
-                val oneFooter = ParquetFileReader.readFooter(hadoopConf, parts.head.getPath)
-                assert(oneFooter.getFileMetaData.getSchema.getColumns.size === 2)
-                assert(oneFooter.getFileMetaData.getSchema.getColumns.get(1).getType() ===
-                  PrimitiveTypeName.INT96)
-                val oneBlockMeta = oneFooter.getBlocks().get(0)
-                val oneBlockColumnMeta = oneBlockMeta.getColumns().get(1)
-                val columnStats = oneBlockColumnMeta.getStatistics
-                // This is the important assert.  Column stats are written, but they are ignored
-                // when the data is read back as mentioned above, b/c int96 is unsigned.  This
-                // assert makes sure this holds even if we change parquet versions (if eg. there
-                // were ever statistics even on unsigned columns).
-                assert(columnStats.isEmpty)
-
-                // These queries should return the entire dataset, but if the predicates were
-                // applied to the raw values in parquet, they would incorrectly filter data out.
-                Seq(
-                  ">" -> "2015-12-31 22:00:00",
-                  "<" -> "2016-01-01 02:00:00"
-                ).foreach { case (comparison, value) =>
-                  val query =
-                    s"select ts from external_$baseTable where ts $comparison '$value'"
-                  val countWithFilter = spark.sql(query).count()
-                  assert(countWithFilter === 4, query)
-                }
+            withClue(s"vectorized = $vectorized;") {
+              spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, vectorized)
+              val sessionTz = sessionTzOpt.getOrElse(TimeZone.getDefault().getID())
+              val query = s"select display, cast(ts as string) as ts_as_string, ts " +
+                s"from external_$baseTable"
+              val collectedFromExternal = spark.sql(query).collect()
+              collectedFromExternal.foreach { row =>
+                val displayTime = row.getAs[String](0)
+                // the timestamp should still display the same, despite the changes in timezones
+                assert(displayTime === row.getAs[String](1).toString())
+                // we'll also check that the millis behind the timestamp has the appropriate
+                // adjustments.
+                val millis = row.getAs[Timestamp](2).getTime()
+                val expectedMillis = timestampTimezoneToMillis((displayTime, sessionTz))
+                val delta = millis - expectedMillis
+                val deltaHours = delta / (1000L * 60 * 60)
+                assert(millis === expectedMillis, s"Display time '$displayTime' did not have " +
+                  s"correct millis: was $millis, expected $expectedMillis; delta = $delta " +
+                  s"($deltaHours hours)")
+              }
+
+              // Now test that the behavior is still correct even with a filter which could get
+              // pushed down into parquet.  We don't need extra handling for pushed down
+              // predicates because (a) in ParquetFilters, we ignore TimestampType and (b) parquet
+              // does not read statistics from int96 fields, as they are unsigned.  See
+              // scalastyle:off line.size.limit
+              // https://github.com/apache/parquet-mr/blob/2fd62ee4d524c270764e9b91dca72e5cf1a005b7/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L419
+              // https://github.com/apache/parquet-mr/blob/2fd62ee4d524c270764e9b91dca72e5cf1a005b7/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L348
+              // scalastyle:on line.size.limit
+              //
+              // Just to be defensive in case anything ever changes in parquet, this test checks
+              // the assumption on column stats, and also the end-to-end behavior.
+
+              val hadoopConf = sparkContext.hadoopConfiguration
+              val fs = FileSystem.get(hadoopConf)
+              val parts = fs.listStatus(new Path(path.getCanonicalPath))
+                .filter(_.getPath().getName().endsWith(".parquet"))
+              // grab the meta data from the parquet file.  The next section of asserts just make
+              // sure the test is configured correctly.
+              assert(parts.size == 1)
+              val oneFooter = ParquetFileReader.readFooter(hadoopConf, parts.head.getPath)
+              assert(oneFooter.getFileMetaData.getSchema.getColumns.size === 2)
+              assert(oneFooter.getFileMetaData.getSchema.getColumns.get(1).getType() ===
+                PrimitiveTypeName.INT96)
+              val oneBlockMeta = oneFooter.getBlocks().get(0)
+              val oneBlockColumnMeta = oneBlockMeta.getColumns().get(1)
+              val columnStats = oneBlockColumnMeta.getStatistics
+              // This is the important assert.  Column stats are written, but they are ignored
+              // when the data is read back as mentioned above, b/c int96 is unsigned.  This
+              // assert makes sure this holds even if we change parquet versions (if eg. there
+              // were ever statistics even on unsigned columns).
+              assert(columnStats.isEmpty)
+
+              // These queries should return the entire dataset, but if the predicates were
+              // applied to the raw values in parquet, they would incorrectly filter data out.
+              Seq(
+                ">" -> "2015-12-31 22:00:00",
+                "<" -> "2016-01-01 02:00:00"
+              ).foreach { case (comparison, value) =>
+                val query =
+                  s"select ts from external_$baseTable where ts $comparison '$value'"
+                val countWithFilter = spark.sql(query).count()
+                assert(countWithFilter === 4, query)
               }
             }
           }