apache · gengliangwang · Jan 24, 2019 · Jan 24, 2019 · Jan 24, 2019 · Jan 24, 2019
diff --git a/docs/sql-data-sources-parquet.md b/docs/sql-data-sources-parquet.md
@@ -295,18 +295,6 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
     </p>
   </td>
 </tr>
-<tr>
-  <td><code>spark.sql.optimizer.metadataOnly</code></td>
-  <td>true</td>
-  <td>
-    <p>
-      When true, enable the metadata-only query optimization that use the table's metadata to
-      produce the partition columns instead of table scans. It applies when all the columns scanned
-      are partition columns and the query has an aggregate operator that satisfies distinct
-      semantics.
-    </p>
-  </td>
-</tr>
 <tr>
   <td><code>spark.sql.parquet.writeLegacyFormat</code></td>
   <td>false</td>

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -585,9 +585,10 @@ object SQLConf {
     .doc("When true, enable the metadata-only query optimization that use the table's metadata " +
       "to produce the partition columns instead of table scans. It applies when all the columns " +
       "scanned are partition columns and the query has an aggregate operator that satisfies " +
-      "distinct semantics.")
+      "distinct semantics. By default the optimization is disabled, since it may return " +
+      "incorrect results with empty tables.")
     .booleanConf
-    .createWithDefault(true)
+    .createWithDefault(false)
 
   val COLUMN_NAME_OF_CORRUPT_RECORD = buildConf("spark.sql.columnNameOfCorruptRecord")
     .doc("The name of internal column for storing raw/un-parsed JSON and CSV records that fail " +

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2422,7 +2422,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Row(s"$expected") :: Nil)
   }
 
-  test("SPARK-15752 optimize metadata only query for datasource table") {
+  ignore("SPARK-15752 optimize metadata only query for datasource table") {
     withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
       withTable("srcpart_15752") {
         val data = (1 to 10).map(i => (i, s"data-$i", i % 2, if ((i % 2) == 0) "a" else "b"))
@@ -2966,6 +2966,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {
+    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+      withTable("t") {
+        sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
+        sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
+        checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
+        checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
+      }
+    }
+  }
 }
 
 case class Foo(bar: Option[String])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/OptimizeMetadataOnlyQuerySuite.scala
@@ -58,7 +58,7 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   private def testMetadataOnly(name: String, sqls: String*): Unit = {
-    test(name) {
+    ignore(name) {
       withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
         sqls.foreach { case q => assertMetadataOnlyQuery(sql(q)) }
       }
@@ -69,7 +69,7 @@ class OptimizeMetadataOnlyQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   private def testNotMetadataOnly(name: String, sqls: String*): Unit = {
-    test(name) {
+    ignore(name) {
       withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
         sqls.foreach { case q => assertNotMetadataOnlyQuery(sql(q)) }
       }

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -86,6 +86,17 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     assert(message.contains("Table or view not found"))
   }
 
+  test("SPARK-26709: OptimizeMetadataOnlyQuery does not handle empty records correctly") {
+    withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "false") {
+      withTable("t") {
+        sql("CREATE TABLE t (col1 INT, p1 INT) USING PARQUET PARTITIONED BY (p1)")
+        sql("INSERT INTO TABLE t PARTITION (p1 = 5) SELECT ID FROM range(1, 1)")
+        checkAnswer(sql("SELECT MAX(p1) FROM t"), Row(null))
+        checkAnswer(sql("SELECT MAX(col1) FROM t"), Row(null))
+      }
+    }
+  }
+
   test("script") {
     assume(TestUtils.testCommandAvailable("/bin/bash"))
     assume(TestUtils.testCommandAvailable("echo | sed"))
@@ -1770,7 +1781,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
-  test("SPARK-15752 optimize metadata only query for hive table") {
+  ignore("SPARK-15752 optimize metadata only query for hive table") {
     withSQLConf(SQLConf.OPTIMIZER_METADATA_ONLY.key -> "true") {
       withTable("data_15752", "srcpart_15752", "srctext_15752") {
         val df = Seq((1, "2"), (3, "4")).toDF("key", "value")