Extracted TestBasicSchemaEvolution as standalone test

Alexey Kudinkin · Alexey Kudinkin · commit 378a3752f4cd · 2022-08-16T18:42:14.000-07:00
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/ScalaAssertionSupport.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/ScalaAssertionSupport.scala
@@ -31,7 +31,7 @@ trait ScalaAssertionSupport {
         return t.asInstanceOf[T]
       // scalastyle:on return
       case ot @ _ =>
-        fail(s"Expected exception of class $expectedExceptionClass, but ${ot.getClass} has been thrown")
+        fail(s"Expected exception of class $expectedExceptionClass, but ${ot.getClass} has been thrown: $ot\n${ot.getStackTrace.mkString("\n")}")
     }
 
     fail(s"Expected exception of class $expectedExceptionClass, but nothing has been thrown")
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestBasicSchemaEvolution.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestBasicSchemaEvolution.scala
@@ -0,0 +1,319 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.functional
+
+import org.apache.hadoop.fs.FileSystem
+import org.apache.hudi.HoodieConversionUtils.toJavaOption
+import org.apache.hudi.common.model.HoodieRecord
+import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver}
+import org.apache.hudi.common.util
+import org.apache.hudi.config.HoodieWriteConfig
+import org.apache.hudi.exception.{HoodieUpsertException, SchemaCompatibilityException}
+import org.apache.hudi.functional.TestBasicSchemaEvolution.{dropColumn, injectColumnAt}
+import org.apache.hudi.testutils.HoodieClientTestBase
+import org.apache.hudi.util.JFunction
+import org.apache.hudi.{AvroConversionUtils, DataSourceWriteOptions, ScalaAssertionSupport}
+import org.apache.spark.sql.hudi.HoodieSparkSessionExtension
+import org.apache.spark.sql.types.{DateType, IntegerType, StringType, StructField, StructType}
+import org.apache.spark.sql.{HoodieUnsafeUtils, Row, SaveMode, SparkSession, SparkSessionExtensions}
+import org.junit.jupiter.api.Assertions.assertEquals
+import org.junit.jupiter.api.{AfterEach, BeforeEach}
+import org.junit.jupiter.params.ParameterizedTest
+import org.junit.jupiter.params.provider.CsvSource
+
+import java.util.function.Consumer
+import scala.collection.JavaConverters._
+import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable`
+
+class TestBasicSchemaEvolution extends HoodieClientTestBase with ScalaAssertionSupport {
+
+  var spark: SparkSession = null
+  val commonOpts = Map(
+    "hoodie.insert.shuffle.parallelism" -> "4",
+    "hoodie.upsert.shuffle.parallelism" -> "4",
+    "hoodie.bulkinsert.shuffle.parallelism" -> "2",
+    "hoodie.delete.shuffle.parallelism" -> "1",
+    HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.key() -> "true",
+    DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
+    DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
+    DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp",
+    HoodieWriteConfig.TBL_NAME.key -> "hoodie_test"
+  )
+
+  val verificationCol: String = "driver"
+  val updatedVerificationVal: String = "driver_update"
+
+  override def getSparkSessionExtensionsInjector: util.Option[Consumer[SparkSessionExtensions]] =
+    toJavaOption(
+      Some(
+        JFunction.toJava((receiver: SparkSessionExtensions) => new HoodieSparkSessionExtension().apply(receiver)))
+    )
+
+  @BeforeEach override def setUp() {
+    initPath()
+    initSparkContexts()
+    spark = sqlContext.sparkSession
+    initTestDataGenerator()
+    initFileSystem()
+  }
+
+  @AfterEach override def tearDown() = {
+    cleanupSparkContexts()
+    cleanupTestDataGenerator()
+    cleanupFileSystem()
+    FileSystem.closeAll()
+    System.gc()
+  }
+
+  @ParameterizedTest
+  @CsvSource(value = Array(
+    "bulk_insert,true", "bulk_insert,false",
+    "insert,true", "insert,false",
+    "upsert,true", "upsert,false"
+  ))
+  def testBasicSchemaEvolution(opType: String, shouldReconcileSchema: Boolean): Unit = {
+    // open the schema validate
+    val opts = commonOpts ++
+      Map(
+        HoodieWriteConfig.AVRO_SCHEMA_VALIDATE_ENABLE.key -> "true",
+        DataSourceWriteOptions.RECONCILE_SCHEMA.key -> shouldReconcileSchema.toString,
+        DataSourceWriteOptions.OPERATION.key -> opType
+      )
+
+    def appendData(schema: StructType, batch: Seq[Row]): Unit = {
+      HoodieUnsafeUtils.createDataFrameFromRows(spark, batch, schema)
+        .write
+        .format("org.apache.hudi")
+        .options(opts)
+        .mode(SaveMode.Append)
+        .save(basePath)
+    }
+
+    def loadTable: (StructType, Seq[Row]) = {
+      val tableMetaClient = HoodieTableMetaClient.builder()
+        .setConf(spark.sparkContext.hadoopConfiguration)
+        .setBasePath(basePath)
+        .build()
+
+      tableMetaClient.reloadActiveTimeline()
+
+      val resolver = new TableSchemaResolver(tableMetaClient)
+      val latestTableSchema = AvroConversionUtils.convertAvroSchemaToStructType(resolver.getTableAvroSchema(false))
+
+      val df =
+        spark.read.format("org.apache.hudi")
+          .load(basePath + "/*/*")
+          .drop(HoodieRecord.HOODIE_META_COLUMNS.asScala: _*)
+          .orderBy("_row_key")
+
+      (latestTableSchema, df.collectAsList().toSeq)
+    }
+
+    //
+    // 1. Write 1st batch with schema A
+    //
+
+    val firstSchema = StructType(
+      StructField("_row_key", StringType, nullable = true) ::
+        StructField("first_name", StringType, nullable = false) ::
+        StructField("last_name", StringType, nullable = true) ::
+        StructField("timestamp", IntegerType, nullable = true) ::
+        StructField("partition", IntegerType, nullable = true) :: Nil)
+
+    val firstBatch = Seq(
+      Row("1", "Andy", "Cooper", 1, 1),
+      Row("2", "Lisi", "Wallace", 1, 1),
+      Row("3", "Zhangsan", "Shu", 1, 1))
+
+    HoodieUnsafeUtils.createDataFrameFromRows(spark, firstBatch, firstSchema)
+      .write
+      .format("org.apache.hudi")
+      .options(opts)
+      .mode(SaveMode.Overwrite)
+      .save(basePath)
+
+    //
+    // 2. Write 2d batch with another schema (added column `age`)
+    //
+
+    val secondSchema = StructType(
+      StructField("_row_key", StringType, nullable = true) ::
+        StructField("first_name", StringType, nullable = false) ::
+        StructField("last_name", StringType, nullable = true) ::
+        StructField("age", StringType, nullable = true) ::
+        StructField("timestamp", IntegerType, nullable = true) ::
+        StructField("partition", IntegerType, nullable = true) :: Nil)
+
+    val secondBatch = Seq(
+      Row("4", "John", "Green", "10", 1, 1),
+      Row("5", "Jack", "Sparrow", "13", 1, 1),
+      Row("6", "Jill", "Fiorella", "12", 1, 1))
+
+    appendData(secondSchema, secondBatch)
+    val (tableSchemaAfterSecondBatch, rowsAfterSecondBatch) = loadTable
+
+    // NOTE: In case schema reconciliation is ENABLED, Hudi would prefer the table's schema over the new batch
+    //       schema, therefore table's schema after commit will actually stay the same, shedding (newly added) columns
+    //       from the records that are present in the batch schema, but not in the table's one.
+    //
+    //       In case schema reconciliation is DISABLED, table will be overwritten in the batch's schema,
+    //       entailing that the data in the added columns for table's existing records will be added w/ nulls,
+    //       in case new column is nullable, and would fail otherwise
+    if (shouldReconcileSchema) {
+      assertEquals(firstSchema, tableSchemaAfterSecondBatch)
+
+      val ageColOrd = secondSchema.indexWhere(_.name == "age")
+      val expectedRows = firstBatch ++ dropColumn(secondBatch, ageColOrd)
+
+      assertEquals(expectedRows, rowsAfterSecondBatch)
+    } else {
+      assertEquals(secondSchema, tableSchemaAfterSecondBatch)
+
+      val ageColOrd = secondSchema.indexWhere(_.name == "age")
+      val expectedRows = injectColumnAt(firstBatch, ageColOrd, null) ++ secondBatch
+
+      assertEquals(expectedRows, rowsAfterSecondBatch)
+    }
+
+    //
+    // 3. Write 3d batch with another schema (w/ omitted a _nullable_ column `second_name`, expected to succeed)
+    //
+
+    val thirdSchema = StructType(
+      StructField("_row_key", StringType, nullable = true) ::
+        StructField("first_name", StringType, nullable = false) ::
+        StructField("age", StringType, nullable = true) ::
+        StructField("timestamp", IntegerType, nullable = true) ::
+        StructField("partition", IntegerType, nullable = true) :: Nil)
+
+    val thirdBatch = Seq(
+      Row("7", "Harry", "15", 1, 1),
+      Row("8", "Ron", "14", 1, 1),
+      Row("9", "Germiona", "16", 1, 1))
+
+    appendData(thirdSchema, thirdBatch)
+    val (tableSchemaAfterThirdBatch, rowsAfterThirdBatch) = loadTable
+
+    // NOTE: In case schema reconciliation is ENABLED, Hudi would prefer the table's schema over the new batch
+    //       schema, therefore table's schema after commit will actually stay the same, adding back (dropped) columns
+    //       to the records in the batch (setting them as null).
+    //
+    //       In case schema reconciliation is DISABLED, table will be overwritten in the batch's schema,
+    //       entailing that the data in the dropped columns for table's existing records will be dropped.
+    if (shouldReconcileSchema) {
+      assertEquals(firstSchema, tableSchemaAfterThirdBatch)
+
+      val ageColOrd = secondSchema.indexWhere(_.name == "age")
+      val lastNameColOrd = firstSchema.indexWhere(_.name == "last_name")
+
+      val expectedRows = rowsAfterSecondBatch ++ dropColumn(injectColumnAt(thirdBatch, lastNameColOrd, null), ageColOrd)
+
+      assertEquals(expectedRows, rowsAfterThirdBatch)
+    } else {
+      assertEquals(thirdSchema, tableSchemaAfterThirdBatch)
+
+      val lastNameColOrd = secondSchema.indexWhere(_.name == "last_name")
+
+      val expectedRows =
+        dropColumn(rowsAfterSecondBatch, lastNameColOrd) ++ thirdBatch
+
+      assertEquals(expectedRows, rowsAfterThirdBatch)
+    }
+
+    //
+    // 4. Write 4th batch with another schema (w/ omitted a _non-nullable_ column `first_name`, expected to fail
+    //    in case when schema reconciliation is enabled, expected to succeed otherwise)
+    //
+
+    val fourthSchema = StructType(
+      StructField("_row_key", StringType, nullable = true) ::
+        StructField("age", StringType, nullable = true) ::
+        StructField("timestamp", IntegerType, nullable = true) ::
+        StructField("partition", IntegerType, nullable = true) :: Nil)
+
+    val fourthBatch = Seq(
+      Row("10", "15", 1, 1),
+      Row("11", "14", 1, 1),
+      Row("12", "16", 1, 1))
+
+    // NOTE: In case schema reconciliation is ENABLED, Hudi would prefer the table's schema over the new batch
+    //       schema, therefore table's schema after commit will actually stay the same, adding back (dropped) columns
+    //       to the records in the batch. Since batch omits column that is designated as non-null, write is expected
+    //       to fail (being unable to set the missing column values to null).
+    //
+    //       In case schema reconciliation is DISABLED, table will be overwritten in the batch's schema,
+    //       entailing that the data in the dropped columns for table's existing records will be dropped.
+    if (shouldReconcileSchema) {
+      assertThrows(classOf[SchemaCompatibilityException]) {
+        appendData(fourthSchema, fourthBatch)
+      }
+    } else {
+      appendData(thirdSchema, thirdBatch)
+      val (latestTableSchema, rows) = loadTable
+
+      assertEquals(thirdSchema, latestTableSchema)
+
+      val firstNameColOrd = thirdSchema.indexWhere(_.name == "first_name")
+
+      val expectedRecords =
+        dropColumn(rowsAfterThirdBatch, firstNameColOrd) ++ fourthBatch
+
+      assertEquals(expectedRecords, rows)
+    }
+
+
+    //
+    // 5. Write 5th batch with another schema (w/ data-type changed for a column `timestamp`, expected to fail)
+    //
+
+    val fifthSchema = StructType(
+      StructField("_row_key", StringType, nullable = true) ::
+        StructField("age", StringType, nullable = true) ::
+        StructField("timestamp", StringType, nullable = true) ::
+        StructField("partition", IntegerType, nullable = true) :: Nil)
+
+    val fifthBatch = Seq(
+      Row("10", "15", "1", 1),
+      Row("11", "14", "1", 1),
+      Row("12", "16", "1", 1))
+
+    // NOTE: Expected to fail in both cases, as such transformation is not permitted
+    assertThrows(classOf[SchemaCompatibilityException]) {
+      appendData(fifthSchema, fifthBatch)
+    }
+  }
+}
+
+object TestBasicSchemaEvolution {
+
+  def dropColumn(rows: Seq[Row], idx: Int): Seq[Row] =
+    rows.map { r =>
+      val values = r.toSeq.zipWithIndex
+        .filterNot { case (_, cidx) => cidx == idx }
+        .map { case (c, _) => c }
+      Row(values: _*)
+    }
+
+  def injectColumnAt(rows: Seq[Row], idx: Int, value: Any): Seq[Row] =
+    rows.map { r =>
+      val (left, right) = r.toSeq.splitAt(idx)
+      val values = (left :+ value) ++ right
+      Row(values: _*)
+    }
+
+}
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ trait ScalaAssertionSupport {`
`31`	`31`	`return t.asInstanceOf[T]`
`32`	`32`	`// scalastyle:on return`
`33`	`33`	`case ot @ _ =>`
`34`		`- fail(s"Expected exception of class $expectedExceptionClass, but ${ot.getClass} has been thrown")`
	`34`	`+ fail(s"Expected exception of class $expectedExceptionClass, but ${ot.getClass} has been thrown: $ot\n${ot.getStackTrace.mkString("\n")}")`
`35`	`35`	`}`
`36`	`36`
`37`	`37`	`fail(s"Expected exception of class $expectedExceptionClass, but nothing has been thrown")`