Fixed handling of Schema Evolution case when actual table's schema has to be derived from InternalSchema representation

Alexey Kudinkin · Alexey Kudinkin · commit aeb98cf499fc · 2022-06-14T17:36:56.000-07:00
diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala
@@ -26,19 +26,20 @@ import org.apache.hadoop.mapred.JobConf
 import org.apache.hudi.HoodieBaseRelation.{convertToAvroSchema, createHFileReader, generateUnsafeProjection, getPartitionPath, projectSchema}
 import org.apache.hudi.HoodieConversionUtils.toScalaOption
 import org.apache.hudi.avro.HoodieAvroUtils
+import org.apache.hudi.client.utils.SparkInternalSchemaConverter
 import org.apache.hudi.common.config.{HoodieMetadataConfig, SerializableConfiguration}
 import org.apache.hudi.common.fs.FSUtils
 import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord}
 import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline}
 import org.apache.hudi.common.table.view.HoodieTableFileSystemView
 import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver}
 import org.apache.hudi.common.util.StringUtils
+import org.apache.hudi.common.util.StringUtils.isNullOrEmpty
 import org.apache.hudi.common.util.ValidationUtils.checkState
-import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter
-import org.apache.hudi.internal.schema.utils.InternalSchemaUtils
 import org.apache.hudi.internal.schema.{HoodieSchemaException, InternalSchema}
+import org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter
+import org.apache.hudi.internal.schema.utils.{InternalSchemaUtils, SerDeHelper}
 import org.apache.hudi.io.storage.HoodieHFileReader
-import org.apache.spark.SerializableWritable
 import org.apache.spark.execution.datasources.HoodieInMemoryFileIndex
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
@@ -62,7 +63,7 @@ import scala.util.{Failure, Success, Try}
 
 trait HoodieFileSplit {}
 
-case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: String, internalSchema: InternalSchema = InternalSchema.getEmptyInternalSchema)
+case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: String, internalSchema: Option[InternalSchema] = None)
 
 case class HoodieTableState(tablePath: String,
                             latestCommitTimestamp: String,
@@ -132,33 +133,36 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
    * NOTE: Initialization of teh following members is coupled on purpose to minimize amount of I/O
    *       required to fetch table's Avro and Internal schemas
    */
-  protected lazy val (tableAvroSchema: Schema, internalSchema: InternalSchema) = {
+  protected lazy val (tableAvroSchema: Schema, internalSchemaOpt: Option[InternalSchema]) = {
     val schemaResolver = new TableSchemaResolver(metaClient)
-    val avroSchema: Schema = schemaSpec.map(convertToAvroSchema).getOrElse {
-      Try(schemaResolver.getTableAvroSchema) match {
-        case Success(schema) => schema
+    val internalSchemaOpt = if (!isSchemaEvolutionEnabled) {
+      None
+    } else {
+      Try(schemaResolver.getTableInternalSchemaFromCommitMetadata) match {
+        case Success(internalSchemaOpt) => toScalaOption(internalSchemaOpt)
         case Failure(e) =>
-          logError("Failed to fetch schema from the table", e)
-          throw new HoodieSchemaException("Failed to fetch schema from the table")
+          logWarning("Failed to fetch internal-schema from the table", e)
+          None
       }
     }
 
-    val internalSchema: InternalSchema = if (!isSchemaEvolutionEnabled) {
-      InternalSchema.getEmptyInternalSchema
-    } else {
-      Try(schemaResolver.getTableInternalSchemaFromCommitMetadata) match {
-        case Success(internalSchemaOpt) =>
-          toScalaOption(internalSchemaOpt).getOrElse(InternalSchema.getEmptyInternalSchema)
+    val avroSchema = internalSchemaOpt.map { is =>
+      AvroInternalSchemaConverter.convert(is, "schema")
+    } orElse {
+      schemaSpec.map(convertToAvroSchema)
+    } getOrElse {
+      Try(schemaResolver.getTableAvroSchema) match {
+        case Success(schema) => schema
         case Failure(e) =>
-          logWarning("Failed to fetch internal-schema from the table", e)
-          InternalSchema.getEmptyInternalSchema
+          logError("Failed to fetch schema from the table", e)
+          throw new HoodieSchemaException("Failed to fetch schema from the table")
       }
     }
 
-    (avroSchema, internalSchema)
+    (avroSchema, internalSchemaOpt)
   }
 
-  protected lazy val tableStructSchema: StructType = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema)
+  protected val tableStructSchema: StructType = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema)
 
   protected val partitionColumns: Array[String] = tableConfig.getPartitionFields.orElse(Array.empty)
 
@@ -230,7 +234,7 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
    * meaning that regardless of whether this columns are being requested by the query they will be fetched
    * regardless so that relation is able to combine records properly (if necessary)
    *
-   * @VisibleForTesting
+   * @VisibleInTests
    */
   val mandatoryFields: Seq[String]
 
@@ -247,7 +251,7 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
   /**
    * Returns true in case table supports Schema on Read (Schema Evolution)
    */
-  def hasSchemaOnRead: Boolean = !internalSchema.isEmptySchema
+  def hasSchemaOnRead: Boolean = internalSchemaOpt.isDefined
 
   /**
    * Data schema is determined as the actual schema of the Table's Data Files (for ex, parquet/orc/etc);
@@ -270,9 +274,8 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
   def canPruneRelationSchema: Boolean =
     (fileFormat.isInstanceOf[ParquetFileFormat] || fileFormat.isInstanceOf[OrcFileFormat]) &&
       // NOTE: Some relations might be disabling sophisticated schema pruning techniques (for ex, nested schema pruning)
-      canPruneSchema &&
       // TODO(HUDI-XXX) internal schema doesn't supported nested schema pruning currently
-      internalSchema.isEmptySchema
+      canPruneSchema && !hasSchemaOnRead
 
   override def schema: StructType = {
     // NOTE: Optimizer could prune the schema (applying for ex, [[NestedSchemaPruning]] rule) setting new updated
@@ -319,19 +322,17 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
     //       w/ more than 2 types are involved)
     val sourceSchema = optimizerPrunedDataSchema.map(convertToAvroSchema).getOrElse(tableAvroSchema)
     val (requiredAvroSchema, requiredStructSchema, requiredInternalSchema) =
-      projectSchema(sourceSchema, targetColumns, Some(internalSchema))
+      projectSchema(Either.cond(internalSchemaOpt.isDefined, internalSchemaOpt.get, sourceSchema), targetColumns)
 
     val filterExpressions = convertToExpressions(filters)
     val (partitionFilters, dataFilters) = filterExpressions.partition(isPartitionPredicate)
 
     val fileSplits = collectFileSplits(partitionFilters, dataFilters)
 
-    val tableAvroSchemaStr =
-      if (internalSchema.isEmptySchema) tableAvroSchema.toString
-      else AvroInternalSchemaConverter.convert(internalSchema, tableAvroSchema.getName).toString
+    val tableAvroSchemaStr = tableAvroSchema.toString
 
-    val tableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchemaStr, internalSchema)
-    val requiredSchema = HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString, requiredInternalSchema)
+    val tableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchemaStr, internalSchemaOpt)
+    val requiredSchema = HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString, Some(requiredInternalSchema))
 
     // Since schema requested by the caller might contain partition columns, we might need to
     // prune it, removing all partition columns from it in case these columns are not persisted
@@ -559,6 +560,17 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
     }
   }
 
+  protected def embedInternalSchema(conf: Configuration, internalSchemaOpt: Option[InternalSchema]): Configuration = {
+    val internalSchema = internalSchemaOpt.getOrElse(InternalSchema.getEmptyInternalSchema)
+    val querySchemaString = SerDeHelper.toJson(internalSchema)
+    if (!isNullOrEmpty(querySchemaString)) {
+      conf.set(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA, SerDeHelper.toJson(internalSchema))
+      conf.set(SparkInternalSchemaConverter.HOODIE_TABLE_PATH, metaClient.getBasePath)
+      conf.set(SparkInternalSchemaConverter.HOODIE_VALID_COMMITS_LIST, validCommits)
+    }
+    conf
+  }
+
   private def tryPrunePartitionColumns(tableSchema: HoodieTableSchema,
                                        requiredSchema: HoodieTableSchema): (StructType, HoodieTableSchema, HoodieTableSchema) = {
     if (shouldExtractPartitionValuesFromPartitionPath) {
@@ -601,33 +613,30 @@ object HoodieBaseRelation extends SparkAdapterSupport {
   /**
    * Projects provided schema by picking only required (projected) top-level columns from it
    *
-   * @param tableAvroSchema schema to project
+   * @param tableSchema schema to project (either of [[InternalSchema]] or Avro's [[Schema]])
    * @param requiredColumns required top-level columns to be projected
-   * @param internalSchemaOpt optional internal schema, providing for appropriate handling of schema evolution
    */
-  // TODO revisit internal schema handling
-  def projectSchema(tableAvroSchema: Schema,
-                    requiredColumns: Array[String],
-                    internalSchemaOpt: Option[InternalSchema] = None): (Schema, StructType, InternalSchema) = {
-    internalSchemaOpt match {
-      case Some(internalSchema) if !internalSchema.isEmptySchema =>
+  def projectSchema(tableSchema: Either[Schema, InternalSchema], requiredColumns: Array[String]): (Schema, StructType, InternalSchema) = {
+    tableSchema match {
+      case Right(internalSchema) =>
+        checkState(!internalSchema.isEmptySchema)
         // TODO extend pruning to leverage optimizer pruned schema
         val prunedInternalSchema = InternalSchemaUtils.pruneInternalSchema(internalSchema, requiredColumns.toList.asJava)
-        val requiredAvroSchema = AvroInternalSchemaConverter.convert(prunedInternalSchema, tableAvroSchema.getName)
+        val requiredAvroSchema = AvroInternalSchemaConverter.convert(prunedInternalSchema, "schema")
         val requiredStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(requiredAvroSchema)
 
         (requiredAvroSchema, requiredStructSchema, prunedInternalSchema)
 
-      case _ =>
-        val fieldMap = tableAvroSchema.getFields.asScala.map(f => f.name() -> f).toMap
+      case Left(avroSchema) =>
+        val fieldMap = avroSchema.getFields.asScala.map(f => f.name() -> f).toMap
         val requiredFields = requiredColumns.map { col =>
           val f = fieldMap(col)
           // We have to create a new [[Schema.Field]] since Avro schemas can't share field
           // instances (and will throw "org.apache.avro.AvroRuntimeException: Field already used")
           new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal(), f.order())
         }.toList
-        val requiredAvroSchema = Schema.createRecord(tableAvroSchema.getName, tableAvroSchema.getDoc,
-          tableAvroSchema.getNamespace, tableAvroSchema.isError, requiredFields.asJava)
+        val requiredAvroSchema = Schema.createRecord(avroSchema.getName, avroSchema.getDoc,
+          avroSchema.getNamespace, avroSchema.isError, requiredFields.asJava)
         val requiredStructSchema = AvroConversionUtils.convertAvroSchemaToStructType(requiredAvroSchema)
 
         (requiredAvroSchema, requiredStructSchema, InternalSchema.getEmptyInternalSchema)
@@ -640,10 +649,11 @@ object HoodieBaseRelation extends SparkAdapterSupport {
                                 filters: Seq[Filter],
                                 options: Map[String, String],
                                 hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
-    val hadoopConfBroadcast = spark.sparkContext.broadcast(new SerializableWritable(hadoopConf))
+    val hadoopConfBroadcast =
+      spark.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
 
     partitionedFile => {
-      val hadoopConf = hadoopConfBroadcast.value.value
+      val hadoopConf = hadoopConfBroadcast.value.get()
       val reader = new HoodieHFileReader[GenericRecord](hadoopConf, new Path(partitionedFile.filePath),
         new CacheConfig(hadoopConf))