fix(spark): catch HoodieSchemaNotFoundException in 3-arg DefaultSource.createRelation

prashantwason · claude · prashantwason · commit 43e0dd49adc1 · 2026-04-30T16:15:53.000-07:00
The 2-arg `createRelation(sqlContext, parameters)` overload wraps its body in a try/catch that converts `HoodieSchemaNotFoundException` to `EmptyRelation` (added in HUDI-7147 / #10689). The 3-arg `createRelation(sqlContext, optParams, schema)` overload — which Spark's `DataSource.resolveRelation()` invokes directly via the `SchemaRelationProvider` path whenever a user-supplied schema is present (e.g. `spark.read.schema(s).format("hudi").load(path)`, or HMS-catalog resolution that already knows the schema) — has no such catch, so the exception propagates and breaks query analysis. Mirror the 2-arg catch on the 3-arg overload so behavior is symmetric: schema-less Hudi tables resolve to an empty relation regardless of which overload Spark invokes. Also adds `TestCOWDataSource.testReadOfAnEmptyTableWithUserSuppliedSchema`, a sibling of the existing `testReadOfAnEmptyTable` that exercises the 3-arg path. Closes #18668 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala
@@ -134,7 +134,19 @@ class DefaultSource extends RelationProvider
       parameters
     }
 
-    val relation = DefaultSource.createRelation(sqlContext, metaClient, schema, options.toMap)
+    // Spark's DataSource.resolveRelation() invokes this 3-arg overload directly via the
+    // SchemaRelationProvider path when a user-supplied schema is present (e.g.
+    // spark.read.schema(...).load(path)). The 2-arg overload catches
+    // HoodieSchemaNotFoundException and returns an EmptyRelation, but that catch is bypassed
+    // on this path, so we mirror the same handling here. Preserve the caller-supplied schema
+    // so subsequent query analysis (e.g. column resolution in WHERE clauses) sees the
+    // HMS-known columns even though the on-disk table is schemaless.
+    val relation = try {
+      DefaultSource.createRelation(sqlContext, metaClient, schema, options.toMap)
+    } catch {
+      case _: HoodieSchemaNotFoundException =>
+        new EmptyRelation(sqlContext, Option(schema).getOrElse(new StructType()))
+    }
     log.info(s"Created relation ${relation.getClass.getSimpleName} with ${options.size} resolved options")
     relation
   }
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala
@@ -2214,6 +2214,40 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup
     assertEquals(count, 0)
   }
 
+  @Test
+  def testReadOfAnEmptyTableWithUserSuppliedSchema(): Unit = {
+    val (writeOpts, _) = getWriterReaderOpts(HoodieRecordType.AVRO)
+
+    // Insert + then delete the only completed commit so the table has no resolvable schema.
+    val records = recordsToStrings(dataGen.generateInserts("000", 100)).asScala.toList
+    val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2))
+    inputDF.write.format("hudi")
+      .options(writeOpts)
+      .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
+      .mode(SaveMode.Overwrite)
+      .save(basePath)
+
+    val fileStatuses = storage.listDirectEntries(
+      new StoragePath(basePath + StoragePath.SEPARATOR + HoodieTableMetaClient.METAFOLDER_NAME
+        + StoragePath.SEPARATOR + HoodieTableMetaClient.TIMELINEFOLDER_NAME),
+      new StoragePathFilter {
+        override def accept(path: StoragePath): Boolean = {
+          path.getName.endsWith(HoodieTimeline.COMMIT_ACTION)
+        }
+      })
+    storage.deleteFile(fileStatuses.get(0).getPath)
+
+    // spark.read.schema(...) triggers Spark's SchemaRelationProvider path which calls the
+    // 3-arg DefaultSource.createRelation overload directly. Without the catch on that
+    // overload, this would fail with HoodieSchemaNotFoundException.
+    val userSchema = inputDF.schema
+    val df = spark.read.schema(userSchema).format("hudi").load(basePath)
+    assertEquals(0, df.count())
+    // The caller-supplied schema must be preserved on the EmptyRelation so subsequent query
+    // analysis (e.g. column resolution) sees the user-known columns.
+    assertEquals(userSchema, df.schema)
+  }
+
   /**
    * Test incremental queries and time travel queries with event time ordering.
    *