fix(spark): catch HoodieSchemaNotFoundException in 3-arg DefaultSource.createRelation

prashantwason · claude · prashantwason · commit 8c5b1135d3d4 · 2026-04-30T14:58:18.000-07:00
The 2-arg `createRelation(sqlContext, parameters)` overload wraps its body in a try/catch that converts `HoodieSchemaNotFoundException` to `EmptyRelation` (added in HUDI-7147 / #10689). The 3-arg `createRelation(sqlContext, optParams, schema)` overload — which Spark's `DataSource.resolveRelation()` invokes directly via the `SchemaRelationProvider` path whenever a user-supplied schema is present (e.g. `spark.read.schema(s).format("hudi").load(path)`, or HMS-catalog resolution that already knows the schema) — has no such catch, so the exception propagates and breaks query analysis. Mirror the 2-arg catch on the 3-arg overload so behavior is symmetric: schema-less Hudi tables resolve to an empty relation regardless of which overload Spark invokes. Also adds `TestCOWDataSource.testReadOfAnEmptyTableWithUserSuppliedSchema`, a sibling of the existing `testReadOfAnEmptyTable` that exercises the 3-arg path. Closes #18668 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala
@@ -134,7 +134,16 @@ class DefaultSource extends RelationProvider
       parameters
     }
 
-    val relation = DefaultSource.createRelation(sqlContext, metaClient, schema, options.toMap)
+    // Spark's DataSource.resolveRelation() invokes this 3-arg overload directly via the
+    // SchemaRelationProvider path when a user-supplied schema is present (e.g.
+    // spark.read.schema(...).load(path)). The 2-arg overload catches
+    // HoodieSchemaNotFoundException and returns an EmptyRelation, but that catch is bypassed
+    // on this path, so we mirror the same handling here.
+    val relation = try {
+      DefaultSource.createRelation(sqlContext, metaClient, schema, options.toMap)
+    } catch {
+      case _: HoodieSchemaNotFoundException => new EmptyRelation(sqlContext, new StructType())
+    }
     log.info(s"Created relation ${relation.getClass.getSimpleName} with ${options.size} resolved options")
     relation
   }
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala
@@ -2214,6 +2214,37 @@ class TestCOWDataSource extends HoodieSparkClientTestBase with ScalaAssertionSup
     assertEquals(count, 0)
   }
 
+  @Test
+  def testReadOfAnEmptyTableWithUserSuppliedSchema(): Unit = {
+    val (writeOpts, _) = getWriterReaderOpts(HoodieRecordType.AVRO)
+
+    // Insert + then delete the only completed commit so the table has no resolvable schema.
+    val records = recordsToStrings(dataGen.generateInserts("000", 100)).asScala.toList
+    val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2))
+    inputDF.write.format("hudi")
+      .options(writeOpts)
+      .option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
+      .mode(SaveMode.Overwrite)
+      .save(basePath)
+
+    val fileStatuses = storage.listDirectEntries(
+      new StoragePath(basePath + StoragePath.SEPARATOR + HoodieTableMetaClient.METAFOLDER_NAME
+        + StoragePath.SEPARATOR + HoodieTableMetaClient.TIMELINEFOLDER_NAME),
+      new StoragePathFilter {
+        override def accept(path: StoragePath): Boolean = {
+          path.getName.endsWith(HoodieTimeline.COMMIT_ACTION)
+        }
+      })
+    storage.deleteFile(fileStatuses.get(0).getPath)
+
+    // spark.read.schema(...) triggers Spark's SchemaRelationProvider path which calls the
+    // 3-arg DefaultSource.createRelation overload directly. Without the catch on that
+    // overload, this would fail with HoodieSchemaNotFoundException.
+    val userSchema = inputDF.schema
+    val count = spark.read.schema(userSchema).format("hudi").load(basePath).count()
+    assertEquals(count, 0)
+  }
+
   /**
    * Test incremental queries and time travel queries with event time ordering.
    *