|
24 | 24 | import org.apache.hudi.common.model.HoodieColumnRangeMetadata; |
25 | 25 | import org.apache.hudi.common.model.HoodieFileFormat; |
26 | 26 | import org.apache.hudi.common.util.BaseFileUtils; |
| 27 | +import org.apache.hudi.common.util.Option; |
27 | 28 | import org.apache.hudi.common.util.ParquetUtils; |
28 | 29 | import org.apache.hudi.common.util.collection.Pair; |
29 | 30 | import org.apache.hudi.exception.HoodieException; |
@@ -304,21 +305,28 @@ public static void updateColumnStatsIndexFor( |
304 | 305 | if (validIndexTables.isEmpty()) { |
305 | 306 | finalColStatsIndexDf = newColStatsIndexDf; |
306 | 307 | } else { |
307 | | - // NOTE: That Parquet schema might deviate from the original table schema (for ex, |
308 | | - // by upcasting "short" to "integer" types, etc), and hence we need to re-adjust it |
309 | | - // prior to merging, since merging might fail otherwise due to schemas incompatibility |
310 | | - finalColStatsIndexDf = |
311 | | - tryMergeMostRecentIndexTableInto( |
312 | | - sparkSession, |
313 | | - newColStatsIndexDf, |
314 | | - // Load current most recent col-stats-index table |
315 | | - sparkSession.read().load( |
316 | | - new Path(indexFolderPath, validIndexTables.get(validIndexTables.size() - 1)).toString() |
317 | | - ) |
318 | | - ); |
319 | | - |
320 | | - // Clean up all index tables (after creation of the new index) |
321 | | - tablesToCleanup.addAll(validIndexTables); |
| 308 | + Path latestIndexTablePath = new Path(indexFolderPath, validIndexTables.get(validIndexTables.size() - 1)); |
| 309 | + |
| 310 | + Option<Dataset<Row>> existingIndexTableOpt = |
| 311 | + tryLoadExistingIndexTable(sparkSession, latestIndexTablePath); |
| 312 | + |
| 313 | + if (!existingIndexTableOpt.isPresent()) { |
| 314 | + finalColStatsIndexDf = newColStatsIndexDf; |
| 315 | + } else { |
| 316 | + // NOTE: That Parquet schema might deviate from the original table schema (for ex, |
| 317 | + // by upcasting "short" to "integer" types, etc), and hence we need to re-adjust it |
| 318 | + // prior to merging, since merging might fail otherwise due to schemas incompatibility |
| 319 | + finalColStatsIndexDf = |
| 320 | + tryMergeMostRecentIndexTableInto( |
| 321 | + sparkSession, |
| 322 | + newColStatsIndexDf, |
| 323 | + // Load current most recent col-stats-index table |
| 324 | + existingIndexTableOpt.get() |
| 325 | + ); |
| 326 | + |
| 327 | + // Clean up all index tables (after creation of the new index) |
| 328 | + tablesToCleanup.addAll(validIndexTables); |
| 329 | + } |
322 | 330 | } |
323 | 331 |
|
324 | 332 | // Persist new col-stats-index table |
@@ -349,6 +357,17 @@ public static void updateColumnStatsIndexFor( |
349 | 357 | } |
350 | 358 | } |
351 | 359 |
|
| 360 | + @Nonnull |
| 361 | + private static Option<Dataset<Row>> tryLoadExistingIndexTable(@Nonnull SparkSession sparkSession, @Nonnull Path indexTablePath) { |
| 362 | + try { |
| 363 | + Dataset<Row> indexTableDataset = sparkSession.read().load(indexTablePath.toUri().toString()); |
| 364 | + return Option.of(indexTableDataset); |
| 365 | + } catch (Exception e) { |
| 366 | + LOG.error(String.format("Failed to load existing Column Stats index table from (%s)", indexTablePath), e); |
| 367 | + return Option.empty(); |
| 368 | + } |
| 369 | + } |
| 370 | + |
352 | 371 | @Nonnull |
353 | 372 | private static Dataset<Row> tryMergeMostRecentIndexTableInto( |
354 | 373 | @Nonnull SparkSession sparkSession, |
|
0 commit comments