[HUDI-4078][HUDI-FLINK]BootstrapOperator contains the pending compaction files

cuibo01 · cuibo01 · commit 8e9e250a4e6f · 2022-05-11T09:25:39.000+08:00
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/PriorityBasedFileSystemView.java
@@ -29,6 +29,7 @@
 import org.apache.hudi.common.util.Functions.Function1;
 import org.apache.hudi.common.util.Functions.Function2;
 import org.apache.hudi.common.util.Functions.Function3;
+import org.apache.hudi.common.util.Functions.Function4;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.collection.Pair;
 
@@ -106,7 +107,7 @@ private <T1, T2, R> R execute(T1 val, T2 val2, Function2<T1, T2, R> preferredFun
   }
 
   private <T1, T2, T3, R> R execute(T1 val, T2 val2, T3 val3, Function3<T1, T2, T3, R> preferredFunction,
-      Function3<T1, T2, T3, R> secondaryFunction) {
+                                    Function3<T1, T2, T3, R> secondaryFunction) {
     if (errorOnPreferredView) {
       LOG.warn("Routing request to secondary file-system view");
       return secondaryFunction.apply(val, val2, val3);
@@ -121,6 +122,22 @@ private <T1, T2, T3, R> R execute(T1 val, T2 val2, T3 val3, Function3<T1, T2, T3
     }
   }
 
+  private <T1, T2, T3, T4, R> R execute(T1 val, T2 val2, T3 val3, T4 val4, Function4<T1, T2, T3, T4, R> preferredFunction,
+      Function4<T1, T2, T3, T4, R> secondaryFunction) {
+    if (errorOnPreferredView) {
+      LOG.warn("Routing request to secondary file-system view");
+      return secondaryFunction.apply(val, val2, val3, val4);
+    } else {
+      try {
+        return preferredFunction.apply(val, val2, val3, val4);
+      } catch (RuntimeException re) {
+        handleRuntimeException(re);
+        errorOnPreferredView = true;
+        return secondaryFunction.apply(val, val2, val3, val4);
+      }
+    }
+  }
+
   private void handleRuntimeException(RuntimeException re) {
     if (re.getCause() instanceof HttpResponseException && ((HttpResponseException)re.getCause()).getStatusCode() == HttpStatus.SC_BAD_REQUEST) {
       LOG.warn("Got error running preferred function. Likely due to another concurrent writer in progress. Trying secondary");
@@ -179,7 +196,7 @@ public Stream<FileSlice> getLatestUnCompactedFileSlices(String partitionPath) {
   @Override
   public Stream<FileSlice> getLatestFileSlicesBeforeOrOn(String partitionPath, String maxCommitTime,
       boolean includeFileSlicesInPendingCompaction, boolean includeFilesInPendingCompaction) {
-    return execute(partitionPath, maxCommitTime, includeFileSlicesInPendingCompaction,
+    return execute(partitionPath, maxCommitTime, includeFileSlicesInPendingCompaction, includeFilesInPendingCompaction,
         preferredView::getLatestFileSlicesBeforeOrOn, secondaryView::getLatestFileSlicesBeforeOrOn);
   }
 
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/RemoteHoodieTableFileSystemView.java
@@ -117,8 +117,8 @@ public class RemoteHoodieTableFileSystemView implements SyncableFileSystemView,
   public static final String LAST_INSTANT_TS = "lastinstantts";
   public static final String TIMELINE_HASH = "timelinehash";
   public static final String REFRESH_OFF = "refreshoff";
-  public static final String INCLUDE_FILES_IN_PENDING_COMPACTION_PARAM = "includependingcompaction";
-
+  public static final String INCLUDE_IN_PENDING_COMPACTION_PARAM = "includependingcompaction";
+  public static final String INCLUDE_FILES_IN_PENDING_COMPACTION_PARAM = "includefilespendingcompaction";
 
   private static final Logger LOG = LogManager.getLogger(RemoteHoodieTableFileSystemView.class);
 
@@ -314,8 +314,8 @@ public Stream<FileSlice> getLatestUnCompactedFileSlices(String partitionPath) {
   public Stream<FileSlice> getLatestFileSlicesBeforeOrOn(String partitionPath, String maxCommitTime,
       boolean includeFileSlicesInPendingCompaction, boolean includeFilesInPendingCompaction) {
     Map<String, String> paramsMap = getParamsWithAdditionalParams(partitionPath,
-        new String[] {MAX_INSTANT_PARAM, INCLUDE_FILES_IN_PENDING_COMPACTION_PARAM},
-        new String[] {maxCommitTime, String.valueOf(includeFileSlicesInPendingCompaction)});
+        new String[] {MAX_INSTANT_PARAM, INCLUDE_IN_PENDING_COMPACTION_PARAM, INCLUDE_FILES_IN_PENDING_COMPACTION_PARAM},
+        new String[] {maxCommitTime, String.valueOf(includeFileSlicesInPendingCompaction), String.valueOf(includeFilesInPendingCompaction)});
     try {
       List<FileSliceDTO> dataFiles = executeRequest(LATEST_SLICES_BEFORE_ON_INSTANT_URL, paramsMap,
           new TypeReference<List<FileSliceDTO>>() {}, RequestMethod.GET);
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/Functions.java b/hudi-common/src/main/java/org/apache/hudi/common/util/Functions.java
@@ -33,28 +33,35 @@ static Runnable noop() {
   /**
    * A function which has not any parameter.
    */
-  public interface Function0<R> extends Serializable {
+  interface Function0<R> extends Serializable {
     R apply();
   }
 
   /**
    * A function which contains only one parameter.
    */
-  public interface Function1<T1, R> extends Serializable {
+  interface Function1<T1, R> extends Serializable {
     R apply(T1 val1);
   }
 
   /**
    * A function which contains two parameters.
    */
-  public interface Function2<T1, T2, R> extends Serializable {
+  interface Function2<T1, T2, R> extends Serializable {
     R apply(T1 val1, T2 val2);
   }
 
   /**
    * A function which contains three parameters.
    */
-  public interface Function3<T1, T2, T3, R> extends Serializable {
+  interface Function3<T1, T2, T3, R> extends Serializable {
     R apply(T1 val1, T2 val2, T3 val3);
   }
+
+  /**
+   * A function which contains 4 parameters.
+   */
+  interface Function4<T1, T2, T3, T4, R> extends Serializable {
+    R apply(T1 val1, T2 val2, T3 val3, T4 val4);
+  }
 }
diff --git a/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestPriorityBasedFileSystemView.java b/hudi-common/src/test/java/org/apache/hudi/common/table/view/TestPriorityBasedFileSystemView.java
@@ -375,30 +375,30 @@ public void testGetLatestFileSlicesBeforeOrOn() {
     String partitionPath = "/table2";
     String maxCommitTime = "2020-01-01";
 
-    when(primary.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false))
+    when(primary.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false, false))
         .thenReturn(testFileSliceStream);
-    actual = fsView.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false);
+    actual = fsView.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false, false);
     assertEquals(expected, actual);
 
     resetMocks();
-    when(primary.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false))
+    when(primary.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false, false))
         .thenThrow(new RuntimeException());
-    when(secondary.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false))
+    when(secondary.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false, false))
         .thenReturn(testFileSliceStream);
-    actual = fsView.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false);
+    actual = fsView.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false, false);
     assertEquals(expected, actual);
 
     resetMocks();
-    when(secondary.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false))
+    when(secondary.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false, false))
         .thenReturn(testFileSliceStream);
-    actual = fsView.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false);
+    actual = fsView.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false, false);
     assertEquals(expected, actual);
 
     resetMocks();
-    when(secondary.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false))
+    when(secondary.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false, false))
         .thenThrow(new RuntimeException());
     assertThrows(RuntimeException.class, () -> {
-      fsView.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false);
+      fsView.getLatestFileSlicesBeforeOrOn(partitionPath, maxCommitTime, false, false);
     });
   }
 
diff --git a/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestBoostrapOperator.java b/hudi-flink-datasource/hudi-flink/src/test/java/org/apache/hudi/sink/TestBoostrapOperator.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.sink;
+
+import org.apache.hudi.client.common.HoodieFlinkEngineContext;
+import org.apache.hudi.common.model.FileSlice;
+import org.apache.hudi.common.model.HoodieKey;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.model.HoodieTableType;
+import org.apache.hudi.common.table.TableSchemaResolver;
+import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
+import org.apache.hudi.common.table.timeline.HoodieInstant;
+import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.util.BaseFileUtils;
+import org.apache.hudi.common.util.ClosableIterator;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.configuration.FlinkOptions;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.sink.transform.Transformer;
+import org.apache.hudi.sink.utils.Pipelines;
+import org.apache.hudi.table.HoodieFlinkTable;
+import org.apache.hudi.table.format.FormatUtils;
+import org.apache.hudi.util.AvroSchemaConverter;
+import org.apache.hudi.util.FlinkClientUtil;
+import org.apache.hudi.util.StreamerUtil;
+import org.apache.hudi.utils.TestConfigurations;
+import org.apache.hudi.utils.source.ContinuousFileSource;
+
+import org.apache.avro.Schema;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.core.execution.JobClient;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.formats.common.TimestampFormat;
+import org.apache.flink.formats.json.JsonRowDataDeserializationSchema;
+import org.apache.flink.streaming.api.CheckpointingMode;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.runtime.typeutils.InternalTypeInfo;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.util.TestLogger;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+import java.util.Objects;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static java.util.stream.Collectors.toList;
+import static org.apache.hudi.util.StreamerUtil.isValidFile;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/**
+ * Integration test for BoostrapOperator.
+ */
+public class TestBoostrapOperator extends TestLogger {
+  @TempDir
+  File tempFile;
+
+  @Test
+  public void testLoadRecords() throws Exception {
+    Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath());
+    conf.setInteger(FlinkOptions.BUCKET_INDEX_NUM_BUCKETS, 4);
+    conf.setString(FlinkOptions.INDEX_KEY_FIELD, "id");
+    conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 1);
+    conf.setString(FlinkOptions.TABLE_TYPE, HoodieTableType.MERGE_ON_READ.name());
+
+    testWriteToHoodie(conf, Option.empty(), "mor_write_with_compact", 5);
+
+    deleteLastCompactionCommit();
+
+    HoodieWriteConfig writeConfig = StreamerUtil.getHoodieClientConfig(conf, true);
+    HoodieFlinkTable hoodieTable = HoodieFlinkTable.create(writeConfig, HoodieFlinkEngineContext.DEFAULT);
+    HoodieTimeline commitsTimeline = hoodieTable.getMetaClient().getCommitsTimeline();
+    Option<HoodieInstant> latestCommitTime = commitsTimeline.filterCompletedInstants().lastInstant();
+    AtomicInteger count = new AtomicInteger();
+    BaseFileUtils fileUtils = BaseFileUtils.getInstance(hoodieTable.getBaseFileFormat());
+    Schema schema = new TableSchemaResolver(hoodieTable.getMetaClient()).getTableAvroSchema();
+    if (latestCommitTime.isPresent()) {
+      List<FileSlice> fileSlices = hoodieTable.getSliceView()
+          .getLatestFileSlicesBeforeOrOn("par1", latestCommitTime.get().getTimestamp(), true, true)
+          .collect(toList());
+      for (FileSlice fileSlice : fileSlices) {
+        fileSlice.getBaseFile().ifPresent(baseFile -> {
+          // filter out crushed files
+          if (!isValidFile(baseFile.getFileStatus())) {
+            return;
+          }
+          try (ClosableIterator<HoodieKey> iterator = fileUtils.getHoodieKeyIterator(FlinkClientUtil.getHadoopConf(), new org.apache.hadoop.fs.Path(baseFile.getPath()))) {
+            iterator.forEachRemaining(hoodieKey -> {
+              count.getAndIncrement();
+            });
+          }
+        });
+
+        // load avro log records
+        List<String> logPaths = fileSlice.getLogFiles()
+            // filter out crushed files
+            .filter(logFile -> isValidFile(logFile.getFileStatus()))
+            .map(logFile -> logFile.getPath().toString())
+            .collect(toList());
+        HoodieMergedLogRecordScanner scanner = FormatUtils.logScanner(logPaths, schema, latestCommitTime.get().getTimestamp(),
+            writeConfig, FlinkClientUtil.getHadoopConf());
+
+        try {
+          for (String recordKey : scanner.getRecords().keySet()) {
+            count.getAndIncrement();
+          }
+        } catch (Exception e) {
+          throw new HoodieException(String.format("Error when loading record keys from files: %s", logPaths), e);
+        } finally {
+          scanner.close();
+        }
+      }
+    }
+    assertEquals(8, count.get());
+  }
+
+  private void deleteLastCompactionCommit() {
+    File allCommits = new File(tempFile.getPath(), ".hoodie");
+    final File[] files = allCommits.listFiles(new FilenameFilter() {
+      @Override
+      public boolean accept(File dir, String name) {
+        return name.endsWith(".commit");
+      }
+    });
+    if (files.length > 0) {
+      files[files.length - 1].delete();
+    }
+  }
+
+  private void testWriteToHoodie(
+      Configuration conf,
+      Option<Transformer> transformer,
+      String jobName,
+      int checkpoints) throws Exception {
+
+    StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment();
+    execEnv.getConfig().disableObjectReuse();
+    execEnv.setParallelism(4);
+    // set up checkpoint interval
+    execEnv.enableCheckpointing(4000, CheckpointingMode.EXACTLY_ONCE);
+    execEnv.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
+
+    // Read from file source
+    RowType rowType =
+        (RowType) AvroSchemaConverter.convertToDataType(StreamerUtil.getSourceSchema(conf))
+            .getLogicalType();
+
+    JsonRowDataDeserializationSchema deserializationSchema = new JsonRowDataDeserializationSchema(
+        rowType,
+        InternalTypeInfo.of(rowType),
+        false,
+        true,
+        TimestampFormat.ISO_8601
+    );
+    String sourcePath = Objects.requireNonNull(Thread.currentThread()
+        .getContextClassLoader().getResource("test_source6.data")).toString();
+
+    DataStream<RowData> dataStream;
+
+    dataStream = execEnv
+        // use continuous file source to trigger checkpoint
+        .addSource(new ContinuousFileSource.BoundedSourceFunction(new Path(sourcePath), checkpoints))
+        .name("continuous_file_source")
+        .setParallelism(1)
+        .map(record -> deserializationSchema.deserialize(record.getBytes(StandardCharsets.UTF_8)))
+        .setParallelism(1);
+
+    if (transformer.isPresent()) {
+      dataStream = transformer.get().apply(dataStream);
+    }
+
+    int parallelism = execEnv.getParallelism();
+    DataStream<HoodieRecord> hoodieRecordDataStream = Pipelines.bootstrap(conf, rowType, parallelism, dataStream);
+    DataStream<Object> pipeline = Pipelines.hoodieStreamWrite(conf, parallelism, hoodieRecordDataStream);
+    execEnv.addOperator(pipeline.getTransformation());
+
+    Pipelines.clean(conf, pipeline);
+    Pipelines.compact(conf, pipeline);
+
+    JobClient client = execEnv.executeAsync(jobName);
+    // wait for the streaming job to finish
+    client.getJobExecutionResult().get();
+  }
+}
diff --git a/hudi-flink-datasource/hudi-flink/src/test/resources/test_source6.data b/hudi-flink-datasource/hudi-flink/src/test/resources/test_source6.data
@@ -0,0 +1,8 @@
+{"uuid": "id1", "name": "Danny", "age": 23, "ts": "1970-01-01T00:00:01", "partition": "par1"}
+{"uuid": "id2", "name": "Stephen", "age": 33, "ts": "1970-01-01T00:00:02", "partition": "par1"}
+{"uuid": "id3", "name": "Julian", "age": 53, "ts": "1970-01-01T00:00:03", "partition": "par1"}
+{"uuid": "id4", "name": "Fabian", "age": 31, "ts": "1970-01-01T00:00:04", "partition": "par1"}
+{"uuid": "id5", "name": "Sophia", "age": 18, "ts": "1970-01-01T00:00:05", "partition": "par1"}
+{"uuid": "id6", "name": "Emma", "age": 20, "ts": "1970-01-01T00:00:06", "partition": "par1"}
+{"uuid": "id7", "name": "Bob", "age": 44, "ts": "1970-01-01T00:00:07", "partition": "par1"}
+{"uuid": "id8", "name": "Han", "age": 56, "ts": "1970-01-01T00:00:08", "partition": "par1"}
diff --git a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/RequestHandler.java b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/RequestHandler.java
@@ -364,7 +364,10 @@ private void registerFileSlicesAPI() {
           ctx.queryParam(RemoteHoodieTableFileSystemView.PARTITION_PARAM,""),
           ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.MAX_INSTANT_PARAM).getOrThrow(),
           Boolean.parseBoolean(
-              ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.INCLUDE_FILES_IN_PENDING_COMPACTION_PARAM)
+              ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.INCLUDE_IN_PENDING_COMPACTION_PARAM)
+                  .getOrThrow()),
+          Boolean.parseBoolean(
+              ctx.validatedQueryParam(RemoteHoodieTableFileSystemView.INCLUDE_FILES_IN_PENDING_COMPACTION_PARAM, "false")
                   .getOrThrow()));
       writeValueAsString(ctx, dtos);
     }, true));
diff --git a/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/FileSliceHandler.java b/hudi-timeline-service/src/main/java/org/apache/hudi/timeline/service/handlers/FileSliceHandler.java
@@ -61,9 +61,9 @@ public List<FileSliceDTO> getLatestMergedFileSlicesBeforeOrOn(String basePath, S
   }
 
   public List<FileSliceDTO> getLatestFileSlicesBeforeOrOn(String basePath, String partitionPath, String maxInstantTime,
-      boolean includeFileSlicesInPendingCompaction) {
+      boolean includeFileSlicesInPendingCompaction, boolean includeFilesInPendingCompaction) {
     return viewManager.getFileSystemView(basePath)
-        .getLatestFileSlicesBeforeOrOn(partitionPath, maxInstantTime, includeFileSlicesInPendingCompaction)
+        .getLatestFileSlicesBeforeOrOn(partitionPath, maxInstantTime, includeFileSlicesInPendingCompaction, includeFilesInPendingCompaction)
         .map(FileSliceDTO::fromFileSlice).collect(Collectors.toList());
   }
 

Original file line number	Diff line number	Diff line change
`@@ -61,9 +61,9 @@ public List<FileSliceDTO> getLatestMergedFileSlicesBeforeOrOn(String basePath, S`
`61`	`61`	`}`
`62`	`62`
`63`	`63`	`public List<FileSliceDTO> getLatestFileSlicesBeforeOrOn(String basePath, String partitionPath, String maxInstantTime,`
`64`		`- boolean includeFileSlicesInPendingCompaction) {`
	`64`	`+ boolean includeFileSlicesInPendingCompaction, boolean includeFilesInPendingCompaction) {`
`65`	`65`	`return viewManager.getFileSystemView(basePath)`
`66`		`- .getLatestFileSlicesBeforeOrOn(partitionPath, maxInstantTime, includeFileSlicesInPendingCompaction)`
	`66`	`+ .getLatestFileSlicesBeforeOrOn(partitionPath, maxInstantTime, includeFileSlicesInPendingCompaction, includeFilesInPendingCompaction)`
`67`	`67`	`.map(FileSliceDTO::fromFileSlice).collect(Collectors.toList());`
`68`	`68`	`}`
`69`	`69`