Support call procedure for build action

huberylee · huberylee · commit 146f2f91d95a · 2022-09-16T18:21:25.000+08:00
diff --git a/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDataSourceHelpers.java b/hudi-spark-datasource/hudi-spark/src/main/java/org/apache/hudi/HoodieDataSourceHelpers.java
@@ -18,12 +18,14 @@
 
 package org.apache.hudi;
 
+import org.apache.hudi.avro.model.HoodieBuildPlan;
 import org.apache.hudi.avro.model.HoodieClusteringPlan;
 import org.apache.hudi.common.model.HoodieTableType;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
 import org.apache.hudi.common.table.timeline.HoodieInstant;
 import org.apache.hudi.common.table.timeline.HoodieTimeline;
+import org.apache.hudi.common.util.BuildUtils;
 import org.apache.hudi.common.util.ClusteringUtils;
 import org.apache.hudi.common.util.CollectionUtils;
 import org.apache.hudi.common.util.Option;
@@ -97,4 +99,28 @@ public static Option<HoodieClusteringPlan> getClusteringPlan(FileSystem fs, Stri
       return Option.empty();
     }
   }
+
+  @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING)
+  public static HoodieTimeline allBuildCommits(FileSystem fs, String basePath) {
+    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder()
+        .setConf(fs.getConf())
+        .setBasePath(basePath)
+        .setLoadActiveTimelineOnLoad(true)
+        .build();
+    return metaClient.getActiveTimeline()
+        .getTimelineOfActions(CollectionUtils.createSet(HoodieActiveTimeline.BUILD_ACTION));
+  }
+
+  @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING)
+  public static Option<HoodieBuildPlan> getBuildPlan(FileSystem fs, String basePath, String instantTime) {
+    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf())
+        .setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
+    HoodieInstant hoodieInstant = HoodieTimeline.getBuildRequestedInstant(instantTime);
+    Option<Pair<HoodieInstant, HoodieBuildPlan>> buildPlan = BuildUtils.getBuildPlan(metaClient, hoodieInstant);
+    if (buildPlan.isPresent()) {
+      return Option.of(buildPlan.get().getValue());
+    } else {
+      return Option.empty();
+    }
+  }
 }
diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/HoodieProcedures.scala
@@ -79,6 +79,8 @@ object HoodieProcedures {
       ,(RunCleanProcedure.NAME, RunCleanProcedure.builder)
       ,(ValidateHoodieSyncProcedure.NAME, ValidateHoodieSyncProcedure.builder)
       ,(ShowInvalidParquetProcedure.NAME, ShowInvalidParquetProcedure.builder)
+      ,(RunBuildProcedure.NAME, RunBuildProcedure.builder)
+      ,(ShowBuildProcedure.NAME, ShowBuildProcedure.builder)
     )
   }
 }
diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedurePredicateHelper.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ProcedurePredicateHelper.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.hudi.command.procedures
+
+import org.apache.hudi.DataSourceReadOptions.{QUERY_TYPE, QUERY_TYPE_SNAPSHOT_OPT_VAL}
+import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
+import org.apache.hudi.common.util.ValidationUtils.checkArgument
+import org.apache.hudi.{AvroConversionUtils, HoodieFileIndex}
+import org.apache.spark.sql.HoodieCatalystExpressionUtils.{resolveExpr, splitPartitionAndDataPredicates}
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.PredicateHelper
+import org.apache.spark.sql.execution.datasources.FileStatusCache
+
+trait ProcedurePredicateHelper extends PredicateHelper {
+
+  def prunePartition(
+    sparkSession: SparkSession,
+    metaClient: HoodieTableMetaClient,
+    predicate: String): Seq[String] = {
+    val options = Map(QUERY_TYPE.key() -> QUERY_TYPE_SNAPSHOT_OPT_VAL, "path" -> metaClient.getBasePath)
+    val hoodieFileIndex = HoodieFileIndex(sparkSession, metaClient, None, options,
+      FileStatusCache.getOrCreate(sparkSession))
+
+    // Resolve partition predicates
+    val schemaResolver = new TableSchemaResolver(metaClient)
+    val tableSchema = AvroConversionUtils.convertAvroSchemaToStructType(schemaResolver.getTableAvroSchema)
+    val condition = resolveExpr(sparkSession, predicate, tableSchema)
+    val partitionColumns = metaClient.getTableConfig.getPartitionFields.orElse(Array[String]())
+    val (partitionPredicates, dataPredicates) = splitPartitionAndDataPredicates(
+      sparkSession, splitConjunctivePredicates(condition).toArray, partitionColumns)
+    checkArgument(dataPredicates.isEmpty, "Only partition predicates are allowed")
+
+    // Get all partitions and prune partition by predicates
+    val prunedPartitions = hoodieFileIndex.getPartitionPaths(partitionPredicates)
+    prunedPartitions.map(partitionPath => partitionPath.getPath)
+  }
+}
diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunBuildProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunBuildProcedure.scala
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.hudi.command.procedures
+
+import org.apache.hudi.HoodieCLIUtils
+import org.apache.hudi.common.config.HoodieBuildConfig
+import org.apache.hudi.common.table.HoodieTableMetaClient
+import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieTimeline}
+import org.apache.hudi.common.util.{BuildUtils, HoodieTimer, Option => HOption}
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
+
+import java.util.function.Supplier
+import scala.collection.JavaConverters.{asScalaBufferConverter, asScalaIteratorConverter}
+
+class RunBuildProcedure extends BaseProcedure
+  with ProcedureBuilder
+  with ProcedurePredicateHelper
+  with Logging {
+  private val PARAMETERS = Array[ProcedureParameter](
+    ProcedureParameter.optional(0, "table", DataTypes.StringType, None),
+    ProcedureParameter.optional(1, "path", DataTypes.StringType, None),
+    ProcedureParameter.optional(2, "predicate", DataTypes.StringType, None),
+    ProcedureParameter.optional(4, "show_involved_partition", DataTypes.BooleanType, false)
+  )
+
+  private val OUTPUT_TYPE = new StructType(Array[StructField](
+    StructField("timestamp", DataTypes.StringType, nullable = true, Metadata.empty),
+    StructField("task_num", DataTypes.IntegerType, nullable = true, Metadata.empty),
+    StructField("state", DataTypes.StringType, nullable = true, Metadata.empty),
+    StructField("involved_partitions", DataTypes.StringType, nullable = true, Metadata.empty)
+  ))
+
+  /**
+   * Returns the input parameters of this procedure.
+   */
+  override def parameters: Array[ProcedureParameter] = PARAMETERS
+
+  /**
+   * Returns the type of rows produced by this procedure.
+   */
+  override def outputType: StructType = OUTPUT_TYPE
+
+  /**
+   * Executes this procedure.
+   * <p>
+   * Spark will align the provided arguments according to the input parameters
+   * defined in {@link # parameters ( )} either by position or by name before execution.
+   * <p>
+   * Implementations may provide a summary of execution by returning one or many rows
+   * as a result. The schema of output rows must match the defined output type
+   * in {@link # outputType ( )}.
+   *
+   * @param args input arguments
+   * @return the result of executing this procedure with the given arguments
+   */
+  override def call(args: ProcedureArgs): Seq[Row] = {
+    super.checkArgs(PARAMETERS, args)
+
+    val tableName = getArgValueOrDefault(args, PARAMETERS(0))
+    val tablePath = getArgValueOrDefault(args, PARAMETERS(1))
+    val predicate = getArgValueOrDefault(args, PARAMETERS(2))
+    val showInvolvedPartitions = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[Boolean]
+
+    val basePath: String = getBasePath(tableName, tablePath)
+    val metaClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build()
+    var conf: Map[String, String] = Map.empty
+    predicate match {
+      case Some(p) =>
+        val prunedPartitions = prunePartition(spark, metaClient, p.asInstanceOf[String])
+        conf = conf ++ Map(
+          HoodieBuildConfig.PARTITION_SELECTED.key() -> prunedPartitions.mkString(",")
+        )
+        logInfo(s"Partition predicates: ${p}, partition selected: ${prunedPartitions}")
+      case _ =>
+        logInfo("No partition predicates")
+    }
+
+    var pendingBuild = BuildUtils.getAllPendingBuildPlans(metaClient)
+      .iterator().asScala.map(_.getLeft.getTimestamp).toSeq.sortBy(f => f)
+    logInfo(s"Pending build instants: ${pendingBuild.mkString(",")}")
+
+    val client = HoodieCLIUtils.createHoodieClientFromPath(sparkSession, basePath, conf)
+    val instantTime = HoodieActiveTimeline.createNewInstantTime()
+    if (client.scheduleBuildAtInstant(instantTime, HOption.empty())) {
+      pendingBuild ++= Seq(instantTime)
+    }
+    logInfo(s"Build instants to run: ${pendingBuild.mkString(",")}")
+
+    val timer = new HoodieTimer
+    timer.startTimer()
+    pendingBuild.foreach(instant => {
+      timer.startTimer()
+      client.build(instant, true)
+      logInfo(s"Finish build for instant: $instant, time cost: ${timer.endTimer()}ms")
+    })
+    client.close()
+    logInfo(s"Finish build all instants: ${pendingBuild.mkString(",")}, time cost: ${timer.endTimer()}ms")
+
+    val buildInstants = metaClient.reloadActiveTimeline().getInstants.iterator().asScala
+      .filter(p => p.getAction == HoodieTimeline.BUILD_ACTION && pendingBuild.contains(p.getTimestamp))
+      .toSeq
+      .sortBy(f => f.getTimestamp)
+      .reverse
+
+    val buildPlans = buildInstants.map(instant =>
+      BuildUtils.getBuildPlan(metaClient, instant)
+    )
+
+    if (showInvolvedPartitions) {
+      buildPlans.map { p =>
+        Row(p.get().getLeft.getTimestamp, p.get().getRight.getTasks.size(),
+          p.get().getLeft.getState.name(),
+          BuildUtils.extractPartitions(p.get().getRight.getTasks).asScala.mkString(","))
+      }
+    } else {
+      buildPlans.map { p =>
+        Row(p.get().getLeft.getTimestamp, p.get().getRight.getTasks.size(), p.get().getLeft.getState.name(), "*")
+      }
+    }
+  }
+
+  override def build: Procedure = new RunBuildProcedure
+}
+
+
+object RunBuildProcedure {
+  val NAME = "run_build"
+
+  def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] {
+    override def get() = new RunBuildProcedure
+  }
+}
diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunClusteringProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunClusteringProcedure.scala
@@ -17,27 +17,23 @@
 
 package org.apache.spark.sql.hudi.command.procedures
 
-import org.apache.hudi.DataSourceReadOptions.{QUERY_TYPE, QUERY_TYPE_SNAPSHOT_OPT_VAL}
+import org.apache.hudi.HoodieCLIUtils
 import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieTimeline}
 import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
-import org.apache.hudi.common.util.ValidationUtils.checkArgument
 import org.apache.hudi.common.util.{ClusteringUtils, Option => HOption}
 import org.apache.hudi.config.HoodieClusteringConfig
 import org.apache.hudi.exception.HoodieClusteringException
-import org.apache.hudi.{AvroConversionUtils, HoodieCLIUtils, HoodieFileIndex}
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.HoodieCatalystExpressionUtils.{resolveExpr, splitPartitionAndDataPredicates}
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.expressions.PredicateHelper
-import org.apache.spark.sql.execution.datasources.FileStatusCache
 import org.apache.spark.sql.types._
 
 import java.util.function.Supplier
+
 import scala.collection.JavaConverters._
 
 class RunClusteringProcedure extends BaseProcedure
   with ProcedureBuilder
-  with PredicateHelper
+  with ProcedurePredicateHelper
   with Logging {
 
   /**
@@ -77,10 +73,10 @@ class RunClusteringProcedure extends BaseProcedure
     var conf: Map[String, String] = Map.empty
     predicate match {
       case Some(p) =>
-        val prunedPartitions = prunePartition(metaClient, p.asInstanceOf[String])
+        val prunedPartitions = prunePartition(spark, metaClient, p.asInstanceOf[String])
         conf = conf ++ Map(
           HoodieClusteringConfig.PLAN_PARTITION_FILTER_MODE_NAME.key() -> "SELECTED_PARTITIONS",
-          HoodieClusteringConfig.PARTITION_SELECTED.key() -> prunedPartitions
+          HoodieClusteringConfig.PARTITION_SELECTED.key() -> prunedPartitions.mkString(",")
         )
         logInfo(s"Partition predicates: $p, partition selected: $prunedPartitions")
       case _ =>
@@ -113,6 +109,7 @@ class RunClusteringProcedure extends BaseProcedure
 
     val startTs = System.currentTimeMillis()
     pendingClustering.foreach(client.cluster(_, true))
+    client.close()
     logInfo(s"Finish clustering all the instants: ${pendingClustering.mkString(",")}," +
       s" time cost: ${System.currentTimeMillis() - startTs}ms.")
 
@@ -140,25 +137,6 @@ class RunClusteringProcedure extends BaseProcedure
 
   override def build: Procedure = new RunClusteringProcedure()
 
-  def prunePartition(metaClient: HoodieTableMetaClient, predicate: String): String = {
-    val options = Map(QUERY_TYPE.key() -> QUERY_TYPE_SNAPSHOT_OPT_VAL, "path" -> metaClient.getBasePath)
-    val hoodieFileIndex = HoodieFileIndex(sparkSession, metaClient, None, options,
-      FileStatusCache.getOrCreate(sparkSession))
-
-    // Resolve partition predicates
-    val schemaResolver = new TableSchemaResolver(metaClient)
-    val tableSchema = AvroConversionUtils.convertAvroSchemaToStructType(schemaResolver.getTableAvroSchema)
-    val condition = resolveExpr(sparkSession, predicate, tableSchema)
-    val partitionColumns = metaClient.getTableConfig.getPartitionFields.orElse(Array[String]())
-    val (partitionPredicates, dataPredicates) = splitPartitionAndDataPredicates(
-      sparkSession, splitConjunctivePredicates(condition).toArray, partitionColumns)
-    checkArgument(dataPredicates.isEmpty, "Only partition predicates are allowed")
-
-    // Get all partitions and prune partition by predicates
-    val prunedPartitions = hoodieFileIndex.getPartitionPaths(partitionPredicates)
-    prunedPartitions.map(partitionPath => partitionPath.getPath).toSet.mkString(",")
-  }
-
   private def validateOrderColumns(orderColumns: String, metaClient: HoodieTableMetaClient): Unit = {
     if (orderColumns == null) {
       throw new HoodieClusteringException("Order columns is null")
@@ -173,7 +151,6 @@ class RunClusteringProcedure extends BaseProcedure
       }
     })
   }
-
 }
 
 object RunClusteringProcedure {
diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunCompactionProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/RunCompactionProcedure.scala
@@ -119,6 +119,7 @@ class RunCompactionProcedure extends BaseProcedure with ProcedureBuilder with Sp
         }
       case _ => throw new UnsupportedOperationException(s"Unsupported compaction operation: $operation")
     }
+    client.close()
 
     val compactionInstants = metaClient.reloadActiveTimeline().getInstants.iterator().asScala
       .filter(instant => willCompactionInstants.contains(instant.getTimestamp))
diff --git a/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowBuildProcedure.scala b/hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ShowBuildProcedure.scala
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestBuildProcedure.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestBuildProcedure.scala

Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,8 @@ object HoodieProcedures {`
`79`	`79`	`,(RunCleanProcedure.NAME, RunCleanProcedure.builder)`
`80`	`80`	`,(ValidateHoodieSyncProcedure.NAME, ValidateHoodieSyncProcedure.builder)`
`81`	`81`	`,(ShowInvalidParquetProcedure.NAME, ShowInvalidParquetProcedure.builder)`
	`82`	`+ ,(RunBuildProcedure.NAME, RunBuildProcedure.builder)`
	`83`	`+ ,(ShowBuildProcedure.NAME, ShowBuildProcedure.builder)`
`82`	`84`	`)`
`83`	`85`	`}`
`84`	`86`	`}`
Original file line number	Diff line number	Diff line change
`@@ -119,6 +119,7 @@ class RunCompactionProcedure extends BaseProcedure with ProcedureBuilder with Sp`
`119`	`119`	`}`
`120`	`120`	`case _ => throw new UnsupportedOperationException(s"Unsupported compaction operation: $operation")`
`121`	`121`	`}`
	`122`	`+ client.close()`
`122`	`123`
`123`	`124`	`val compactionInstants = metaClient.reloadActiveTimeline().getInstants.iterator().asScala`
`124`	`125`	`.filter(instant => willCompactionInstants.contains(instant.getTimestamp))`