apache · jonvex · Mar 23, 2023 · Mar 23, 2023 · Mar 24, 2023
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/...client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java b/...client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieBootstrapConfig.java
@@ -118,6 +118,11 @@ public class HoodieBootstrapConfig extends HoodieConfig {
       .sinceVersion("0.6.0")
       .withDocumentation("Implementation to use, for mapping a skeleton base file to a bootstrap base file.");
 
+  public static final ConfigProperty<String> DATA_QUERIES_ONLY = ConfigProperty
+      .key("hoodie.bootstrap.data.queries.only")
+      .defaultValue("true")
+      .withDocumentation("Improves query performance, but queries cannot use hudi metadata fields");
+
   /**
    * @deprecated Use {@link #BASE_PATH} and its methods instead
    */

diff --git a/...ava/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java b/...ava/org/apache/hudi/client/clustering/run/strategy/MultipleSparkJobExecutionStrategy.java
@@ -18,6 +18,8 @@
 
 package org.apache.hudi.client.clustering.run.strategy;
 
+import org.apache.hudi.AvroConversionUtils;
+import org.apache.hudi.HoodieSparkUtils;
 import org.apache.hudi.SparkAdapterSupport$;
 import org.apache.hudi.avro.HoodieAvroUtils;
 import org.apache.hudi.avro.model.HoodieClusteringGroup;
@@ -52,9 +54,12 @@
 import org.apache.hudi.execution.bulkinsert.RDDSpatialCurveSortPartitioner;
 import org.apache.hudi.execution.bulkinsert.RowCustomColumnsSortPartitioner;
 import org.apache.hudi.execution.bulkinsert.RowSpatialCurveSortPartitioner;
+import org.apache.hudi.hadoop.CachingPath;
 import org.apache.hudi.io.IOUtils;
 import org.apache.hudi.io.storage.HoodieFileReader;
 import org.apache.hudi.io.storage.HoodieFileReaderFactory;
+import org.apache.hudi.io.storage.HoodieSparkBootstrapFileReader;
+import org.apache.hudi.io.storage.HoodieSparkFileReader;
 import org.apache.hudi.keygen.BaseKeyGenerator;
 import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
 import org.apache.hudi.table.BulkInsertPartitioner;
@@ -71,6 +76,8 @@
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.execution.datasources.SparkParsePartitionUtil;
+import org.apache.spark.sql.internal.SQLConf;
 import org.apache.spark.sql.sources.BaseRelation;
 
 import java.io.IOException;
@@ -88,6 +95,7 @@
 import static org.apache.hudi.common.config.HoodieCommonConfig.TIMESTAMP_AS_OF;
 import static org.apache.hudi.common.table.log.HoodieFileSliceReader.getFileSliceReader;
 import static org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS;
+import static org.apache.spark.sql.catalyst.util.DateTimeUtils.TIMEZONE_OPTION;
 
 /**
  * Clustering strategy to submit multiple spark jobs and union the results.
@@ -324,6 +332,12 @@ private HoodieData<HoodieRecord<T>> readRecordsForGroupBaseFiles(JavaSparkContex
                                                                    List<ClusteringOperation> clusteringOps) {
     SerializableConfiguration hadoopConf = new SerializableConfiguration(getHoodieTable().getHadoopConf());
     HoodieWriteConfig writeConfig = getWriteConfig();
+    Option<String[]> partitionFields = getHoodieTable().getMetaClient().getTableConfig().getPartitionFields();
+    String bootstrapBasePath = getHoodieTable().getMetaClient().getTableConfig().getBootstrapBasePath().get();
+    CachingPath bbp = new CachingPath(bootstrapBasePath);
+    String timeZoneId = jsc.getConf().get("timeZone", SQLConf.get().sessionLocalTimeZone());
+    SparkParsePartitionUtil sparkParsePartitionUtil = SparkAdapterSupport$.MODULE$.sparkAdapter().getSparkParsePartitionUtil();
+    Boolean shouldValidateColumns = jsc.getConf().getBoolean("spark.sql.sources.validatePartitionColumns", true);
 
     // NOTE: It's crucial to make sure that we don't capture whole "this" object into the
     //       closure, as this might lead to issues attempting to serialize its nested fields
@@ -334,6 +348,15 @@ private HoodieData<HoodieRecord<T>> readRecordsForGroupBaseFiles(JavaSparkContex
             try {
               Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(writeConfig.getSchema()));
               HoodieFileReader baseFileReader = HoodieFileReaderFactory.getReaderFactory(recordType).getFileReader(hadoopConf.get(), new Path(clusteringOp.getDataFilePath()));
+              if (!clusteringOp.getBootstrapFilePath().isEmpty()) {
+                String bootstrapFilePath = clusteringOp.getBootstrapFilePath();
+                int startOfPartitionPath = bootstrapFilePath.indexOf(bootstrapBasePath) + bootstrapBasePath.length() + 1;
+                String partitionFilePath = bootstrapFilePath.substring(startOfPartitionPath, clusteringOp.getBootstrapFilePath().lastIndexOf("/"));
+                HoodieFileReader dataFileReader = HoodieFileReaderFactory.getReaderFactory(recordType).getFileReader(hadoopConf.get(), new Path(clusteringOp.getBootstrapFilePath()));
+                Object[] partitionValues = HoodieSparkUtils.ParsePartitionColumnValues(partitionFields.get(), partitionFilePath, bbp,
+                    AvroConversionUtils.convertAvroSchemaToStructType(baseFileReader.getSchema()), timeZoneId, sparkParsePartitionUtil, shouldValidateColumns);
+                baseFileReader = HoodieFileReaderFactory.getReaderFactory(recordType).newBootstrapFileReader(baseFileReader, dataFileReader, partitionFields, partitionValues);
+              }
               Option<BaseKeyGenerator> keyGeneratorOp =
                   writeConfig.populateMetaFields() ? Option.empty() : Option.of((BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(writeConfig.getProps()));
               // NOTE: Record have to be cloned here to make sure if it holds low-level engine-specific
@@ -368,9 +391,6 @@ private Dataset<Row> readRecordsForGroupAsRow(JavaSparkContext jsc,
         .stream()
         .map(op -> {
           ArrayList<String> readPaths = new ArrayList<>();
-          if (op.getBootstrapFilePath() != null) {
-            readPaths.add(op.getBootstrapFilePath());
-          }
           if (op.getDataFilePath() != null) {
             readPaths.add(op.getDataFilePath());
           }

diff --git a/...spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkBootstrapFileReader.java b/...spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkBootstrapFileReader.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.io.storage;
+
+import org.apache.hudi.common.util.Option;
+
+import org.apache.avro.Schema;
+import org.apache.spark.sql.catalyst.InternalRow;
+
+public class HoodieSparkBootstrapFileReader extends HoodieBootstrapFileReader<InternalRow>{
+
+  public HoodieSparkBootstrapFileReader(HoodieFileReader<InternalRow> skeletonFileReader, HoodieFileReader<InternalRow> dataFileReader, Option<String[]> partitionFields,
+                                        Object[] partitionValues) {
+    super(skeletonFileReader, dataFileReader, partitionFields, partitionValues);
+  }
+
+  @Override
+  protected void setPartitionField(int position, Object fieldValue, InternalRow row) {
+    if (row.isNullAt(position)) {
+      row.update(position, fieldValue);
+    }
+  }
+}
diff --git a/...i-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileReaderFactory.java b/...i-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkFileReaderFactory.java
@@ -18,8 +18,11 @@
 
 package org.apache.hudi.io.storage;
 
+import org.apache.avro.generic.IndexedRecord;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+
+import org.apache.hudi.common.util.Option;
 import org.apache.hudi.exception.HoodieIOException;
 
 import java.io.IOException;
@@ -43,4 +46,11 @@ protected HoodieFileReader newHFileFileReader(Configuration conf, Path path) thr
   protected HoodieFileReader newOrcFileReader(Configuration conf, Path path) {
     throw new HoodieIOException("Not support read orc file");
   }
+
+  @Override
+  public HoodieFileReader newBootstrapFileReader(HoodieFileReader skeletonFileReader, HoodieFileReader dataFileReader,
+                                                                Option<String[]> partitionFields, Object[] partitionValues) {
+    return new HoodieSparkBootstrapFileReader(skeletonFileReader, dataFileReader, partitionFields, partitionValues);
+  }
+
 }
diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieSparkUtils.scala
@@ -20,17 +20,24 @@ package org.apache.hudi
 
 import org.apache.avro.Schema
 import org.apache.avro.generic.GenericRecord
+import org.apache.hadoop.fs.Path
 import org.apache.hudi.HoodieConversionUtils.toScalaOption
 import org.apache.hudi.avro.{AvroSchemaUtils, HoodieAvroUtils}
 import org.apache.hudi.client.utils.SparkRowSerDe
 import org.apache.hudi.common.model.HoodieRecord
+import org.apache.hudi.hadoop.CachingPath
+import org.apache.hudi.hadoop.CachingPath.createRelativePathUnsafe
 import org.apache.spark.SPARK_VERSION
+import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame}
+import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.SQLConfInjectingRDD
+import org.apache.spark.sql.execution.datasources.SparkParsePartitionUtil
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{StringType, StructField, StructType}
+import org.apache.spark.unsafe.types.UTF8String
 
 import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
@@ -54,7 +61,7 @@ private[hudi] trait SparkVersionsSupport {
   def gteqSpark3_3: Boolean = getSparkVersion >= "3.3"
 }
 
-object HoodieSparkUtils extends SparkAdapterSupport with SparkVersionsSupport {
+object HoodieSparkUtils extends SparkAdapterSupport with SparkVersionsSupport with Logging {
 
   override def getSparkVersion: String = SPARK_VERSION
 
@@ -191,4 +198,77 @@ object HoodieSparkUtils extends SparkAdapterSupport with SparkVersionsSupport {
   def getCatalystRowSerDe(structType: StructType): SparkRowSerDe = {
     sparkAdapter.createSparkRowSerDe(structType)
   }
+
+  def ParsePartitionColumnValues(partitionColumns: Array[String], partitionPath: String, basePath: Path,
+                                 schema: StructType, timeZoneId: String, sparkParsePartitionUtil: SparkParsePartitionUtil,
+                                 shouldValidatePartitionCols: Boolean): Array[Object] = {
+    if (partitionColumns.length == 0) {
+      // This is a non-partitioned table
+      Array.empty
+    } else {
+      val partitionFragments = partitionPath.split("/")
+      if (partitionFragments.length != partitionColumns.length) {
+        if (partitionColumns.length == 1) {
+          // If the partition column size is not equal to the partition fragment size
+          // and the partition column size is 1, we map the whole partition path
+          // to the partition column which can benefit from the partition prune.
+          val prefix = s"${partitionColumns.head}="
+          val partitionValue = if (partitionPath.startsWith(prefix)) {
+            // support hive style partition path
+            partitionPath.substring(prefix.length)
+          } else {
+            partitionPath
+          }
+          Array(UTF8String.fromString(partitionValue))
+        } else {
+          // If the partition column size is not equal to the partition fragments size
+          // and the partition column size > 1, we do not know how to map the partition
+          // fragments to the partition columns and therefore return an empty tuple. We don't
+          // fail outright so that in some cases we can fallback to reading the table as non-partitioned
+          // one
+          logWarning(s"Failed to parse partition values: found partition fragments" +
+            s" (${partitionFragments.mkString(",")}) are not aligned with expected partition columns" +
+            s" (${partitionColumns.mkString(",")})")
+          Array.empty
+        }
+      } else {
+        // If partitionSeqs.length == partitionSchema.fields.length
+        // Append partition name to the partition value if the
+        // HIVE_STYLE_PARTITIONING is disable.
+        // e.g. convert "/xx/xx/2021/02" to "/xx/xx/year=2021/month=02"
+        val partitionWithName =
+        partitionFragments.zip(partitionColumns).map {
+          case (partition, columnName) =>
+            if (partition.indexOf("=") == -1) {
+              s"${columnName}=$partition"
+            } else {
+              partition
+            }
+        }.mkString("/")
+
+        val pathWithPartitionName = new CachingPath(basePath, createRelativePathUnsafe(partitionWithName))
+        val partitionSchema = StructType(schema.fields.filter(f => partitionColumns.contains(f.name)))
+        val partitionValues = parsePartitionPath(pathWithPartitionName, partitionSchema, timeZoneId,
+          sparkParsePartitionUtil, basePath, shouldValidatePartitionCols)
+
+        partitionValues.map(_.asInstanceOf[Object]).toArray
+      }
+    }
+  }
+
+  private def parsePartitionPath(partitionPath: Path, partitionSchema: StructType, timeZoneId: String,
+                                 sparkParsePartitionUtil: SparkParsePartitionUtil, basePath: Path,
+                                 shouldValidatePartitionCols: Boolean): Seq[Any] = {
+    val partitionDataTypes = partitionSchema.map(f => f.name -> f.dataType).toMap
+
+    sparkParsePartitionUtil.parsePartition(
+      partitionPath,
+      typeInference = false,
+      Set(basePath),
+      partitionDataTypes,
+      DateTimeUtils.getTimeZone(timeZoneId),
+      validatePartitionValues = shouldValidatePartitionCols
+    )
+      .toSeq(partitionSchema)
+  }
 }
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java b/hudi-common/src/main/java/org/apache/hudi/common/table/view/AbstractTableFileSystemView.java
@@ -143,7 +143,7 @@ public List<HoodieFileGroup> addFilesToView(FileStatus[] statuses) {
     // Group by partition for efficient updates for both InMemory and DiskBased structures.
     fileGroups.stream().collect(Collectors.groupingBy(HoodieFileGroup::getPartitionPath)).forEach((partition, value) -> {
       if (!isPartitionAvailableInStore(partition)) {
-        if (bootstrapIndex.useIndex()) {
+        if (!partition.isEmpty() && bootstrapIndex.useIndex()) {
           try (BootstrapIndex.IndexReader reader = bootstrapIndex.createReader()) {
             LOG.info("Bootstrap Index available for partition " + partition);
             List<BootstrapFileMapping> sourceFileMappings =

diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroBootstrapFileReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroBootstrapFileReader.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.io.storage;
+
+import org.apache.hudi.common.util.Option;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.IndexedRecord;
+
+import java.util.Objects;
+
+public class HoodieAvroBootstrapFileReader extends HoodieBootstrapFileReader<IndexedRecord> {
+
+  public HoodieAvroBootstrapFileReader(HoodieFileReader<IndexedRecord> skeletonFileReader, HoodieFileReader<IndexedRecord> dataFileReader, Option<String[]> partitionFields,
+                                       Object[] partitionValues) {
+    super(skeletonFileReader, dataFileReader, partitionFields, partitionValues);
+  }
+
+  @Override
+  protected void setPartitionField(int position, Object fieldValue, IndexedRecord row) {
+    if (Objects.isNull(row.get(position))) {
+      row.put(position, fieldValue);
+    }
+  }
+}
diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileReaderFactory.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieAvroFileReaderFactory.java
@@ -18,6 +18,9 @@
 
 package org.apache.hudi.io.storage;
 
+import org.apache.hudi.common.util.Option;
+
+import org.apache.avro.generic.IndexedRecord;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
@@ -39,4 +42,10 @@ protected HoodieFileReader newHFileFileReader(Configuration conf, Path path) thr
   protected HoodieFileReader newOrcFileReader(Configuration conf, Path path) {
     return new HoodieAvroOrcReader(conf, path);
   }
+
+  @Override
+  public HoodieFileReader newBootstrapFileReader(HoodieFileReader skeletonFileReader, HoodieFileReader dataFileReader,
+                                                                Option<String[]> partitionFields, Object[] partitionValues) {
+    return new HoodieAvroBootstrapFileReader(skeletonFileReader, dataFileReader, partitionFields, partitionValues);
+  }
 }