|
29 | 29 | import org.apache.hudi.common.model.HoodieWriteStat; |
30 | 30 | import org.apache.hudi.common.table.timeline.HoodieInstant; |
31 | 31 | import org.apache.hudi.common.table.timeline.HoodieTimeline; |
| 32 | +import org.apache.hudi.common.util.CollectionUtils; |
32 | 33 | import org.apache.hudi.common.util.NumericUtils; |
33 | 34 | import org.apache.hudi.common.util.Option; |
34 | 35 | import org.apache.hudi.common.util.collection.Pair; |
35 | 36 | import org.apache.hudi.config.HoodieWriteConfig; |
36 | 37 | import org.apache.hudi.table.HoodieTable; |
37 | 38 | import org.apache.hudi.table.WorkloadProfile; |
38 | 39 | import org.apache.hudi.table.WorkloadStat; |
| 40 | + |
39 | 41 | import org.apache.log4j.LogManager; |
40 | 42 | import org.apache.log4j.Logger; |
41 | 43 | import org.apache.spark.api.java.JavaRDD; |
|
54 | 56 |
|
55 | 57 | import scala.Tuple2; |
56 | 58 |
|
| 59 | +import static org.apache.hudi.common.table.timeline.HoodieTimeline.COMMIT_ACTION; |
| 60 | + |
57 | 61 | /** |
58 | 62 | * Packs incoming records to be upserted, into buckets (1 bucket = 1 RDD partition). |
59 | 63 | */ |
@@ -158,13 +162,17 @@ private List<SmallFile> filterSmallFilesInClustering(final Set<String> pendingCl |
158 | 162 | private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) { |
159 | 163 | // for new inserts, compute buckets depending on how many records we have for each partition |
160 | 164 | Set<String> partitionPaths = profile.getPartitionPaths(); |
161 | | - long averageRecordSize = |
162 | | - averageBytesPerRecord(table.getMetaClient().getActiveTimeline().getCommitTimeline().filterCompletedInstants(), |
163 | | - config); |
| 165 | + /* |
| 166 | + * NOTE: we only use commit instants to calculate average record size because replacecommit can be |
| 167 | + * created by clustering, which has smaller average record size, which affects assigning inserts and |
| 168 | + * may result in OOM by making spark underestimate the actual input record sizes. |
| 169 | + */ |
| 170 | + long averageRecordSize = averageBytesPerRecord(table.getMetaClient().getActiveTimeline() |
| 171 | + .getTimelineOfActions(CollectionUtils.createSet(COMMIT_ACTION)).filterCompletedInstants(), config); |
164 | 172 | LOG.info("AvgRecordSize => " + averageRecordSize); |
165 | 173 |
|
166 | 174 | Map<String, List<SmallFile>> partitionSmallFilesMap = |
167 | | - getSmallFilesForPartitions(new ArrayList<String>(partitionPaths), context); |
| 175 | + getSmallFilesForPartitions(new ArrayList<>(partitionPaths), context); |
168 | 176 |
|
169 | 177 | Map<String, Set<String>> partitionPathToPendingClusteringFileGroupsId = getPartitionPathToPendingClusteringFileGroupsId(); |
170 | 178 |
|
|
0 commit comments