@@ -63,7 +63,8 @@ public class TestAverageRecordSizeEstimator {
6363 private static final String PARTITION1 = "partition1" ;
6464 private static final String TEST_WRITE_TOKEN = "1-0-1" ;
6565 private static final Integer DEFAULT_MAX_COMMITS = 2 ;
66- private static final Integer DEFAULT_MAX_PARQUET_METADATA_SIZE = 1000 ;
66+ // needs to be big enough to skew the estimate
67+ private static final Integer DEFAULT_AVERAGE_PARQUET_METADATA_SIZE = 10000000 ;
6768 private static final Double DEFAULT_RECORD_SIZE_ESTIMATE_THRESHOLD = 0.1 ;
6869
6970 @ Test
@@ -102,7 +103,7 @@ public void testAverageRecordSizeWithNonEmptyCommitTimeline(List<Pair<HoodieInst
102103 HoodieWriteConfig writeConfig = HoodieWriteConfig .newBuilder ().withPath ("/tmp" )
103104 .withRecordSizeEstimator (AverageRecordSizeEstimator .class .getName ())
104105 .withRecordSizeEstimatorMaxCommits (DEFAULT_MAX_COMMITS )
105- .withRecordSizeEstimatorAverageMetadataSize (DEFAULT_MAX_PARQUET_METADATA_SIZE )
106+ .withRecordSizeEstimatorAverageMetadataSize (DEFAULT_AVERAGE_PARQUET_METADATA_SIZE )
106107 .withCompactionConfig (HoodieCompactionConfig .newBuilder ()
107108 .compactionRecordSizeEstimateThreshold (DEFAULT_RECORD_SIZE_ESTIMATE_THRESHOLD )
108109 .build ())
@@ -152,85 +153,85 @@ private static String getLogFileName(String instantTime) {
152153
153154 private static Stream <Arguments > testCases () {
154155 Long baseInstant = 20231204194919610L ;
156+ Long standardCount = 10000000L ;
155157 List <Arguments > arguments = new ArrayList <>();
156158 // Note the avg record estimate is based on a parquet metadata size of 500Bytes per file.
157159 // 1. straight forward. just 1 instant.
158160 arguments .add (Arguments .of (
159161 Arrays .asList (Pair .of (generateCompletedInstant (HoodieTimeline .COMMIT_ACTION , Long .toString (baseInstant )),
160- Collections .singletonList (generateBaseWriteStat (baseInstant , 10000000L , 100L )))), 99L ));
162+ Collections .singletonList (generateBaseWriteStat (baseInstant , standardCount , 100L )))), 99L ));
161163
162- // 2. two instants. avg of both the instants
164+ // 2. two instants. latest instant should be honored
163165 arguments .add (Arguments .of (
164166 Arrays .asList (Pair .of (generateCompletedInstant (HoodieTimeline .COMMIT_ACTION , Long .toString (baseInstant )),
165- Collections .singletonList (generateBaseWriteStat (baseInstant , 10000000L , 100L ))),
167+ Collections .singletonList (generateBaseWriteStat (baseInstant , standardCount , 100L ))),
166168 Pair .of (generateCompletedInstant (HoodieTimeline .COMMIT_ACTION , Long .toString (baseInstant + 100 )),
167- Collections .singletonList (generateBaseWriteStat (baseInstant + 100 , 1000000L , 200L )))), 109L ));
169+ Collections .singletonList (generateBaseWriteStat (baseInstant + 100 , standardCount , 200L )))), 199L ));
168170
169- // 3. two instants, latest commit has a small file thats just above threshold, while earliest commit is fully ignored,
170- // since it below the threshold size limit
171+ // 3. two instants, while 2nd one is smaller in size so as to not meet the threshold. So, 1st one should be honored
171172 arguments .add (Arguments .of (
172173 Arrays .asList (Pair .of (generateCompletedInstant (HoodieTimeline .COMMIT_ACTION , Long .toString (baseInstant )),
173- Collections .singletonList (generateBaseWriteStat (baseInstant , 9000L , 1000L ))),
174+ Collections .singletonList (generateBaseWriteStat (baseInstant , standardCount , 100L ))),
174175 Pair .of (generateCompletedInstant (HoodieTimeline .DELTA_COMMIT_ACTION , Long .toString (baseInstant + 100 )),
175- Collections .singletonList (generateBaseWriteStat (baseInstant + 100 , 110000 , 100L )))), 99L ));
176+ Collections .singletonList (generateBaseWriteStat (baseInstant + 100 , 1000L , 200L )))), 99L ));
176177
177- // 4. 2nd instance is replace commit, it shld be excluded and should be avg of both commits.
178+ // 4. 2nd instance is replace commit, it should be excluded
178179 arguments .add (Arguments .of (
179180 Arrays .asList (Pair .of (generateCompletedInstant (HoodieTimeline .COMMIT_ACTION , Long .toString (baseInstant )),
180- Collections .singletonList (generateBaseWriteStat (baseInstant , 10000000L , 100L ))),
181+ Collections .singletonList (generateBaseWriteStat (baseInstant , standardCount , 200L ))),
181182 Pair .of (generateCompletedInstant (HoodieTimeline .REPLACE_COMMIT_ACTION , Long .toString (baseInstant + 100 )),
182- Collections .singletonList (generateBaseWriteStat (baseInstant + 100 , 10000000L , 200L )))), 99L ));
183+ Collections .singletonList (generateBaseWriteStat (baseInstant + 100 , standardCount , 100L )))), 199L ));
183184
184185 // 5. for delta commits, only parquet files should be accounted for.
185186 arguments .add (Arguments .of (
186187 Arrays .asList (Pair .of (generateCompletedInstant (HoodieTimeline .COMMIT_ACTION , Long .toString (baseInstant )),
187- Collections .singletonList (generateBaseWriteStat (baseInstant , 10000000L , 100L ))),
188+ Collections .singletonList (generateBaseWriteStat (baseInstant , standardCount , 100L ))),
188189 Pair .of (generateCompletedInstant (HoodieTimeline .DELTA_COMMIT_ACTION , Long .toString (baseInstant + 100 )),
189- Collections .singletonList (generateBaseWriteStat (baseInstant + 100 , 10000000L , 200L )))), 149L ));
190+ Collections .singletonList (generateBaseWriteStat (baseInstant + 100 , standardCount , 200L )))), 199L ));
190191
191192 // 6. delta commit has a mix of parquet and log files. only parquet files should be accounted for.
192193 arguments .add (Arguments .of (
193194 Arrays .asList (Pair .of (generateCompletedInstant (HoodieTimeline .DELTA_COMMIT_ACTION , Long .toString (baseInstant )),
194- Collections .singletonList (generateBaseWriteStat (baseInstant , 1000000L , 100L ))),
195+ Collections .singletonList (generateBaseWriteStat (baseInstant , standardCount , 100L ))),
195196 Pair .of (generateCompletedInstant (HoodieTimeline .DELTA_COMMIT_ACTION , Long .toString (baseInstant + 100 )),
196- Arrays .asList (generateBaseWriteStat (baseInstant + 100 , 10000000L , 200L ),
197- generateLogWriteStat (baseInstant + 100 , 10000000L , 300L )))), 190L ));
197+ Arrays .asList (generateBaseWriteStat (baseInstant + 100 , standardCount , 200L ),
198+ generateLogWriteStat (baseInstant + 100 , standardCount , 300L )))), 199L ));
198199
199200 // 7. 2nd delta commit only has log files. and so we honor 1st delta commit size.
200201 arguments .add (Arguments .of (
201202 Arrays .asList (Pair .of (generateCompletedInstant (HoodieTimeline .DELTA_COMMIT_ACTION , Long .toString (baseInstant )),
202- Collections .singletonList (generateBaseWriteStat (baseInstant , 10000000L , 100L ))),
203+ Collections .singletonList (generateBaseWriteStat (baseInstant , standardCount , 100L ))),
203204 Pair .of (generateCompletedInstant (HoodieTimeline .DELTA_COMMIT_ACTION , Long .toString (baseInstant + 100 )),
204- Arrays .asList (generateLogWriteStat (baseInstant + 100 , 1000000L , 200L ),
205- generateLogWriteStat (baseInstant + 100 , 10000000L , 300L )))), 99L ));
205+ Arrays .asList (generateLogWriteStat (baseInstant + 100 , standardCount , 200L ),
206+ generateLogWriteStat (baseInstant + 100 , standardCount , 300L )))), 99L ));
206207
207208 // 8. since default max commits is overriden to 2 commits, ignore the earliest commit here since there are total 3 commits
208209 arguments .add (Arguments .of (
209210 Arrays .asList (Pair .of (generateCompletedInstant (HoodieTimeline .COMMIT_ACTION , Long .toString (baseInstant )),
210- Collections .singletonList (generateBaseWriteStat (baseInstant , 10000000L , 1000L ))),
211+ Collections .singletonList (generateBaseWriteStat (baseInstant , standardCount , 200L ))),
211212 Pair .of (generateCompletedInstant (HoodieTimeline .COMMIT_ACTION , Long .toString (baseInstant + 100 )),
212- Collections .singletonList (generateBaseWriteStat (baseInstant + 100 , 10000000L , 50L ))),
213+ Collections .singletonList (generateBaseWriteStat (baseInstant + 100 , 1L , 50L ))),
213214 Pair .of (generateCompletedInstant (HoodieTimeline .DELTA_COMMIT_ACTION , Long .toString (baseInstant + 200 )),
214- Collections .singletonList (generateBaseWriteStat (baseInstant + 200 , 10000000L , 100L )))), 74L ));
215+ Collections .singletonList (generateBaseWriteStat (baseInstant + 200 , 1L , 100L )))), Long . valueOf ( HoodieCompactionConfig . COPY_ON_WRITE_RECORD_SIZE_ESTIMATE . defaultValue ()) ));
215216
216217 // 9. replace commits should be ignored despite being the latest commits.
217218 arguments .add (Arguments .of (
218219 Arrays .asList (Pair .of (generateCompletedInstant (HoodieTimeline .DELTA_COMMIT_ACTION , Long .toString (baseInstant )),
219- Collections .singletonList (generateBaseWriteStat (baseInstant , 1000000L , 100L ))),
220+ Collections .singletonList (generateBaseWriteStat (baseInstant , standardCount , 100L ))),
220221 Pair .of (generateCompletedInstant (HoodieTimeline .DELTA_COMMIT_ACTION , Long .toString (baseInstant + 100 )),
221- Arrays .asList (generateLogWriteStat (baseInstant + 100 , 1000000L , 200L ),
222+ Arrays .asList (generateLogWriteStat (baseInstant + 100 , standardCount , 200L ),
222223 generateLogWriteStat (baseInstant + 100 , 1000000L , 300L ))),
223224 Pair .of (generateCompletedInstant (HoodieTimeline .REPLACE_COMMIT_ACTION , Long .toString (baseInstant )),
224- Collections .singletonList (generateBaseWriteStat (baseInstant + 200 , 1000000L , 2000L ))),
225+ Collections .singletonList (generateBaseWriteStat (baseInstant + 200 , standardCount , 2000L ))),
225226 Pair .of (generateCompletedInstant (HoodieTimeline .REPLACE_COMMIT_ACTION , Long .toString (baseInstant )),
226- Collections .singletonList (generateBaseWriteStat (baseInstant + 300 , 1000000L , 3000L )))), 99L ));
227+ Collections .singletonList (generateBaseWriteStat (baseInstant + 300 , standardCount , 3000L )))), 99L ));
227228
228229 // 10. Ignore commit stat with 0 records
229230 arguments .add (Arguments .of (
230231 Arrays .asList (Pair .of (generateCompletedInstant (HoodieTimeline .COMMIT_ACTION , Long .toString (baseInstant )),
231- Collections .singletonList (generateBaseWriteStat (baseInstant , 10000000L , 1000L ))),
232+ Collections .singletonList (generateBaseWriteStat (baseInstant , standardCount , 1000L ))),
232233 Pair .of (generateCompletedInstant (HoodieTimeline .COMMIT_ACTION , Long .toString (baseInstant + 100 )),
233- Collections .singletonList (generateBaseWriteStat (baseInstant + 100 , 10000000L , 50L ))),
234+ Collections .singletonList (generateBaseWriteStat (baseInstant + 100 , standardCount , 50L ))),
234235 Pair .of (generateCompletedInstant (HoodieTimeline .DELTA_COMMIT_ACTION , Long .toString (baseInstant + 200 )),
235236 Collections .singletonList (generateBaseWriteStat (baseInstant + 200 , 0L , 1000L )))), 49L ));
236237
0 commit comments