apache · nsivabalan · Jun 7, 2022 · Jun 6, 2022 · Jun 6, 2022 · Jun 6, 2022
diff --git a/...i-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java b/...i-client-common/src/test/java/org/apache/hudi/io/storage/TestHoodieHFileReaderWriter.java
@@ -50,6 +50,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
@@ -316,15 +317,20 @@ public void testReaderGetRecordIteratorByKeyPrefixes() throws Exception {
     assertEquals(expectedKey50and0s, recordsByPrefix);
 
     // filter for "key1" and "key0" : entries from 'key10 to key19' and 'key00 to key09' should be matched.
-    List<GenericRecord> expectedKey1sand0s = expectedKey1s;
-    expectedKey1sand0s.addAll(allRecords.stream()
-        .filter(entry -> (entry.get("_row_key").toString()).contains("key0"))
-        .collect(Collectors.toList()));
+    List<GenericRecord> expectedKey1sand0s = allRecords.stream()
+        .filter(entry -> (entry.get("_row_key").toString()).contains("key1") || (entry.get("_row_key").toString()).contains("key0"))
+        .collect(Collectors.toList());
     iterator =
         hfileReader.getRecordsByKeyPrefixIterator(Arrays.asList("key1", "key0"), avroSchema);
     recordsByPrefix =
         StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false)
             .collect(Collectors.toList());
+    Collections.sort(recordsByPrefix, new Comparator<GenericRecord>() {
+      @Override
+      public int compare(GenericRecord o1, GenericRecord o2) {
+        return o1.get("_row_key").toString().compareTo(o2.get("_row_key").toString());
+      }
+    });
     assertEquals(expectedKey1sand0s, recordsByPrefix);
   }
 

diff --git a/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java b/hudi-common/src/main/java/org/apache/hudi/io/storage/HoodieHFileReader.java
@@ -259,11 +259,8 @@ private static Iterator<GenericRecord> getRecordByKeyPrefixIteratorInternal(HFil
         return Collections.emptyIterator();
       }
     } else if (val == -1) {
-      // If scanner is aleady on the top of hfile. avoid trigger seekTo again.
-      Option<Cell> headerCell = Option.fromJavaOptional(scanner.getReader().getFirstKey());
-      if (headerCell.isPresent() && !headerCell.get().equals(scanner.getCell())) {
-        scanner.seekTo();
-      }
+      // seek to beginning. anyways, its key prefix search.
+      scanner.seekTo();
     }
 
     class KeyPrefixIterator implements Iterator<GenericRecord> {

diff --git a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java
@@ -142,8 +142,13 @@ protected Option<HoodieRecord<HoodieMetadataPayload>> getRecordByKey(String key,
   }
 
   @Override
-  public HoodieData<HoodieRecord<HoodieMetadataPayload>> getRecordsByKeyPrefixes(List<String> keyPrefixes,
+  public HoodieData<HoodieRecord<HoodieMetadataPayload>> getRecordsByKeyPrefixes(List<String> keyPrefixesUnsorted,
                                                                                  String partitionName) {
+    // Sort the columns so that keys are looked up in order
+    List<String> keyPrefixes = new ArrayList<>();
+    keyPrefixes.addAll(keyPrefixesUnsorted);
+    Collections.sort(keyPrefixes);
+
     // NOTE: Since we partition records to a particular file-group by full key, we will have
     //       to scan all file-groups for all key-prefixes as each of these might contain some
     //       records matching the key-prefix
@@ -192,8 +197,12 @@ public HoodieData<HoodieRecord<HoodieMetadataPayload>> getRecordsByKeyPrefixes(L
   }
 
   @Override
-  public List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> getRecordsByKeys(List<String> keys,
+  public List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> getRecordsByKeys(List<String> keysUnsorted,
                                                                                           String partitionName) {
+    // Sort the columns so that keys are looked up in order
+    List<String> keys = new ArrayList<>();
+    keys.addAll(keysUnsorted);
+    Collections.sort(keys);
     Map<Pair<String, FileSlice>, List<String>> partitionFileSliceToKeysMap = getPartitionFileSliceToKeysMapping(partitionName, keys);
     List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> result = new ArrayList<>();
     AtomicInteger fileSlicesKeysCount = new AtomicInteger();

diff --git a/...source/hudi-spark/src/test/resources/index/colstats/partial-column-stats-index-table.json b/...source/hudi-spark/src/test/resources/index/colstats/partial-column-stats-index-table.json
@@ -1,4 +1,4 @@
-{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"valueCount":9}
-{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"valueCount":8}
-{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"valueCount":10}
-{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"valueCount":13}
+{"c1_maxValue":769,"c1_minValue":309,"c1_nullCount":0,"c2_maxValue":" 769sdc","c2_minValue":" 309sdc","c2_nullCount":0,"valueCount":9}
+{"c1_maxValue":932,"c1_minValue":0,"c1_nullCount":0,"c2_maxValue":" 932sdc","c2_minValue":" 0sdc","c2_nullCount":0,"valueCount":8}
+{"c1_maxValue":943,"c1_minValue":89,"c1_nullCount":0,"c2_maxValue":" 943sdc","c2_minValue":" 200sdc","c2_nullCount":0,"valueCount":10}
+{"c1_maxValue":959,"c1_minValue":74,"c1_nullCount":0,"c2_maxValue":" 959sdc","c2_minValue":" 181sdc","c2_nullCount":0,"valueCount":13}
diff --git a/...atasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala b/...atasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
@@ -250,7 +250,7 @@ class TestColumnStatsIndex extends HoodieClientTestBase with ColumnStatsIndexSup
 
     {
       // We have to include "c1", since we sort the expected outputs by this column
-      val requestedColumns = Seq("c1", "c4")
+      val requestedColumns = Seq("c2", "c1", "c4")
 
       val partialColStatsDF = readColumnStatsIndex(spark, basePath, metadataConfig, requestedColumns)
       val partialTransposedColStatsDF = transposeColumnStatsIndex(spark, partialColStatsDF, requestedColumns, sourceTableSchema)