diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 3cfc437107a4..d08a8521c932 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -241,6 +241,8 @@ Optimizations * GITHUB#15024: Improve prefix sum computation in Lucene99HnswVectorsReader for faster neighbor decoding. (Luis Negrin) +* GITHUB#15779: Improve BytesRefHash.add performance by optimize rehash operation (tyronecai) + Bug Fixes --------------------- * GITHUB#15754: Fix HTMLStripCharFilter to prevent tags from incorrectly consuming subsequent diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java index 2e1850344dc9..f71c59f768e0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java +++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java @@ -433,40 +433,37 @@ public int addByPoolOffset(int offset) { private void rehash(final int newSize, boolean hashOnData) { final int newMask = newSize - 1; final int newHighMask = ~newMask; - bytesUsed.addAndGet(Integer.BYTES * (long) newSize); - final int[] newHash = new int[newSize]; - Arrays.fill(newHash, -1); - for (int i = 0; i < hashSize; i++) { - int e0 = ids[i]; - if (e0 != -1) { - e0 &= hashMask; - final int hashcode; - int code; - if (hashOnData) { - hashcode = code = pool.hash(bytesStart[e0]); - } else { - code = bytesStart[e0]; - hashcode = 0; - } - - int hashPos = code & newMask; - assert hashPos >= 0; - - // Conflict; use linear probe to find an open slot - // (see LUCENE-5604): - while (newHash[hashPos] != -1) { - code++; - hashPos = code & newMask; - } - - newHash[hashPos] = e0 | (hashcode & newHighMask); + bytesUsed.addAndGet(Integer.BYTES * (long) (newSize - ids.length)); + + ids = new int[newSize]; + Arrays.fill(ids, -1); + + // rebuild ids from terms in pool pointed by bytesStart + for (int id = 0; id < count; id++) { + final int hashcode; + int code; + if (hashOnData) { + hashcode = code = pool.hash(bytesStart[id]); + } else { + code = bytesStart[id]; + hashcode = 0; } + + int hashPos = code & newMask; + assert hashPos >= 0; + + // Conflict; use linear probe to find an open slot + // (see LUCENE-5604): + while (ids[hashPos] != -1) { + code++; + hashPos = code & newMask; + } + + ids[hashPos] = id | (hashcode & newHighMask); } hashMask = newMask; highMask = newHighMask; - bytesUsed.addAndGet(Integer.BYTES * (long) -ids.length); - ids = newHash; hashSize = newSize; hashHalfSize = newSize / 2; }