Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ Optimizations

Bug Fixes
---------------------
* GITHUB#15812: CJKBigramFilter now produces consistent token positions regardless of the
outputUnigrams setting. Previously, when outputUnigrams=false, bigrams did not account for
spanning two character positions, causing position mismatches after word breaks that broke
phrase queries in combined unigram+bigram indexing strategies. (Sarah Syarofina)

* GITHUB#14049: Randomize KNN codec params in RandomCodec. Fixes scalar quantization div-by-zero
when all values are identical. (Mike Sokolov)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,12 @@ public final class CJKBigramFilter extends TokenFilter {
private final boolean outputUnigrams;
private boolean ngramState; // false = output unigram, true = output bigram

// When outputUnigrams=false, a bigram spans two character positions but only advances the
// position counter by 1. After a CJK segment boundary (word break or non-CJK token), the next
// token must account for the "missing" trailing unigram position. These fields track this.
private boolean hadBigrams; // true if bigrams were emitted from the current CJK segment
private int deferredPosInc; // extra position increment to apply to the next emitted token

private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
Expand Down Expand Up @@ -191,6 +197,10 @@ public boolean incrementToken() throws IOException {
// otherwise, we clear the buffer and start over.

if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
if (hadBigrams) {
deferredPosInc++;
hadBigrams = false;
}
if (hasBufferedUnigram()) {

// we have a buffered unigram, and we peeked ahead to see if we could form
Expand All @@ -209,6 +219,11 @@ public boolean incrementToken() throws IOException {

// not a CJK type: we just return these as-is.

if (hadBigrams) {
deferredPosInc++;
hadBigrams = false;
}

if (hasBufferedUnigram()) {

// we have a buffered unigram, and we peeked ahead to see if we could form
Expand All @@ -219,6 +234,10 @@ public boolean incrementToken() throws IOException {
flushUnigram();
return true;
}
if (deferredPosInc > 0) {
posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + deferredPosInc);
deferredPosInc = 0;
}
return true;
}
} else {
Expand Down Expand Up @@ -317,6 +336,13 @@ private void flushBigram() {
if (outputUnigrams) {
posIncAtt.setPositionIncrement(0);
posLengthAtt.setPositionLength(2);
} else {
// Apply any deferred position increment from a previous CJK segment boundary.
if (deferredPosInc > 0) {
posIncAtt.setPositionIncrement(1 + deferredPosInc);
deferredPosInc = 0;
}
hadBigrams = true;
}
index++;
}
Expand All @@ -333,6 +359,10 @@ private void flushUnigram() {
termAtt.setLength(len);
offsetAtt.setOffset(startOffset[index], endOffset[index]);
typeAtt.setType(SINGLE_TYPE);
if (deferredPosInc > 0) {
posIncAtt.setPositionIncrement(1 + deferredPosInc);
deferredPosInc = 0;
}
index++;
}

Expand Down Expand Up @@ -364,5 +394,7 @@ public void reset() throws IOException {
loneState = null;
exhausted = false;
ngramState = false;
hadBigrams = false;
deferredPosInc = 0;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ public void testJa2() throws IOException {
"<DOUBLE>",
"<SINGLE>"
},
new int[] {1, 1, 1, 1, 1, 1, 1, 1});
new int[] {1, 1, 1, 2, 1, 1, 1, 2});
}

public void testC() throws IOException {
Expand Down Expand Up @@ -147,7 +147,7 @@ public void testFinalOffset() throws IOException {
new int[] {0, 2},
new int[] {2, 6},
new String[] {"<DOUBLE>", "<ALPHANUM>"},
new int[] {1, 1});
new int[] {1, 2});

assertAnalyzesTo(
analyzer,
Expand Down Expand Up @@ -177,7 +177,7 @@ public void testMix() throws IOException {
"<DOUBLE>",
"<DOUBLE>"
},
new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1});
new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1});
}

public void testMix2() throws IOException {
Expand All @@ -200,7 +200,7 @@ public void testMix2() throws IOException {
"<DOUBLE>",
"<SINGLE>"
},
new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2});
}

/** Non-english text (outside of CJK) is treated normally, according to unicode rules */
Expand Down Expand Up @@ -256,7 +256,7 @@ public void testReusableTokenStream() throws IOException {
"<DOUBLE>",
"<DOUBLE>"
},
new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1});
new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1});

assertAnalyzesTo(
analyzer,
Expand All @@ -277,7 +277,7 @@ public void testReusableTokenStream() throws IOException {
"<DOUBLE>",
"<SINGLE>"
},
new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2});
}

public void testSingleChar() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ protected TokenStreamComponents createComponents(String fieldName) {
"<HIRAGANA>",
"<SINGLE>"
},
new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
new int[] {1, 1, 1, 1, 2, 1, 2, 1, 1, 1},
new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
a.close();
}
Expand Down Expand Up @@ -217,6 +217,79 @@ public void testUnigramsAndBigramsHuge() throws Exception {
});
}

/**
* Test that positions are consistent between outputUnigrams=true and outputUnigrams=false when a
* CJK segment is followed by a word break and then another token. See GITHUB#15812.
*/
public void testBigramPositionsConsistentAcrossWordBreak() throws Exception {
// "一二、三": bigram "一二" followed by punctuation break, then lone "三"
// With outputUnigrams=true: 一(pos0) 一二(pos0) 二(pos1) 三(pos2)
// With outputUnigrams=false: 一二(pos0) 三(pos2) — NOT pos1
assertAnalyzesTo(
analyzer,
"一二、三",
new String[] {"一二", "三"},
new int[] {0, 3},
new int[] {2, 4},
new String[] {"<DOUBLE>", "<SINGLE>"},
new int[] {1, 2},
new int[] {1, 1});

// Also verify the outputUnigrams=true case for comparison
assertAnalyzesTo(
unibiAnalyzer,
"一二、三",
new String[] {"一", "一二", "二", "三"},
new int[] {0, 0, 1, 3},
new int[] {1, 2, 2, 4},
new String[] {"<SINGLE>", "<DOUBLE>", "<SINGLE>", "<SINGLE>"},
new int[] {1, 0, 1, 1},
new int[] {1, 2, 1, 1});
}

/**
* Test positions across multiple CJK segments separated by word breaks. Each segment's bigrams
* should account for spanning two positions, so subsequent tokens align correctly.
*/
public void testBigramPositionsMultipleSegments() throws Exception {
// "一二、三四、五" → segments [一二], [三四], [五]
assertAnalyzesTo(
analyzer,
"一二、三四、五",
new String[] {"一二", "三四", "五"},
new int[] {0, 3, 6},
new int[] {2, 5, 7},
new String[] {"<DOUBLE>", "<DOUBLE>", "<SINGLE>"},
new int[] {1, 2, 2},
new int[] {1, 1, 1});
}

/**
* Test positions when a CJK bigram segment is followed by non-CJK text. The non-CJK token should
* account for the bigram spanning two positions.
*/
public void testBigramPositionsBeforeNonCJK() throws Exception {
Analyzer a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
}
};
// "学生abc" → HAN bigram "学生" then non-CJK "abc"
assertAnalyzesTo(
a,
"学生abc",
new String[] {"学生", "abc"},
new int[] {0, 2},
new int[] {2, 5},
new String[] {"<DOUBLE>", "<ALPHANUM>"},
new int[] {1, 2},
new int[] {1, 1});
a.close();
}

/** blast some random strings through the analyzer */
public void testRandomUnibiStrings() throws Exception {
checkRandomData(random(), unibiAnalyzer, 200 * RANDOM_MULTIPLIER);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ public void testJa2() throws IOException {
"<DOUBLE>",
"<SINGLE>"
},
new int[] {1, 1, 1, 1, 1, 1, 1, 1});
new int[] {1, 1, 1, 2, 1, 1, 1, 2});
}

public void testC() throws IOException {
Expand Down Expand Up @@ -179,7 +179,7 @@ public void testFinalOffset() throws IOException {
new int[] {0, 2},
new int[] {2, 6},
new String[] {"<DOUBLE>", "<ALPHANUM>"},
new int[] {1, 1});
new int[] {1, 2});

assertAnalyzesTo(
analyzer,
Expand Down Expand Up @@ -209,7 +209,7 @@ public void testMix() throws IOException {
"<DOUBLE>",
"<DOUBLE>"
},
new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1});
new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1});
}

public void testMix2() throws IOException {
Expand All @@ -232,7 +232,7 @@ public void testMix2() throws IOException {
"<DOUBLE>",
"<SINGLE>"
},
new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2});
}

/** Non-english text (outside of CJK) is treated normally, according to unicode rules */
Expand Down Expand Up @@ -288,7 +288,7 @@ public void testReusableTokenStream() throws IOException {
"<DOUBLE>",
"<DOUBLE>"
},
new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1});
new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1});

assertAnalyzesTo(
analyzer,
Expand All @@ -309,7 +309,7 @@ public void testReusableTokenStream() throws IOException {
"<DOUBLE>",
"<SINGLE>"
},
new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2});
}

public void testSingleChar() throws IOException {
Expand Down
Loading