apache · herley-shaori · Mar 14, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -130,6 +130,11 @@ Optimizations
 
 Bug Fixes
 ---------------------
+* GITHUB#15812: CJKBigramFilter now produces consistent token positions regardless of the
+  outputUnigrams setting. Previously, when outputUnigrams=false, bigrams did not account for
+  spanning two character positions, causing position mismatches after word breaks that broke
+  phrase queries in combined unigram+bigram indexing strategies. (Sarah Syarofina)
+
 * GITHUB#14049: Randomize KNN codec params in RandomCodec. Fixes scalar quantization div-by-zero
   when all values are identical. (Mike Sokolov)
 

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
@@ -91,6 +91,12 @@ public final class CJKBigramFilter extends TokenFilter {
   private final boolean outputUnigrams;
   private boolean ngramState; // false = output unigram, true = output bigram
 
+  // When outputUnigrams=false, a bigram spans two character positions but only advances the
+  // position counter by 1. After a CJK segment boundary (word break or non-CJK token), the next
+  // token must account for the "missing" trailing unigram position. These fields track this.
+  private boolean hadBigrams; // true if bigrams were emitted from the current CJK segment
+  private int deferredPosInc; // extra position increment to apply to the next emitted token
+
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@@ -191,6 +197,10 @@ public boolean incrementToken() throws IOException {
           // otherwise, we clear the buffer and start over.
 
           if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
+            if (hadBigrams) {
+              deferredPosInc++;
+              hadBigrams = false;
+            }
             if (hasBufferedUnigram()) {
 
               // we have a buffered unigram, and we peeked ahead to see if we could form
@@ -209,6 +219,11 @@ public boolean incrementToken() throws IOException {
 
           // not a CJK type: we just return these as-is.
 
+          if (hadBigrams) {
+            deferredPosInc++;
+            hadBigrams = false;
+          }
+
           if (hasBufferedUnigram()) {
 
             // we have a buffered unigram, and we peeked ahead to see if we could form
@@ -219,6 +234,10 @@ public boolean incrementToken() throws IOException {
             flushUnigram();
             return true;
           }
+          if (deferredPosInc > 0) {
+            posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + deferredPosInc);
+            deferredPosInc = 0;
+          }
           return true;
         }
       } else {
@@ -317,6 +336,13 @@ private void flushBigram() {
     if (outputUnigrams) {
       posIncAtt.setPositionIncrement(0);
       posLengthAtt.setPositionLength(2);
+    } else {
+      // Apply any deferred position increment from a previous CJK segment boundary.
+      if (deferredPosInc > 0) {
+        posIncAtt.setPositionIncrement(1 + deferredPosInc);
+        deferredPosInc = 0;
+      }
+      hadBigrams = true;
     }
     index++;
   }
@@ -333,6 +359,10 @@ private void flushUnigram() {
     termAtt.setLength(len);
     offsetAtt.setOffset(startOffset[index], endOffset[index]);
     typeAtt.setType(SINGLE_TYPE);
+    if (deferredPosInc > 0) {
+      posIncAtt.setPositionIncrement(1 + deferredPosInc);
+      deferredPosInc = 0;
+    }
     index++;
   }
 
@@ -364,5 +394,7 @@ public void reset() throws IOException {
     loneState = null;
     exhausted = false;
     ngramState = false;
+    hadBigrams = false;
+    deferredPosInc = 0;
   }
 }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java
@@ -86,7 +86,7 @@ public void testJa2() throws IOException {
           "<DOUBLE>",
           "<SINGLE>"
         },
-        new int[] {1, 1, 1, 1, 1, 1, 1, 1});
+        new int[] {1, 1, 1, 2, 1, 1, 1, 2});
   }
 
   public void testC() throws IOException {
@@ -147,7 +147,7 @@ public void testFinalOffset() throws IOException {
         new int[] {0, 2},
         new int[] {2, 6},
         new String[] {"<DOUBLE>", "<ALPHANUM>"},
-        new int[] {1, 1});
+        new int[] {1, 2});
 
     assertAnalyzesTo(
         analyzer,
@@ -177,7 +177,7 @@ public void testMix() throws IOException {
           "<DOUBLE>",
           "<DOUBLE>"
         },
-        new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1});
+        new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1});
   }
 
   public void testMix2() throws IOException {
@@ -200,7 +200,7 @@ public void testMix2() throws IOException {
           "<DOUBLE>",
           "<SINGLE>"
         },
-        new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+        new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2});
   }
 
   /** Non-english text (outside of CJK) is treated normally, according to unicode rules */
@@ -256,7 +256,7 @@ public void testReusableTokenStream() throws IOException {
           "<DOUBLE>",
           "<DOUBLE>"
         },
-        new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1});
+        new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1});
 
     assertAnalyzesTo(
         analyzer,
@@ -277,7 +277,7 @@ public void testReusableTokenStream() throws IOException {
           "<DOUBLE>",
           "<SINGLE>"
         },
-        new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+        new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2});
   }
 
   public void testSingleChar() throws IOException {

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java
@@ -110,7 +110,7 @@ protected TokenStreamComponents createComponents(String fieldName) {
           "<HIRAGANA>",
           "<SINGLE>"
         },
-        new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+        new int[] {1, 1, 1, 1, 2, 1, 2, 1, 1, 1},
         new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
     a.close();
   }
@@ -217,6 +217,79 @@ public void testUnigramsAndBigramsHuge() throws Exception {
         });
   }
 
+  /**
+   * Test that positions are consistent between outputUnigrams=true and outputUnigrams=false when a
+   * CJK segment is followed by a word break and then another token. See GITHUB#15812.
+   */
+  public void testBigramPositionsConsistentAcrossWordBreak() throws Exception {
+    // "一二、三": bigram "一二" followed by punctuation break, then lone "三"
+    // With outputUnigrams=true: 一(pos0) 一二(pos0) 二(pos1) 三(pos2)
+    // With outputUnigrams=false: 一二(pos0) 三(pos2) — NOT pos1
+    assertAnalyzesTo(
+        analyzer,
+        "一二、三",
+        new String[] {"一二", "三"},
+        new int[] {0, 3},
+        new int[] {2, 4},
+        new String[] {"<DOUBLE>", "<SINGLE>"},
+        new int[] {1, 2},
+        new int[] {1, 1});
+
+    // Also verify the outputUnigrams=true case for comparison
+    assertAnalyzesTo(
+        unibiAnalyzer,
+        "一二、三",
+        new String[] {"一", "一二", "二", "三"},
+        new int[] {0, 0, 1, 3},
+        new int[] {1, 2, 2, 4},
+        new String[] {"<SINGLE>", "<DOUBLE>", "<SINGLE>", "<SINGLE>"},
+        new int[] {1, 0, 1, 1},
+        new int[] {1, 2, 1, 1});
+  }
+
+  /**
+   * Test positions across multiple CJK segments separated by word breaks. Each segment's bigrams
+   * should account for spanning two positions, so subsequent tokens align correctly.
+   */
+  public void testBigramPositionsMultipleSegments() throws Exception {
+    // "一二、三四、五" → segments [一二], [三四], [五]
+    assertAnalyzesTo(
+        analyzer,
+        "一二、三四、五",
+        new String[] {"一二", "三四", "五"},
+        new int[] {0, 3, 6},
+        new int[] {2, 5, 7},
+        new String[] {"<DOUBLE>", "<DOUBLE>", "<SINGLE>"},
+        new int[] {1, 2, 2},
+        new int[] {1, 1, 1});
+  }
+
+  /**
+   * Test positions when a CJK bigram segment is followed by non-CJK text. The non-CJK token should
+   * account for the bigram spanning two positions.
+   */
+  public void testBigramPositionsBeforeNonCJK() throws Exception {
+    Analyzer a =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer t = new StandardTokenizer();
+            return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
+          }
+        };
+    // "学生abc" → HAN bigram "学生" then non-CJK "abc"
+    assertAnalyzesTo(
+        a,
+        "学生abc",
+        new String[] {"学生", "abc"},
+        new int[] {0, 2},
+        new int[] {2, 5},
+        new String[] {"<DOUBLE>", "<ALPHANUM>"},
+        new int[] {1, 2},
+        new int[] {1, 1});
+    a.close();
+  }
+
   /** blast some random strings through the analyzer */
   public void testRandomUnibiStrings() throws Exception {
     checkRandomData(random(), unibiAnalyzer, 200 * RANDOM_MULTIPLIER);

diff --git a/...sis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java b/...sis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
@@ -118,7 +118,7 @@ public void testJa2() throws IOException {
           "<DOUBLE>",
           "<SINGLE>"
         },
-        new int[] {1, 1, 1, 1, 1, 1, 1, 1});
+        new int[] {1, 1, 1, 2, 1, 1, 1, 2});
   }
 
   public void testC() throws IOException {
@@ -179,7 +179,7 @@ public void testFinalOffset() throws IOException {
         new int[] {0, 2},
         new int[] {2, 6},
         new String[] {"<DOUBLE>", "<ALPHANUM>"},
-        new int[] {1, 1});
+        new int[] {1, 2});
 
     assertAnalyzesTo(
         analyzer,
@@ -209,7 +209,7 @@ public void testMix() throws IOException {
           "<DOUBLE>",
           "<DOUBLE>"
         },
-        new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1});
+        new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1});
   }
 
   public void testMix2() throws IOException {
@@ -232,7 +232,7 @@ public void testMix2() throws IOException {
           "<DOUBLE>",
           "<SINGLE>"
         },
-        new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+        new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2});
   }
 
   /** Non-english text (outside of CJK) is treated normally, according to unicode rules */
@@ -288,7 +288,7 @@ public void testReusableTokenStream() throws IOException {
           "<DOUBLE>",
           "<DOUBLE>"
         },
-        new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1});
+        new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1});
 
     assertAnalyzesTo(
         analyzer,
@@ -309,7 +309,7 @@ public void testReusableTokenStream() throws IOException {
           "<DOUBLE>",
           "<SINGLE>"
         },
-        new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+        new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2});
   }
 
   public void testSingleChar() throws IOException {