Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ New Features

* GITHUB#15508: Use native vectorization in Lucene. (Ankur Goel, Shubham Chaudhary, Dawid Weiss)

* GITHUB#15818: Add BM25 k3 query-term frequency saturation to classic query parser. (Sagar Upadhyaya)

Improvements
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,16 @@

import java.io.StringReader;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -102,6 +106,14 @@ public abstract class QueryParserBase extends QueryBuilder
boolean autoGeneratePhraseQueries;
int determinizeWorkLimit = DEFAULT_DETERMINIZE_WORK_LIMIT;

/**
* BM25's k3 parameter for query-side term frequency saturation. When duplicate terms appear in
* the query string, their boost is computed as ((k3 + 1) * qtf) / (k3 + qtf) instead of a linear
* sum of qtf. A negative value (the default) disables saturation and preserves the existing
* linear boost behavior for backward compatibility.
*/
float k3 = -1f;

// So the generated QueryParser(CharStream) won't error out
protected QueryParserBase() {
super(null);
Expand Down Expand Up @@ -363,6 +375,33 @@ public int getDeterminizeWorkLimit() {
return determinizeWorkLimit;
}

/**
* Sets BM25's k3 parameter for query-side term frequency saturation. When duplicate terms appear
* in the query string, their combined boost is computed using the BM25 saturation formula: ((k3 +
* 1) * qtf) / (k3 + qtf), where qtf is the number of times the term appears in the query.
*
* <p>Common values from IR literature are 7 or 8. A negative value (the default) disables
* saturation and preserves the existing linear boost behavior where duplicate terms have their
* boosts summed.
*
* @param k3 the k3 saturation parameter, or a negative value to disable saturation
*/
@Override
public void setK3(float k3) {
this.k3 = k3;
}

/**
* Returns BM25's k3 parameter for query-side term frequency saturation. A negative value means
* saturation is disabled (linear boost summing).
*
* @see #setK3(float)
*/
@Override
public float getK3() {
return k3;
}

protected void addClause(List<BooleanClause> clauses, int conj, int mods, Query q) {
boolean required, prohibited;

Expand Down Expand Up @@ -660,13 +699,74 @@ protected Query getBooleanQuery(List<BooleanClause> clauses) throws ParseExcepti
if (clauses.isEmpty()) {
return null; // all clause words were filtered away by the analyzer.
}

if (k3 >= 0) {
clauses = applySaturation(clauses);
}

BooleanQuery.Builder query = newBooleanQuery();
for (final BooleanClause clause : clauses) {
query.add(clause);
}
return query.build();
}

/**
* Applies BM25 k3 saturation to duplicate terms in the clause list. Duplicate SHOULD and MUST
* clauses are collapsed into a single clause with a saturated boost computed as ((k3 + 1) * qtf)
* / (k3 + qtf), where qtf is the number of occurrences of the term in the query.
*
* <p>Non-scoring clauses (FILTER, MUST_NOT) are passed through unchanged.
*/
private List<BooleanClause> applySaturation(List<BooleanClause> clauses) {
// Count occurrences of each scoring clause query. LinkedHashMap preserves insertion order
// so the rebuilt list maintains the original clause ordering.
Map<Query, Integer> counts = new LinkedHashMap<>();
boolean hasDuplicates = false;

for (BooleanClause clause : clauses) {
if (clause.occur() == Occur.SHOULD || clause.occur() == Occur.MUST) {
int prev = counts.getOrDefault(clause.query(), 0);
counts.put(clause.query(), prev + 1);
if (prev > 0) hasDuplicates = true;
}
}

if (!hasDuplicates) {
return clauses;
}

// Rebuild with saturated boosts for duplicates, skipping subsequent occurrences
List<BooleanClause> result = new ArrayList<>();
Set<Query> seen = new HashSet<>();

for (BooleanClause clause : clauses) {
if (clause.occur() == Occur.SHOULD || clause.occur() == Occur.MUST) {
if (seen.add(clause.query())) {
result.add(saturateClause(clause, counts.get(clause.query())));
}
} else {
// FILTER, MUST_NOT — pass through unchanged
result.add(clause);
}
}

return result;
}

/**
* Returns a clause with BM25 k3 saturated boost if qtf &gt; 1, or the original clause if qtf ==
* 1.
*/
private BooleanClause saturateClause(BooleanClause clause, int qtf) {
if (qtf <= 1) {
return clause;
}
float weight = ((k3 + 1f) * qtf) / (k3 + qtf);
Query boosted = new BoostQuery(clause.query(), weight);
return new BooleanClause(boosted, clause.occur());
}

/**
* Factory method for generating a query. Called when parser parses an input term token that
* contains one or more wildcard characters (? and *), but is not a prefix term token (one that
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,24 @@ public interface CommonQueryParserConfiguration {
* @param dateResolution the default {@link Resolution}
*/
public void setDateResolution(DateTools.Resolution dateResolution);

/**
* Sets BM25's k3 parameter for query-side term frequency saturation. When duplicate terms appear
* in the query string, their combined boost is computed using the BM25 saturation formula: ((k3 +
* 1) * qtf) / (k3 + qtf), where qtf is the number of times the term appears in the query.
*
* <p>Common values from IR literature are 7 or 8. A negative value (the default) disables
* saturation and preserves the existing linear boost behavior.
*
* @param k3 the k3 saturation parameter, or a negative value to disable saturation
*/
public void setK3(float k3);

/**
* Returns BM25's k3 parameter for query-side term frequency saturation. A negative value means
* saturation is disabled.
*
* @see #setK3(float)
*/
public float getK3();
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -872,6 +872,7 @@ public void testWildCardEscapes() throws ParseException, IOException {
Analyzer a = new ASCIIAnalyzer();
QueryParser parser = new QueryParser(FIELD, a);
assertTrue(isAHit(parser.parse("mö*tley"), "moatley", a));

// need to have at least one genuine wildcard to trigger the wildcard analysis
// hence the * before the y
assertTrue(isAHit(parser.parse("mö\\*tl*y"), "mo*tley", a));
Expand Down Expand Up @@ -1038,4 +1039,94 @@ private boolean isAHit(Query q, String content, Analyzer analyzer) throws IOExce
return false;
}
}

public void testK3DefaultDisabled() throws Exception {
QueryParser qp = getParser(new MockAnalyzer(random()));
assertEquals(-1f, qp.getK3(), 0f);
// With k3 disabled (default), duplicate terms produce linear boost sum via BooleanQuery.rewrite
Query q = qp.parse("a a a b");
assertTrue(q instanceof BooleanQuery);
BooleanQuery bq = (BooleanQuery) q;
// All 4 clauses should be present (no dedup at parser level)
assertEquals(4, bq.clauses().size());
}

public void testK3SaturationDuplicateShould() throws Exception {
QueryParser qp = getParser(new MockAnalyzer(random()));
qp.setK3(8f);
// "a a a b" with OR operator: 'a' appears 3 times, 'b' once
Query q = qp.parse("a a a b");
assertTrue(q instanceof BooleanQuery);
BooleanQuery bq = (BooleanQuery) q;
// Should be deduplicated to 2 clauses: boosted 'a' + 'b'
assertEquals(2, bq.clauses().size());

// Find the 'a' clause — it should be a BoostQuery with saturated weight
boolean foundA = false;
boolean foundB = false;
for (BooleanClause clause : bq.clauses()) {
assertEquals(BooleanClause.Occur.SHOULD, clause.occur());
if (clause.query() instanceof BoostQuery boostQuery) {
// This is the saturated 'a': ((8+1)*3)/(8+3) = 27/11 ≈ 2.4545
assertTrue(boostQuery.getQuery() instanceof TermQuery);
assertEquals("a", ((TermQuery) boostQuery.getQuery()).getTerm().text());
float expectedWeight = ((8f + 1f) * 3f) / (8f + 3f);
assertEquals(expectedWeight, boostQuery.getBoost(), 0.001f);
foundA = true;
} else if (clause.query() instanceof TermQuery termQuery) {
// 'b' appears once, no boost
assertEquals("b", termQuery.getTerm().text());
foundB = true;
}
}
assertTrue("Expected saturated 'a' clause", foundA);
assertTrue("Expected 'b' clause", foundB);
}

public void testK3SaturationDuplicateMust() throws Exception {
QueryParser qp = getParser(new MockAnalyzer(random()));
qp.setDefaultOperator(Operator.AND);
qp.setK3(7f);
// "a a b" with AND operator: 'a' appears 2 times
Query q = qp.parse("a a b");
assertTrue(q instanceof BooleanQuery);
BooleanQuery bq = (BooleanQuery) q;
// Should be deduplicated to 2 clauses
assertEquals(2, bq.clauses().size());

for (BooleanClause clause : bq.clauses()) {
assertEquals(BooleanClause.Occur.MUST, clause.occur());
if (clause.query() instanceof BoostQuery boostQuery) {
// 'a' with qtf=2: ((7+1)*2)/(7+2) = 16/9 ≈ 1.7778
float expectedWeight = ((7f + 1f) * 2f) / (7f + 2f);
assertEquals(expectedWeight, boostQuery.getBoost(), 0.001f);
}
}
}

public void testK3NoDuplicatesNoChange() throws Exception {
QueryParser qp = getParser(new MockAnalyzer(random()));
qp.setK3(8f);
// No duplicates — should produce normal clauses with no boost
Query q = qp.parse("a b c");
assertTrue(q instanceof BooleanQuery);
BooleanQuery bq = (BooleanQuery) q;
assertEquals(3, bq.clauses().size());
for (BooleanClause clause : bq.clauses()) {
assertTrue(
"Expected plain TermQuery, got " + clause.query().getClass(),
clause.query() instanceof TermQuery);
}
}

public void testK3SetterGetter() throws Exception {
QueryParser qp = getParser(new MockAnalyzer(random()));
assertEquals(-1f, qp.getK3(), 0f);
qp.setK3(8f);
assertEquals(8f, qp.getK3(), 0f);
qp.setK3(0f);
assertEquals(0f, qp.getK3(), 0f);
qp.setK3(-1f);
assertEquals(-1f, qp.getK3(), 0f);
}
}
Loading