Skip to content

Commit 922ecaa

Browse files
authored
Add a Faiss KNNVectorsFormat (Codec component) for KNN searches (#14178)
Faiss is Meta's open source library for approximate KNN search. It implements multiple algorithms and quantization choices. This change simply wraps its C API into an experimental KnnVectorsFormat using Java's Foreign Function & Memory (FFM) API. This means you can make a custom Lucene Codec which uses this for KNN indexing and searching. Note that this is native (C) code, so it's possible it has bugs that trigger SIGSEGV and then the OS rapidly tears down the JVM, or a subtle/slow memory leak outside of the JVM's heap visibility. This new format is experimental, might have exciting bugs, and has no promise of backwards compatibility. PRs welcome! Co-authored-by: Kaival Parikh <[email protected]>
1 parent 3a57b73 commit 922ecaa

File tree

11 files changed

+1375
-0
lines changed

11 files changed

+1375
-0
lines changed
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
name: "Run special checks: module lucene/sandbox"
2+
3+
on:
4+
workflow_dispatch:
5+
6+
pull_request:
7+
branches:
8+
- '*'
9+
10+
push:
11+
branches:
12+
- 'main'
13+
- 'branch_10x'
14+
15+
jobs:
16+
faiss-tests:
17+
name: tests for the Faiss codec (v${{ matrix.faiss-version }} with JDK ${{ matrix.java }} on ${{ matrix.os }})
18+
timeout-minutes: 15
19+
20+
strategy:
21+
matrix:
22+
os: [ ubuntu-latest ]
23+
java: [ '24' ]
24+
faiss-version: [ '1.11.0' ]
25+
26+
runs-on: ${{ matrix.os }}
27+
28+
steps:
29+
- name: Install Mamba
30+
uses: conda-incubator/setup-miniconda@835234971496cad1653abb28a638a281cf32541f #v3.2.0
31+
with:
32+
miniforge-version: 'latest'
33+
auto-activate-base: 'false'
34+
activate-environment: 'faiss-env'
35+
# TODO: Use only conda-forge if possible, see https://github.com/conda-forge/faiss-split-feedstock/pull/88
36+
channels: 'pytorch,conda-forge'
37+
conda-remove-defaults: 'true'
38+
39+
- name: Install Faiss
40+
run: mamba install faiss-cpu=${{ matrix.faiss-version }}
41+
42+
- name: Checkout Lucene
43+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
44+
45+
- name: Prepare Lucene workspace
46+
uses: ./.github/actions/prepare-for-build
47+
48+
- name: Run tests for Faiss codec
49+
run: >
50+
LD_LIBRARY_PATH=$CONDA_PREFIX/lib
51+
./gradlew -p lucene/sandbox
52+
-Dtests.faiss.run=true
53+
test
54+
--tests "org.apache.lucene.sandbox.codecs.faiss.*"
55+
56+
defaults:
57+
run:
58+
shell: bash -leo pipefail {0}

build-tools/build-infra/src/main/groovy/lucene.java.tests-and-randomization.gradle

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,9 @@ buildOptions.addOption("tests.file.encoding", "Sets the default file.encoding on
147147
])
148148
})
149149

150+
buildOptions.addBooleanOption("tests.faiss.run", "Explicitly run tests for the Faiss codec.", false)
151+
optionsInheritedAsProperties += ["tests.faiss.run"]
152+
150153
// TODO: do we still use these?
151154
// Test data file used.
152155
// [propName: 'tests.linedocsfile', value: 'europarl.lines.txt.gz', description: "Test data file path."],

lucene/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ New Features
3030
---------------------
3131
* GITHUB#14097: Binary partitioning merge policy over float-valued vector field. (Mike Sokolov)
3232

33+
* GITHUB#14178: Add a Faiss-based vector format in the sandbox module. (Kaival Parikh)
34+
3335
Improvements
3436
---------------------
3537

lucene/sandbox/src/java/module-info.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
requires org.apache.lucene.facet;
2323

2424
exports org.apache.lucene.payloads;
25+
exports org.apache.lucene.sandbox.codecs.faiss;
2526
exports org.apache.lucene.sandbox.codecs.idversion;
2627
exports org.apache.lucene.sandbox.codecs.quantization;
2728
exports org.apache.lucene.sandbox.document;
@@ -39,4 +40,6 @@
3940

4041
provides org.apache.lucene.codecs.PostingsFormat with
4142
org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat;
43+
provides org.apache.lucene.codecs.KnnVectorsFormat with
44+
org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat;
4245
}
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.sandbox.codecs.faiss;
18+
19+
import static org.apache.lucene.util.hnsw.HnswGraphBuilder.DEFAULT_BEAM_WIDTH;
20+
import static org.apache.lucene.util.hnsw.HnswGraphBuilder.DEFAULT_MAX_CONN;
21+
22+
import java.io.IOException;
23+
import java.util.Locale;
24+
import org.apache.lucene.codecs.KnnVectorsFormat;
25+
import org.apache.lucene.codecs.KnnVectorsReader;
26+
import org.apache.lucene.codecs.KnnVectorsWriter;
27+
import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil;
28+
import org.apache.lucene.codecs.hnsw.FlatVectorsFormat;
29+
import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat;
30+
import org.apache.lucene.index.SegmentReadState;
31+
import org.apache.lucene.index.SegmentWriteState;
32+
33+
/**
34+
* A Faiss-based format to create and search vector indexes, using {@link LibFaissC} to interact
35+
* with the native library.
36+
*
37+
* <p>The Faiss index is configured using its flexible <a
38+
* href="https://github.com/facebookresearch/faiss/wiki/The-index-factory">index factory</a>, which
39+
* allows creating arbitrary indexes by "describing" them. These indexes can be tuned by <a
40+
* href="https://github.com/facebookresearch/faiss/wiki/Index-IO,-cloning-and-hyper-parameter-tuning">setting
41+
* relevant parameters</a>.
42+
*
43+
* <p>A separate Faiss index is created per-segment, and uses the following files:
44+
*
45+
* <ul>
46+
* <li><code>.faissm</code> (metadata file): stores field number, offset and length of actual
47+
* Faiss index in data file.
48+
* <li><code>.faissd</code> (data file): stores concatenated Faiss indexes for all fields.
49+
* <li>All files required by {@link Lucene99FlatVectorsFormat} for storing raw vectors.
50+
* </ul>
51+
*
52+
* <p>Note: Set the {@code $OMP_NUM_THREADS} environment variable to control <a
53+
* href="https://github.com/facebookresearch/faiss/wiki/Threads-and-asynchronous-calls">internal
54+
* threading</a>.
55+
*
56+
* <p>TODO: There is no guarantee of backwards compatibility!
57+
*
58+
* @lucene.experimental
59+
*/
60+
public final class FaissKnnVectorsFormat extends KnnVectorsFormat {
61+
public static final String NAME = FaissKnnVectorsFormat.class.getSimpleName();
62+
static final int VERSION_START = 0;
63+
static final int VERSION_CURRENT = VERSION_START;
64+
static final String META_CODEC_NAME = NAME + "Meta";
65+
static final String DATA_CODEC_NAME = NAME + "Data";
66+
static final String META_EXTENSION = "faissm";
67+
static final String DATA_EXTENSION = "faissd";
68+
69+
private final String description;
70+
private final String indexParams;
71+
private final FlatVectorsFormat rawVectorsFormat;
72+
73+
/**
74+
* Constructs an HNSW-based format using default {@code maxConn}={@value
75+
* org.apache.lucene.util.hnsw.HnswGraphBuilder#DEFAULT_MAX_CONN} and {@code beamWidth}={@value
76+
* org.apache.lucene.util.hnsw.HnswGraphBuilder#DEFAULT_BEAM_WIDTH}.
77+
*/
78+
public FaissKnnVectorsFormat() {
79+
this(
80+
String.format(Locale.ROOT, "IDMap,HNSW%d", DEFAULT_MAX_CONN),
81+
String.format(Locale.ROOT, "efConstruction=%d", DEFAULT_BEAM_WIDTH));
82+
}
83+
84+
/**
85+
* Constructs a format using the specified index factory string and index parameters (see class
86+
* docs for more information).
87+
*
88+
* @param description the index factory string to initialize Faiss indexes.
89+
* @param indexParams the index params to set on Faiss indexes.
90+
*/
91+
public FaissKnnVectorsFormat(String description, String indexParams) {
92+
super(NAME);
93+
this.description = description;
94+
this.indexParams = indexParams;
95+
this.rawVectorsFormat =
96+
new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getLucene99FlatVectorsScorer());
97+
}
98+
99+
@Override
100+
public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
101+
return new FaissKnnVectorsWriter(
102+
description, indexParams, state, rawVectorsFormat.fieldsWriter(state));
103+
}
104+
105+
@Override
106+
public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException {
107+
return new FaissKnnVectorsReader(state, rawVectorsFormat.fieldsReader(state));
108+
}
109+
110+
@Override
111+
public int getMaxDimensions(String fieldName) {
112+
return DEFAULT_MAX_DIMENSIONS;
113+
}
114+
115+
@Override
116+
public String toString() {
117+
return String.format(
118+
Locale.ROOT, "%s(description=%s indexParams=%s)", NAME, description, indexParams);
119+
}
120+
}

0 commit comments

Comments
 (0)