Skip to content

Commit 83252aa

Browse files
authored
[ENH] Add BM25 embedding function to JS (#5756)
## Description of changes _Summarize the changes made by this PR._ - Improvements & Bug fixes - This PR adds a JS embedding function to implement BM25, referencing the existing rust bm25 impl. It also has tests to validate the ef, and ensure it matches the rust tests 1:1. - New functionality - ... ## Test plan _How are these changes tested?_ tested manually + added unit tests - [ x] Tests pass locally with `pytest` for python, `yarn test` for js, `cargo test` for rust ## Migration plan _Are there any migrations, or any forwards/backwards compatibility changes needed in order to make sure this change deploys reliably?_ ## Observability plan _What is the plan to instrument and monitor this change?_ ## Documentation Changes _Are all docstrings for user-facing APIs updated if required? Do we need to make documentation changes in the [docs section](https://github.com/chroma-core/chroma/tree/main/docs/docs.trychroma.com)?_
1 parent ff1285d commit 83252aa

File tree

14 files changed

+802
-0
lines changed

14 files changed

+802
-0
lines changed

clients/new-js/packages/ai-embeddings/all/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
"@chroma-core/openai": "workspace:^",
4949
"@chroma-core/together-ai": "workspace:^",
5050
"@chroma-core/voyageai": "workspace:^",
51+
"@chroma-core/chroma-bm25": "workspace:^",
5152
"@chroma-core/chroma-cloud-qwen": "workspace:^",
5253
"@chroma-core/chroma-cloud-splade": "workspace:^"
5354
},

clients/new-js/packages/ai-embeddings/all/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ export * from "@chroma-core/together-ai";
1212
export * from "@chroma-core/voyageai";
1313
export * from "@chroma-core/chroma-cloud-qwen";
1414
export * from "@chroma-core/chroma-cloud-splade";
15+
export * from "@chroma-core/chroma-bm25";
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import type { Config } from "jest";
2+
3+
const config: Config = {
4+
preset: "ts-jest",
5+
testEnvironment: "node",
6+
testMatch: ["**/*.test.ts"],
7+
transform: {
8+
"^.+\\.tsx?$": [
9+
"ts-jest",
10+
{
11+
useESM: true,
12+
},
13+
],
14+
},
15+
extensionsToTreatAsEsm: [".ts"],
16+
moduleNameMapper: {
17+
"^(\\.{1,2}/.*)\\.js$": "$1",
18+
},
19+
setupFiles: ["./jest.setup.ts"],
20+
};
21+
22+
export default config;
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import * as dotenv from "dotenv";
2+
3+
dotenv.config({ path: "../../../.env" });
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{
2+
"name": "@chroma-core/chroma-bm25",
3+
"version": "0.1.7",
4+
"private": false,
5+
"description": "Chroma BM25 sparse embedding function implemented in TypeScript",
6+
"main": "dist/cjs/chroma-bm25.cjs",
7+
"types": "dist/chroma-bm25.d.ts",
8+
"module": "dist/chroma-bm25.legacy-esm.js",
9+
"type": "module",
10+
"exports": {
11+
".": {
12+
"import": {
13+
"types": "./dist/chroma-bm25.d.ts",
14+
"default": "./dist/chroma-bm25.mjs"
15+
},
16+
"require": {
17+
"types": "./dist/cjs/chroma-bm25.d.cts",
18+
"default": "./dist/cjs/chroma-bm25.cjs"
19+
}
20+
}
21+
},
22+
"files": [
23+
"src",
24+
"dist"
25+
],
26+
"scripts": {
27+
"clean": "rimraf dist",
28+
"prebuild": "rimraf dist",
29+
"build": "tsup",
30+
"watch": "tsup --watch",
31+
"test": "jest"
32+
},
33+
"devDependencies": {
34+
"@jest/globals": "^29.7.0",
35+
"dotenv": "^16.3.1",
36+
"jest": "^29.7.0",
37+
"rimraf": "^5.0.0",
38+
"ts-jest": "^29.1.2",
39+
"ts-node": "^10.9.2",
40+
"tsup": "^8.3.5"
41+
},
42+
"peerDependencies": {
43+
"chromadb": "workspace:^"
44+
},
45+
"dependencies": {
46+
"@chroma-core/ai-embeddings-common": "workspace:^",
47+
"snowball-stemmers": "^0.6.0"
48+
},
49+
"engines": {
50+
"node": ">=20"
51+
},
52+
"publishConfig": {
53+
"access": "public"
54+
}
55+
}
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import { describe, expect, test } from "@jest/globals";
2+
import {
3+
DEFAULT_CHROMA_BM25_STOPWORDS,
4+
ChromaBm25EmbeddingFunction,
5+
type ChromaBm25Config,
6+
} from "./index";
7+
8+
const isSorted = (arr: number[]): boolean => {
9+
for (let i = 1; i < arr.length; i += 1) {
10+
if (arr[i] < arr[i - 1]) {
11+
return false;
12+
}
13+
}
14+
return true;
15+
};
16+
17+
describe("ChromaBm25EmbeddingFunction", () => {
18+
const embedder = new ChromaBm25EmbeddingFunction();
19+
20+
test("matches comprehensive tokenization expectations", async () => {
21+
const [embedding] = await embedder.generate([
22+
"Usain Bolt's top speed reached ~27.8 mph (44.72 km/h)",
23+
]);
24+
25+
const expectedIndices = [
26+
230246813, 395514983, 458027949, 488165615, 729632045, 734978415,
27+
997512866, 1114505193, 1381820790, 1501587190, 1649421877,
28+
1837285388,
29+
];
30+
const expectedValue = 1.6391153;
31+
32+
expect(embedding.indices).toEqual(expectedIndices);
33+
embedding.values.forEach((value) => {
34+
expect(value).toBeCloseTo(expectedValue, 5);
35+
});
36+
});
37+
38+
// mirrors rust test `test_bm25_stopwords_and_punctuation` to guarantee compatibility
39+
test("ensure Rust impl compatibilty", async () => {
40+
const [embedding] = await embedder.generate([
41+
"The space-time continuum WARPS near massive objects...",
42+
]);
43+
44+
const expectedIndices = [
45+
90097469, 519064992, 737893654, 1110755108, 1950894484, 2031641008,
46+
2058513491,
47+
];
48+
const expectedValue = 1.660867;
49+
50+
expect(embedding.indices).toEqual(expectedIndices);
51+
embedding.values.forEach((value) => {
52+
expect(value).toBeCloseTo(expectedValue, 5);
53+
});
54+
});
55+
56+
test("generates consistent embeddings for multiple documents", async () => {
57+
const texts = [
58+
"Usain Bolt's top speed reached ~27.8 mph (44.72 km/h)",
59+
"The space-time continuum WARPS near massive objects...",
60+
"BM25 is great for sparse retrieval tasks",
61+
];
62+
63+
const embeddings = await embedder.generate(texts);
64+
65+
expect(embeddings).toHaveLength(texts.length);
66+
embeddings.forEach((embedding, index) => {
67+
expect(embedding.indices.length).toBeGreaterThan(0);
68+
expect(embedding.values.length).toBe(embedding.indices.length);
69+
expect(isSorted(embedding.indices)).toBe(true);
70+
71+
embedding.values.forEach((value) => {
72+
expect(value).toBeGreaterThan(0);
73+
expect(Number.isFinite(value)).toBe(true);
74+
});
75+
});
76+
});
77+
78+
test("generateForQueries mirrors generate", async () => {
79+
const query = "retrieve BM25 docs";
80+
const [queryEmbedding] = await embedder.generateForQueries([query]);
81+
const [docEmbedding] = await embedder.generate([query]);
82+
83+
expect(queryEmbedding.indices).toEqual(docEmbedding.indices);
84+
expect(queryEmbedding.values).toEqual(docEmbedding.values);
85+
});
86+
87+
test("config round trip maintains settings", () => {
88+
const config = embedder.getConfig() as Required<ChromaBm25Config>;
89+
90+
expect(config).toMatchObject({
91+
k: 1.2,
92+
b: 0.75,
93+
avg_doc_length: 256,
94+
token_max_length: 40,
95+
});
96+
expect(config.stopwords).toBeUndefined();
97+
98+
const custom = ChromaBm25EmbeddingFunction.buildFromConfig({
99+
...config,
100+
stopwords: DEFAULT_CHROMA_BM25_STOPWORDS.slice(0, 10),
101+
});
102+
103+
const rebuiltConfig =
104+
custom.getConfig() as Required<ChromaBm25Config>;
105+
expect(rebuiltConfig.k).toBeCloseTo(config.k);
106+
expect(rebuiltConfig.b).toBeCloseTo(config.b);
107+
expect(rebuiltConfig.avg_doc_length).toBeCloseTo(config.avg_doc_length);
108+
expect(rebuiltConfig.token_max_length).toBe(config.token_max_length);
109+
expect(rebuiltConfig.stopwords).toEqual(
110+
DEFAULT_CHROMA_BM25_STOPWORDS.slice(0, 10),
111+
);
112+
});
113+
});

0 commit comments

Comments
 (0)