| 
 | 1 | +import { describe, expect, test } from "@jest/globals";  | 
 | 2 | +import {  | 
 | 3 | +    DEFAULT_CHROMA_BM25_STOPWORDS,  | 
 | 4 | +    ChromaBm25EmbeddingFunction,  | 
 | 5 | +    type ChromaBm25Config,  | 
 | 6 | +} from "./index";  | 
 | 7 | + | 
 | 8 | +const isSorted = (arr: number[]): boolean => {  | 
 | 9 | +    for (let i = 1; i < arr.length; i += 1) {  | 
 | 10 | +        if (arr[i] < arr[i - 1]) {  | 
 | 11 | +            return false;  | 
 | 12 | +        }  | 
 | 13 | +    }  | 
 | 14 | +    return true;  | 
 | 15 | +};  | 
 | 16 | + | 
 | 17 | +describe("ChromaBm25EmbeddingFunction", () => {  | 
 | 18 | +    const embedder = new ChromaBm25EmbeddingFunction();  | 
 | 19 | + | 
 | 20 | +    test("matches comprehensive tokenization expectations", async () => {  | 
 | 21 | +        const [embedding] = await embedder.generate([  | 
 | 22 | +            "Usain Bolt's top speed reached ~27.8 mph (44.72 km/h)",  | 
 | 23 | +        ]);  | 
 | 24 | + | 
 | 25 | +        const expectedIndices = [  | 
 | 26 | +            230246813, 395514983, 458027949, 488165615, 729632045, 734978415,  | 
 | 27 | +            997512866, 1114505193, 1381820790, 1501587190, 1649421877,  | 
 | 28 | +            1837285388,  | 
 | 29 | +        ];  | 
 | 30 | +        const expectedValue = 1.6391153;  | 
 | 31 | + | 
 | 32 | +        expect(embedding.indices).toEqual(expectedIndices);  | 
 | 33 | +        embedding.values.forEach((value) => {  | 
 | 34 | +            expect(value).toBeCloseTo(expectedValue, 5);  | 
 | 35 | +        });  | 
 | 36 | +    });  | 
 | 37 | + | 
 | 38 | +    // mirrors rust test `test_bm25_stopwords_and_punctuation` to guarantee compatibility  | 
 | 39 | +    test("ensure Rust impl compatibilty", async () => {  | 
 | 40 | +        const [embedding] = await embedder.generate([  | 
 | 41 | +            "The   space-time   continuum   WARPS   near   massive   objects...",  | 
 | 42 | +        ]);  | 
 | 43 | + | 
 | 44 | +        const expectedIndices = [  | 
 | 45 | +            90097469, 519064992, 737893654, 1110755108, 1950894484, 2031641008,  | 
 | 46 | +            2058513491,  | 
 | 47 | +        ];  | 
 | 48 | +        const expectedValue = 1.660867;  | 
 | 49 | + | 
 | 50 | +        expect(embedding.indices).toEqual(expectedIndices);  | 
 | 51 | +        embedding.values.forEach((value) => {  | 
 | 52 | +            expect(value).toBeCloseTo(expectedValue, 5);  | 
 | 53 | +        });  | 
 | 54 | +    });  | 
 | 55 | + | 
 | 56 | +    test("generates consistent embeddings for multiple documents", async () => {  | 
 | 57 | +        const texts = [  | 
 | 58 | +            "Usain Bolt's top speed reached ~27.8 mph (44.72 km/h)",  | 
 | 59 | +            "The   space-time   continuum   WARPS   near   massive   objects...",  | 
 | 60 | +            "BM25 is great for sparse retrieval tasks",  | 
 | 61 | +        ];  | 
 | 62 | + | 
 | 63 | +        const embeddings = await embedder.generate(texts);  | 
 | 64 | + | 
 | 65 | +        expect(embeddings).toHaveLength(texts.length);  | 
 | 66 | +        embeddings.forEach((embedding, index) => {  | 
 | 67 | +            expect(embedding.indices.length).toBeGreaterThan(0);  | 
 | 68 | +            expect(embedding.values.length).toBe(embedding.indices.length);  | 
 | 69 | +            expect(isSorted(embedding.indices)).toBe(true);  | 
 | 70 | + | 
 | 71 | +            embedding.values.forEach((value) => {  | 
 | 72 | +                expect(value).toBeGreaterThan(0);  | 
 | 73 | +                expect(Number.isFinite(value)).toBe(true);  | 
 | 74 | +            });  | 
 | 75 | +        });  | 
 | 76 | +    });  | 
 | 77 | + | 
 | 78 | +    test("generateForQueries mirrors generate", async () => {  | 
 | 79 | +        const query = "retrieve BM25 docs";  | 
 | 80 | +        const [queryEmbedding] = await embedder.generateForQueries([query]);  | 
 | 81 | +        const [docEmbedding] = await embedder.generate([query]);  | 
 | 82 | + | 
 | 83 | +        expect(queryEmbedding.indices).toEqual(docEmbedding.indices);  | 
 | 84 | +        expect(queryEmbedding.values).toEqual(docEmbedding.values);  | 
 | 85 | +    });  | 
 | 86 | + | 
 | 87 | +    test("config round trip maintains settings", () => {  | 
 | 88 | +        const config = embedder.getConfig() as Required<ChromaBm25Config>;  | 
 | 89 | + | 
 | 90 | +        expect(config).toMatchObject({  | 
 | 91 | +            k: 1.2,  | 
 | 92 | +            b: 0.75,  | 
 | 93 | +            avg_doc_length: 256,  | 
 | 94 | +            token_max_length: 40,  | 
 | 95 | +        });  | 
 | 96 | +        expect(config.stopwords).toBeUndefined();  | 
 | 97 | + | 
 | 98 | +        const custom = ChromaBm25EmbeddingFunction.buildFromConfig({  | 
 | 99 | +            ...config,  | 
 | 100 | +            stopwords: DEFAULT_CHROMA_BM25_STOPWORDS.slice(0, 10),  | 
 | 101 | +        });  | 
 | 102 | + | 
 | 103 | +        const rebuiltConfig =  | 
 | 104 | +            custom.getConfig() as Required<ChromaBm25Config>;  | 
 | 105 | +        expect(rebuiltConfig.k).toBeCloseTo(config.k);  | 
 | 106 | +        expect(rebuiltConfig.b).toBeCloseTo(config.b);  | 
 | 107 | +        expect(rebuiltConfig.avg_doc_length).toBeCloseTo(config.avg_doc_length);  | 
 | 108 | +        expect(rebuiltConfig.token_max_length).toBe(config.token_max_length);  | 
 | 109 | +        expect(rebuiltConfig.stopwords).toEqual(  | 
 | 110 | +            DEFAULT_CHROMA_BM25_STOPWORDS.slice(0, 10),  | 
 | 111 | +        );  | 
 | 112 | +    });  | 
 | 113 | +});  | 
0 commit comments