Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/major-lamps-walk.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@langchain/community": patch
---

BM25Retriever: escape regex metacharacters in getTermFrequency to prevent crashes
18 changes: 18 additions & 0 deletions libs/langchain-community/src/retrievers/tests/bm25.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { expect, test } from "@jest/globals";
import { Document } from "@langchain/core/documents";
import { BM25Retriever } from "../bm25.js";
import { getTermFrequency } from "../../utils/@furkantoprak/bm25/BM25.js";

test("BM25Retriever", async () => {
const docs = [
Expand All @@ -25,3 +26,20 @@ test("BM25Retriever", async () => {
"The quick brown fox jumps over the lazy dog"
);
});

test("getTermFrequency escapes regex metacharacters", () => {
const corpus =
"**Version 1:** What is the country of origin for the person in question?";
const term = "**Version 1:**";

// Should not throw and should find at least one match
const freq = getTermFrequency(term, corpus);
expect(freq).toBeGreaterThanOrEqual(1);

// Also test other metacharacters
const corpus2 = "Does this match (maybe)? [yes] *stars* +plus+";
expect(getTermFrequency("(maybe)?", corpus2)).toBeGreaterThanOrEqual(1);
expect(getTermFrequency("[yes]", corpus2)).toBeGreaterThanOrEqual(1);
expect(getTermFrequency("*stars*", corpus2)).toBeGreaterThanOrEqual(1);
expect(getTermFrequency("+plus+", corpus2)).toBeGreaterThanOrEqual(1);
});
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ export const getWordCount = (corpus: string) => {

/** Number of occurences of a word in a string. */
export const getTermFrequency = (term: string, corpus: string) => {
return ((corpus || "").match(new RegExp(term, "g")) || []).length;
// Escape any RegExp metacharacters in the term so constructing a RegExp
// from user-provided or model-generated queries does not throw an error
const escaped = (term || "").replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
return ((corpus || "").match(new RegExp(escaped, "g")) || []).length;
};

/** Inverse document frequency. */
Expand Down
Loading