File tree Expand file tree Collapse file tree 3 files changed +27
-1
lines changed Expand file tree Collapse file tree 3 files changed +27
-1
lines changed Original file line number Diff line number Diff line change 1+ ---
2+ " @langchain/community " : patch
3+ ---
4+
5+ BM25Retriever: escape regex metacharacters in getTermFrequency to prevent crashes
Original file line number Diff line number Diff line change 11import { expect , test } from "@jest/globals" ;
22import { Document } from "@langchain/core/documents" ;
33import { BM25Retriever } from "../bm25.js" ;
4+ import { getTermFrequency } from "../../utils/@furkantoprak/bm25/BM25.js" ;
45
56test ( "BM25Retriever" , async ( ) => {
67 const docs = [
@@ -25,3 +26,20 @@ test("BM25Retriever", async () => {
2526 "The quick brown fox jumps over the lazy dog"
2627 ) ;
2728} ) ;
29+
30+ test ( "getTermFrequency escapes regex metacharacters" , ( ) => {
31+ const corpus =
32+ "**Version 1:** What is the country of origin for the person in question?" ;
33+ const term = "**Version 1:**" ;
34+
35+ // Should not throw and should find at least one match
36+ const freq = getTermFrequency ( term , corpus ) ;
37+ expect ( freq ) . toBeGreaterThanOrEqual ( 1 ) ;
38+
39+ // Also test other metacharacters
40+ const corpus2 = "Does this match (maybe)? [yes] *stars* +plus+" ;
41+ expect ( getTermFrequency ( "(maybe)?" , corpus2 ) ) . toBeGreaterThanOrEqual ( 1 ) ;
42+ expect ( getTermFrequency ( "[yes]" , corpus2 ) ) . toBeGreaterThanOrEqual ( 1 ) ;
43+ expect ( getTermFrequency ( "*stars*" , corpus2 ) ) . toBeGreaterThanOrEqual ( 1 ) ;
44+ expect ( getTermFrequency ( "+plus+" , corpus2 ) ) . toBeGreaterThanOrEqual ( 1 ) ;
45+ } ) ;
Original file line number Diff line number Diff line change @@ -12,7 +12,10 @@ export const getWordCount = (corpus: string) => {
1212
1313/** Number of occurences of a word in a string. */
1414export const getTermFrequency = ( term : string , corpus : string ) => {
15- return ( ( corpus || "" ) . match ( new RegExp ( term , "g" ) ) || [ ] ) . length ;
15+ // Escape any RegExp metacharacters in the term so constructing a RegExp
16+ // from user-provided or model-generated queries does not throw an error
17+ const escaped = ( term || "" ) . replace ( / [ . * + ? ^ $ { } ( ) | [ \] \\ ] / g, "\\$&" ) ;
18+ return ( ( corpus || "" ) . match ( new RegExp ( escaped , "g" ) ) || [ ] ) . length ;
1619} ;
1720
1821/** Inverse document frequency. */
You can’t perform that action at this time.
0 commit comments