@@ -11,12 +11,13 @@ class OllamaEmbedder {
1111 this . className = "OllamaEmbedder" ;
1212 this . basePath = process . env . EMBEDDING_BASE_PATH ;
1313 this . model = process . env . EMBEDDING_MODEL_PREF ;
14- // Limit of how many strings we can process in a single pass to stay with resource or network limits
15- this . maxConcurrentChunks = 1 ;
14+ this . maxConcurrentChunks = process . env . OLLAMA_EMBEDDING_BATCH_SIZE
15+ ? Number ( process . env . OLLAMA_EMBEDDING_BATCH_SIZE )
16+ : 1 ;
1617 this . embeddingMaxChunkLength = maximumChunkLength ( ) ;
1718 this . client = new Ollama ( { host : this . basePath } ) ;
1819 this . log (
19- `initialized with model ${ this . model } at ${ this . basePath } . num_ctx: ${ this . embeddingMaxChunkLength } `
20+ `initialized with model ${ this . model } at ${ this . basePath } . Batch size: ${ this . maxConcurrentChunks } , num_ctx: ${ this . embeddingMaxChunkLength } `
2021 ) ;
2122 }
2223
@@ -46,14 +47,14 @@ class OllamaEmbedder {
4647
4748 /**
4849 * This function takes an array of text chunks and embeds them using the Ollama API.
49- * chunks are processed sequentially to avoid overwhelming the API with too many requests
50- * or running out of resources on the endpoint running the ollama instance .
50+ * Chunks are processed in batches based on the maxConcurrentChunks setting to balance
51+ * resource usage on the Ollama endpoint .
5152 *
5253 * We will use the num_ctx option to set the maximum context window to the max chunk length defined by the user in the settings
5354 * so that the maximum context window is used and content is not truncated.
5455 *
5556 * We also assume the default keep alive option. This could cause issues with models being unloaded and reloaded
56- * on load memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are
57+ * on low memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are
5758 * constantly being loaded and unloaded, the user should use another LLM or Embedder to avoid this issue.
5859 * @param {string[] } textChunks - An array of text chunks to embed.
5960 * @returns {Promise<Array<number[]>> } - A promise that resolves to an array of embeddings.
@@ -64,29 +65,45 @@ class OllamaEmbedder {
6465 `Ollama service could not be reached. Is Ollama running?`
6566 ) ;
6667 this . log (
67- `Embedding ${ textChunks . length } chunks of text with ${ this . model } .`
68+ `Embedding ${ textChunks . length } chunks of text with ${ this . model } in batches of ${ this . maxConcurrentChunks } .`
6869 ) ;
6970
7071 let data = [ ] ;
7172 let error = null ;
7273
73- for ( const chunk of textChunks ) {
74+ // Process chunks in batches based on maxConcurrentChunks
75+ const totalBatches = Math . ceil (
76+ textChunks . length / this . maxConcurrentChunks
77+ ) ;
78+ let currentBatch = 0 ;
79+
80+ for ( let i = 0 ; i < textChunks . length ; i += this . maxConcurrentChunks ) {
81+ const batch = textChunks . slice ( i , i + this . maxConcurrentChunks ) ;
82+ currentBatch ++ ;
83+
7484 try {
75- const res = await this . client . embeddings ( {
85+ // Use input param instead of prompt param to support batch processing
86+ const res = await this . client . embed ( {
7687 model : this . model ,
77- prompt : chunk ,
88+ input : batch ,
7889 options : {
7990 // Always set the num_ctx to the max chunk length defined by the user in the settings
8091 // so that the maximum context window is used and content is not truncated.
8192 num_ctx : this . embeddingMaxChunkLength ,
8293 } ,
8394 } ) ;
8495
85- const { embedding } = res ;
86- if ( ! Array . isArray ( embedding ) || embedding . length === 0 )
87- throw new Error ( "Ollama returned an empty embedding for chunk !" ) ;
96+ const { embeddings } = res ;
97+ if ( ! Array . isArray ( embeddings ) || embeddings . length === 0 )
98+ throw new Error ( "Ollama returned empty embeddings for batch !" ) ;
8899
89- data . push ( embedding ) ;
100+ // Using prompt param in embed() would return a single embedding (number[])
101+ // but input param returns an array of embeddings (number[][]) for batch processing.
102+ // This is why we spread the embeddings array into the data array.
103+ data . push ( ...embeddings ) ;
104+ this . log (
105+ `Batch ${ currentBatch } /${ totalBatches } : Embedded ${ embeddings . length } chunks. Total: ${ data . length } /${ textChunks . length } `
106+ ) ;
90107 } catch ( err ) {
91108 this . log ( err . message ) ;
92109 error = err . message ;
0 commit comments