Add batch embedding support for Ollama embedding provider (#4553)

shatfield4 · web-flow · commit 66e44f65b434 · 2025-11-25T13:03:47.000-08:00
* add batch embedding support for ollama embedding provider

* lint

* simplify ollama embedder input
diff --git a/frontend/src/components/EmbeddingSelection/OllamaOptions/index.jsx b/frontend/src/components/EmbeddingSelection/OllamaOptions/index.jsx
@@ -23,11 +23,18 @@ export default function OllamaEmbeddingOptions({ settings }) {
   const [maxChunkLength, setMaxChunkLength] = useState(
     settings?.EmbeddingModelMaxChunkLength || 8192
   );
+  const [batchSize, setBatchSize] = useState(
+    settings?.OllamaEmbeddingBatchSize || 1
+  );
 
   const handleMaxChunkLengthChange = (e) => {
     setMaxChunkLength(Number(e.target.value));
   };
 
+  const handleBatchSizeChange = (e) => {
+    setBatchSize(Number(e.target.value));
+  };
+
   return (
     <div className="w-full flex flex-col gap-y-7">
       <div className="w-full flex items-start gap-[36px] mt-1.5">
@@ -74,7 +81,7 @@ export default function OllamaEmbeddingOptions({ settings }) {
           }}
           className="border-none text-theme-text-primary hover:text-theme-text-secondary flex items-center text-sm"
         >
-          {showAdvancedControls ? "Hide" : "Show"} Manual Endpoint Input
+          {showAdvancedControls ? "Hide" : "Show"} Advanced Settings
           {showAdvancedControls ? (
             <CaretUp size={14} className="ml-1" />
           ) : (
@@ -121,6 +128,41 @@ export default function OllamaEmbeddingOptions({ settings }) {
               Enter the URL where Ollama is running.
             </p>
           </div>
+          <div className="flex flex-col w-60">
+            <div
+              data-tooltip-place="top"
+              data-tooltip-id="ollama-batch-size-tooltip"
+              className="flex gap-x-1 items-center mb-3"
+            >
+              <Info
+                size={16}
+                className="text-theme-text-secondary cursor-pointer"
+              />
+              <label className="text-white text-sm font-semibold block">
+                Embedding batch size
+              </label>
+              <Tooltip id="ollama-batch-size-tooltip">
+                Number of text chunks to embed in parallel. Higher values
+                improve speed but use more memory. Default is 1.
+              </Tooltip>
+            </div>
+            <input
+              type="number"
+              name="OllamaEmbeddingBatchSize"
+              className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
+              placeholder="1"
+              min={1}
+              value={batchSize}
+              onChange={handleBatchSizeChange}
+              onScroll={(e) => e.target.blur()}
+              required={true}
+              autoComplete="off"
+            />
+            <p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
+              Increase this value to process multiple chunks simultaneously for
+              faster embedding.
+            </p>
+          </div>
         </div>
       </div>
     </div>
diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js
@@ -232,6 +232,7 @@ const SystemSettings = {
           : process.env.EMBEDDING_MODEL_PREF,
       EmbeddingModelMaxChunkLength:
         process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH,
+      OllamaEmbeddingBatchSize: process.env.OLLAMA_EMBEDDING_BATCH_SIZE || 1,
       VoyageAiApiKey: !!process.env.VOYAGEAI_API_KEY,
       GenericOpenAiEmbeddingApiKey:
         !!process.env.GENERIC_OPEN_AI_EMBEDDING_API_KEY,
diff --git a/server/utils/EmbeddingEngines/ollama/index.js b/server/utils/EmbeddingEngines/ollama/index.js
@@ -11,12 +11,13 @@ class OllamaEmbedder {
     this.className = "OllamaEmbedder";
     this.basePath = process.env.EMBEDDING_BASE_PATH;
     this.model = process.env.EMBEDDING_MODEL_PREF;
-    // Limit of how many strings we can process in a single pass to stay with resource or network limits
-    this.maxConcurrentChunks = 1;
+    this.maxConcurrentChunks = process.env.OLLAMA_EMBEDDING_BATCH_SIZE
+      ? Number(process.env.OLLAMA_EMBEDDING_BATCH_SIZE)
+      : 1;
     this.embeddingMaxChunkLength = maximumChunkLength();
     this.client = new Ollama({ host: this.basePath });
     this.log(
-      `initialized with model ${this.model} at ${this.basePath}. num_ctx: ${this.embeddingMaxChunkLength}`
+      `initialized with model ${this.model} at ${this.basePath}. Batch size: ${this.maxConcurrentChunks}, num_ctx: ${this.embeddingMaxChunkLength}`
     );
   }
 
@@ -46,14 +47,14 @@ class OllamaEmbedder {
 
   /**
    * This function takes an array of text chunks and embeds them using the Ollama API.
-   * chunks are processed sequentially to avoid overwhelming the API with too many requests
-   * or running out of resources on the endpoint running the ollama instance.
+   * Chunks are processed in batches based on the maxConcurrentChunks setting to balance
+   * resource usage on the Ollama endpoint.
    *
    * We will use the num_ctx option to set the maximum context window to the max chunk length defined by the user in the settings
    * so that the maximum context window is used and content is not truncated.
    *
    * We also assume the default keep alive option. This could cause issues with models being unloaded and reloaded
-   * on load memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are
+   * on low memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are
    * constantly being loaded and unloaded, the user should use another LLM or Embedder to avoid this issue.
    * @param {string[]} textChunks - An array of text chunks to embed.
    * @returns {Promise<Array<number[]>>} - A promise that resolves to an array of embeddings.
@@ -64,29 +65,45 @@ class OllamaEmbedder {
         `Ollama service could not be reached. Is Ollama running?`
       );
     this.log(
-      `Embedding ${textChunks.length} chunks of text with ${this.model}.`
+      `Embedding ${textChunks.length} chunks of text with ${this.model} in batches of ${this.maxConcurrentChunks}.`
     );
 
     let data = [];
     let error = null;
 
-    for (const chunk of textChunks) {
+    // Process chunks in batches based on maxConcurrentChunks
+    const totalBatches = Math.ceil(
+      textChunks.length / this.maxConcurrentChunks
+    );
+    let currentBatch = 0;
+
+    for (let i = 0; i < textChunks.length; i += this.maxConcurrentChunks) {
+      const batch = textChunks.slice(i, i + this.maxConcurrentChunks);
+      currentBatch++;
+
       try {
-        const res = await this.client.embeddings({
+        // Use input param instead of prompt param to support batch processing
+        const res = await this.client.embed({
           model: this.model,
-          prompt: chunk,
+          input: batch,
           options: {
             // Always set the num_ctx to the max chunk length defined by the user in the settings
             // so that the maximum context window is used and content is not truncated.
             num_ctx: this.embeddingMaxChunkLength,
           },
         });
 
-        const { embedding } = res;
-        if (!Array.isArray(embedding) || embedding.length === 0)
-          throw new Error("Ollama returned an empty embedding for chunk!");
+        const { embeddings } = res;
+        if (!Array.isArray(embeddings) || embeddings.length === 0)
+          throw new Error("Ollama returned empty embeddings for batch!");
 
-        data.push(embedding);
+        // Using prompt param in embed() would return a single embedding (number[])
+        // but input param returns an array of embeddings (number[][]) for batch processing.
+        // This is why we spread the embeddings array into the data array.
+        data.push(...embeddings);
+        this.log(
+          `Batch ${currentBatch}/${totalBatches}: Embedded ${embeddings.length} chunks. Total: ${data.length}/${textChunks.length}`
+        );
       } catch (err) {
         this.log(err.message);
         error = err.message;
diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js
@@ -307,6 +307,10 @@ const KEY_MAPPING = {
     envKey: "EMBEDDING_MODEL_MAX_CHUNK_LENGTH",
     checks: [nonZero],
   },
+  OllamaEmbeddingBatchSize: {
+    envKey: "OLLAMA_EMBEDDING_BATCH_SIZE",
+    checks: [nonZero],
+  },
 
   // Gemini Embedding Settings
   GeminiEmbeddingApiKey: {