Skip to content

Commit 66e44f6

Browse files
authored
Add batch embedding support for Ollama embedding provider (#4553)
* add batch embedding support for ollama embedding provider * lint * simplify ollama embedder input
1 parent 6a72ac2 commit 66e44f6

File tree

4 files changed

+79
-15
lines changed

4 files changed

+79
-15
lines changed

frontend/src/components/EmbeddingSelection/OllamaOptions/index.jsx

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,18 @@ export default function OllamaEmbeddingOptions({ settings }) {
2323
const [maxChunkLength, setMaxChunkLength] = useState(
2424
settings?.EmbeddingModelMaxChunkLength || 8192
2525
);
26+
const [batchSize, setBatchSize] = useState(
27+
settings?.OllamaEmbeddingBatchSize || 1
28+
);
2629

2730
const handleMaxChunkLengthChange = (e) => {
2831
setMaxChunkLength(Number(e.target.value));
2932
};
3033

34+
const handleBatchSizeChange = (e) => {
35+
setBatchSize(Number(e.target.value));
36+
};
37+
3138
return (
3239
<div className="w-full flex flex-col gap-y-7">
3340
<div className="w-full flex items-start gap-[36px] mt-1.5">
@@ -74,7 +81,7 @@ export default function OllamaEmbeddingOptions({ settings }) {
7481
}}
7582
className="border-none text-theme-text-primary hover:text-theme-text-secondary flex items-center text-sm"
7683
>
77-
{showAdvancedControls ? "Hide" : "Show"} Manual Endpoint Input
84+
{showAdvancedControls ? "Hide" : "Show"} Advanced Settings
7885
{showAdvancedControls ? (
7986
<CaretUp size={14} className="ml-1" />
8087
) : (
@@ -121,6 +128,41 @@ export default function OllamaEmbeddingOptions({ settings }) {
121128
Enter the URL where Ollama is running.
122129
</p>
123130
</div>
131+
<div className="flex flex-col w-60">
132+
<div
133+
data-tooltip-place="top"
134+
data-tooltip-id="ollama-batch-size-tooltip"
135+
className="flex gap-x-1 items-center mb-3"
136+
>
137+
<Info
138+
size={16}
139+
className="text-theme-text-secondary cursor-pointer"
140+
/>
141+
<label className="text-white text-sm font-semibold block">
142+
Embedding batch size
143+
</label>
144+
<Tooltip id="ollama-batch-size-tooltip">
145+
Number of text chunks to embed in parallel. Higher values
146+
improve speed but use more memory. Default is 1.
147+
</Tooltip>
148+
</div>
149+
<input
150+
type="number"
151+
name="OllamaEmbeddingBatchSize"
152+
className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
153+
placeholder="1"
154+
min={1}
155+
value={batchSize}
156+
onChange={handleBatchSizeChange}
157+
onScroll={(e) => e.target.blur()}
158+
required={true}
159+
autoComplete="off"
160+
/>
161+
<p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
162+
Increase this value to process multiple chunks simultaneously for
163+
faster embedding.
164+
</p>
165+
</div>
124166
</div>
125167
</div>
126168
</div>

server/models/systemSettings.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ const SystemSettings = {
232232
: process.env.EMBEDDING_MODEL_PREF,
233233
EmbeddingModelMaxChunkLength:
234234
process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH,
235+
OllamaEmbeddingBatchSize: process.env.OLLAMA_EMBEDDING_BATCH_SIZE || 1,
235236
VoyageAiApiKey: !!process.env.VOYAGEAI_API_KEY,
236237
GenericOpenAiEmbeddingApiKey:
237238
!!process.env.GENERIC_OPEN_AI_EMBEDDING_API_KEY,

server/utils/EmbeddingEngines/ollama/index.js

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,13 @@ class OllamaEmbedder {
1111
this.className = "OllamaEmbedder";
1212
this.basePath = process.env.EMBEDDING_BASE_PATH;
1313
this.model = process.env.EMBEDDING_MODEL_PREF;
14-
// Limit of how many strings we can process in a single pass to stay with resource or network limits
15-
this.maxConcurrentChunks = 1;
14+
this.maxConcurrentChunks = process.env.OLLAMA_EMBEDDING_BATCH_SIZE
15+
? Number(process.env.OLLAMA_EMBEDDING_BATCH_SIZE)
16+
: 1;
1617
this.embeddingMaxChunkLength = maximumChunkLength();
1718
this.client = new Ollama({ host: this.basePath });
1819
this.log(
19-
`initialized with model ${this.model} at ${this.basePath}. num_ctx: ${this.embeddingMaxChunkLength}`
20+
`initialized with model ${this.model} at ${this.basePath}. Batch size: ${this.maxConcurrentChunks}, num_ctx: ${this.embeddingMaxChunkLength}`
2021
);
2122
}
2223

@@ -46,14 +47,14 @@ class OllamaEmbedder {
4647

4748
/**
4849
* This function takes an array of text chunks and embeds them using the Ollama API.
49-
* chunks are processed sequentially to avoid overwhelming the API with too many requests
50-
* or running out of resources on the endpoint running the ollama instance.
50+
* Chunks are processed in batches based on the maxConcurrentChunks setting to balance
51+
* resource usage on the Ollama endpoint.
5152
*
5253
* We will use the num_ctx option to set the maximum context window to the max chunk length defined by the user in the settings
5354
* so that the maximum context window is used and content is not truncated.
5455
*
5556
* We also assume the default keep alive option. This could cause issues with models being unloaded and reloaded
56-
* on load memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are
57+
* on low memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are
5758
* constantly being loaded and unloaded, the user should use another LLM or Embedder to avoid this issue.
5859
* @param {string[]} textChunks - An array of text chunks to embed.
5960
* @returns {Promise<Array<number[]>>} - A promise that resolves to an array of embeddings.
@@ -64,29 +65,45 @@ class OllamaEmbedder {
6465
`Ollama service could not be reached. Is Ollama running?`
6566
);
6667
this.log(
67-
`Embedding ${textChunks.length} chunks of text with ${this.model}.`
68+
`Embedding ${textChunks.length} chunks of text with ${this.model} in batches of ${this.maxConcurrentChunks}.`
6869
);
6970

7071
let data = [];
7172
let error = null;
7273

73-
for (const chunk of textChunks) {
74+
// Process chunks in batches based on maxConcurrentChunks
75+
const totalBatches = Math.ceil(
76+
textChunks.length / this.maxConcurrentChunks
77+
);
78+
let currentBatch = 0;
79+
80+
for (let i = 0; i < textChunks.length; i += this.maxConcurrentChunks) {
81+
const batch = textChunks.slice(i, i + this.maxConcurrentChunks);
82+
currentBatch++;
83+
7484
try {
75-
const res = await this.client.embeddings({
85+
// Use input param instead of prompt param to support batch processing
86+
const res = await this.client.embed({
7687
model: this.model,
77-
prompt: chunk,
88+
input: batch,
7889
options: {
7990
// Always set the num_ctx to the max chunk length defined by the user in the settings
8091
// so that the maximum context window is used and content is not truncated.
8192
num_ctx: this.embeddingMaxChunkLength,
8293
},
8394
});
8495

85-
const { embedding } = res;
86-
if (!Array.isArray(embedding) || embedding.length === 0)
87-
throw new Error("Ollama returned an empty embedding for chunk!");
96+
const { embeddings } = res;
97+
if (!Array.isArray(embeddings) || embeddings.length === 0)
98+
throw new Error("Ollama returned empty embeddings for batch!");
8899

89-
data.push(embedding);
100+
// Using prompt param in embed() would return a single embedding (number[])
101+
// but input param returns an array of embeddings (number[][]) for batch processing.
102+
// This is why we spread the embeddings array into the data array.
103+
data.push(...embeddings);
104+
this.log(
105+
`Batch ${currentBatch}/${totalBatches}: Embedded ${embeddings.length} chunks. Total: ${data.length}/${textChunks.length}`
106+
);
90107
} catch (err) {
91108
this.log(err.message);
92109
error = err.message;

server/utils/helpers/updateENV.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,10 @@ const KEY_MAPPING = {
307307
envKey: "EMBEDDING_MODEL_MAX_CHUNK_LENGTH",
308308
checks: [nonZero],
309309
},
310+
OllamaEmbeddingBatchSize: {
311+
envKey: "OLLAMA_EMBEDDING_BATCH_SIZE",
312+
checks: [nonZero],
313+
},
310314

311315
// Gemini Embedding Settings
312316
GeminiEmbeddingApiKey: {

0 commit comments

Comments
 (0)