Skip to content
Open
7 changes: 7 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Attach by Process ID",
"processId": "${command:PickProcess}",
"request": "attach",
"skipFiles": ["<node_internals>/**"],
"type": "node"
},
{
"type": "node",
"request": "launch",
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,6 @@ npx -y mongodb-mcp-server@latest --logPath=/path/to/logs --readOnly --indexCheck
"args": [
"-y",
"mongodb-mcp-server",
"--connectionString",
"mongodb+srv://username:[email protected]/myDatabase",
"--readOnly"
]
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"dist"
],
"scripts": {
"start": "node dist/index.js --transport http --loggers stderr mcp",
"start": "node dist/index.js --transport http --loggers stderr mcp --previewFeatures vectorSearch",
"start:stdio": "node dist/index.js --transport stdio --loggers stderr mcp",
"prepare": "husky && npm run build",
"build:clean": "rm -rf dist",
Expand Down
16 changes: 16 additions & 0 deletions src/tools/args.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,17 @@ export const ALLOWED_CLUSTER_NAME_CHARACTERS_ERROR =
const ALLOWED_PROJECT_NAME_CHARACTERS_REGEX = /^[a-zA-Z0-9\s()@&+:._',-]+$/;
export const ALLOWED_PROJECT_NAME_CHARACTERS_ERROR =
"Project names can't be longer than 64 characters and can only contain letters, numbers, spaces, and the following symbols: ( ) @ & + : . _ - ' ,";

// Zod does not undestand JS boxed numbers (like Int32) as integer literals,
// so we preprocess them to unwrap them so Zod understands them.
function unboxNumber(v: unknown): number {
if (v && typeof v === "object" && typeof v.valueOf === "function") {
const n = Number(v.valueOf());
if (!Number.isNaN(n)) return n;
}
return v as number;
}

export const CommonArgs = {
string: (): ZodString => z.string().regex(NO_UNICODE_REGEX, NO_UNICODE_ERROR),

Expand All @@ -27,6 +38,11 @@ export const CommonArgs = {
.min(1, `${fieldName} is required`)
.length(24, `${fieldName} must be exactly 24 characters`)
.regex(/^[0-9a-fA-F]+$/, `${fieldName} must contain only hexadecimal characters`),
numberEnum: <Options extends Readonly<[z.ZodTypeAny, z.ZodTypeAny, ...z.ZodTypeAny[]]>>(
values: Options
): z.ZodEffects<z.ZodUnion<Options>> => {
return z.preprocess(unboxNumber, z.union(values));
},
};

export const AtlasArgs = {
Expand Down
212 changes: 151 additions & 61 deletions src/tools/mongodb/create/createIndex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,64 +5,130 @@ import { type ToolArgs, type OperationType } from "../../tool.js";
import type { IndexDirection } from "mongodb";
import { quantizationEnum } from "../../../common/search/vectorSearchEmbeddingsManager.js";
import { similarityValues } from "../../../common/schemas.js";
import { CommonArgs } from "../../args.js";

export class CreateIndexTool extends MongoDBToolBase {
private vectorSearchIndexDefinition = z.object({
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is best viewed with the "Hide whitespace" option - it's just prettier reformatting the indents.

type: z.literal("vectorSearch"),
fields: z
.array(
z.discriminatedUnion("type", [
z
.object({
type: z.literal("filter"),
path: z
.string()
.describe(
"Name of the field to index. For nested fields, use dot notation to specify path to embedded fields"
),
})
.strict()
.describe("Definition for a field that will be used for pre-filtering results."),
z
.object({
type: z.literal("vector"),
path: z
.string()
.describe(
"Name of the field to index. For nested fields, use dot notation to specify path to embedded fields"
),
numDimensions: z
.number()
.min(1)
.max(8192)
.default(this.config.vectorSearchDimensions)
.describe(
"Number of vector dimensions that MongoDB Vector Search enforces at index-time and query-time"
),
similarity: z
.enum(similarityValues)
.default(this.config.vectorSearchSimilarityFunction)
.describe(
"Vector similarity function to use to search for top K-nearest neighbors. You can set this field only for vector-type fields."
),
quantization: quantizationEnum
.default("none")
private vectorSearchIndexDefinition = z
.object({
type: z.literal("vectorSearch"),
fields: z
.array(
z.discriminatedUnion("type", [
z
.object({
type: z.literal("filter"),
path: z
.string()
.describe(
"Name of the field to index. For nested fields, use dot notation to specify path to embedded fields"
),
})
.strict()
.describe("Definition for a field that will be used for pre-filtering results."),
z
.object({
type: z.literal("vector"),
path: z
.string()
.describe(
"Name of the field to index. For nested fields, use dot notation to specify path to embedded fields"
),
numDimensions: z
.number()
.min(1)
.max(8192)
.default(this.config.vectorSearchDimensions)
.describe(
"Number of vector dimensions that MongoDB Vector Search enforces at index-time and query-time"
),
similarity: z
.enum(similarityValues)
.default(this.config.vectorSearchSimilarityFunction)
.describe(
"Vector similarity function to use to search for top K-nearest neighbors. You can set this field only for vector-type fields."
),
quantization: quantizationEnum
.default("none")
.describe(
"Type of automatic vector quantization for your vectors. Use this setting only if your embeddings are float or double vectors."
),
})
.strict()
.describe("Definition for a field that contains vector embeddings."),
])
)
.nonempty()
.refine((fields) => fields.some((f) => f.type === "vector"), {
message: "At least one vector field must be defined",
})
.describe(
"Definitions for the vector and filter fields to index, one definition per document. You must specify `vector` for fields that contain vector embeddings and `filter` for additional fields to filter on. At least one vector-type field definition is required."
),
})
.describe("Definition for a Vector Search index.");

private atlasSearchIndexDefinition = z
.object({
type: z.literal("search"),
analyzer: z
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably this should be an enum of the analyzers.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussing with the search team, searchAnalyzer and analyzers is not something we need to support based on current usage patterns. We can add support in the future if we see customer demand for it.

.string()
.optional()
.default("lucene.standard")
.describe(
"The analyzer to use for the index. Can be one of the built-in lucene analyzers (`lucene.standard`, `lucene.simple`, `lucene.whitespace`, `lucene.keyword`), a language-specific analyzer, such as `lucene.cjk` or `lucene.czech`, or a custom analyzer defined in the Atlas UI."
),
mappings: z
.object({
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lack support of:

  • numPartitions
  • searchAnalyzer vs analyze
  • custom analyzers
  • storedSources
  • synonyms
  • typeSets

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://www.mongodb.com/docs/atlas/atlas-search/index-definitions/?deployment-type=atlas&interface=driver&language=nodejs#std-label-ref-index-definitions

We could say that custom analyzers are not that important, but storedSources is actually relevant most of the times.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The args shape is based on the POC the search team did for index support and I was going off of the assumption that they've selected the fields that they see the most value in exposing to LLMs. I realize there's a lot more configuration that's possible, I'm just not sure how much of that is stuff we expect agents to configure vs an actual human who wants to fine-tune the index.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A POC to see the feasibility to create search indexes and production code are likely to have different requirements.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • typeSets is still in preview, so I'm leaving it out for now.
  • numPartitions was added
  • searchAnalyzer and analyzers are out of scope per search team's recommendation

I'm leaving the following out of this PR for now as they have more complex schema and have been more error prone when testing:

  • synonyms
  • storedSources

We can add them as we move toward GA and build out a more comprehensive testing suite.

dynamic: z
.boolean()
.optional()
.default(false)
.describe(
"Enables or disables dynamic mapping of fields for this index. If set to true, Atlas Search recursively indexes all dynamically indexable fields. If set to false, you must specify individual fields to index using mappings.fields."
),
fields: z
.record(
z.string().describe("The field name"),
z
.object({
type: z
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Objects will require additional fields depending on the type. I know passthrough will keep them, but we should document them so the agent knows which ones to use and how. For example, autocomplete supports defining a custom analyzer, how to tokenize (which is really important) and similarity functions.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The exact shape is extremely complex to represent in a json schema. I'm worried that being overly specific will result in this being more harmful than helpful, especially if we expect the majority of the use cases to revolve around just specifying the type.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, the schema is complicated, it has a lot of options that are not compatible even between them. We should have proper documentation of which ones we want to expose and which ones not, something that we haven't discussed yet because supporting the most used bits of Atlas Search is already a substantial effort.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leaving this out of scope for now and we can decide how much detail to provide as we build out a more comprehensive testing suite.

.enum([
"autocomplete",
"boolean",
"date",
"document",
"embeddedDocuments",
"geo",
"number",
"objectId",
"string",
"token",
"uuid",
])
.describe("The field type"),
})
.passthrough()
.describe(
"Type of automatic vector quantization for your vectors. Use this setting only if your embeddings are float or double vectors."
),
})
.strict()
.describe("Definition for a field that contains vector embeddings."),
])
)
.nonempty()
.refine((fields) => fields.some((f) => f.type === "vector"), {
message: "At least one vector field must be defined",
})
.describe(
"Definitions for the vector and filter fields to index, one definition per document. You must specify `vector` for fields that contain vector embeddings and `filter` for additional fields to filter on. At least one vector-type field definition is required."
),
});
"The field index definition. It must contain the field type, as well as any additional options for that field type."
)
)
.optional()
.describe("The field mapping definitions. If `dynamic` is set to `false`, this is required."),
})
.refine((data) => data.dynamic !== !!(data.fields && Object.keys(data.fields).length > 0), {
message:
"Either `dynamic` must be `true` and `fields` empty or `dynamic` must be `false` and at least one field must be defined in `fields`",
})
.describe(
"Document describing the index to create. Either `dynamic` must be `true` and `fields` empty or `dynamic` must be `false` and at least one field must be defined in the `fields` document."
),
numPartitions: CommonArgs.numberEnum([z.literal(1), z.literal(2), z.literal(4)])
.default(1)
.describe(
"Specifies the number of sub-indexes to create if the document count exceeds two billion. If omitted, defaults to 1."
),
})
.describe("Definition for an Atlas Search (lexical) index.");

public name = "create-index";
protected description = "Create an index for a collection";
Expand All @@ -72,15 +138,19 @@ export class CreateIndexTool extends MongoDBToolBase {
definition: z
.array(
z.discriminatedUnion("type", [
z.object({
type: z.literal("classic"),
keys: z.object({}).catchall(z.custom<IndexDirection>()).describe("The index definition"),
}),
...(this.isFeatureEnabled("vectorSearch") ? [this.vectorSearchIndexDefinition] : []),
z
.object({
type: z.literal("classic"),
keys: z.object({}).catchall(z.custom<IndexDirection>()).describe("The index definition"),
})
.describe("Definition for a MongoDB index (e.g. ascending/descending/geospatial)."),
...(this.isFeatureEnabled("vectorSearch")
? [this.vectorSearchIndexDefinition, this.atlasSearchIndexDefinition]
: []),
])
)
.describe(
`The index definition. Use 'classic' for standard indexes${this.isFeatureEnabled("vectorSearch") ? " and 'vectorSearch' for vector search indexes" : ""}.`
`The index definition. Use 'classic' for standard indexes${this.isFeatureEnabled("vectorSearch") ? ", 'vectorSearch' for vector search indexes, and 'search' for Atlas Search (lexical) indexes" : ""}.`
),
};

Expand Down Expand Up @@ -130,6 +200,26 @@ export class CreateIndexTool extends MongoDBToolBase {
this.session.vectorSearchEmbeddingsManager.cleanupEmbeddingsForNamespace({ database, collection });
}

break;
case "search":
{
await this.ensureSearchIsSupported();
indexes = await provider.createSearchIndexes(database, collection, [
{
name,
definition: {
mappings: definition.mappings,
analyzer: definition.analyzer,
numPartitions: definition.numPartitions,
},
type: "search",
},
]);

responseClarification =
" Since this is a search index, it may take a while for the index to build. Use the `list-indexes` tool to check the index status.";
}

break;
}

Expand Down
24 changes: 8 additions & 16 deletions src/tools/mongodb/mongodbSchemas.ts
Original file line number Diff line number Diff line change
@@ -1,26 +1,18 @@
import z from "zod";
import { zEJSON } from "../args.js";
import { CommonArgs, zEJSON } from "../args.js";

export const zVoyageModels = z
.enum(["voyage-3-large", "voyage-3.5", "voyage-3.5-lite", "voyage-code-3"])
.default("voyage-3-large");

// Zod does not undestand JS boxed numbers (like Int32) as integer literals,
// so we preprocess them to unwrap them so Zod understands them.
function unboxNumber(v: unknown): number {
if (v && typeof v === "object" && typeof v.valueOf === "function") {
const n = Number(v.valueOf());
if (!Number.isNaN(n)) return n;
}
return v as number;
}

export const zVoyageEmbeddingParameters = z.object({
outputDimension: z
.preprocess(
unboxNumber,
z.union([z.literal(256), z.literal(512), z.literal(1024), z.literal(2048), z.literal(4096)])
)
outputDimension: CommonArgs.numberEnum([
z.literal(256),
z.literal(512),
z.literal(1024),
z.literal(2048),
z.literal(4096),
])
.optional()
.default(1024),
outputDtype: z.enum(["float", "int8", "uint8", "binary", "ubinary"]).optional().default("float"),
Expand Down
10 changes: 7 additions & 3 deletions src/tools/mongodb/read/aggregate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ import {

const pipelineDescriptionWithVectorSearch = `\
An array of aggregation stages to execute.
\`$vectorSearch\` **MUST** be the first stage of the pipeline, or the first stage of a \`$unionWith\` subpipeline.
If the user has asked for a vector search, \`$vectorSearch\` **MUST** be the first stage of the pipeline, or the first stage of a \`$unionWith\` subpipeline.
If the user has asked for lexical/Atlas search, use \`$search\` instead of \`$text\`.
### Usage Rules for \`$vectorSearch\`
- **Unset embeddings:**
Unless the user explicitly requests the embeddings, add an \`$unset\` stage **at the end of the pipeline** to remove the embedding field and avoid context limits. **The $unset stage in this situation is mandatory**.
Expand All @@ -29,9 +30,12 @@ If the user requests additional filtering, include filters in \`$vectorSearch.fi
NEVER include fields in $vectorSearch.filter that are not part of the vector index.
- **Post-filtering:**
For all remaining filters, add a $match stage after $vectorSearch.
### Note to LLM
- If unsure which fields are filterable, use the collection-indexes tool to determine valid prefilter fields.
- If no requested filters are valid prefilters, omit the filter key from $vectorSearch.\
- If no requested filters are valid prefilters, omit the filter key from $vectorSearch.

### Usage Rules for \`$search\`
- Include the index name, unless you know for a fact there's a default index. If unsure, use the collection-indexes tool to determine the index name.
- The \`$search\` stage supports multiple operators, such as 'autocomplete', 'text', 'geoWithin', and others. Choose the approprate operator based on the user's query. If unsure of the exact syntax, consult the MongoDB Atlas Search documentation, which can be found here: https://www.mongodb.com/docs/atlas/atlas-search/operators-and-collectors/
`;

const genericPipelineDescription = "An array of aggregation stages to execute.";
Expand Down
23 changes: 23 additions & 0 deletions tests/accuracy/aggregate.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -421,4 +421,27 @@ describeAccuracyTests([
},
},
},
{
prompt: "Run a $search query on mflix.movies to find all movies that mention 'space travel' in the plot or title. Use the default search index.",
expectedToolCalls: [
{
toolName: "aggregate",
parameters: {
database: "mflix",
collection: "movies",
pipeline: [
{
$search: {
index: Matcher.anyOf(Matcher.undefined, Matcher.value("default")),
text: {
query: "space travel",
path: ["plot", "title"],
},
},
},
],
},
},
],
},
]);
Loading
Loading