diff --git a/flags.json b/flags.json index 0967ef424b..6003f4bc31 100644 --- a/flags.json +++ b/flags.json @@ -1 +1,3 @@ -{} +{ + "use-ai-search": true +} diff --git a/package-lock.json b/package-lock.json index 63272b275e..ef0bde7fd0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,9 +8,11 @@ "name": "clerk-docs-2023", "version": "0.1.0", "devDependencies": { + "@ai-sdk/openai": "^2.0.65", "@parcel/watcher": "^2.5.1", "@sindresorhus/slugify": "^2.2.1", "@types/node": "^22.13.2", + "ai": "^5.0.93", "bun": "^1.2.20", "chokidar": "^4.0.3", "concurrently": "^8.2.2", @@ -29,6 +31,7 @@ "remark-mdx": "^3.0.1", "simple-git": "^3.27.0", "symlink-dir": "^6.0.5", + "tiktoken": "^1.0.22", "tsx": "^4.19.2", "typescript": "^5.7.3", "unist-builder": "^4.0.0", @@ -43,6 +46,72 @@ "zod-validation-error": "^3.4.0" } }, + "node_modules/@ai-sdk/gateway": { + "version": "2.0.9", + "resolved": "https://registry.npmjs.org/@ai-sdk/gateway/-/gateway-2.0.9.tgz", + "integrity": "sha512-E6x4h5CPPPJ0za1r5HsLtHbeI+Tp3H+YFtcH8G3dSSPFE6w+PZINzB4NxLZmg1QqSeA5HTP3ZEzzsohp0o2GEw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "2.0.0", + "@ai-sdk/provider-utils": "3.0.17", + "@vercel/oidc": "3.0.3" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.25.76 || ^4.1.8" + } + }, + "node_modules/@ai-sdk/openai": { + "version": "2.0.65", + "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-2.0.65.tgz", + "integrity": "sha512-Wqo4iNCsxUYJFmEAzAHO0XDnnS76rqvPRX2AUhEAOX3cgL9UEfouFmiNQS2jM9AZhMdWj5GIG41hgr4YOr20tA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "2.0.0", + "@ai-sdk/provider-utils": "3.0.17" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.25.76 || ^4.1.8" + } + }, + "node_modules/@ai-sdk/provider": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-2.0.0.tgz", + "integrity": "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@ai-sdk/provider-utils": { + "version": "3.0.17", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-3.0.17.tgz", + "integrity": "sha512-TR3Gs4I3Tym4Ll+EPdzRdvo/rc8Js6c4nVhFLuvGLX/Y4V9ZcQMa/HTiYsHEgmYrf1zVi6Q145UEZUfleOwOjw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "2.0.0", + "@standard-schema/spec": "^1.0.0", + "eventsource-parser": "^3.0.6" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.25.76 || ^4.1.8" + } + }, "node_modules/@ampproject/remapping": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz", @@ -146,6 +215,33 @@ "node": ">=6.9.0" } }, + "node_modules/@edge-runtime/primitives": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/@edge-runtime/primitives/-/primitives-4.1.0.tgz", + "integrity": "sha512-Vw0lbJ2lvRUqc7/soqygUX216Xb8T3WBZ987oywz6aJqRxcwSVWwr9e+Nqo2m9bxobA9mdbWNNoRY6S9eko1EQ==", + "dev": true, + "license": "MPL-2.0", + "optional": true, + "peer": true, + "engines": { + "node": ">=16" + } + }, + "node_modules/@edge-runtime/vm": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/@edge-runtime/vm/-/vm-3.2.0.tgz", + "integrity": "sha512-0dEVyRLM/lG4gp1R/Ik5bfPl/1wX00xFwd5KcNH602tzBa09oF7pbTKETEhR1GjZ75K6OJnYFu8II2dyMhONMw==", + "dev": true, + "license": "MPL-2.0", + "optional": true, + "peer": true, + "dependencies": { + "@edge-runtime/primitives": "4.1.0" + }, + "engines": { + "node": ">=16" + } + }, "node_modules/@esbuild/aix-ppc64": { "version": "0.23.1", "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.23.1.tgz", @@ -743,6 +839,16 @@ "dev": true, "license": "MIT" }, + "node_modules/@opentelemetry/api": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", + "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=8.0.0" + } + }, "node_modules/@oven/bun-darwin-aarch64": { "version": "1.2.20", "resolved": "https://registry.npmjs.org/@oven/bun-darwin-aarch64/-/bun-darwin-aarch64-1.2.20.tgz", @@ -1505,6 +1611,13 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/@standard-schema/spec": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.0.0.tgz", + "integrity": "sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/acorn": { "version": "4.0.6", "resolved": "https://registry.npmjs.org/@types/acorn/-/acorn-4.0.6.tgz", @@ -1576,6 +1689,16 @@ "integrity": "sha512-dqId9J8K/vGi5Zr7oo212BGii5m3q5Hxlkwy3WpYuKPklmBEvsbMYYyLxAQpSffdLl/gdW0XUpKWFvYmyoWCoQ==", "dev": true }, + "node_modules/@vercel/oidc": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@vercel/oidc/-/oidc-3.0.3.tgz", + "integrity": "sha512-yNEQvPcVrK9sIe637+I0jD6leluPxzwJKx/Haw6F4H77CdDsszUn5V3o96LPziXkSNE2B83+Z3mjqGKBK/R6Gg==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">= 20" + } + }, "node_modules/@vitest/expect": { "version": "3.0.7", "resolved": "https://registry.npmjs.org/@vitest/expect/-/expect-3.0.7.tgz", @@ -1720,6 +1843,25 @@ "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" } }, + "node_modules/ai": { + "version": "5.0.93", + "resolved": "https://registry.npmjs.org/ai/-/ai-5.0.93.tgz", + "integrity": "sha512-9eGcu+1PJgPg4pRNV4L7tLjRR3wdJC9CXQoNMvtqvYNOLZHFCzjHtVIOr2SIkoJJeu2+sOy3hyiSuTmy2MA40g==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/gateway": "2.0.9", + "@ai-sdk/provider": "2.0.0", + "@ai-sdk/provider-utils": "3.0.17", + "@opentelemetry/api": "1.9.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.25.76 || ^4.1.8" + } + }, "node_modules/ansi-regex": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", @@ -2318,6 +2460,16 @@ "@types/estree": "^1.0.0" } }, + "node_modules/eventsource-parser": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.6.tgz", + "integrity": "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18.0.0" + } + }, "node_modules/expect-type": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/expect-type/-/expect-type-1.1.0.tgz", @@ -2566,6 +2718,13 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/json-schema": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz", + "integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==", + "dev": true, + "license": "(AFL-2.1 OR BSD-3-Clause)" + }, "node_modules/jsonc-parser": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.3.1.tgz", @@ -4557,6 +4716,13 @@ "node": ">=18.12" } }, + "node_modules/tiktoken": { + "version": "1.0.22", + "resolved": "https://registry.npmjs.org/tiktoken/-/tiktoken-1.0.22.tgz", + "integrity": "sha512-PKvy1rVF1RibfF3JlXBSP0Jrcw2uq3yXdgcEXtKTYn3QJ/cBRBHDnrJ5jHky+MENZ6DIPwNUGWpkVx+7joCpNA==", + "dev": true, + "license": "MIT" + }, "node_modules/tinybench": { "version": "2.9.0", "resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz", @@ -5731,9 +5897,9 @@ } }, "node_modules/zod": { - "version": "3.24.2", - "resolved": "https://registry.npmjs.org/zod/-/zod-3.24.2.tgz", - "integrity": "sha512-lY7CDW43ECgW9u1TcT3IoXHflywfVqDYze4waEz812jR/bZ8FHDsl7pFQoSZTz5N+2NqRXs8GBwnAwo3ZNxqhQ==", + "version": "3.25.76", + "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz", + "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==", "dev": true, "license": "MIT", "funding": { diff --git a/package.json b/package.json index aca5cc8d7d..e565229810 100644 --- a/package.json +++ b/package.json @@ -22,12 +22,15 @@ "move-doc": "node scripts/move-doc.mjs", "delete-doc": "node scripts/delete-doc.mjs", "migrate-sdk-scoping": "tsx scripts/migrate-sdk-scoping.ts", - "migrate-api-reference-links": "tsx scripts/migrate-api-reference-links.ts" + "migrate-api-reference-links": "tsx scripts/migrate-api-reference-links.ts", + "generate-embeddings": "bun scripts/generate-embeddings.ts" }, "devDependencies": { + "@ai-sdk/openai": "^2.0.65", "@parcel/watcher": "^2.5.1", "@sindresorhus/slugify": "^2.2.1", "@types/node": "^22.13.2", + "ai": "^5.0.93", "bun": "^1.2.20", "chokidar": "^4.0.3", "concurrently": "^8.2.2", @@ -46,6 +49,7 @@ "remark-mdx": "^3.0.1", "simple-git": "^3.27.0", "symlink-dir": "^6.0.5", + "tiktoken": "^1.0.22", "tsx": "^4.19.2", "typescript": "^5.7.3", "unist-builder": "^4.0.0", diff --git a/scripts/generate-embeddings.ts b/scripts/generate-embeddings.ts new file mode 100644 index 0000000000..5489feab78 --- /dev/null +++ b/scripts/generate-embeddings.ts @@ -0,0 +1,373 @@ +const EMBEDDING_MODELS = { + small: { + model: 'text-embedding-3-small', + cost: 0.02 / 1_000_000, + batch_cost: 0.01 / 1_000_000, + max_tokens: 8_192, + }, + large: { + model: 'text-embedding-3-large', + cost: 0.13 / 1_000_000, + batch_cost: 0.065 / 1_000_000, + max_tokens: 8_192, + }, +} as const satisfies Record< + string, + { model: OpenAIEmbeddingModelId; cost: number; batch_cost: number; max_tokens: number } +> + +import 'dotenv/config' +import { embedMany } from 'ai' +import { createOpenAI, type openai } from '@ai-sdk/openai' +import readdirp from 'readdirp' +import fs from 'fs/promises' +import yaml from 'yaml' +import z from 'zod' +import { encoding_for_model } from 'tiktoken' +import { slugifyWithCounter } from '@sindresorhus/slugify' +import { remark } from 'remark' +import remarkMdx from 'remark-mdx' +import { filter as mdastFilter } from 'unist-util-filter' +import { map as mdastMap } from 'unist-util-map' + +const EMBEDDING_MODEL_SIZE = cliFlag('large') ? 'large' : 'small' +const ESTIMATE_COST = cliFlag('estimate-cost') +const EMBEDDING_MODEL = EMBEDDING_MODELS[EMBEDDING_MODEL_SIZE] +const EMBEDDING_DIMENSIONS = cliFlag('dimensions', z.coerce.number().positive().optional()) ?? 1_536 / 3 // higher dimensions are more accurate but more expensive, and slower +const OPENAI_EMBEDDINGS_API_KEY = env('OPENAI_EMBEDDINGS_API_KEY') +// We want to use the dist folder as the markdown in there has the partials, tooltips, typedocs, etc. embedded in it. +const DOCUMENTATION_FOLDER = cliFlag('docs', z.string().optional()) ?? './dist' +const EMBEDDINGS_OUTPUT_PATH = cliFlag('output', z.string().optional()) ?? './dist/embeddings.json' + +type Chunk = { + type: 'page' | 'paragraph' + availableSdks?: string[] + activeSdk?: string + title: string + canonical: string + heading?: string + content: string + tokens: number + cost: number + searchRank?: number +} +type OpenAIEmbeddingModelId = Parameters[0] + +async function main() { + console.info({ + EMBEDDING_MODEL_SIZE, + EMBEDDING_MODEL, + EMBEDDING_DIMENSIONS, + ESTIMATE_COST, + DOCUMENTATION_FOLDER, + EMBEDDINGS_OUTPUT_PATH, + }) + + // List all the markdown files in the dist folder + const markdownFiles = ( + await Promise.all( + ( + await readdirp.promise(DOCUMENTATION_FOLDER, { + type: 'files', + fileFilter: '*.mdx', + }) + ).map(async ({ fullPath }) => { + try { + const fileContents = await fs.readFile(fullPath, 'utf8') + const { frontmatter, content } = extractFrontmatter(fileContents) + + const frontmatterTitle = frontmatter.title + + if (frontmatterTitle === undefined) return null + + const vfile = await remark() + .use(remarkMdx) + .use(() => (tree) => { + // Here we can filter out any nodes that we don't want to include in the search + return mdastFilter(tree, (node) => { + if (node.type === 'code') return false // remove code blocks from search + if (node.type === 'mdxJsxFlowElement') return false // remove mdx elements from search + if (node.type === 'mdxJsxTextElement') return false // remove mdx elements from search + if (node.type === 'mdxTextExpression') return false // remove `{ target: '_blank' }` tags + if (node.type === 'mdxFlowExpression') return false // remove `{/* ... */}` comments + if (node.type === 'image') return false // remove images from search + + return true + }) + }) + .use(() => (tree) => { + return mdastMap(tree, (node) => { + // Remove the url by replacing the node with its children (the link text) + if (node.type === 'link' && 'children' in node && Array.isArray(node.children)) { + return node.children[0] + } + + // Remove bold, italic, underline, etc tags + if ( + node.type === 'emphasis' || + node.type === 'strong' || + node.type === 'underline' || + node.type === 'strikethrough' + ) { + if ('children' in node && Array.isArray(node.children)) { + return node.children[0] + } + } + + // Remove blockquote style + if (node.type === 'blockquote') { + if ('children' in node && Array.isArray(node.children)) { + return node.children[0] + } + } + + return node + }) + }) + .process({ + value: content, + }) + + return { + fullPath, + title: frontmatterTitle, + canonical: frontmatter.canonical, + availableSdks: frontmatter.availableSdks, + activeSdk: frontmatter.activeSdk, + searchRank: frontmatter.search?.rank, + content: String(vfile), + } + } catch (error) { + throw new Error(`Failed to parse ${fullPath}`, { cause: error }) + } + }), + ) + ).filter((file) => file !== null) + console.info(`✓ Loaded ${markdownFiles.length} markdown files from ${DOCUMENTATION_FOLDER}`) + + // Chunk the markdown + const markdownChunks = markdownFiles.flatMap((file) => { + try { + // Include the title in the first chunk + let currentChunkContent: string[] | null = [`# ${file.title}`] + let currentHeading: string | undefined = undefined + let type: 'page' | 'paragraph' = 'page' + const slugify = slugifyWithCounter() + + return file.content.split('\n').reduce((chunks, line, lineCount, lines) => { + const trimmedLine = line.trim() + + // Detect if the current line is a h2 or h3 heading + const heading = isH2(trimmedLine) ?? isH3(trimmedLine) + + if (currentChunkContent !== null && heading) { + // We have reached a new heading, so we need to add the current chunk to the chunks array + const content = currentChunkContent.join('\n') + const tokens = calcTokens(content) + + if (tokens > EMBEDDING_MODEL.max_tokens) { + throw new Error(`Chunk content is too large, max tokens: ${EMBEDDING_MODEL.max_tokens}, tokens: ${tokens}`) + } + + chunks.push({ + title: file.title, + canonical: file.canonical, + availableSdks: file.availableSdks, + activeSdk: file.activeSdk, + heading: currentHeading ? slugify(currentHeading) : undefined, + content, + tokens, + cost: calcTokenCost({ tokens }), + type, + searchRank: file.searchRank, + }) + // Reset the current chunk content + currentChunkContent = null + + // Now switch to paragraphs for the rest of the file + type = 'paragraph' + } + + if (heading) { + currentHeading = heading + } + + if (currentChunkContent !== null && !heading) { + // Add the current line to the current chunk content + currentChunkContent.push(trimmedLine) + + if (lineCount === lines.length - 1) { + // We have reached the end of the file, so we need to add the current chunk to the chunks array + const content = currentChunkContent.join('\n') + const tokens = calcTokens(content) + + if (tokens > EMBEDDING_MODEL.max_tokens) { + throw new Error( + `Chunk content is too large, max tokens: ${EMBEDDING_MODEL.max_tokens}, tokens: ${tokens}`, + ) + } + + chunks.push({ + title: file.title, + canonical: file.canonical, + availableSdks: file.availableSdks, + activeSdk: file.activeSdk, + heading: currentHeading ? slugify(currentHeading) : undefined, + content, + tokens, + cost: calcTokenCost({ tokens }), + type, + searchRank: file.searchRank, + }) + } + } + + if (currentChunkContent === null) { + // We are starting a new chunk, so just add in the first line + currentChunkContent = [trimmedLine] + } + + return chunks + }, [] as Chunk[]) + } catch (error) { + throw new Error(`Failed to chunk ${file.fullPath}`, { cause: error }) + } + }) + console.info(`✓ Converted ${markdownFiles.length} markdown files into ${markdownChunks.length} chunks`) + + const totalCost = markdownChunks.reduce((acc, chunk) => acc + chunk.cost, 0) + const totalChunks = markdownChunks.reduce((acc, chunk) => acc + chunk.tokens, 0) + const largestChunk = markdownChunks.reduce((acc, chunk) => Math.max(acc, chunk.tokens), 0) + const smallestChunk = markdownChunks.reduce((acc, chunk) => Math.min(acc, chunk.tokens), 10_000) + + console.info(`Total chunk Tokens: ${totalChunks}`) + console.info(`Largest chunk Tokens: ${largestChunk}`) + console.info(`Smallest chunk Tokens: ${smallestChunk}`) + console.info(`Estimated cost: $${totalCost.toFixed(6)}`) + + if (ESTIMATE_COST) { + process.exit(0) + } + + const openai = createOpenAI({ apiKey: OPENAI_EMBEDDINGS_API_KEY }) + + const { embeddings } = await embedMany({ + model: openai.textEmbeddingModel(EMBEDDING_MODEL.model), + values: markdownChunks.map((chunk) => chunk.content), + providerOptions: { + openai: { + dimensions: EMBEDDING_DIMENSIONS, + }, + }, + }) + + const chunksWithEmbeddings = markdownChunks.map(({ cost, tokens, ...chunk }, index) => { + const embedding = embeddings[index] + + if (!embedding) { + throw new Error(`No embedding found for chunk ${index}`) + } + + return { + ...chunk, + id: index, + embedding: embedding, + } + }) + + console.info(`✓ Generated embeddings for ${chunksWithEmbeddings.length} chunks`) + + await fs.writeFile(EMBEDDINGS_OUTPUT_PATH, JSON.stringify(chunksWithEmbeddings)) + console.info(`✓ Wrote embeddings to ${EMBEDDINGS_OUTPUT_PATH}`) +} + +// Only invokes the main function if we run the script directly eg npm run build, bun run ./scripts/build-docs.ts +if (require.main === module) { + main() +} + +function env(name: string): string +function env(name: string, required: true): string +function env(name: string, required: false): string | undefined +function env(name: string, required: boolean = true): string | undefined { + const value = process.env[name] + if (required && !value) { + throw new Error(`Environment variable ${name} is required`) + } + return value +} + +function cliFlag(name: string): boolean +function cliFlag(name: string, schema: T): z.infer | undefined +function cliFlag(name: string, schema?: T): boolean | z.infer | undefined { + if (schema) { + const arg = process.argv.find((f) => f.startsWith(`--${name}=`)) + if (!arg) return undefined + const value = arg.split('=')[1] + try { + return schema.parse(value) + } catch (err) { + throw new Error(`Invalid value for flag --${name}: ${value}. Error: ${err}`) + } + } else { + return process.argv.includes(`--${name}`) + } +} + +const frontmatterRegex = /---[\s\S]*?---/ +const frontmatterSchema = z.object({ + canonical: z.string(), + title: z.string().optional(), + availableSdks: z + .string() + .optional() + .transform((value) => value?.split(',')), + activeSdk: z.string().optional(), + search: z + .object({ + rank: z.number().optional(), + }) + .optional(), +}) + +function extractFrontmatter(content: string) { + const frontmatterMatch = content.match(frontmatterRegex) + const frontmatter = frontmatterMatch?.[0] + ?.replace(/^---\s*\n?/, '') + .replace(/\n?---\s*$/, '') + .trim() + + if (!frontmatter) { + throw new Error('No frontmatter found') + } + + const parsedFrontmatter = yaml.parse(frontmatter) + const parsed = frontmatterSchema.parse(parsedFrontmatter) + + // Remove the frontmatter from the content to return the rest + const contentWithoutFrontmatter = frontmatterMatch ? content.replace(frontmatterMatch[0], '').trimStart() : content + + return { + frontmatter: parsed, + content: contentWithoutFrontmatter, + } +} + +// Helper function to calculate the number of tokens in a string +const encoder = encoding_for_model(EMBEDDING_MODEL.model) +function calcTokens(text: string) { + return encoder.encode(text).length +} +function calcTokenCost(source: { tokens: number } | { text: string }) { + if ('text' in source) { + return calcTokens(source.text) * EMBEDDING_MODEL.cost + } else { + return source.tokens * EMBEDDING_MODEL.cost + } +} + +const H2Regex = /^##\s+(.+)$/ +const isH2 = (line: string) => line.match(H2Regex)?.[1] + +const H3Regex = /^###\s+(.+)$/ +const isH3 = (line: string) => line.match(H3Regex)?.[1] diff --git a/scripts/lib/api-errors.ts b/scripts/lib/api-errors.ts index 9ea3b16fec..2b0f3f65c9 100644 --- a/scripts/lib/api-errors.ts +++ b/scripts/lib/api-errors.ts @@ -28,6 +28,8 @@ function parseApiErrors(errors: ApiError[], opts: ParseApiErrorsOpts): string { title: ${opts.title} description: ${opts.description} type: reference +search: + rank: -10 --- ${opts.description} diff --git a/vercel.json b/vercel.json index 79097c090e..a7faca6b89 100644 --- a/vercel.json +++ b/vercel.json @@ -1,6 +1,6 @@ { "$schema": "https://openapi.vercel.sh/vercel.json", - "buildCommand": "npm run build", + "buildCommand": "npm run build && npm run generate-embeddings", "devCommand": "npm run dev", "installCommand": "npm install", "outputDirectory": "dist",