diff --git a/.env.example b/.env.example index e94d21e9..bcb14ecd 100644 --- a/.env.example +++ b/.env.example @@ -1,7 +1,20 @@ +# Uncomment the crawler you are using: +CRAWLER="FIRECRAWL" +# CRAWLER="CRAWL4AI" + +# FIRECRAWL SETTINGS +# If you are using Firecrawl, add the following below: FIRECRAWL_KEY="YOUR_KEY" # If you want to use your self-hosted Firecrawl, add the following below: # FIRECRAWL_BASE_URL="http://localhost:3002" +# CRAWL4AI SETTINGS +# If you are using Crawl4AI, add the following below: +CRAWL4AI_API_TOKEN="YOUR_API_TOKEN" +# If you your Crawl4AI is running on a different host, add the following below: +# CRAWL4AI_BASE_URL="http://localhost:11235" + +# If you are using OpenAI, add the following below: OPENAI_KEY="YOUR_KEY" CONTEXT_SIZE="128000" # If you want to use other OpenAI compatible API, add the following below: diff --git a/README.md b/README.md index 161455ae..8d27fed4 100644 --- a/README.md +++ b/README.md @@ -88,22 +88,23 @@ flowchart TB 1. Clone the repository 2. Install dependencies: -```bash -npm install -``` + ```bash + npm install + ``` 3. Set up environment variables in a `.env.local` file: -```bash -FIRECRAWL_KEY="your_firecrawl_key" -# If you want to use your self-hosted Firecrawl, add the following below: -# FIRECRAWL_BASE_URL="http://localhost:3002" + ```bash + FIRECRAWL_KEY="your_firecrawl_key" + # If you want to use your self-hosted Firecrawl, add the following below: + # FIRECRAWL_BASE_URL="http://localhost:3002" -OPENAI_KEY="your_openai_key" -``` + OPENAI_KEY="your_openai_key" + ``` To use local LLM, comment out `OPENAI_KEY` and instead uncomment `OPENAI_ENDPOINT` and `OPENAI_MODEL`: -- Set `OPENAI_ENDPOINT` to the address of your local server (eg."http://localhost:1234/v1") + +- Set `OPENAI_ENDPOINT` to the address of your local server (eg."`http://localhost:1234/v1`") - Set `OPENAI_MODEL` to the name of the model loaded in your local server. ### Docker @@ -115,22 +116,23 @@ To use local LLM, comment out `OPENAI_KEY` and instead uncomment `OPENAI_ENDPOIN 4. Run the Docker image: -```bash -docker compose up -d -``` + ```bash + docker compose up -d + ``` 5. Execute `npm run docker` in the docker service: -```bash -docker exec -it deep-research npm run docker -``` + + ```bash + docker exec -it deep-research npm run docker + ``` ## Usage Run the research assistant: -```bash -npm start -``` + ```bash + npm start + ``` You'll be prompted to: @@ -158,10 +160,10 @@ If you have a free version, you may sometimes run into rate limit errors, you ca There are 2 other optional env vars that lets you tweak the endpoint (for other OpenAI compatible APIs like OpenRouter or Gemini) as well as the model string. -```bash -OPENAI_ENDPOINT="custom_endpoint" -OPENAI_MODEL="custom_model" -``` + ```bash + OPENAI_ENDPOINT="custom_endpoint" + OPENAI_MODEL="custom_model" + ``` ## How It Works diff --git a/src/deep-research.ts b/src/deep-research.ts index 1505c4af..c3281942 100644 --- a/src/deep-research.ts +++ b/src/deep-research.ts @@ -1,4 +1,3 @@ -import FirecrawlApp, { SearchResponse } from '@mendable/firecrawl-js'; import { generateObject } from 'ai'; import { compact } from 'lodash-es'; import pLimit from 'p-limit'; @@ -16,6 +15,88 @@ function log(...args: any[]) { output.log(...args); } +// --- NEW DYNAMIC IMPORT BLOCK --- +// Determine which crawler library to use based on the environment variable +const crawlerType = process.env.CRAWLER || 'FIRECRAWL'; +let searchFunction: (query: string, options: any) => Promise<{ data: Array<{ markdown: string; url?: string }> }>; + +// If the CRAWLER env var is set to "CRAWL4AI", use the HTTP API +if (crawlerType === 'CRAWL4AI') { + // Create a wrapper for the Crawl4AI HTTP API + const crawl4aiBaseUrl = process.env.CRAWL4AI_BASE_URL ?? 'http://localhost:11235'; + const crawl4aiToken = process.env.CRAWL4AI_API_TOKEN ?? ''; + + searchFunction = async (query: string, options: any) => { + const headers = { + 'Authorization': `Bearer ${crawl4aiToken}`, + 'Content-Type': 'application/json', + }; + + // Submit crawl job + const response = await fetch(`${crawl4aiBaseUrl}/crawl`, { + method: 'POST', + headers, + body: JSON.stringify({ + urls: query, + priority: 10, + // Map any relevant options from Firecrawl format to Crawl4AI format + ...(options.timeout && { ttl: options.timeout }), + ...(options.limit && { max_results: options.limit }), + }) + }); + + if (!response.ok) { + throw new Error(`Crawl4AI API error: ${response.statusText}`); + } + + const { task_id } = await response.json(); + + // Poll for result with timeout + const startTime = Date.now(); + while (true) { + if (Date.now() - startTime > (options.timeout || 15000)) { + throw new Error('Timeout waiting for Crawl4AI result'); + } + + const statusResponse = await fetch(`${crawl4aiBaseUrl}/task/${task_id}`, { + headers + }); + + if (!statusResponse.ok) { + throw new Error(`Crawl4AI status check error: ${statusResponse.statusText}`); + } + + const status = await statusResponse.json(); + + if (status.status === 'completed') { + // Transform Crawl4AI response to match Firecrawl's format + return { + data: [{ + markdown: status.result.markdown, + url: query + }] + }; + } + + // Wait before polling again + await new Promise(resolve => setTimeout(resolve, 2000)); + } + }; +} else { + // Default to using FIRECRAWL + const { default: FirecrawlApp } = require('@mendable/firecrawl-js'); + // Instantiate Firecrawl with optional API keys/URLs + const firecrawl = new FirecrawlApp({ + apiKey: process.env.FIRECRAWL_KEY ?? '', + apiUrl: process.env.FIRECRAWL_BASE_URL, + }); + searchFunction = async (query: string, options: any) => { + return await firecrawl.search(query, options); + }; +} + +// --- END NEW DYNAMIC IMPORT BLOCK --- + export type ResearchProgress = { currentDepth: number; totalDepth: number; @@ -34,13 +115,6 @@ type ResearchResult = { // increase this if you have higher API rate limits const ConcurrencyLimit = 2; -// Initialize Firecrawl with optional API key and optional base url - -const firecrawl = new FirecrawlApp({ - apiKey: process.env.FIRECRAWL_KEY ?? '', - apiUrl: process.env.FIRECRAWL_BASE_URL, -}); - // take en user query, return a list of SERP queries async function generateSerpQueries({ query, @@ -197,7 +271,7 @@ export async function deepResearch({ reportProgress({ totalQueries: serpQueries.length, - currentQuery: serpQueries[0]?.query + currentQuery: serpQueries[0]?.query, }); const limit = pLimit(ConcurrencyLimit); @@ -206,7 +280,7 @@ export async function deepResearch({ serpQueries.map(serpQuery => limit(async () => { try { - const result = await firecrawl.search(serpQuery.query, { + const result = await searchFunction(serpQuery.query, { timeout: 15000, limit: 5, scrapeOptions: { formats: ['markdown'] },