Skip to content

Commit 97deaa5

Browse files
densumeshcdxker
authored andcommitted
feature: add firecrawl to our docker compose
1 parent 7e42e78 commit 97deaa5

File tree

5 files changed

+200
-2
lines changed

5 files changed

+200
-2
lines changed

.env.example

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,17 @@ JINA_CODE_API_KEY=""
6565
VECTOR_SIZES="384,512,768,1024,1536,3072"
6666
RUST_LOG="INFO"
6767
BM25_ACTIVE="true"
68+
69+
##### Firecrawl #####
70+
NUM_WORKERS_PER_QUEUE=8
71+
PORT=3002
72+
HOST=0.0.0.0
73+
FIRECRAWL_REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
74+
FIRECRAWL_REDIS_RATE_LIMIT_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
75+
PLAYWRIGHT_MICROSERVICE_URL=http://localhost:3000/scrape
76+
77+
## To turn on DB authentication, you need to set up supabase.
78+
USE_DB_AUTHENTICATION=false
79+
HDX_NODE_BETA_MODE=1
80+
BULL_AUTH_KEY=@
81+
LOGGING_LEVEL=INFO

.env.firecrawl

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# ===== Required ENVS ======
2+
NUM_WORKERS_PER_QUEUE=8
3+
PORT=3002
4+
HOST=0.0.0.0
5+
FIRECRAWL_REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
6+
FIRECRAWL_REDIS_RATE_LIMIT_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
7+
PLAYWRIGHT_MICROSERVICE_URL=http://localhost:3000/scrape
8+
9+
## To turn on DB authentication, you need to set up supabase.
10+
USE_DB_AUTHENTICATION=false
11+
12+
# ===== Optional ENVS ======
13+
14+
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
15+
SUPABASE_ANON_TOKEN=
16+
SUPABASE_URL=
17+
SUPABASE_SERVICE_TOKEN=
18+
19+
# Other Optionals
20+
# use if you've set up authentication and want to test with a real API key
21+
TEST_API_KEY=
22+
# set if you'd like to test the scraping rate limit
23+
RATE_LIMIT_TEST_API_KEY_SCRAPE=
24+
# set if you'd like to test the crawling rate limit
25+
RATE_LIMIT_TEST_API_KEY_CRAWL=
26+
# set if you'd like to use scraping Be to handle JS blocking
27+
SCRAPING_BEE_API_KEY=
28+
# add for LLM dependednt features (image alt generation, etc.)
29+
OPENAI_API_KEY=
30+
BULL_AUTH_KEY=@
31+
# use if you're configuring basic logging with logtail
32+
LOGTAIL_KEY=
33+
# set if you have a llamaparse key you'd like to use to parse pdfs
34+
LLAMAPARSE_API_KEY=
35+
# set if you'd like to send slack server health status messages
36+
SLACK_WEBHOOK_URL=
37+
# set if you'd like to send posthog events like job logs
38+
POSTHOG_API_KEY=
39+
# set if you'd like to send posthog events like job logs
40+
POSTHOG_HOST=
41+
42+
STRIPE_PRICE_ID_STANDARD=
43+
STRIPE_PRICE_ID_SCALE=
44+
STRIPE_PRICE_ID_STARTER=
45+
STRIPE_PRICE_ID_HOBBY=
46+
STRIPE_PRICE_ID_HOBBY_YEARLY=
47+
STRIPE_PRICE_ID_STANDARD_NEW=
48+
STRIPE_PRICE_ID_STANDARD_NEW_YEARLY=
49+
STRIPE_PRICE_ID_GROWTH=
50+
STRIPE_PRICE_ID_GROWTH_YEARLY=
51+
52+
HYPERDX_API_KEY=
53+
HDX_NODE_BETA_MODE=1
54+
55+
# set if you'd like to use the fire engine closed beta
56+
FIRE_ENGINE_BETA_URL=
57+
58+
# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
59+
PROXY_SERVER=
60+
PROXY_USERNAME=
61+
PROXY_PASSWORD=
62+
# set if you'd like to block media requests to save proxy bandwidth
63+
BLOCK_MEDIA=
64+
65+
# Set this to the URL of your webhook when using the self-hosted version of FireCrawl
66+
SELF_HOSTED_WEBHOOK_URL=
67+
68+
# Resend API Key for transactional emails
69+
RESEND_API_KEY=
70+
71+
# LOGGING_LEVEL determines the verbosity of logs that the system will output.
72+
# Available levels are:
73+
# NONE - No logs will be output.
74+
# ERROR - For logging error messages that indicate a failure in a specific operation.
75+
# WARN - For logging potentially harmful situations that are not necessarily errors.
76+
# INFO - For logging informational messages that highlight the progress of the application.
77+
# DEBUG - For logging detailed information on the flow through the system, primarily used for debugging.
78+
# TRACE - For logging more detailed information than the DEBUG level.
79+
# Set LOGGING_LEVEL to one of the above options to control logging output.
80+
LOGGING_LEVEL=INFO

convenience.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,13 @@ build_typescript_client() {
3030
echo "Done building the TypeScript client."
3131
}
3232

33+
start_firecrawl() {
34+
echo "Starting Firecrawl..."
35+
docker compose -f docker-compose-firecrawl.yml up -d firecrawl-worker firecrawl-api playwright-service redis
36+
}
37+
3338
# Main script logic
34-
while getopts ":qps3lc" opt; do
39+
while getopts ":qps3lcf" opt; do
3540
case $opt in
3641
q)
3742
reset_qdrant_database
@@ -42,6 +47,9 @@ while getopts ":qps3lc" opt; do
4247
l)
4348
start_local_services
4449
;;
50+
f)
51+
start_firecrawl
52+
;;
4553
c)
4654
build_typescript_client
4755
;;

docker-compose-firecrawl.yml

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
name: firecrawl
2+
services:
3+
# Firecrawl services
4+
playwright-service:
5+
image: trieve/playwright-service-ts:latest
6+
environment:
7+
- PORT=3000
8+
- PROXY_SERVER=${PROXY_SERVER}
9+
- PROXY_USERNAME=${PROXY_USERNAME}
10+
- PROXY_PASSWORD=${PROXY_PASSWORD}
11+
- BLOCK_MEDIA=${BLOCK_MEDIA}
12+
networks:
13+
- backend
14+
15+
firecrawl-api:
16+
image: trieve/firecrawl:latest
17+
networks:
18+
- backend
19+
environment:
20+
- REDIS_URL=${REDIS_URL:-redis://redis:6379}
21+
- REDIS_RATE_LIMIT_URL=${REDIS_URL:-redis://redis:6379}
22+
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
23+
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
24+
- PORT=${PORT:-3002}
25+
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
26+
- OPENAI_API_KEY=${OPENAI_API_KEY}
27+
- OPENAI_BASE_URL=${OPENAI_BASE_URL}
28+
- MODEL_NAME=${MODEL_NAME:-gpt-4o}
29+
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
30+
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
31+
- LOGTAIL_KEY=${LOGTAIL_KEY}
32+
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
33+
- TEST_API_KEY=${TEST_API_KEY}
34+
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
35+
- POSTHOG_HOST=${POSTHOG_HOST}
36+
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
37+
- SUPABASE_URL=${SUPABASE_URL}
38+
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
39+
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
40+
- HOST=${HOST:-0.0.0.0}
41+
- SELF_HOSTED_WEBHOOK_URL=${SELF_HOSTED_WEBHOOK_URL}
42+
- LOGGING_LEVEL=${LOGGING_LEVEL}
43+
extra_hosts:
44+
- "host.docker.internal:host-gateway"
45+
depends_on:
46+
- playwright-service
47+
ports:
48+
- "3002:3002"
49+
command: [ "pnpm", "run", "start:production" ]
50+
51+
firecrawl-worker:
52+
image: trieve/firecrawl:latest
53+
networks:
54+
- backend
55+
environment:
56+
- REDIS_URL=${FIRECRAWL_REDIS_URL:-redis://redis:6379}
57+
- REDIS_RATE_LIMIT_URL=${FIRECRAWL_REDIS_URL:-redis://redis:6379}
58+
- PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000}
59+
- USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION}
60+
- PORT=${PORT:-3002}
61+
- NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE}
62+
- OPENAI_API_KEY=${OPENAI_API_KEY}
63+
- OPENAI_BASE_URL=${OPENAI_BASE_URL}
64+
- MODEL_NAME=${MODEL_NAME:-gpt-4o}
65+
- SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
66+
- LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY}
67+
- LOGTAIL_KEY=${LOGTAIL_KEY}
68+
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
69+
- TEST_API_KEY=${TEST_API_KEY}
70+
- POSTHOG_API_KEY=${POSTHOG_API_KEY}
71+
- POSTHOG_HOST=${POSTHOG_HOST}
72+
- SUPABASE_ANON_TOKEN=${SUPABASE_ANON_TOKEN}
73+
- SUPABASE_URL=${SUPABASE_URL}
74+
- SUPABASE_SERVICE_TOKEN=${SUPABASE_SERVICE_TOKEN}
75+
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
76+
- HOST=${HOST:-0.0.0.0}
77+
- SELF_HOSTED_WEBHOOK_URL=${SELF_HOSTED_WEBHOOK_URL}
78+
- LOGGING_LEVEL=${LOGGING_LEVEL}
79+
extra_hosts:
80+
- "host.docker.internal:host-gateway"
81+
depends_on:
82+
- playwright-service
83+
- firecrawl-api
84+
command: [ "pnpm", "run", "workers" ]
85+
86+
redis:
87+
image: redis:alpine
88+
networks:
89+
- backend
90+
command: redis-server --bind 0.0.0.0
91+
92+
93+
networks:
94+
backend:
95+
driver: bridge
96+

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,4 +355,4 @@ volumes:
355355
s3-data:
356356
keycloak-data:
357357
pg-keycloak-data:
358-
clickhouse-data:
358+
clickhouse-data:

0 commit comments

Comments
 (0)