Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/guides/code_examples/request_loaders/sitemap_example.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import asyncio
import re

from crawlee.http_clients import HttpxHttpClient
from crawlee.http_clients import ImpitHttpClient
from crawlee.request_loaders import SitemapRequestLoader


async def main() -> None:
# Create an HTTP client for fetching sitemaps
async with HttpxHttpClient() as http_client:
async with ImpitHttpClient() as http_client:
# Create a sitemap request loader with URL filtering
sitemap_loader = SitemapRequestLoader(
sitemap_urls=['https://crawlee.dev/sitemap.xml'],
Expand Down
14 changes: 7 additions & 7 deletions docs/guides/http_clients.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,24 @@ class HttpClient {
%% Specific classes
%% ========================

class ImpitHttpClient

class HttpxHttpClient

class CurlImpersonateHttpClient

class ImpitHttpClient

%% ========================
%% Inheritance arrows
%% ========================

HttpClient --|> ImpitHttpClient
HttpClient --|> HttpxHttpClient
HttpClient --|> CurlImpersonateHttpClient
HttpClient --|> ImpitHttpClient
```

## Switching between HTTP clients

Crawlee currently provides three main HTTP clients: <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>, which uses the `httpx` library, <ApiLink to="class/CurlImpersonateHttpClient">`CurlImpersonateHttpClient`</ApiLink>, which uses the `curl-cffi` library, and <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>, which uses the `impit` library. You can switch between them by setting the `http_client` parameter when initializing a crawler class. The default HTTP client is <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>.
Crawlee currently provides three main HTTP clients: <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>, which uses the `impit` library, <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>, which uses the `httpx` library with `browserforge` for custom HTTP headers and fingerprints, and <ApiLink to="class/CurlImpersonateHttpClient">`CurlImpersonateHttpClient`</ApiLink>, which uses the `curl-cffi` library. You can switch between them by setting the `http_client` parameter when initializing a crawler class. The default HTTP client is <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>. For more details on anti-blocking features, see our [avoid getting blocked guide](./avoid-blocking).

Below are examples of how to configure the HTTP client for the <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink>:

Expand All @@ -77,18 +77,18 @@ Below are examples of how to configure the HTTP client for the <ApiLink to="clas

## Installation requirements

Since <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink> is the default HTTP client, it's included with the base Crawlee installation and requires no additional packages.
Since <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink> is the default HTTP client, it's included with the base Crawlee installation and requires no additional packages.

For <ApiLink to="class/CurlImpersonateHttpClient">`CurlImpersonateHttpClient`</ApiLink>, you need to install Crawlee with the `curl-impersonate` extra:

```sh
python -m pip install 'crawlee[curl-impersonate]'
```

For <ApiLink to="class/ImpitHttpClient">`ImpitHttpClient`</ApiLink>, you need to install Crawlee with the `impit` extra:
For <ApiLink to="class/HttpxHttpClient">`HttpxHttpClient`</ApiLink>, you need to install Crawlee with the `httpx` extra:

```sh
python -m pip install 'crawlee[impit]'
python -m pip install 'crawlee[httpx]'
```

Alternatively, you can install all available extras to get access to all HTTP clients and features:
Expand Down
14 changes: 7 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,9 @@ keywords = [
"scraping",
]
dependencies = [
"apify_fingerprint_datapoints>=0.0.2",
"browserforge>=1.2.3",
"cachetools>=5.5.0",
"colorama>=0.4.0",
"httpx[brotli,http2,zstd]>=0.27.0",
"impit>=0.4.2",
"more-itertools>=10.2.0",
"protego>=0.5.0",
"psutil>=6.0.0",
Expand All @@ -52,18 +50,20 @@ dependencies = [
]

[project.optional-dependencies]
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,parsel,playwright,otel]"]
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel]"]
adaptive-crawler = [
"jaro-winkler>=2.0.3",
"playwright>=1.27.0",
"scikit-learn>=1.6.0",
"apify_fingerprint_datapoints>=0.0.2",
"browserforge>=1.2.3"
]
beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0","impit>=0.4.0"]
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
curl-impersonate = ["curl-cffi>=0.9.0"]
impit = ["impit>=0.4.0"]
httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
parsel = ["parsel>=1.10.0"]
playwright = ["playwright>=1.27.0"]
playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
otel = [
"opentelemetry-api>=1.34.1",
"opentelemetry-distro[otlp]>=0.54",
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
UserDefinedErrorHandlerError,
)
from crawlee.events._types import Event, EventCrawlerStatusData
from crawlee.http_clients import HttpxHttpClient
from crawlee.http_clients import ImpitHttpClient
from crawlee.router import Router
from crawlee.sessions import SessionPool
from crawlee.statistics import Statistics, StatisticsState
Expand Down Expand Up @@ -367,7 +367,7 @@ def __init__(
set(ignore_http_error_status_codes) if ignore_http_error_status_codes else set()
)

self._http_client = http_client or HttpxHttpClient()
self._http_client = http_client or ImpitHttpClient()

# Request router setup
self._router: Router[TCrawlingContext] | None = None
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from crawlee.errors import SessionError
from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
from crawlee.http_clients import HttpxHttpClient
from crawlee.http_clients import ImpitHttpClient
from crawlee.sessions._cookies import PlaywrightCookieParam
from crawlee.statistics import StatisticsState

Expand Down Expand Up @@ -473,7 +473,7 @@ async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
Args:
url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
"""
http_client = HttpxHttpClient() if isinstance(self._http_client, PlaywrightHttpClient) else self._http_client
http_client = ImpitHttpClient() if isinstance(self._http_client, PlaywrightHttpClient) else self._http_client

return await RobotsTxtFile.find(url, http_client=http_client)

Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/http_clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# These imports have only mandatory dependencies, so they are imported directly.
from ._base import HttpClient, HttpCrawlingResult, HttpResponse
from ._httpx import HttpxHttpClient
from ._impit import ImpitHttpClient

_install_import_hook(__name__)

Expand All @@ -12,8 +12,8 @@
with _try_import(__name__, 'CurlImpersonateHttpClient'):
from ._curl_impersonate import CurlImpersonateHttpClient

with _try_import(__name__, 'ImpitHttpClient'):
from ._impit import ImpitHttpClient
with _try_import(__name__, 'HttpxHttpClient'):
from ._httpx import HttpxHttpClient


__all__ = [
Expand Down
9 changes: 4 additions & 5 deletions src/crawlee/http_clients/_impit.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ def __init__(
persist_cookies_per_session: Whether to persist cookies per HTTP session.
http3: Whether to enable HTTP/3 support.
verify: SSL certificates used to verify the identity of requested hosts.
header_generator: Header generator instance to use for generating common headers.
browser: Browser to impersonate.
async_client_kwargs: Additional keyword arguments for `impit.AsyncClient`.
"""
Expand Down Expand Up @@ -135,7 +134,7 @@ async def crawl(
content=request.payload,
headers=dict(request.headers) if request.headers else None,
)
except (TransportError, HTTPError) as exc: # type: ignore[misc] # waiting for merge https://github.com/apify/impit/pull/207
except (TransportError, HTTPError) as exc:
if self._is_proxy_error(exc):
raise ProxyError from exc
raise
Expand Down Expand Up @@ -167,7 +166,7 @@ async def send_request(
response = await client.request(
method=method, url=url, content=payload, headers=dict(headers) if headers else None
)
except (TransportError, HTTPError) as exc: # type: ignore[misc] # waiting for merge https://github.com/apify/impit/pull/207
except (TransportError, HTTPError) as exc:
if self._is_proxy_error(exc):
raise ProxyError from exc
raise
Expand All @@ -194,7 +193,7 @@ async def stream(
url=url,
content=payload,
headers=dict(headers) if headers else None,
stream=True, # type: ignore[call-arg] # waiting for merge https://github.com/apify/impit/pull/207
stream=True,
)
try:
yield _ImpitResponse(response)
Expand Down Expand Up @@ -233,7 +232,7 @@ def _get_client(self, proxy_url: str | None, cookie_jar: CookieJar | None) -> As
return client

@staticmethod
def _is_proxy_error(error: RuntimeError) -> bool:
def _is_proxy_error(error: HTTPError) -> bool:
"""Determine whether the given error is related to a proxy issue.

Check if the error message contains known proxy-related error keywords.
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/project_template/cookiecutter.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
"crawler_type": ["beautifulsoup", "parsel", "playwright", "playwright-camoufox"],
"__crawler_type": "{{ cookiecutter.crawler_type|lower|replace('-', '_') }}",
"http_client": ["httpx", "curl-impersonate", "impit"],
"http_client": ["impit", "httpx", "curl-impersonate"],
"package_manager": ["poetry", "pip", "uv"],
"enable_apify_integration": false,
"install_project": true,
Expand Down
9 changes: 9 additions & 0 deletions tests/unit/crawlers/_http/test_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,16 @@ async def handler(context: HttpCrawlingContext) -> None:
'http_only': False,
}

# Some clients may ignore `.` at the beginning of the domain
# https://www.rfc-editor.org/rfc/rfc6265#section-4.1.2.3
assert session_cookies_dict['domain'] == {
'name': 'domain',
'value': '6',
'domain': {server_url.host},
'path': '/',
'secure': False,
'http_only': False,
} or {
'name': 'domain',
'value': '6',
'domain': f'.{server_url.host}',
Expand Down
6 changes: 2 additions & 4 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values
from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
from crawlee.http_clients import HttpxHttpClient
from crawlee.http_clients import ImpitHttpClient
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.sessions import Session, SessionPool
from crawlee.statistics import Statistics
Expand Down Expand Up @@ -694,9 +694,7 @@ async def test_send_request_with_client(server_url: URL) -> None:
"""Check that the persist context works with fingerprints."""
check_data: dict[str, Any] = {}

crawler = PlaywrightCrawler(
http_client=HttpxHttpClient(header_generator=None, headers={'user-agent': 'My User-Agent'})
)
crawler = PlaywrightCrawler(http_client=ImpitHttpClient(headers={'user-agent': 'My User-Agent'}))

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
Expand Down
16 changes: 8 additions & 8 deletions tests/unit/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_create_interactive(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyP
'project_name': 'my_project',
'package_manager': 'poetry',
'crawler_type': 'beautifulsoup',
'http_client': 'httpx',
'http_client': 'impit',
'enable_apify_integration': False,
'start_url': 'https://crawlee.dev',
'install_project': True,
Expand Down Expand Up @@ -79,7 +79,7 @@ def test_create_interactive_non_default_template(mock_cookiecutter: Mock, monkey
'project_name': 'my_project',
'package_manager': 'poetry',
'crawler_type': 'parsel',
'http_client': 'httpx',
'http_client': 'impit',
'enable_apify_integration': False,
'start_url': 'https://crawlee.dev',
'install_project': True,
Expand All @@ -96,7 +96,7 @@ def test_create_non_interactive(mock_cookiecutter: Mock) -> None:
'--crawler-type',
'playwright',
'--http-client',
'curl-impersonate',
'httpx',
'--package-manager',
'pip',
'--start-url',
Expand All @@ -113,7 +113,7 @@ def test_create_non_interactive(mock_cookiecutter: Mock) -> None:
'project_name': 'my_project',
'package_manager': 'pip',
'crawler_type': 'playwright',
'http_client': 'curl-impersonate',
'http_client': 'httpx',
'start_url': 'https://yr.no',
'enable_apify_integration': False,
'install_project': False,
Expand Down Expand Up @@ -144,7 +144,7 @@ def test_create_existing_folder(
'--crawler-type',
'playwright',
'--http-client',
'curl-impersonate',
'httpx',
'--package-manager',
'pip',
'--start-url',
Expand All @@ -162,7 +162,7 @@ def test_create_existing_folder(
'project_name': 'my_project',
'package_manager': 'pip',
'crawler_type': 'playwright',
'http_client': 'curl-impersonate',
'http_client': 'httpx',
'start_url': 'https://yr.no',
'enable_apify_integration': False,
'install_project': True,
Expand Down Expand Up @@ -202,7 +202,7 @@ def test_create_existing_folder_interactive(
'project_name': 'my_project',
'package_manager': 'poetry',
'crawler_type': 'playwright',
'http_client': 'httpx',
'http_client': 'impit',
'start_url': 'https://crawlee.dev',
'enable_apify_integration': False,
'install_project': True,
Expand Down Expand Up @@ -245,7 +245,7 @@ def test_create_existing_folder_interactive_multiple_attempts(
'project_name': 'my_project',
'package_manager': 'poetry',
'crawler_type': 'playwright',
'http_client': 'httpx',
'http_client': 'impit',
'start_url': 'https://crawlee.dev',
'enable_apify_integration': False,
'install_project': True,
Expand Down
Loading
Loading