Skip to content

Commit b63f9f9

Browse files
authored
feat: add header generator and integrate it into HTTPX client (#530)
### Description - This is the first version of the header generator, providing common HTTP headers including user agent. - User-agent is picked randomly from a random pool of 1000 user agents from the Apify fingerprint dataset. - This is integrated into the HTTPX client and will be further used in the Playwright fingerprint injector (#401). ### Issues - Closes: #402 ### Testing - New unit tests implemented. ### Checklist - [x] CI passed
1 parent 0f968e8 commit b63f9f9

File tree

7 files changed

+1090
-2
lines changed

7 files changed

+1090
-2
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from ._header_generator import HeaderGenerator

src/crawlee/fingerprint_suite/_consts.py

Lines changed: 1009 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from __future__ import annotations
2+
3+
import random
4+
from typing import TYPE_CHECKING
5+
6+
from crawlee.fingerprint_suite._consts import COMMON_ACCEPT, COMMON_ACCEPT_LANGUAGE, USER_AGENT_POOL
7+
8+
if TYPE_CHECKING:
9+
from collections.abc import Mapping
10+
11+
12+
class HeaderGenerator:
13+
"""Generates common headers for HTTP requests."""
14+
15+
def get_common_headers(self) -> Mapping[str, str]:
16+
"""Get common headers for HTTP requests.
17+
18+
We do not modify the 'Accept-Encoding', 'Connection' and other headers. They should be included and handled
19+
by the HTTP client.
20+
21+
Returns:
22+
Dictionary containing common headers.
23+
"""
24+
return {
25+
'Accept': COMMON_ACCEPT,
26+
'Accept-Language': COMMON_ACCEPT_LANGUAGE,
27+
'User-Agent': random.choice(USER_AGENT_POOL),
28+
}

src/crawlee/fingerprint_suite/py.typed

Whitespace-only changes.

src/crawlee/http_clients/_httpx.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,17 @@
66
import httpx
77
from typing_extensions import override
88

9+
from crawlee._types import HttpHeaders
910
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
1011
from crawlee.errors import ProxyError
12+
from crawlee.fingerprint_suite import HeaderGenerator
1113
from crawlee.http_clients import BaseHttpClient, HttpCrawlingResult, HttpResponse
1214
from crawlee.sessions import Session
1315

1416
if TYPE_CHECKING:
1517
from collections.abc import Iterable
1618

17-
from crawlee._types import HttpHeaders, HttpMethod
19+
from crawlee._types import HttpMethod
1820
from crawlee.base_storage_client._models import Request
1921
from crawlee.proxy_configuration import ProxyInfo
2022
from crawlee.statistics import Statistics
@@ -77,6 +79,8 @@ class HttpxHttpClient(BaseHttpClient):
7779
See the `BaseHttpClient` class for more common information about HTTP clients.
7880
"""
7981

82+
_DEFAULT_HEADER_GENERATOR = HeaderGenerator()
83+
8084
def __init__(
8185
self,
8286
*,
@@ -85,6 +89,7 @@ def __init__(
8589
ignore_http_error_status_codes: Iterable[int] = (),
8690
http1: bool = True,
8791
http2: bool = True,
92+
header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,
8893
**async_client_kwargs: Any,
8994
) -> None:
9095
"""Create a new instance.
@@ -95,6 +100,7 @@ def __init__(
95100
ignore_http_error_status_codes: HTTP status codes to ignore as errors.
96101
http1: Whether to enable HTTP/1.1 support.
97102
http2: Whether to enable HTTP/2 support.
103+
header_generator: Header generator instance to use for generating common headers.
98104
async_client_kwargs: Additional keyword arguments for `httpx.AsyncClient`.
99105
"""
100106
super().__init__(
@@ -105,6 +111,7 @@ def __init__(
105111
self._http1 = http1
106112
self._http2 = http2
107113
self._async_client_kwargs = async_client_kwargs
114+
self._header_generator = header_generator
108115

109116
self._client_by_proxy_url = dict[Optional[str], httpx.AsyncClient]()
110117

@@ -118,11 +125,12 @@ async def crawl(
118125
statistics: Statistics | None = None,
119126
) -> HttpCrawlingResult:
120127
client = self._get_client(proxy_info.url if proxy_info else None)
128+
headers = self._combine_headers(HttpHeaders(request.headers))
121129

122130
http_request = client.build_request(
123131
url=request.url,
124132
method=request.method,
125-
headers=request.headers,
133+
headers=headers,
126134
params=request.query_params,
127135
data=request.data,
128136
cookies=session.cookies if session else None,
@@ -164,6 +172,7 @@ async def send_request(
164172
proxy_info: ProxyInfo | None = None,
165173
) -> HttpResponse:
166174
client = self._get_client(proxy_info.url if proxy_info else None)
175+
headers = self._combine_headers(headers)
167176

168177
http_request = client.build_request(
169178
url=url,
@@ -215,6 +224,16 @@ def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
215224

216225
return self._client_by_proxy_url[proxy_url]
217226

227+
def _combine_headers(self, explicit_headers: HttpHeaders | None) -> HttpHeaders | None:
228+
"""Helper to get the headers for a HTTP request."""
229+
common_headers = self._header_generator.get_common_headers() if self._header_generator else {}
230+
headers = HttpHeaders(common_headers)
231+
232+
if explicit_headers:
233+
headers = HttpHeaders({**headers, **explicit_headers})
234+
235+
return headers if headers else None
236+
218237
@staticmethod
219238
def _is_proxy_error(error: httpx.TransportError) -> bool:
220239
"""Helper to check whether the given error is a proxy-related error."""
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from crawlee.fingerprint_suite import HeaderGenerator
2+
3+
4+
def test_get_common_headers() -> None:
5+
header_generator = HeaderGenerator()
6+
headers = header_generator.get_common_headers()
7+
8+
assert 'Accept' in headers
9+
assert 'Accept-Language' in headers
10+
assert 'User-Agent' in headers

tests/unit/http_clients/test_httpx.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
from __future__ import annotations
22

3+
import json
34
import os
45
from typing import TYPE_CHECKING
56

67
import pytest
78

89
from crawlee import Request
910
from crawlee.errors import ProxyError
11+
from crawlee.fingerprint_suite._consts import COMMON_ACCEPT, COMMON_ACCEPT_LANGUAGE, USER_AGENT_POOL
1012
from crawlee.http_clients import HttpxHttpClient
1113
from crawlee.statistics import Statistics
1214

@@ -82,3 +84,22 @@ async def test_send_request_with_proxy_disabled(
8284

8385
with pytest.raises(ProxyError):
8486
await http_client.send_request(url, proxy_info=disabled_proxy)
87+
88+
89+
async def test_common_headers() -> None:
90+
client = HttpxHttpClient()
91+
92+
response = await client.send_request('https://httpbin.org/get')
93+
response_dict = json.loads(response.read().decode())
94+
response_headers = response_dict.get('headers', {})
95+
96+
assert 'Accept' in response_headers
97+
assert response_headers['Accept'] == COMMON_ACCEPT
98+
99+
assert 'Accept-Language' in response_headers
100+
assert response_headers['Accept-Language'] == COMMON_ACCEPT_LANGUAGE
101+
102+
# By default, HTTPX uses its own User-Agent, which should be replaced by the one from the header generator.
103+
assert 'User-Agent' in response_headers
104+
assert 'python-httpx' not in response_headers['User-Agent']
105+
assert response_headers['User-Agent'] in USER_AGENT_POOL

0 commit comments

Comments
 (0)