Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@


def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
"""Converts all header keys to lowercase and returns them sorted by key."""
normalized_headers = {k.lower(): v for k, v in headers.items()}
"""Converts all header keys to lowercase, strips whitespace, and returns them sorted by key."""
normalized_headers = {k.lower().strip(): v.strip() for k, v in headers.items()}
sorted_headers = sorted(normalized_headers.items())
return dict(sorted_headers)

Expand Down
10 changes: 5 additions & 5 deletions src/crawlee/_utils/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,11 @@ def compute_unique_key(

Args:
url: The request URL.
method: The HTTP method, defaults to 'GET'.
headers: The HTTP headers, defaults to None.
payload: The data to be sent as the request body, defaults to None.
keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False.
use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False.
method: The HTTP method.
headers: The HTTP headers.
payload: The data to be sent as the request body.
keep_url_fragment: A flag indicating whether to keep the URL fragment.
use_extended_unique_key: A flag indicating whether to include a hashed payload in the key.

Returns:
A string representing the unique key for the request.
Expand Down
183 changes: 92 additions & 91 deletions tests/unit/_utils/test_requests.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest

from crawlee._types import HttpHeaders
from crawlee._utils.requests import compute_unique_key, normalize_url, unique_key_to_request_id

if TYPE_CHECKING:
from crawlee._types import HttpMethod, HttpPayload


def test_unique_key_to_request_id_length() -> None:
unique_key = 'exampleKey123'
Expand Down Expand Up @@ -41,7 +36,7 @@ def test_unique_key_to_request_id_consistency() -> None:
'url_unsafe_characters',
],
)
def test_unique_key_to_request_id_known_values(unique_key: str, expected_request_id: str) -> None:
def test_unique_key_to_request_id_matches_known_values(unique_key: str, expected_request_id: str) -> None:
request_id = unique_key_to_request_id(unique_key)
assert request_id == expected_request_id, f'Unique key "{unique_key}" should produce the expected request ID.'

Expand Down Expand Up @@ -78,93 +73,99 @@ def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: boo
assert output == expected_output


@pytest.mark.parametrize(
('url', 'method', 'headers', 'payload', 'keep_url_fragment', 'use_extended_unique_key', 'expected_output'),
[
('http://example.com', 'GET', None, None, False, False, 'http://example.com'),
('http://example.com', 'POST', None, None, False, False, 'http://example.com'),
('http://example.com', 'GET', None, 'data', False, False, 'http://example.com'),
(
'http://example.com',
'GET',
None,
'data',
False,
True,
'GET|e3b0c442|3a6eb079|http://example.com',
),
(
'http://example.com',
'POST',
HttpHeaders({'Content-Type': 'application/json'}),
'data',
False,
True,
'POST|60d83e70|3a6eb079|http://example.com',
),
(
'http://example.com',
'POST',
HttpHeaders({'Content-Type': 'application/json', 'Custom-Header': 'should be ignored'}),
'data',
False,
True,
'POST|60d83e70|3a6eb079|http://example.com',
),
('http://example.com#fragment', 'GET', None, None, True, False, 'http://example.com#fragment'),
('http://example.com#fragment', 'GET', None, None, False, False, 'http://example.com'),
(
'http://example.com',
'DELETE',
None,
'test',
False,
True,
'DELETE|e3b0c442|9f86d081|http://example.com',
),
('https://example.com?utm_content=test', 'GET', None, None, False, False, 'https://example.com'),
('https://example.com?utm_content=test', 'GET', None, None, True, False, 'https://example.com'),
(
'http://example.com',
'GET',
HttpHeaders({'Accept': 'text/html'}),
None,
False,
True,
'GET|f1614162|e3b0c442|http://example.com',
),
],
ids=[
'simple_get',
'simple_post',
'get_with_payload',
'get_with_payload_extended',
'post_with_payload_extended',
'post_with_payload_and_headers',
'get_with_fragment',
'get_remove_fragment',
'delete_with_payload_extended',
'get_remove_utm',
'get_keep_utm_fragment',
'get_with_headers_extended',
],
)
def test_compute_unique_key(
url: str,
method: HttpMethod,
headers: HttpHeaders | None,
payload: HttpPayload | None,
*,
keep_url_fragment: bool,
use_extended_unique_key: bool,
expected_output: str,
) -> None:
output = compute_unique_key(
def test_compute_unique_key_basic() -> None:
url = 'https://crawlee.dev'
uk_get = compute_unique_key(url, method='GET')
uk_post = compute_unique_key(url, method='POST')
assert url == uk_get == uk_post


def test_compute_unique_key_handles_fragments() -> None:
url = 'https://crawlee.dev#fragment'
uk_with_fragment = compute_unique_key(url, keep_url_fragment=True)
assert uk_with_fragment == url

uk_without_fragment = compute_unique_key(url, 'GET', keep_url_fragment=False)
assert uk_without_fragment == 'https://crawlee.dev'


def test_compute_unique_key_handles_payload() -> None:
url = 'https://crawlee.dev'
payload = '{"key": "value"}'

# Payload without extended unique key
uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=False)
assert uk == url

# Extended unique key and payload is None
uk = compute_unique_key(url, method='POST', payload=None, use_extended_unique_key=True)
assert uk == 'POST|e3b0c442|e3b0c442|https://crawlee.dev'

# Extended unique key and payload is string
uk = compute_unique_key(url, method='POST', payload=payload, use_extended_unique_key=True)
assert uk == 'POST|e3b0c442|9724c1e2|https://crawlee.dev'

# Extended unique key and payload is bytes
uk = compute_unique_key(url, method='POST', payload=payload.encode(), use_extended_unique_key=True)
assert uk == 'POST|e3b0c442|9724c1e2|https://crawlee.dev'


def test_compute_unique_key_handles_headers() -> None:
url = 'https://crawlee.dev'
headers = HttpHeaders({'Accept': '*/*', 'Content-Type': 'application/json'})
uk = compute_unique_key(url, headers=headers, use_extended_unique_key=False)
assert uk == url

extended_uk_expected = 'GET|4e1a2cf6|e3b0c442|https://crawlee.dev'

uk = compute_unique_key(url, headers=headers, use_extended_unique_key=True)
assert uk == extended_uk_expected

# Accept-Encoding header should not be included.
headers = HttpHeaders({'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Content-Type': 'application/json'})
uk = compute_unique_key(url, headers=headers, use_extended_unique_key=True)
assert uk == extended_uk_expected


def test_compute_unique_key_complex() -> None:
url = 'https://crawlee.dev'
headers = HttpHeaders({'Accept': '*/*', 'Content-Type': 'application/json'})
payload = b'{"key": "value"}'

uk = compute_unique_key(
url,
method='POST',
headers=headers,
payload=payload,
use_extended_unique_key=False,
)
assert uk == url

extended_uk = compute_unique_key(
url,
method=method,
method='POST',
headers=headers,
payload=payload,
keep_url_fragment=keep_url_fragment,
use_extended_unique_key=use_extended_unique_key,
use_extended_unique_key=True,
)
assert extended_uk == 'POST|4e1a2cf6|9724c1e2|https://crawlee.dev'


def test_compute_unique_key_post_with_none_payload() -> None:
url = 'https://crawlee.dev'
expected_output = 'POST|e3b0c442|e3b0c442|https://crawlee.dev'
output = compute_unique_key(url, 'POST', payload=None, use_extended_unique_key=True)
assert output == expected_output


def test_compute_unique_key_with_whitespace_in_headers() -> None:
url = 'https://crawlee.dev'
headers = HttpHeaders({'Content-Type': 'application/json'})
headers_with_whitespaces = HttpHeaders({'Content-Type': ' application/json '})

expected_output = 'GET|60d83e70|e3b0c442|https://crawlee.dev'
uk_1 = compute_unique_key(url, headers=headers, use_extended_unique_key=True)
assert uk_1 == expected_output

uk_2 = compute_unique_key(url, headers=headers_with_whitespaces, use_extended_unique_key=True)
assert uk_2 == expected_output
Loading