Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from typing_extensions import Self

from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams, JsonSerializable
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
from crawlee._utils.urls import extract_query_params, validate_http_url

Expand Down Expand Up @@ -278,6 +279,7 @@ def from_url(
id: str | None = None,
keep_url_fragment: bool = False,
use_extended_unique_key: bool = False,
always_enqueue: bool = False,
**kwargs: Any,
) -> Self:
"""Create a new `Request` instance from a URL.
Expand All @@ -303,8 +305,12 @@ def from_url(
the `unique_key` computation. This is only relevant when `unique_key` is not provided.
use_extended_unique_key: Determines whether to include the HTTP method and payload in the `unique_key`
computation. This is only relevant when `unique_key` is not provided.
always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
**kwargs: Additional request properties.
"""
if unique_key is not None and always_enqueue:
raise ValueError('`always_enqueue` cannot be used with a custom `unique_key`')

headers = headers or HttpHeaders()
query_params = query_params or {}

Expand All @@ -317,6 +323,9 @@ def from_url(
use_extended_unique_key=use_extended_unique_key,
)

if always_enqueue:
unique_key = f'{unique_key}_{crypto_random_object_id()}'

id = id or unique_key_to_request_id(unique_key)

request = cls(
Expand Down
32 changes: 32 additions & 0 deletions tests/unit/storages/test_request_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,3 +214,35 @@ async def test_complex_user_data_serialization(request_queue: RequestQueue) -> N
'maxRetries': 1,
'state': RequestState.ERROR_HANDLER,
}


async def test_deduplication_of_requests_with_custom_unique_key() -> None:
with pytest.raises(ValueError, match='`always_enqueue` cannot be used with a custom `unique_key`'):
Request.from_url('https://apify.com', unique_key='apify', always_enqueue=True)


async def test_deduplication_of_requests_with_invalid_custom_unique_key() -> None:
request_1 = Request.from_url('https://apify.com', always_enqueue=True)
request_2 = Request.from_url('https://apify.com', always_enqueue=True)

rq = await RequestQueue.open(name='my-rq')
await rq.add_request(request_1)
await rq.add_request(request_2)

assert await rq.get_total_count() == 2

assert await rq.fetch_next_request() == request_1
assert await rq.fetch_next_request() == request_2


async def test_deduplication_of_requests_with_valid_custom_unique_key() -> None:
request_1 = Request.from_url('https://apify.com')
request_2 = Request.from_url('https://apify.com')

rq = await RequestQueue.open(name='my-rq')
await rq.add_request(request_1)
await rq.add_request(request_2)

assert await rq.get_total_count() == 1

assert await rq.fetch_next_request() == request_1
Loading