Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/upgrading/upgrading_to_v1.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ Some older methods have been removed or replaced:

- `from_storage_object` - removed; use the `open` method with either a `name` or `id` instead.
- `get_info` and `storage_object` - replaced by the new `get_metadata` method.
- `get_request` has argument `unique_key` instead of `request_id` as the `id` field was removed from the `Request`.
- `set_metadata` method has been removed.

Some changes in the related model classes:
Expand Down Expand Up @@ -200,6 +201,10 @@ We drop support for Python 3.9. The minimum supported version is now Python 3.10

The fields `persist_storage` and `persist_metadata` have been removed from the `Configuration`. Persistence is now determined only by which storage client class you use.

### Changes in Request

`Request` objects no longer have `id` field and all its usages have been transferred to `unique_key` field.

### Changes in HttpResponse

The method `HttpResponse.read` is now asynchronous. This affects all HTTP-based crawlers.
Expand Down
12 changes: 1 addition & 11 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.docs import docs_group
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
from crawlee._utils.requests import compute_unique_key
from crawlee._utils.urls import validate_http_url

if TYPE_CHECKING:
Expand Down Expand Up @@ -165,10 +165,6 @@ class Request(BaseModel):

model_config = ConfigDict(populate_by_name=True)

id: str
"""A unique identifier for the request. Note that this is not used for deduplication, and should not be confused
with `unique_key`."""

unique_key: Annotated[str, Field(alias='uniqueKey')]
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
to the same URL.
Expand Down Expand Up @@ -239,7 +235,6 @@ def from_url(
label: str | None = None,
session_id: str | None = None,
unique_key: str | None = None,
id: str | None = None,
keep_url_fragment: bool = False,
use_extended_unique_key: bool = False,
always_enqueue: bool = False,
Expand All @@ -264,8 +259,6 @@ def from_url(
raised.
unique_key: A unique key identifying the request. If not provided, it is automatically computed based on
the URL and other parameters. Requests with the same `unique_key` are treated as identical.
id: A unique identifier for the request. If not provided, it is automatically generated from the
`unique_key`.
keep_url_fragment: Determines whether the URL fragment (e.g., `#section`) should be included in
the `unique_key` computation. This is only relevant when `unique_key` is not provided.
use_extended_unique_key: Determines whether to include the HTTP method, ID Session and payload in the
Expand Down Expand Up @@ -296,12 +289,9 @@ def from_url(
if always_enqueue:
unique_key = f'{unique_key}_{crypto_random_object_id()}'

id = id or unique_key_to_request_id(unique_key)

request = cls(
url=url,
unique_key=unique_key,
id=id,
method=method,
headers=headers,
payload=payload,
Expand Down
26 changes: 0 additions & 26 deletions src/crawlee/_utils/requests.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
from __future__ import annotations

import re
from base64 import b64encode
from hashlib import sha256
from logging import getLogger
from typing import TYPE_CHECKING

Expand All @@ -16,29 +13,6 @@
logger = getLogger(__name__)


def unique_key_to_request_id(unique_key: str, *, request_id_length: int = 15) -> str:
"""Generate a deterministic request ID based on a unique key.

Args:
unique_key: The unique key to convert into a request ID.
request_id_length: The length of the request ID.

Returns:
A URL-safe, truncated request ID based on the unique key.
"""
# Encode the unique key and compute its SHA-256 hash
hashed_key = sha256(unique_key.encode('utf-8')).digest()

# Encode the hash in base64 and decode it to get a string
base64_encoded = b64encode(hashed_key).decode('utf-8')

# Remove characters that are not URL-safe ('+', '/', or '=')
url_safe_key = re.sub(r'(\+|\/|=)', '', base64_encoded)

# Truncate the key to the desired length
return url_safe_key[:request_id_length]


def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
"""Normalize a URL.

Expand Down
11 changes: 5 additions & 6 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,7 @@ async def _handle_request_retries(
max_retries=3,
)
await self._handle_failed_request(context, error)
self._statistics.record_request_processing_failure(request.id or request.unique_key)
self._statistics.record_request_processing_failure(request.unique_key)

async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
try:
Expand Down Expand Up @@ -1274,7 +1274,7 @@ async def __run_task_function(self) -> None:

if not (await self._is_allowed_based_on_robots_txt_file(request.url)):
self._logger.warning(
f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt'
f'Skipping request {request.url} ({request.unique_key}) because it is disallowed based on robots.txt'
)

await self._handle_skipped_request(request, 'robots_txt', need_mark=True)
Expand All @@ -1300,8 +1300,7 @@ async def __run_task_function(self) -> None:
)
self._context_result_map[context] = result

statistics_id = request.id or request.unique_key
self._statistics.record_request_processing_start(statistics_id)
self._statistics.record_request_processing_start(request.unique_key)

try:
request.state = RequestState.REQUEST_HANDLER
Expand All @@ -1328,7 +1327,7 @@ async def __run_task_function(self) -> None:
if context.session and context.session.is_usable:
context.session.mark_good()

self._statistics.record_request_processing_finish(statistics_id)
self._statistics.record_request_processing_finish(request.unique_key)

except RequestCollisionError as request_error:
context.request.no_retry = True
Expand Down Expand Up @@ -1374,7 +1373,7 @@ async def __run_task_function(self) -> None:
)

await self._handle_failed_request(context, session_error)
self._statistics.record_request_processing_failure(statistics_id)
self._statistics.record_request_processing_failure(request.unique_key)

except ContextPipelineInterruptedError as interrupted_error:
self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/request_loaders/_request_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ async def fetch_next_request(self) -> Request | None:
return None

state = await self._get_state()
state.in_progress.add(self._next[0].id)
state.in_progress.add(self._next[0].unique_key)
self._assumed_total_count += 1

next_request = self._next[0]
Expand All @@ -183,7 +183,7 @@ async def fetch_next_request(self) -> Request | None:
async def mark_request_as_handled(self, request: Request) -> None:
self._handled_count += 1
state = await self._get_state()
state.in_progress.remove(request.id)
state.in_progress.remove(request.unique_key)

async def _ensure_next_request(self) -> None:
await self._get_state()
Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/request_loaders/_sitemap_request_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,15 @@ async def fetch_next_request(self) -> Request | None:
url = await self._url_queue.get()

request = Request.from_url(url)
self._in_progress.add(request.id)
self._in_progress.add(request.unique_key)
return request

return None

async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
"""Mark a request as successfully handled."""
if request.id in self._in_progress:
self._in_progress.remove(request.id)
if request.unique_key in self._in_progress:
self._in_progress.remove(request.unique_key)
self._handled_count += 1
return None

Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/storage_clients/_base/_request_queue_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ async def add_batch_of_requests(
"""

@abstractmethod
async def get_request(self, request_id: str) -> Request | None:
async def get_request(self, unique_key: str) -> Request | None:
"""Retrieve a request from the queue.

Args:
request_id: ID of the request to retrieve.
unique_key: Unique key of the request to retrieve.

Returns:
The retrieved request, or None, if it did not exist.
Expand Down
Loading
Loading