Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/upgrading/upgrading_to_v1.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ title: Upgrading to v1

This page summarizes the breaking changes between Crawlee for Python v0.6 and v1.0.


## Terminology change: "browser" in different contexts

The word "browser" is now used distinctly in two contexts:
Expand Down Expand Up @@ -102,6 +103,7 @@ Some older methods have been removed or replaced:

- `from_storage_object` - removed; use the `open` method with either a `name` or `id` instead.
- `get_info` and `storage_object` - replaced by the new `get_metadata` method.
- `get_request` has argument `unique_key` instead of `request_id` as the `id` field was removed from the `Request`.
- `set_metadata` method has been removed.

Some changes in the related model classes:
Expand Down Expand Up @@ -200,6 +202,11 @@ We drop support for Python 3.9. The minimum supported version is now Python 3.10

The fields `persist_storage` and `persist_metadata` have been removed from the `Configuration`. Persistence is now determined only by which storage client class you use.

### Changes in Request

## Removed `id` field from `Request`
`Request` objects no longer have `id` field and all its usages have been transferred to `unique_key` field.

### Changes in HttpResponse

The method `HttpResponse.read` is now asynchronous. This affects all HTTP-based crawlers.
Expand Down
12 changes: 1 addition & 11 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.docs import docs_group
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
from crawlee._utils.requests import compute_unique_key
from crawlee._utils.urls import validate_http_url

if TYPE_CHECKING:
Expand Down Expand Up @@ -165,10 +165,6 @@ class Request(BaseModel):

model_config = ConfigDict(populate_by_name=True)

id: str
"""A unique identifier for the request. Note that this is not used for deduplication, and should not be confused
with `unique_key`."""

unique_key: Annotated[str, Field(alias='uniqueKey')]
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
to the same URL.
Expand Down Expand Up @@ -239,7 +235,6 @@ def from_url(
label: str | None = None,
session_id: str | None = None,
unique_key: str | None = None,
id: str | None = None,
keep_url_fragment: bool = False,
use_extended_unique_key: bool = False,
always_enqueue: bool = False,
Expand All @@ -264,8 +259,6 @@ def from_url(
raised.
unique_key: A unique key identifying the request. If not provided, it is automatically computed based on
the URL and other parameters. Requests with the same `unique_key` are treated as identical.
id: A unique identifier for the request. If not provided, it is automatically generated from the
`unique_key`.
keep_url_fragment: Determines whether the URL fragment (e.g., `#section`) should be included in
the `unique_key` computation. This is only relevant when `unique_key` is not provided.
use_extended_unique_key: Determines whether to include the HTTP method, ID Session and payload in the
Expand Down Expand Up @@ -296,12 +289,9 @@ def from_url(
if always_enqueue:
unique_key = f'{unique_key}_{crypto_random_object_id()}'

id = id or unique_key_to_request_id(unique_key)

request = cls(
url=url,
unique_key=unique_key,
id=id,
method=method,
headers=headers,
payload=payload,
Expand Down
26 changes: 0 additions & 26 deletions src/crawlee/_utils/requests.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
from __future__ import annotations

import re
from base64 import b64encode
from hashlib import sha256
from logging import getLogger
from typing import TYPE_CHECKING

Expand All @@ -16,29 +13,6 @@
logger = getLogger(__name__)


def unique_key_to_request_id(unique_key: str, *, request_id_length: int = 15) -> str:
"""Generate a deterministic request ID based on a unique key.

Args:
unique_key: The unique key to convert into a request ID.
request_id_length: The length of the request ID.

Returns:
A URL-safe, truncated request ID based on the unique key.
"""
# Encode the unique key and compute its SHA-256 hash
hashed_key = sha256(unique_key.encode('utf-8')).digest()

# Encode the hash in base64 and decode it to get a string
base64_encoded = b64encode(hashed_key).decode('utf-8')

# Remove characters that are not URL-safe ('+', '/', or '=')
url_safe_key = re.sub(r'(\+|\/|=)', '', base64_encoded)

# Truncate the key to the desired length
return url_safe_key[:request_id_length]


def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
"""Normalize a URL.

Expand Down
11 changes: 5 additions & 6 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,7 @@ async def _handle_request_retries(
max_retries=3,
)
await self._handle_failed_request(context, error)
self._statistics.record_request_processing_failure(request.id or request.unique_key)
self._statistics.record_request_processing_failure(request.unique_key)

async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None:
try:
Expand Down Expand Up @@ -1274,7 +1274,7 @@ async def __run_task_function(self) -> None:

if not (await self._is_allowed_based_on_robots_txt_file(request.url)):
self._logger.warning(
f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt'
f'Skipping request {request.url} ({request.unique_key}) because it is disallowed based on robots.txt'
)

await self._handle_skipped_request(request, 'robots_txt', need_mark=True)
Expand All @@ -1300,8 +1300,7 @@ async def __run_task_function(self) -> None:
)
self._context_result_map[context] = result

statistics_id = request.id or request.unique_key
self._statistics.record_request_processing_start(statistics_id)
self._statistics.record_request_processing_start(request.unique_key)

try:
request.state = RequestState.REQUEST_HANDLER
Expand All @@ -1328,7 +1327,7 @@ async def __run_task_function(self) -> None:
if context.session and context.session.is_usable:
context.session.mark_good()

self._statistics.record_request_processing_finish(statistics_id)
self._statistics.record_request_processing_finish(request.unique_key)

except RequestCollisionError as request_error:
context.request.no_retry = True
Expand Down Expand Up @@ -1374,7 +1373,7 @@ async def __run_task_function(self) -> None:
)

await self._handle_failed_request(context, session_error)
self._statistics.record_request_processing_failure(statistics_id)
self._statistics.record_request_processing_failure(request.unique_key)

except ContextPipelineInterruptedError as interrupted_error:
self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/request_loaders/_request_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ async def fetch_next_request(self) -> Request | None:
return None

state = await self._get_state()
state.in_progress.add(self._next[0].id)
state.in_progress.add(self._next[0].unique_key)
self._assumed_total_count += 1

next_request = self._next[0]
Expand All @@ -183,7 +183,7 @@ async def fetch_next_request(self) -> Request | None:
async def mark_request_as_handled(self, request: Request) -> None:
self._handled_count += 1
state = await self._get_state()
state.in_progress.remove(request.id)
state.in_progress.remove(request.unique_key)

async def _ensure_next_request(self) -> None:
await self._get_state()
Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/request_loaders/_sitemap_request_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,15 @@ async def fetch_next_request(self) -> Request | None:
url = await self._url_queue.get()

request = Request.from_url(url)
self._in_progress.add(request.id)
self._in_progress.add(request.unique_key)
return request

return None

async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
"""Mark a request as successfully handled."""
if request.id in self._in_progress:
self._in_progress.remove(request.id)
if request.unique_key in self._in_progress:
self._in_progress.remove(request.unique_key)
self._handled_count += 1
return None

Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/storage_clients/_base/_request_queue_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ async def add_batch_of_requests(
"""

@abstractmethod
async def get_request(self, request_id: str) -> Request | None:
async def get_request(self, unique_key: str) -> Request | None:
"""Retrieve a request from the queue.

Args:
request_id: ID of the request to retrieve.
unique_key: Unique key of the request to retrieve.

Returns:
The retrieved request, or None, if it did not exist.
Expand Down
Loading
Loading