Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/upgrading/upgrading_to_v0x.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ This section summarizes the breaking changes between v0.4.x and v0.5.0.

- The `crawlee.statistics.Statistics` class do not accept an event manager as an input argument anymore. It uses the default, global one.

### Request

- Removed properties `json_` and `order_no`.

## Upgrading to v0.4

This section summarizes the breaking changes between v0.3.x and v0.4.0.
Expand Down
48 changes: 1 addition & 47 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from collections.abc import Iterator, MutableMapping
from datetime import datetime
from decimal import Decimal
from enum import IntEnum
from typing import TYPE_CHECKING, Annotated, Any, cast

Expand Down Expand Up @@ -141,11 +140,7 @@ class BaseRequestData(BaseModel):
BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't the BeforeValidator still needed when loading an old request file?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. We need both so that payload is correctly serialized and deserialized to a file.
Updated the e2e test.

PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
] = None
"""HTTP request payload.

TODO: Re-check the need for `Validator` and `Serializer` once the issue is resolved.
https://github.com/apify/crawlee-python/issues/94
"""
"""HTTP request payload."""

user_data: Annotated[
dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
Expand Down Expand Up @@ -256,18 +251,6 @@ class Request(BaseRequestData):
"""A unique identifier for the request. Note that this is not used for deduplication, and should not be confused
with `unique_key`."""

json_: str | None = None
"""Deprecated internal field, do not use it.

Should be removed as part of https://github.com/apify/crawlee-python/issues/94.
"""

order_no: Decimal | None = None
"""Deprecated internal field, do not use it.

Should be removed as part of https://github.com/apify/crawlee-python/issues/94.
"""

@classmethod
def from_url(
cls,
Expand Down Expand Up @@ -436,35 +419,6 @@ def forefront(self) -> bool:
def forefront(self, new_value: bool) -> None:
self.crawlee_data.forefront = new_value

def __eq__(self, other: object) -> bool:
"""Compare all relevant fields of the `Request` class, excluding deprecated fields `json_` and `order_no`.

TODO: Remove this method once the issue is resolved.
https://github.com/apify/crawlee-python/issues/94
"""
if isinstance(other, Request):
return (
self.url == other.url
and self.unique_key == other.unique_key
and self.method == other.method
and self.headers == other.headers
and self.payload == other.payload
and self.user_data == other.user_data
and self.retry_count == other.retry_count
and self.no_retry == other.no_retry
and self.loaded_url == other.loaded_url
and self.handled_at == other.handled_at
and self.id == other.id
and self.label == other.label
and self.state == other.state
and self.max_retries == other.max_retries
and self.session_rotation_count == other.session_rotation_count
and self.enqueue_strategy == other.enqueue_strategy
and self.last_proxy_tier == other.last_proxy_tier
and self.forefront == other.forefront
)
return NotImplemented


class RequestWithLock(Request):
"""A crawling request with information about locks."""
Expand Down
26 changes: 0 additions & 26 deletions src/crawlee/_utils/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,6 @@
from crawlee._types import StorageTypes


def filter_out_none_values_recursively(dictionary: dict, *, remove_empty_dicts: bool = False) -> dict | None:
"""Recursively filters out None values from a dictionary.
Args:
dictionary: The dictionary to filter.
remove_empty_dicts: Flag indicating whether to remove empty nested dictionaries.
Returns:
A copy of the dictionary with all None values (and potentially empty dictionaries) removed.
"""
result = {}
for k, v in dictionary.items():
# If the value is a dictionary, apply recursion
if isinstance(v, dict):
nested = filter_out_none_values_recursively(v, remove_empty_dicts=remove_empty_dicts)
if nested or not remove_empty_dicts:
result[k] = nested
elif v is not None:
result[k] = v

# If removing empty dictionaries and result is empty, return None
if remove_empty_dicts and not result:
return None
return result


def maybe_extract_enum_member_value(maybe_enum_member: Any) -> Any:
"""Extract the value of an enumeration member if it is an Enum, otherwise return the original value."""
if isinstance(maybe_enum_member, Enum):
Expand Down
32 changes: 32 additions & 0 deletions src/crawlee/base_storage_client/_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

import json
from datetime import datetime
from decimal import Decimal
from typing import Annotated, Any, Generic

from pydantic import BaseModel, BeforeValidator, ConfigDict, Field
Expand Down Expand Up @@ -230,3 +232,33 @@ class BatchRequestsOperationResponse(BaseModel):

processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')]
unprocessed_requests: Annotated[list[UnprocessedRequest], Field(alias='unprocessedRequests')]


class InternalRequest(BaseModel):
"""Internal representation of a queue request with additional metadata for ordering and storage."""

model_config = ConfigDict(populate_by_name=True)

id: str

unique_key: str

order_no: Decimal | None = None
"""Order number for maintaining request sequence in queue.
Used for restoring correct request order when recovering queue from storage."""

handled_at: datetime | None

request: Annotated[
Request, Field(alias='json_'), BeforeValidator(lambda v: json.loads(v) if isinstance(v, str) else v)
]
"""Original Request object. The alias 'json_' is required for backward compatibility with legacy code."""

@classmethod
def from_request(cls, request: Request, id: str, order_no: Decimal | None) -> InternalRequest:
return cls(
unique_key=request.unique_key, id=id, handled_at=request.handled_at, order_no=order_no, request=request
)

def to_request(self) -> Request:
return self.request
10 changes: 3 additions & 7 deletions src/crawlee/memory_storage_client/_creation_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import os
import pathlib
from datetime import datetime, timezone
from decimal import Decimal
from logging import getLogger
from typing import TYPE_CHECKING

Expand All @@ -15,10 +14,10 @@
from crawlee._utils.file import json_dumps
from crawlee.base_storage_client._models import (
DatasetMetadata,
InternalRequest,
KeyValueStoreMetadata,
KeyValueStoreRecord,
KeyValueStoreRecordMetadata,
Request,
RequestQueueMetadata,
)

Expand Down Expand Up @@ -358,7 +357,7 @@ def create_rq_from_directory(
pending_request_count = resource_info.pending_request_count

# Load request entries
entries: dict[str, Request] = {}
entries: dict[str, InternalRequest] = {}

for entry in os.scandir(storage_directory):
if entry.is_file():
Expand All @@ -368,10 +367,7 @@ def create_rq_from_directory(
with open(os.path.join(storage_directory, entry.name), encoding='utf-8') as f:
content = json.load(f)

request = Request(**content)
order_no = request.order_no
if order_no:
request.order_no = Decimal(order_no)
request = InternalRequest(**content)

entries[request.id] = request

Expand Down
Loading
Loading