Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,13 @@
BeforeValidator,
ConfigDict,
Field,
JsonValue,
PlainSerializer,
PlainValidator,
TypeAdapter,
)
from typing_extensions import Self

from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams
from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams, JsonSerializable
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
from crawlee._utils.urls import extract_query_params, validate_http_url

Expand Down Expand Up @@ -59,23 +58,23 @@ class CrawleeRequestData(BaseModel):
forefront: Annotated[bool, Field()] = False


class UserData(BaseModel, MutableMapping[str, JsonValue]):
class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
"""Represents the `user_data` part of a Request.

Apart from the well-known attributes (`label` and `__crawlee`), it can also contain arbitrary JSON-compatible
values.
"""

model_config = ConfigDict(extra='allow')
__pydantic_extra__: dict[str, JsonValue] = Field(init=False) # pyright: ignore
__pydantic_extra__: dict[str, JsonSerializable] = Field(init=False) # pyright: ignore

crawlee_data: Annotated[CrawleeRequestData | None, Field(alias='__crawlee')] = None
label: Annotated[str | None, Field()] = None

def __getitem__(self, key: str) -> JsonValue:
def __getitem__(self, key: str) -> JsonSerializable:
return self.__pydantic_extra__[key]

def __setitem__(self, key: str, value: JsonValue) -> None:
def __setitem__(self, key: str, value: JsonSerializable) -> None:
if key == 'label':
if value is not None and not isinstance(value, str):
raise ValueError('`label` must be str or None')
Expand Down Expand Up @@ -139,7 +138,7 @@ class BaseRequestData(BaseModel):
data: Annotated[dict[str, Any], Field(default_factory=dict)] = {}

user_data: Annotated[
dict[str, JsonValue], # Internally, the model contains `UserData`, this is just for convenience
dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
Field(alias='userData', default_factory=lambda: UserData()),
PlainValidator(user_data_adapter.validate_python),
PlainSerializer(
Expand Down
20 changes: 15 additions & 5 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections.abc import Iterator, Mapping
from dataclasses import dataclass, field
from enum import Enum
from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, Union
from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypeVar, Union

from pydantic import ConfigDict, Field, PlainValidator, RootModel
from typing_extensions import NotRequired, TypeAlias, TypedDict, Unpack
Expand All @@ -21,10 +21,20 @@
from crawlee.sessions._session import Session
from crawlee.storages._dataset import ExportToKwargs, GetDataKwargs, PushDataKwargs

# Type for representing json-serializable values. It's close enough to the real thing supported
# by json.parse, and the best we can do until mypy supports recursive types. It was suggested
# in a discussion with (and approved by) Guido van Rossum, so I'd consider it correct enough.
JsonSerializable: TypeAlias = Union[str, int, float, bool, None, dict[str, Any], list[Any]]
# Workaround for https://github.com/pydantic/pydantic/issues/9445
J = TypeVar('J', bound='JsonSerializable')
JsonSerializable: TypeAlias = Union[
list[J],
dict[str, J],
str,
bool,
int,
float,
None,
]
else:
from pydantic import JsonValue as JsonSerializable


HttpMethod: TypeAlias = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']

Expand Down
Loading