Skip to content

Commit 2752643

Browse files
committed
fix: merge payload and data fields of Request
1 parent 01f0746 commit 2752643

File tree

11 files changed

+118
-36
lines changed

11 files changed

+118
-36
lines changed

docs/examples/code/fill_and_submit_web_form_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:
1818
request = Request.from_url(
1919
url='https://httpbin.org/post',
2020
method='POST',
21-
data={
21+
payload={
2222
'custname': 'John Doe',
2323
'custtel': '1234567890',
2424
'custemail': '[email protected]',

docs/examples/code/fill_and_submit_web_form_request.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
request = Request.from_url(
55
url='https://httpbin.org/post',
66
method='POST',
7-
data={
7+
payload={
88
'custname': 'John Doe',
99
'custtel': '1234567890',
1010
'custemail': '[email protected]',

docs/examples/fill_and_submit_web_form.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ Now, let's create a POST request with the form fields and their values using the
4646
{RequestExample}
4747
</CodeBlock>
4848

49-
Alternatively, you can send form data as URL parameters using the `query_params` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `data` parameter is generally a better approach.
49+
Alternatively, you can send form data as URL parameters using the `query_params` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach.
5050

5151
## Implementing the crawler
5252

src/crawlee/_request.py

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,15 +127,14 @@ class BaseRequestData(BaseModel):
127127
method: HttpMethod = 'GET'
128128
"""HTTP request method."""
129129

130-
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders())] = HttpHeaders()
130+
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
131131
"""HTTP request headers."""
132132

133133
query_params: Annotated[HttpQueryParams, Field(alias='queryParams', default_factory=dict)] = {}
134134
"""URL query parameters."""
135135

136-
payload: HttpPayload | None = None
137-
138-
data: Annotated[dict[str, Any], Field(default_factory=dict)] = {}
136+
payload: Annotated[HttpPayload, Field(default_factory=dict)] = {}
137+
"""HTTP request payload."""
139138

140139
user_data: Annotated[
141140
dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
@@ -169,6 +168,8 @@ def from_url(
169168
url: str,
170169
*,
171170
method: HttpMethod = 'GET',
171+
headers: HttpHeaders | None = None,
172+
query_params: HttpQueryParams | None = None,
172173
payload: HttpPayload | None = None,
173174
label: str | None = None,
174175
unique_key: str | None = None,
@@ -178,6 +179,10 @@ def from_url(
178179
**kwargs: Any,
179180
) -> Self:
180181
"""Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details."""
182+
headers = headers or HttpHeaders()
183+
query_params = query_params or {}
184+
payload = payload or {}
185+
181186
unique_key = unique_key or compute_unique_key(
182187
url,
183188
method=method,
@@ -193,6 +198,8 @@ def from_url(
193198
unique_key=unique_key,
194199
id=id,
195200
method=method,
201+
headers=headers,
202+
query_params=query_params,
196203
payload=payload,
197204
**kwargs,
198205
)
@@ -243,6 +250,8 @@ def from_url(
243250
url: str,
244251
*,
245252
method: HttpMethod = 'GET',
253+
headers: HttpHeaders | None = None,
254+
query_params: HttpQueryParams | None = None,
246255
payload: HttpPayload | None = None,
247256
label: str | None = None,
248257
unique_key: str | None = None,
@@ -261,6 +270,8 @@ def from_url(
261270
Args:
262271
url: The URL of the request.
263272
method: The HTTP method of the request.
273+
headers: The HTTP headers of the request.
274+
query_params: The query parameters of the URL.
264275
payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests.
265276
label: A custom label to differentiate between request types. This is stored in `user_data`, and it is
266277
used for request routing (different requests go to different handlers).
@@ -274,6 +285,10 @@ def from_url(
274285
computation. This is only relevant when `unique_key` is not provided.
275286
**kwargs: Additional request properties.
276287
"""
288+
headers = headers or HttpHeaders()
289+
query_params = query_params or {}
290+
payload = payload or {}
291+
277292
unique_key = unique_key or compute_unique_key(
278293
url,
279294
method=method,
@@ -289,6 +304,8 @@ def from_url(
289304
unique_key=unique_key,
290305
id=id,
291306
method=method,
307+
headers=headers,
308+
query_params=query_params,
292309
payload=payload,
293310
**kwargs,
294311
)
@@ -377,6 +394,36 @@ def forefront(self) -> bool:
377394
def forefront(self, new_value: bool) -> None:
378395
self.crawlee_data.forefront = new_value
379396

397+
def __eq__(self, other: object) -> bool:
398+
"""Compare all relevant fields of the `Request` class, excluding deprecated fields `json_` and `order_no`.
399+
400+
TODO: Remove this method once the issue is resolved.
401+
https://github.com/apify/crawlee-python/issues/94
402+
"""
403+
if isinstance(other, Request):
404+
return (
405+
self.url == other.url
406+
and self.unique_key == other.unique_key
407+
and self.method == other.method
408+
and self.headers == other.headers
409+
and self.query_params == other.query_params
410+
and self.payload == other.payload
411+
and self.user_data == other.user_data
412+
and self.retry_count == other.retry_count
413+
and self.no_retry == other.no_retry
414+
and self.loaded_url == other.loaded_url
415+
and self.handled_at == other.handled_at
416+
and self.id == other.id
417+
and self.label == other.label
418+
and self.state == other.state
419+
and self.max_retries == other.max_retries
420+
and self.session_rotation_count == other.session_rotation_count
421+
and self.enqueue_strategy == other.enqueue_strategy
422+
and self.last_proxy_tier == other.last_proxy_tier
423+
and self.forefront == other.forefront
424+
)
425+
return NotImplemented
426+
380427

381428
class RequestWithLock(Request):
382429
"""A crawling request with information about locks."""

src/crawlee/_types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252

5353
HttpQueryParams: TypeAlias = dict[str, str]
5454

55-
HttpPayload: TypeAlias = Union[str, bytes]
55+
HttpPayload: TypeAlias = dict[str, Any]
5656

5757

5858
def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:

src/crawlee/_utils/requests.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -119,13 +119,7 @@ def compute_unique_key(
119119

120120
# Compute and return the extended unique key if required.
121121
if use_extended_unique_key:
122-
if payload is None:
123-
payload_in_bytes = b''
124-
elif isinstance(payload, str):
125-
payload_in_bytes = payload.encode('utf-8')
126-
else:
127-
payload_in_bytes = payload
128-
122+
payload_in_bytes = b'' if payload is None else str(payload).encode('utf-8')
129123
payload_hash = compute_short_hash(payload_in_bytes)
130124
return f'{normalized_method}({payload_hash}):{normalized_url}'
131125

src/crawlee/http_clients/_base.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@
22

33
from abc import ABC, abstractmethod
44
from dataclasses import dataclass
5-
from typing import TYPE_CHECKING, Any, Protocol
5+
from typing import TYPE_CHECKING, Protocol
66

77
from crawlee._utils.http import is_status_code_error
88
from crawlee.errors import HttpStatusCodeError
99

1010
if TYPE_CHECKING:
1111
from collections.abc import Iterable
1212

13-
from crawlee._types import HttpHeaders, HttpMethod, HttpQueryParams
13+
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams
1414
from crawlee.base_storage_client._models import Request
1515
from crawlee.proxy_configuration import ProxyInfo
1616
from crawlee.sessions import Session
@@ -115,7 +115,7 @@ async def send_request(
115115
method: HttpMethod = 'GET',
116116
headers: HttpHeaders | None = None,
117117
query_params: HttpQueryParams | None = None,
118-
data: dict[str, Any] | None = None,
118+
payload: HttpPayload | None = None,
119119
session: Session | None = None,
120120
proxy_info: ProxyInfo | None = None,
121121
) -> HttpResponse:
@@ -128,7 +128,7 @@ async def send_request(
128128
method: The HTTP method to use.
129129
headers: The headers to include in the request.
130130
query_params: The query parameters to include in the request.
131-
data: The data to be sent as the request body.
131+
payload: The data to be sent as the request body.
132132
session: The session associated with the request.
133133
proxy_info: The information about the proxy to be used.
134134

src/crawlee/http_clients/_httpx.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
if TYPE_CHECKING:
1717
from collections.abc import Iterable
1818

19-
from crawlee._types import HttpMethod, HttpQueryParams
19+
from crawlee._types import HttpMethod, HttpPayload, HttpQueryParams
2020
from crawlee.base_storage_client._models import Request
2121
from crawlee.proxy_configuration import ProxyInfo
2222
from crawlee.statistics import Statistics
@@ -132,7 +132,7 @@ async def crawl(
132132
method=request.method,
133133
headers=headers,
134134
params=request.query_params,
135-
data=request.data,
135+
data=request.payload,
136136
cookies=session.cookies if session else None,
137137
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
138138
)
@@ -167,7 +167,7 @@ async def send_request(
167167
method: HttpMethod = 'GET',
168168
headers: HttpHeaders | None = None,
169169
query_params: HttpQueryParams | None = None,
170-
data: dict[str, Any] | None = None,
170+
payload: HttpPayload | None = None,
171171
session: Session | None = None,
172172
proxy_info: ProxyInfo | None = None,
173173
) -> HttpResponse:
@@ -179,7 +179,7 @@ async def send_request(
179179
method=method,
180180
headers=dict(headers) if headers else None,
181181
params=query_params,
182-
data=data,
182+
data=payload,
183183
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
184184
)
185185

src/crawlee/http_clients/curl_impersonate.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from curl_cffi.const import CurlHttpVersion
1717
from typing_extensions import override
1818

19-
from crawlee._types import HttpHeaders
19+
from crawlee._types import HttpHeaders, HttpPayload
2020
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
2121
from crawlee.errors import ProxyError
2222
from crawlee.http_clients import BaseHttpClient, HttpCrawlingResult, HttpResponse
@@ -153,7 +153,7 @@ async def send_request(
153153
method: HttpMethod = 'GET',
154154
headers: HttpHeaders | None = None,
155155
query_params: HttpQueryParams | None = None,
156-
data: dict[str, Any] | None = None,
156+
payload: HttpPayload | None = None,
157157
session: Session | None = None,
158158
proxy_info: ProxyInfo | None = None,
159159
) -> HttpResponse:
@@ -166,7 +166,7 @@ async def send_request(
166166
method=method.upper(), # type: ignore # curl-cffi requires uppercase method
167167
headers=dict(headers) if headers else None,
168168
params=query_params,
169-
data=data,
169+
data=payload,
170170
cookies=session.cookies if session else None,
171171
allow_redirects=True,
172172
)

src/crawlee/memory_storage_client/_request_queue_client.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ async def add_request(
268268
persist_storage=self._memory_storage_client.persist_storage,
269269
)
270270

271-
# We return wasAlreadyHandled is false even though the request may have been added as handled,
271+
# We return was_already_handled=False even though the request may have been added as handled,
272272
# because that's how API behaves.
273273
return ProcessedRequest(
274274
id=request_model.id,
@@ -519,15 +519,17 @@ async def _create_internal_request(self, request: Request, forefront: bool | Non
519519
if request.id is not None and request.id != id:
520520
raise ValueError('Request ID does not match its unique_key.')
521521

522-
json_request = await json_dumps({**(request.model_dump()), 'id': id})
522+
request_kwargs = {
523+
**(request.model_dump()),
524+
'id': id,
525+
'order_no': order_no,
526+
}
527+
528+
del request_kwargs['json_']
529+
523530
return Request(
524-
url=request.url,
525-
unique_key=request.unique_key,
526-
id=id,
527-
method=request.method,
528-
retry_count=request.retry_count,
529-
order_no=order_no,
530-
json_=json_request,
531+
**request_kwargs,
532+
json_=await json_dumps(request_kwargs),
531533
)
532534

533535
def _calculate_order_no(self, request: Request, forefront: bool | None) -> Decimal | None:
@@ -538,7 +540,7 @@ def _calculate_order_no(self, request: Request, forefront: bool | None) -> Decim
538540
timestamp = Decimal(datetime.now(timezone.utc).timestamp()) * 1000
539541
timestamp = round(timestamp, 6)
540542

541-
# Make sure that this timestamp was not used yet, so that we have unique orderNos
543+
# Make sure that this timestamp was not used yet, so that we have unique order_nos
542544
if timestamp <= self._last_used_timestamp:
543545
timestamp = self._last_used_timestamp + Decimal(0.000001)
544546

0 commit comments

Comments
 (0)