Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/examples/fill_and_submit_web_form.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Now, let's create a POST request with the form fields and their values using the
{RequestExample}
</CodeBlock>

Alternatively, you can send form data as URL parameters using the `query_params` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach.
Alternatively, you can send form data as URL parameters using the `url` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach.

## Implementing the crawler

Expand Down
13 changes: 1 addition & 12 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
)
from typing_extensions import Self

from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams, JsonSerializable
from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
from crawlee._utils.urls import extract_query_params, validate_http_url
Expand Down Expand Up @@ -139,9 +139,6 @@ class BaseRequestData(BaseModel):
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
"""HTTP request headers."""

query_params: Annotated[HttpQueryParams, Field(alias='queryParams', default_factory=dict)] = {}
"""URL query parameters."""

payload: HttpPayload | None = None
"""HTTP request payload."""

Expand Down Expand Up @@ -182,7 +179,6 @@ def from_url(
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
payload: HttpPayload | None = None,
label: str | None = None,
unique_key: str | None = None,
Expand All @@ -193,7 +189,6 @@ def from_url(
) -> Self:
"""Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details."""
headers = headers or HttpHeaders()
query_params = query_params or {}

unique_key = unique_key or compute_unique_key(
url,
Expand All @@ -212,7 +207,6 @@ def from_url(
id=id,
method=method,
headers=headers,
query_params=query_params,
payload=payload,
**kwargs,
)
Expand Down Expand Up @@ -276,7 +270,6 @@ def from_url(
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
payload: HttpPayload | None = None,
label: str | None = None,
unique_key: str | None = None,
Expand All @@ -297,7 +290,6 @@ def from_url(
url: The URL of the request.
method: The HTTP method of the request.
headers: The HTTP headers of the request.
query_params: The query parameters of the URL.
payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests.
label: A custom label to differentiate between request types. This is stored in `user_data`, and it is
used for request routing (different requests go to different handlers).
Expand All @@ -317,7 +309,6 @@ def from_url(
raise ValueError('`always_enqueue` cannot be used with a custom `unique_key`')

headers = headers or HttpHeaders()
query_params = query_params or {}

unique_key = unique_key or compute_unique_key(
url,
Expand All @@ -339,7 +330,6 @@ def from_url(
id=id,
method=method,
headers=headers,
query_params=query_params,
payload=payload,
**kwargs,
)
Expand Down Expand Up @@ -440,7 +430,6 @@ def __eq__(self, other: object) -> bool:
and self.unique_key == other.unique_key
and self.method == other.method
and self.headers == other.headers
and self.query_params == other.query_params
and self.payload == other.payload
and self.user_data == other.user_data
and self.retry_count == other.retry_count
Expand Down
2 changes: 0 additions & 2 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@

HttpMethod: TypeAlias = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']

HttpQueryParams: TypeAlias = dict[str, str]

HttpPayload: TypeAlias = bytes


Expand Down
4 changes: 1 addition & 3 deletions src/crawlee/http_clients/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
if TYPE_CHECKING:
from collections.abc import Iterable

from crawlee._types import HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload
from crawlee.base_storage_client._models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions import Session
Expand Down Expand Up @@ -112,7 +112,6 @@ async def send_request(
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
Expand All @@ -125,7 +124,6 @@ async def send_request(
url: The URL to send the request to.
method: The HTTP method to use.
headers: The headers to include in the request.
query_params: The query parameters to include in the request.
payload: The data to be sent as the request body.
session: The session associated with the request.
proxy_info: The information about the proxy to be used.
Expand Down
5 changes: 1 addition & 4 deletions src/crawlee/http_clients/_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
if TYPE_CHECKING:
from collections.abc import Iterable

from crawlee._types import HttpMethod, HttpPayload, HttpQueryParams
from crawlee._types import HttpMethod, HttpPayload
from crawlee.base_storage_client._models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.statistics import Statistics
Expand Down Expand Up @@ -141,7 +141,6 @@ async def crawl(
url=request.url,
method=request.method,
headers=headers,
params=request.query_params,
content=request.payload,
cookies=session.cookies if session else None,
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
Expand Down Expand Up @@ -176,7 +175,6 @@ async def send_request(
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
Expand All @@ -188,7 +186,6 @@ async def send_request(
url=url,
method=method,
headers=dict(headers) if headers else None,
params=query_params,
content=payload,
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
)
Expand Down
5 changes: 1 addition & 4 deletions src/crawlee/http_clients/curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

from curl_cffi.requests import Response

from crawlee._types import HttpMethod, HttpQueryParams
from crawlee._types import HttpMethod
from crawlee.base_storage_client._models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions import Session
Expand Down Expand Up @@ -130,7 +130,6 @@ async def crawl(
url=request.url,
method=request.method.upper(), # type: ignore # curl-cffi requires uppercase method
headers=request.headers,
params=request.query_params,
data=request.payload,
cookies=session.cookies if session else None,
allow_redirects=True,
Expand Down Expand Up @@ -162,7 +161,6 @@ async def send_request(
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
Expand All @@ -175,7 +173,6 @@ async def send_request(
url=url,
method=method.upper(), # type: ignore # curl-cffi requires uppercase method
headers=dict(headers) if headers else None,
params=query_params,
data=payload,
cookies=session.cookies if session else None,
allow_redirects=True,
Expand Down
37 changes: 35 additions & 2 deletions tests/unit/http_crawler/test_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,12 +246,45 @@ async def request_handler(context: HttpCrawlingContext) -> None:
await crawler.run([request])

# The request handler should be called once.
assert len(responses) == 1
assert len(responses) == 1, 'The request handler should be called once.'

# The reconstructed payload data should match the original payload. We have to flatten the values, because
# parse_qs returns a list of values for each key.
response_data = {
k: v[0] if len(v) == 1 else v for k, v in parse_qs(responses[0]['data'].strip("b'").strip("'")).items()
}

assert response_data == payload
assert response_data == payload, 'The reconstructed payload data should match the original payload.'


@pytest.mark.parametrize(
'http_client_class',
[CurlImpersonateHttpClient, HttpxHttpClient],
ids=['curl', 'httpx'],
)
async def test_sending_url_query_params(http_client_class: type[BaseHttpClient]) -> None:
http_client = http_client_class()
crawler = HttpCrawler(http_client=http_client)

responses = []

@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
response = json.loads(context.http_response.read())
# The httpbin.org/get endpoint returns the provided query parameters in the response.
responses.append(response)

base_url = 'https://httpbin.org/get'
query_params = {'param1': 'value1', 'param2': 'value2'}
request = Request.from_url(url=f'{base_url}?{urlencode(query_params)}')

await crawler.run([request])

# The request handler should be called once.
assert len(responses) == 1, 'The request handler should be called once.'

# Validate the response query parameters.
response_args = responses[0]['args']
assert (
response_args == query_params
), 'The reconstructed query parameters should match the original query parameters.'
Loading