From 30835132c10964c9917775b0b65b4cc59e84286a Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 31 Oct 2024 15:50:34 +0100 Subject: [PATCH 1/2] chore!: rm Request.query_params field closes: #615 --- docs/examples/fill_and_submit_web_form.mdx | 2 +- src/crawlee/_request.py | 13 +------------ src/crawlee/_types.py | 2 -- src/crawlee/http_clients/_base.py | 4 +--- src/crawlee/http_clients/_httpx.py | 5 +---- src/crawlee/http_clients/curl_impersonate.py | 5 +---- 6 files changed, 5 insertions(+), 26 deletions(-) diff --git a/docs/examples/fill_and_submit_web_form.mdx b/docs/examples/fill_and_submit_web_form.mdx index 8498bb3014..b5d77ff5d3 100644 --- a/docs/examples/fill_and_submit_web_form.mdx +++ b/docs/examples/fill_and_submit_web_form.mdx @@ -46,7 +46,7 @@ Now, let's create a POST request with the form fields and their values using the {RequestExample} -Alternatively, you can send form data as URL parameters using the `query_params` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach. +Alternatively, you can send form data as URL parameters using the `url` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach. ## Implementing the crawler diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index 0d6bad09f9..4b4af29cdb 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -19,7 +19,7 @@ ) from typing_extensions import Self -from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams, JsonSerializable +from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id from crawlee._utils.urls import extract_query_params, validate_http_url @@ -139,9 +139,6 @@ class BaseRequestData(BaseModel): headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders() """HTTP request headers.""" - query_params: Annotated[HttpQueryParams, Field(alias='queryParams', default_factory=dict)] = {} - """URL query parameters.""" - payload: HttpPayload | None = None """HTTP request payload.""" @@ -182,7 +179,6 @@ def from_url( *, method: HttpMethod = 'GET', headers: HttpHeaders | None = None, - query_params: HttpQueryParams | None = None, payload: HttpPayload | None = None, label: str | None = None, unique_key: str | None = None, @@ -193,7 +189,6 @@ def from_url( ) -> Self: """Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details.""" headers = headers or HttpHeaders() - query_params = query_params or {} unique_key = unique_key or compute_unique_key( url, @@ -212,7 +207,6 @@ def from_url( id=id, method=method, headers=headers, - query_params=query_params, payload=payload, **kwargs, ) @@ -276,7 +270,6 @@ def from_url( *, method: HttpMethod = 'GET', headers: HttpHeaders | None = None, - query_params: HttpQueryParams | None = None, payload: HttpPayload | None = None, label: str | None = None, unique_key: str | None = None, @@ -297,7 +290,6 @@ def from_url( url: The URL of the request. method: The HTTP method of the request. headers: The HTTP headers of the request. - query_params: The query parameters of the URL. payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests. label: A custom label to differentiate between request types. This is stored in `user_data`, and it is used for request routing (different requests go to different handlers). @@ -317,7 +309,6 @@ def from_url( raise ValueError('`always_enqueue` cannot be used with a custom `unique_key`') headers = headers or HttpHeaders() - query_params = query_params or {} unique_key = unique_key or compute_unique_key( url, @@ -339,7 +330,6 @@ def from_url( id=id, method=method, headers=headers, - query_params=query_params, payload=payload, **kwargs, ) @@ -440,7 +430,6 @@ def __eq__(self, other: object) -> bool: and self.unique_key == other.unique_key and self.method == other.method and self.headers == other.headers - and self.query_params == other.query_params and self.payload == other.payload and self.user_data == other.user_data and self.retry_count == other.retry_count diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index d63da27683..d69b297634 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -50,8 +50,6 @@ HttpMethod: TypeAlias = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH'] -HttpQueryParams: TypeAlias = dict[str, str] - HttpPayload: TypeAlias = bytes diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index 4c07902c87..bfbc8de4ea 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -10,7 +10,7 @@ if TYPE_CHECKING: from collections.abc import Iterable - from crawlee._types import HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams + from crawlee._types import HttpHeaders, HttpMethod, HttpPayload from crawlee.base_storage_client._models import Request from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session @@ -112,7 +112,6 @@ async def send_request( *, method: HttpMethod = 'GET', headers: HttpHeaders | None = None, - query_params: HttpQueryParams | None = None, payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, @@ -125,7 +124,6 @@ async def send_request( url: The URL to send the request to. method: The HTTP method to use. headers: The headers to include in the request. - query_params: The query parameters to include in the request. payload: The data to be sent as the request body. session: The session associated with the request. proxy_info: The information about the proxy to be used. diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index 321fd33a47..e401b1125a 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -16,7 +16,7 @@ if TYPE_CHECKING: from collections.abc import Iterable - from crawlee._types import HttpMethod, HttpPayload, HttpQueryParams + from crawlee._types import HttpMethod, HttpPayload from crawlee.base_storage_client._models import Request from crawlee.proxy_configuration import ProxyInfo from crawlee.statistics import Statistics @@ -141,7 +141,6 @@ async def crawl( url=request.url, method=request.method, headers=headers, - params=request.query_params, content=request.payload, cookies=session.cookies if session else None, extensions={'crawlee_session': session if self._persist_cookies_per_session else None}, @@ -176,7 +175,6 @@ async def send_request( *, method: HttpMethod = 'GET', headers: HttpHeaders | None = None, - query_params: HttpQueryParams | None = None, payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, @@ -188,7 +186,6 @@ async def send_request( url=url, method=method, headers=dict(headers) if headers else None, - params=query_params, content=payload, extensions={'crawlee_session': session if self._persist_cookies_per_session else None}, ) diff --git a/src/crawlee/http_clients/curl_impersonate.py b/src/crawlee/http_clients/curl_impersonate.py index 51e434ff9c..5fe2ca179f 100644 --- a/src/crawlee/http_clients/curl_impersonate.py +++ b/src/crawlee/http_clients/curl_impersonate.py @@ -26,7 +26,7 @@ from curl_cffi.requests import Response - from crawlee._types import HttpMethod, HttpQueryParams + from crawlee._types import HttpMethod from crawlee.base_storage_client._models import Request from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session @@ -130,7 +130,6 @@ async def crawl( url=request.url, method=request.method.upper(), # type: ignore # curl-cffi requires uppercase method headers=request.headers, - params=request.query_params, data=request.payload, cookies=session.cookies if session else None, allow_redirects=True, @@ -162,7 +161,6 @@ async def send_request( *, method: HttpMethod = 'GET', headers: HttpHeaders | None = None, - query_params: HttpQueryParams | None = None, payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, @@ -175,7 +173,6 @@ async def send_request( url=url, method=method.upper(), # type: ignore # curl-cffi requires uppercase method headers=dict(headers) if headers else None, - params=query_params, data=payload, cookies=session.cookies if session else None, allow_redirects=True, From 76be589f3da5469c52f974351da7de95850278db Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 31 Oct 2024 16:48:49 +0100 Subject: [PATCH 2/2] add test --- tests/unit/http_crawler/test_http_crawler.py | 37 ++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/tests/unit/http_crawler/test_http_crawler.py b/tests/unit/http_crawler/test_http_crawler.py index afc9e79c7b..e29bff323c 100644 --- a/tests/unit/http_crawler/test_http_crawler.py +++ b/tests/unit/http_crawler/test_http_crawler.py @@ -246,7 +246,7 @@ async def request_handler(context: HttpCrawlingContext) -> None: await crawler.run([request]) # The request handler should be called once. - assert len(responses) == 1 + assert len(responses) == 1, 'The request handler should be called once.' # The reconstructed payload data should match the original payload. We have to flatten the values, because # parse_qs returns a list of values for each key. @@ -254,4 +254,37 @@ async def request_handler(context: HttpCrawlingContext) -> None: k: v[0] if len(v) == 1 else v for k, v in parse_qs(responses[0]['data'].strip("b'").strip("'")).items() } - assert response_data == payload + assert response_data == payload, 'The reconstructed payload data should match the original payload.' + + +@pytest.mark.parametrize( + 'http_client_class', + [CurlImpersonateHttpClient, HttpxHttpClient], + ids=['curl', 'httpx'], +) +async def test_sending_url_query_params(http_client_class: type[BaseHttpClient]) -> None: + http_client = http_client_class() + crawler = HttpCrawler(http_client=http_client) + + responses = [] + + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + response = json.loads(context.http_response.read()) + # The httpbin.org/get endpoint returns the provided query parameters in the response. + responses.append(response) + + base_url = 'https://httpbin.org/get' + query_params = {'param1': 'value1', 'param2': 'value2'} + request = Request.from_url(url=f'{base_url}?{urlencode(query_params)}') + + await crawler.run([request]) + + # The request handler should be called once. + assert len(responses) == 1, 'The request handler should be called once.' + + # Validate the response query parameters. + response_args = responses[0]['args'] + assert ( + response_args == query_params + ), 'The reconstructed query parameters should match the original query parameters.'