Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ async def main() -> None:
@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
response = context.http_response.read().decode('utf-8')
response = (await context.http_response.read()).decode('utf-8')
context.log.info(f'Response: {response}') # To see the response in the logs.

# Prepare a POST request to the form endpoint.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext


def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None:
async def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None:
"""Helper function for archiving response in WARC format."""
# Create WARC records for response
response_body = context.http_response.read()
response_body = await context.http_response.read()
response_payload_stream = io.BytesIO(response_body)

response_headers = StatusAndHeaders(
Expand Down Expand Up @@ -51,7 +51,7 @@ async def main() -> None:
@crawler.router.default_handler
async def request_handler(context: ParselCrawlingContext) -> None:
context.log.info(f'Archiving {context.request.url} ...')
archive_response(context=context, writer=writer)
await archive_response(context=context, writer=writer)
await context.enqueue_links(strategy='same-domain')

await crawler.run(['https://crawlee.dev/'])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async def default_handler(context: HttpCrawlingContext) -> None:
response = await context.send_request(
'https://placeholder.org/refresh', headers=headers
)
data = json.loads(response.read())
data = json.loads(await response.read())
# Add the new token to our `Request` headers
new_headers = {
**context.request.headers,
Expand Down
2 changes: 1 addition & 1 deletion docs/guides/code_examples/login_crawler/http_login.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ async def login_handler(context: HttpCrawlingContext) -> None:
raise RuntimeError('Session not found')

# Parse the API response containing authentication tokens and user data
data = json.loads(context.http_response.read())
data = json.loads(await context.http_response.read())

# Extract authentication data from the response
token = data['token']
Expand Down
2 changes: 1 addition & 1 deletion docs/guides/code_examples/session_management/sm_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async def default_handler(context: BasicCrawlingContext) -> None:
# and `context.proxy_info`.
response = await context.send_request(context.request.url)

page_content = response.read().decode()
page_content = (await response.read()).decode()
title_match = re.search(r'<title(?:.*?)>(.*?)</title>', page_content)

if context.session and (title := title_match.group(1) if title_match else None):
Expand Down
2 changes: 1 addition & 1 deletion docs/guides/code_examples/session_management/sm_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ async def main() -> None:
# based on the response content and potential blocking
@crawler.router.default_handler
async def default_handler(context: HttpCrawlingContext) -> None:
page_content = context.http_response.read().decode()
page_content = (await context.http_response.read()).decode()
title_match = re.search(r'<title(?:.*?)>(.*?)</title>', page_content)

if context.session and (title := title_match.group(1) if title_match else None):
Expand Down
2 changes: 2 additions & 0 deletions docs/upgrading/upgrading_to_v1.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ dataset = await Dataset.open(
The `persist_storage` and `persist_metadata` fields have been removed from the `Configuration` class.
Persistence is now determined solely by the storage client class you use.

The `read` method for `HttpResponse` has been changed from synchronous to asynchronous.

### Storage client instance behavior

Instance caching is implemented for the storage open methods: `Dataset.open()`, `KeyValueStore.open()`,
Expand Down
4 changes: 3 additions & 1 deletion src/crawlee/_utils/robots.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
"""
response = await http_client.send_request(url, proxy_info=proxy_info)
body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read()
body = (
b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
)

robots = Protego.parse(body.decode('utf-8'))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_respons

async def get_snapshot(self) -> PageSnapshot:
"""Get snapshot of crawled page."""
return PageSnapshot(html=self.http_response.read().decode('utf-8'))
return PageSnapshot(html=(await self.http_response.read()).decode('utf-8'))


@dataclass(frozen=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None:

@override
async def parse(self, response: HttpResponse) -> BeautifulSoup:
return BeautifulSoup(response.read(), features=self._parser)
return BeautifulSoup(await response.read(), features=self._parser)

@override
async def parse_text(self, text: str) -> BeautifulSoup:
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/crawlers/_http/_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:
# Extract data from the page.
data = {
'url': context.request.url,
'response': context.http_response.read().decode()[:100],
'response': (await context.http_response.read()).decode()[:100],
}

# Push the extracted data to the default dataset.
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/crawlers/_http/_http_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class NoParser(AbstractHttpParser[bytes, bytes]):

@override
async def parse(self, response: HttpResponse) -> bytes:
return response.read()
return await response.read()

@override
async def parse_text(self, text: str) -> bytes:
Expand Down
3 changes: 2 additions & 1 deletion src/crawlee/crawlers/_parsel/_parsel_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ class ParselParser(AbstractHttpParser[Selector, Selector]):

@override
async def parse(self, response: HttpResponse) -> Selector:
return await asyncio.to_thread(lambda: Selector(body=response.read()))
response_body = await response.read()
return await asyncio.to_thread(lambda: Selector(body=response_body))

@override
async def parse_text(self, text: str) -> Selector:
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/crawlers/_playwright/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class PlaywrightHttpResponse:
headers: HttpHeaders
_content: bytes

def read(self) -> bytes:
async def read(self) -> bytes:
return self._content

async def read_stream(self) -> AsyncGenerator[bytes, None]:
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/http_clients/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def status_code(self) -> int:
def headers(self) -> HttpHeaders:
"""The HTTP headers received in the response."""

def read(self) -> bytes:
async def read(self) -> bytes:
"""Read the entire content of the response body.

This method loads the complete response body into memory at once. It should be used
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/http_clients/_curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def status_code(self) -> int:
def headers(self) -> HttpHeaders:
return HttpHeaders({key: value for key, value in self._response.headers.items() if value})

def read(self) -> bytes:
async def read(self) -> bytes:
if self._response.astream_task:
raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')
return self._response.content
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/http_clients/_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ def status_code(self) -> int:
def headers(self) -> HttpHeaders:
return HttpHeaders(dict(self._response.headers))

def read(self) -> bytes:
async def read(self) -> bytes:
if not self._response.is_closed:
raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')
return self._response.read()
return await self._response.aread()

async def read_stream(self) -> AsyncIterator[bytes]:
if self._response.is_stream_consumed:
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/crawlers/_basic/test_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ async def test_send_request_works(server_url: URL, method: HttpMethod, path: str
async def handler(context: BasicCrawlingContext) -> None:
response = await context.send_request(str(server_url / path), method=method, payload=payload)

response_data['body'] = json.loads(response.read())
response_data['body'] = json.loads(await response.read())
response_data['headers'] = response.headers

await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
Expand Down
10 changes: 5 additions & 5 deletions tests/unit/crawlers/_http/test_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ async def test_sending_payload_as_raw_data(http_client: HttpClient, server_url:

@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
response = json.loads(context.http_response.read())
response = json.loads(await context.http_response.read())
# The post endpoint returns the provided payload in the response.
responses.append(response)

Expand Down Expand Up @@ -271,7 +271,7 @@ async def test_sending_payload_as_form_data(http_client: HttpClient, server_url:

@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
response = json.loads(context.http_response.read())
response = json.loads(await context.http_response.read())
# The /post endpoint returns the provided payload in the response.
responses.append(response)

Expand All @@ -297,7 +297,7 @@ async def test_sending_payload_as_json(http_client: HttpClient, server_url: URL)

@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
response = json.loads(context.http_response.read())
response = json.loads(await context.http_response.read())
# The /post endpoint returns the provided payload in the response.
responses.append(response)

Expand All @@ -324,7 +324,7 @@ async def test_sending_url_query_params(http_client: HttpClient, server_url: URL

@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
response = json.loads(context.http_response.read())
response = json.loads(await context.http_response.read())
# The /get endpoint returns the provided query parameters in the response.
responses.append(response)

Expand Down Expand Up @@ -397,7 +397,7 @@ async def handler(context: HttpCrawlingContext) -> None:
sessions_cookies[context.session.id] = {
cookie['name']: cookie['value'] for cookie in context.session.cookies.get_cookies_as_dicts()
}
response_data = json.loads(context.http_response.read())
response_data = json.loads(await context.http_response.read())
response_cookies[context.session.id] = response_data.get('cookies')

if context.request.user_data.get('retire_session'):
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,14 +672,14 @@ async def test_send_request(server_url: URL) -> None:
@crawler.pre_navigation_hook
async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
send_request_response = await context.send_request(str(server_url / 'user-agent'))
check_data['pre_send_request'] = dict(json.loads(send_request_response.read()))
check_data['pre_send_request'] = dict(json.loads(await send_request_response.read()))

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
response = await context.response.text()
check_data['default'] = dict(json.loads(response))
send_request_response = await context.send_request(str(server_url / 'user-agent'))
check_data['send_request'] = dict(json.loads(send_request_response.read()))
check_data['send_request'] = dict(json.loads(await send_request_response.read()))

await crawler.run([str(server_url / 'user-agent')])

Expand All @@ -703,7 +703,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
response = await context.response.text()
check_data['default'] = dict(json.loads(response))
send_request_response = await context.send_request(str(server_url / 'user-agent'))
check_data['send_request'] = dict(json.loads(send_request_response.read()))
check_data['send_request'] = dict(json.loads(await send_request_response.read()))

await crawler.run([str(server_url / 'user-agent')])

Expand Down
2 changes: 1 addition & 1 deletion tests/unit/http_clients/test_curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ async def test_stream_error_for_read(http_client: CurlImpersonateHttpClient, ser
assert response.status_code == 200

with pytest.raises(RuntimeError):
response.read()
await response.read()


async def test_send_request_error_for_read_stream(http_client: CurlImpersonateHttpClient, server_url: URL) -> None:
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/http_clients/test_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ async def test_common_headers_and_user_agent(server_url: URL, header_network: di
client = HttpxHttpClient()

response = await client.send_request(str(server_url / 'headers'))
response_headers = json.loads(response.read().decode())
response_headers = json.loads((await response.read()).decode())

assert 'accept' in response_headers
assert response_headers['accept'] in get_available_header_values(header_network, {'Accept', 'accept'})
Expand Down Expand Up @@ -176,7 +176,7 @@ async def test_stream_error_for_read(http_client: HttpxHttpClient, server_url: U
assert response.status_code == 200

with pytest.raises(RuntimeError):
response.read()
await response.read()


async def test_send_request_error_for_read_stream(http_client: HttpxHttpClient, server_url: URL) -> None:
Expand Down
Loading