Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ async def _handle_blocked_request(
"""
if self._retry_on_blocked:
status_code = context.http_response.status_code
if self._is_blocked_status_code(context.session, status_code):
if self._is_session_blocked_status_code(context.session, status_code):
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
if blocked_info := self._parser.is_blocked(context.parsed_content):
raise SessionError(blocked_info.reason)
Expand Down
11 changes: 10 additions & 1 deletion src/crawlee/basic_crawler/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1074,7 +1074,16 @@ async def __run_task_function(self) -> None:
async def __run_request_handler(self, context: BasicCrawlingContext) -> None:
await self._context_pipeline(context, self.router)

def _is_blocked_status_code(self, session: Session | None, status_code: int) -> bool:
def _is_session_blocked_status_code(self, session: Session | None, status_code: int) -> bool:
"""Check if the HTTP status code indicates that the session was blocked by the target website.

Args:
session: The session used for the request. If None, the method always returns False.
status_code: The HTTP status code to check.

Returns:
True if the status code indicates the session was blocked, False otherwise.
"""
return session is not None and session.is_blocked_status_code(
status_code=status_code,
additional_blocked_status_codes=self._http_client.additional_blocked_status_codes,
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/playwright_crawler/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ async def _handle_blocked_request(
status_code = context.response.status

# Check if the session is blocked based on the HTTP status code.
if self._is_blocked_status_code(context.session, status_code):
if self._is_session_blocked_status_code(context.session, status_code):
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')

matched_selectors = [
Expand Down
49 changes: 39 additions & 10 deletions tests/unit/http_crawler/test_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ async def server() -> AsyncGenerator[respx.MockRouter, None]:
</html>""",
)

mock.get('/404', name='404_endpoint').return_value = Response(
404,
mock.get('/403', name='403_endpoint').return_value = Response(
403,
text="""<html>
<head>
<title>Not found</title>
Expand Down Expand Up @@ -136,19 +136,48 @@ async def test_handles_redirects(
assert server['html_endpoint'].called


@pytest.mark.parametrize(
('additional_http_error_status_codes', 'ignore_http_error_status_codes', 'expected_number_error'),
[
([], [], 1),
([403], [], 3),
([], [403], 0),
([403], [403], 3),
],
ids=[
'default_behavior', # error without retry for all 4xx statuses
'additional_status_codes', # make retry for codes in `additional_http_error_status_codes` list
'ignore_error_status_codes', # take as successful status codes from the `ignore_http_error_status_codes` list
'additional_and_ignore', # check precedence for `additional_http_error_status_codes`
],
)
async def test_handles_client_errors(
crawler_without_retries: HttpCrawler,
additional_http_error_status_codes: list[int],
ignore_http_error_status_codes: list[int],
expected_number_error: int,
mock_request_handler: AsyncMock,
server: respx.MockRouter,
) -> None:
crawler = crawler_without_retries
crawler = HttpCrawler(
request_handler=mock_request_handler,
additional_http_error_status_codes=additional_http_error_status_codes,
ignore_http_error_status_codes=ignore_http_error_status_codes,
request_provider=RequestList(),
max_request_retries=3,
)

await crawler.add_requests(['https://test.io/404'])
await crawler.add_requests(['https://test.io/403'])
await crawler.run()

assert crawler.statistics.error_tracker.total == expected_number_error

# Request handler should not be called for error status codes.
mock_request_handler.assert_not_called()
assert server['404_endpoint'].called
if expected_number_error:
mock_request_handler.assert_not_called()
else:
mock_request_handler.assert_called()

assert server['403_endpoint'].called


async def test_handles_server_error(
Expand Down Expand Up @@ -211,19 +240,19 @@ async def test_do_not_retry_on_client_errors(crawler: HttpCrawler, server: respx

async def test_http_status_statistics(crawler: HttpCrawler, server: respx.MockRouter) -> None:
await crawler.add_requests([f'https://test.io/500?id={i}' for i in range(10)])
await crawler.add_requests([f'https://test.io/404?id={i}' for i in range(10)])
await crawler.add_requests([f'https://test.io/403?id={i}' for i in range(10)])
await crawler.add_requests([f'https://test.io/html?id={i}' for i in range(10)])

await crawler.run()

assert crawler.statistics.state.requests_with_status_code == {
'200': 10,
'404': 10, # client errors are not retried by default
'403': 10, # client errors are not retried by default
'500': 30, # server errors are retried by default
}

assert len(server['html_endpoint'].calls) == 10
assert len(server['404_endpoint'].calls) == 10
assert len(server['403_endpoint'].calls) == 10
assert len(server['500_endpoint'].calls) == 30


Expand Down
Loading