diff --git a/src/crawlee/_utils/http.py b/src/crawlee/_utils/http.py index 8b2ea72400..9754d95275 100644 --- a/src/crawlee/_utils/http.py +++ b/src/crawlee/_utils/http.py @@ -1,11 +1,6 @@ from __future__ import annotations -def is_status_code_error(value: int) -> bool: - """Returns `True` for 4xx or 5xx status codes, `False` otherwise.""" - return is_status_code_client_error(value) or is_status_code_server_error(value) - - def is_status_code_client_error(value: int) -> bool: """Returns `True` for 4xx status codes, `False` otherwise.""" return 400 <= value <= 499 # noqa: PLR2004 diff --git a/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py b/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py index d53821b339..9809378345 100644 --- a/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py +++ b/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py @@ -204,14 +204,7 @@ async def _handle_blocked_request( """ if self._retry_on_blocked: status_code = context.http_response.status_code - - # TODO: refactor to avoid private member access - # https://github.com/apify/crawlee-python/issues/708 - if ( - context.session - and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001 - and context.session.is_blocked_status_code(status_code=status_code) - ): + if self._is_session_blocked_status_code(context.session, status_code): raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}') if blocked_info := self._parser.is_blocked(context.parsed_content): raise SessionError(blocked_info.reason) diff --git a/src/crawlee/basic_crawler/_basic_crawler.py b/src/crawlee/basic_crawler/_basic_crawler.py index aeec0031e9..14946d3a4d 100644 --- a/src/crawlee/basic_crawler/_basic_crawler.py +++ b/src/crawlee/basic_crawler/_basic_crawler.py @@ -27,14 +27,13 @@ from crawlee._types import BasicCrawlingContext, HttpHeaders, RequestHandlerRunResult, SendRequestFunction from crawlee._utils.byte_size import ByteSize from crawlee._utils.docs import docs_group -from crawlee._utils.http import is_status_code_client_error from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute from crawlee._utils.wait import wait_for from crawlee.basic_crawler._context_pipeline import ContextPipeline from crawlee.errors import ( ContextPipelineInitializationError, ContextPipelineInterruptedError, - HttpStatusCodeError, + HttpClientStatusCodeError, RequestHandlerError, SessionError, UserDefinedErrorHandlerError, @@ -670,7 +669,7 @@ def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) return False # Do not retry on client errors. - if isinstance(error, HttpStatusCodeError) and is_status_code_client_error(error.status_code): + if isinstance(error, HttpClientStatusCodeError): return False if isinstance(error, SessionError): @@ -1074,3 +1073,19 @@ async def __run_task_function(self) -> None: async def __run_request_handler(self, context: BasicCrawlingContext) -> None: await self._context_pipeline(context, self.router) + + def _is_session_blocked_status_code(self, session: Session | None, status_code: int) -> bool: + """Check if the HTTP status code indicates that the session was blocked by the target website. + + Args: + session: The session used for the request. If None, the method always returns False. + status_code: The HTTP status code to check. + + Returns: + True if the status code indicates the session was blocked, False otherwise. + """ + return session is not None and session.is_blocked_status_code( + status_code=status_code, + additional_blocked_status_codes=self._http_client.additional_blocked_status_codes, + ignore_http_error_status_codes=self._http_client.ignore_http_error_status_codes, + ) diff --git a/src/crawlee/errors.py b/src/crawlee/errors.py index ff6348842e..544ea5377b 100644 --- a/src/crawlee/errors.py +++ b/src/crawlee/errors.py @@ -11,6 +11,7 @@ 'ContextPipelineFinalizationError', 'ContextPipelineInitializationError', 'ContextPipelineInterruptedError', + 'HttpClientStatusCodeError', 'HttpStatusCodeError', 'ProxyError', 'RequestHandlerError', @@ -50,6 +51,11 @@ def __init__(self, message: str, status_code: int) -> None: self.message = message +@docs_group('Errors') +class HttpClientStatusCodeError(HttpStatusCodeError): + """Raised when the response status code indicates an client error.""" + + @docs_group('Errors') class RequestHandlerError(Exception, Generic[TCrawlingContext]): """Wraps an exception thrown from a request handler (router) and extends it with crawling context.""" diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index 9012fdc431..d7880fcf25 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, Protocol from crawlee._utils.docs import docs_group -from crawlee._utils.http import is_status_code_error -from crawlee.errors import HttpStatusCodeError +from crawlee._utils.http import is_status_code_client_error, is_status_code_server_error +from crawlee.errors import HttpClientStatusCodeError, HttpStatusCodeError if TYPE_CHECKING: from collections.abc import Iterable @@ -147,11 +147,22 @@ def _raise_for_error_status_code( ignore_http_error_status_codes: set[int], ) -> None: """Raise an exception if the given status code is considered as an error.""" - exclude_error = status_code in ignore_http_error_status_codes - include_error = status_code in additional_http_error_status_codes + is_ignored_status = status_code in ignore_http_error_status_codes + is_explicit_error = status_code in additional_http_error_status_codes - if include_error or (is_status_code_error(status_code) and not exclude_error): - if include_error: - raise HttpStatusCodeError('Error status code (user-configured) returned.', status_code) + if is_explicit_error: + raise HttpStatusCodeError('Error status code (user-configured) returned.', status_code) + if is_status_code_client_error(status_code) and not is_ignored_status: + raise HttpClientStatusCodeError('Client error status code returned', status_code) + + if is_status_code_server_error(status_code) and not is_ignored_status: raise HttpStatusCodeError('Error status code returned', status_code) + + @property + def additional_blocked_status_codes(self) -> set[int]: + return self._additional_http_error_status_codes + + @property + def ignore_http_error_status_codes(self) -> set[int]: + return self._ignore_http_error_status_codes diff --git a/src/crawlee/playwright_crawler/_playwright_crawler.py b/src/crawlee/playwright_crawler/_playwright_crawler.py index a609674581..5520c58861 100644 --- a/src/crawlee/playwright_crawler/_playwright_crawler.py +++ b/src/crawlee/playwright_crawler/_playwright_crawler.py @@ -255,8 +255,8 @@ async def _handle_blocked_request( status_code = context.response.status # Check if the session is blocked based on the HTTP status code. - if context.session and context.session.is_blocked_status_code(status_code=status_code): - raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}.') + if self._is_session_blocked_status_code(context.session, status_code): + raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}') matched_selectors = [ selector for selector in RETRY_CSS_SELECTORS if (await context.page.query_selector(selector)) diff --git a/src/crawlee/sessions/_session.py b/src/crawlee/sessions/_session.py index cff8324686..963eb3a05a 100644 --- a/src/crawlee/sessions/_session.py +++ b/src/crawlee/sessions/_session.py @@ -65,7 +65,7 @@ def __init__( self._max_usage_count = max_usage_count self._error_score = error_score self._cookies = cookies or {} - self._blocked_status_codes = blocked_status_codes or self._DEFAULT_BLOCKED_STATUS_CODES + self._blocked_status_codes = set(blocked_status_codes or self._DEFAULT_BLOCKED_STATUS_CODES) @classmethod def from_model(cls, model: SessionModel) -> Session: @@ -193,17 +193,21 @@ def is_blocked_status_code( self, *, status_code: int, - additional_blocked_status_codes: list[int] | None = None, + additional_blocked_status_codes: set[int] | None = None, + ignore_http_error_status_codes: set[int] | None = None, ) -> bool: """Evaluate whether a session should be retired based on the received HTTP status code. Args: status_code: The HTTP status code received from a server response. additional_blocked_status_codes: Optional additional status codes that should trigger session retirement. + ignore_http_error_status_codes: Optional status codes to allow suppression of + codes from `blocked_status_codes`. Returns: True if the session should be retired, False otherwise. """ - blocked_status_codes = self._blocked_status_codes + (additional_blocked_status_codes or []) + if additional_blocked_status_codes and status_code in additional_blocked_status_codes: + return True - return status_code in blocked_status_codes + return status_code in (self._blocked_status_codes - (ignore_http_error_status_codes or set())) diff --git a/tests/unit/http_crawler/test_http_crawler.py b/tests/unit/http_crawler/test_http_crawler.py index e625a48b0d..d38c122660 100644 --- a/tests/unit/http_crawler/test_http_crawler.py +++ b/tests/unit/http_crawler/test_http_crawler.py @@ -88,8 +88,8 @@ async def server() -> AsyncGenerator[respx.MockRouter, None]: """, ) - mock.get('/404', name='404_endpoint').return_value = Response( - 404, + mock.get('/403', name='403_endpoint').return_value = Response( + 403, text=""" Not found @@ -136,19 +136,48 @@ async def test_handles_redirects( assert server['html_endpoint'].called +@pytest.mark.parametrize( + ('additional_http_error_status_codes', 'ignore_http_error_status_codes', 'expected_number_error'), + [ + ([], [], 1), + ([403], [], 3), + ([], [403], 0), + ([403], [403], 3), + ], + ids=[ + 'default_behavior', # error without retry for all 4xx statuses + 'additional_status_codes', # make retry for codes in `additional_http_error_status_codes` list + 'ignore_error_status_codes', # take as successful status codes from the `ignore_http_error_status_codes` list + 'additional_and_ignore', # check precedence for `additional_http_error_status_codes` + ], +) async def test_handles_client_errors( - crawler_without_retries: HttpCrawler, + additional_http_error_status_codes: list[int], + ignore_http_error_status_codes: list[int], + expected_number_error: int, mock_request_handler: AsyncMock, server: respx.MockRouter, ) -> None: - crawler = crawler_without_retries + crawler = HttpCrawler( + request_handler=mock_request_handler, + additional_http_error_status_codes=additional_http_error_status_codes, + ignore_http_error_status_codes=ignore_http_error_status_codes, + request_provider=RequestList(), + max_request_retries=3, + ) - await crawler.add_requests(['https://test.io/404']) + await crawler.add_requests(['https://test.io/403']) await crawler.run() + assert crawler.statistics.error_tracker.total == expected_number_error + # Request handler should not be called for error status codes. - mock_request_handler.assert_not_called() - assert server['404_endpoint'].called + if expected_number_error: + mock_request_handler.assert_not_called() + else: + mock_request_handler.assert_called() + + assert server['403_endpoint'].called async def test_handles_server_error( @@ -211,19 +240,19 @@ async def test_do_not_retry_on_client_errors(crawler: HttpCrawler, server: respx async def test_http_status_statistics(crawler: HttpCrawler, server: respx.MockRouter) -> None: await crawler.add_requests([f'https://test.io/500?id={i}' for i in range(10)]) - await crawler.add_requests([f'https://test.io/404?id={i}' for i in range(10)]) + await crawler.add_requests([f'https://test.io/403?id={i}' for i in range(10)]) await crawler.add_requests([f'https://test.io/html?id={i}' for i in range(10)]) await crawler.run() assert crawler.statistics.state.requests_with_status_code == { '200': 10, - '404': 10, # client errors are not retried by default + '403': 10, # client errors are not retried by default '500': 30, # server errors are retried by default } assert len(server['html_endpoint'].calls) == 10 - assert len(server['404_endpoint'].calls) == 10 + assert len(server['403_endpoint'].calls) == 10 assert len(server['500_endpoint'].calls) == 30