Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions src/crawlee/_utils/http.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
from __future__ import annotations


def is_status_code_error(value: int) -> bool:
"""Returns `True` for 4xx or 5xx status codes, `False` otherwise."""
return is_status_code_client_error(value) or is_status_code_server_error(value)


def is_status_code_client_error(value: int) -> bool:
"""Returns `True` for 4xx status codes, `False` otherwise."""
return 400 <= value <= 499 # noqa: PLR2004
Expand Down
9 changes: 1 addition & 8 deletions src/crawlee/abstract_http_crawler/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,14 +204,7 @@ async def _handle_blocked_request(
"""
if self._retry_on_blocked:
status_code = context.http_response.status_code

# TODO: refactor to avoid private member access
# https://github.com/apify/crawlee-python/issues/708
if (
context.session
and status_code not in self._http_client._ignore_http_error_status_codes # noqa: SLF001
and context.session.is_blocked_status_code(status_code=status_code)
):
if self._is_blocked_status_code(context.session, status_code):
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
if blocked_info := self._parser.is_blocked(context.parsed_content):
raise SessionError(blocked_info.reason)
Expand Down
12 changes: 9 additions & 3 deletions src/crawlee/basic_crawler/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,13 @@
from crawlee._types import BasicCrawlingContext, HttpHeaders, RequestHandlerRunResult, SendRequestFunction
from crawlee._utils.byte_size import ByteSize
from crawlee._utils.docs import docs_group
from crawlee._utils.http import is_status_code_client_error
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
from crawlee._utils.wait import wait_for
from crawlee.basic_crawler._context_pipeline import ContextPipeline
from crawlee.errors import (
ContextPipelineInitializationError,
ContextPipelineInterruptedError,
HttpStatusCodeError,
HttpClientStatusCodeError,
RequestHandlerError,
SessionError,
UserDefinedErrorHandlerError,
Expand Down Expand Up @@ -670,7 +669,7 @@ def _should_retry_request(self, context: BasicCrawlingContext, error: Exception)
return False

# Do not retry on client errors.
if isinstance(error, HttpStatusCodeError) and is_status_code_client_error(error.status_code):
if isinstance(error, HttpClientStatusCodeError):
return False

if isinstance(error, SessionError):
Expand Down Expand Up @@ -1074,3 +1073,10 @@ async def __run_task_function(self) -> None:

async def __run_request_handler(self, context: BasicCrawlingContext) -> None:
await self._context_pipeline(context, self.router)

def _is_blocked_status_code(self, session: Session | None, status_code: int) -> bool:
return session is not None and session.is_blocked_status_code(
status_code=status_code,
additional_blocked_status_codes=self._http_client.additional_blocked_status_codes,
ignore_http_error_status_codes=self._http_client.ignore_http_error_status_codes,
)
6 changes: 6 additions & 0 deletions src/crawlee/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
'ContextPipelineFinalizationError',
'ContextPipelineInitializationError',
'ContextPipelineInterruptedError',
'HttpClientStatusCodeError',
'HttpStatusCodeError',
'ProxyError',
'RequestHandlerError',
Expand Down Expand Up @@ -50,6 +51,11 @@ def __init__(self, message: str, status_code: int) -> None:
self.message = message


@docs_group('Errors')
class HttpClientStatusCodeError(HttpStatusCodeError):
"""Raised when the response status code indicates an client error."""


@docs_group('Errors')
class RequestHandlerError(Exception, Generic[TCrawlingContext]):
"""Wraps an exception thrown from a request handler (router) and extends it with crawling context."""
Expand Down
25 changes: 18 additions & 7 deletions src/crawlee/http_clients/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from typing import TYPE_CHECKING, Protocol

from crawlee._utils.docs import docs_group
from crawlee._utils.http import is_status_code_error
from crawlee.errors import HttpStatusCodeError
from crawlee._utils.http import is_status_code_client_error, is_status_code_server_error
from crawlee.errors import HttpClientStatusCodeError, HttpStatusCodeError

if TYPE_CHECKING:
from collections.abc import Iterable
Expand Down Expand Up @@ -147,11 +147,22 @@ def _raise_for_error_status_code(
ignore_http_error_status_codes: set[int],
) -> None:
"""Raise an exception if the given status code is considered as an error."""
exclude_error = status_code in ignore_http_error_status_codes
include_error = status_code in additional_http_error_status_codes
is_ignored_status = status_code in ignore_http_error_status_codes
is_explicit_error = status_code in additional_http_error_status_codes

if include_error or (is_status_code_error(status_code) and not exclude_error):
if include_error:
raise HttpStatusCodeError('Error status code (user-configured) returned.', status_code)
if is_explicit_error:
raise HttpStatusCodeError('Error status code (user-configured) returned.', status_code)

if is_status_code_client_error(status_code) and not is_ignored_status:
raise HttpClientStatusCodeError('Client error status code returned', status_code)

if is_status_code_server_error(status_code) and not is_ignored_status:
raise HttpStatusCodeError('Error status code returned', status_code)

@property
def additional_blocked_status_codes(self) -> set[int]:
return self._additional_http_error_status_codes

@property
def ignore_http_error_status_codes(self) -> set[int]:
return self._ignore_http_error_status_codes
4 changes: 2 additions & 2 deletions src/crawlee/playwright_crawler/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,8 @@ async def _handle_blocked_request(
status_code = context.response.status

# Check if the session is blocked based on the HTTP status code.
if context.session and context.session.is_blocked_status_code(status_code=status_code):
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}.')
if self._is_blocked_status_code(context.session, status_code):
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')

matched_selectors = [
selector for selector in RETRY_CSS_SELECTORS if (await context.page.query_selector(selector))
Expand Down
12 changes: 8 additions & 4 deletions src/crawlee/sessions/_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __init__(
self._max_usage_count = max_usage_count
self._error_score = error_score
self._cookies = cookies or {}
self._blocked_status_codes = blocked_status_codes or self._DEFAULT_BLOCKED_STATUS_CODES
self._blocked_status_codes = set(blocked_status_codes or self._DEFAULT_BLOCKED_STATUS_CODES)

@classmethod
def from_model(cls, model: SessionModel) -> Session:
Expand Down Expand Up @@ -193,17 +193,21 @@ def is_blocked_status_code(
self,
*,
status_code: int,
additional_blocked_status_codes: list[int] | None = None,
additional_blocked_status_codes: set[int] | None = None,
ignore_http_error_status_codes: set[int] | None = None,
) -> bool:
"""Evaluate whether a session should be retired based on the received HTTP status code.

Args:
status_code: The HTTP status code received from a server response.
additional_blocked_status_codes: Optional additional status codes that should trigger session retirement.
ignore_http_error_status_codes: Optional status codes to allow suppression of
codes from `blocked_status_codes`.

Returns:
True if the session should be retired, False otherwise.
"""
blocked_status_codes = self._blocked_status_codes + (additional_blocked_status_codes or [])
if additional_blocked_status_codes and status_code in additional_blocked_status_codes:
return True

return status_code in blocked_status_codes
return status_code in (self._blocked_status_codes - (ignore_http_error_status_codes or set()))
Loading