apify · janbuchar · Dec 18, 2024 · Dec 13, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py b/src/crawlee/abstract_http_crawler/_abstract_http_crawler.py
@@ -204,7 +204,7 @@ async def _handle_blocked_request(
         """
         if self._retry_on_blocked:
             status_code = context.http_response.status_code
-            if self._is_blocked_status_code(context.session, status_code):
+            if self._is_session_blocked_status_code(context.session, status_code):
                 raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
             if blocked_info := self._parser.is_blocked(context.parsed_content):
                 raise SessionError(blocked_info.reason)

diff --git a/src/crawlee/basic_crawler/_basic_crawler.py b/src/crawlee/basic_crawler/_basic_crawler.py
@@ -1074,7 +1074,16 @@ async def __run_task_function(self) -> None:
     async def __run_request_handler(self, context: BasicCrawlingContext) -> None:
         await self._context_pipeline(context, self.router)
 
-    def _is_blocked_status_code(self, session: Session | None, status_code: int) -> bool:
+    def _is_session_blocked_status_code(self, session: Session | None, status_code: int) -> bool:
+        """Check if the HTTP status code indicates that the session was blocked by the target website.
+
+        Args:
+            session: The session used for the request. If None, the method always returns False.
+            status_code: The HTTP status code to check.
+
+        Returns:
+            True if the status code indicates the session was blocked, False otherwise.
+        """
         return session is not None and session.is_blocked_status_code(
             status_code=status_code,
             additional_blocked_status_codes=self._http_client.additional_blocked_status_codes,

diff --git a/src/crawlee/playwright_crawler/_playwright_crawler.py b/src/crawlee/playwright_crawler/_playwright_crawler.py
@@ -255,7 +255,7 @@ async def _handle_blocked_request(
             status_code = context.response.status
 
             # Check if the session is blocked based on the HTTP status code.
-            if self._is_blocked_status_code(context.session, status_code):
+            if self._is_session_blocked_status_code(context.session, status_code):
                 raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
 
             matched_selectors = [

diff --git a/tests/unit/http_crawler/test_http_crawler.py b/tests/unit/http_crawler/test_http_crawler.py
@@ -88,8 +88,8 @@ async def server() -> AsyncGenerator[respx.MockRouter, None]:
             </html>""",
         )
 
-        mock.get('/404', name='404_endpoint').return_value = Response(
-            404,
+        mock.get('/403', name='403_endpoint').return_value = Response(
+            403,
             text="""<html>
                 <head>
                     <title>Not found</title>
@@ -136,19 +136,48 @@ async def test_handles_redirects(
     assert server['html_endpoint'].called
 
 
+@pytest.mark.parametrize(
+    ('additional_http_error_status_codes', 'ignore_http_error_status_codes', 'expected_number_error'),
+    [
+        ([], [], 1),
+        ([403], [], 3),
+        ([], [403], 0),
+        ([403], [403], 3),
+    ],
+    ids=[
+        'default_behavior',  # error without retry for all 4xx statuses
+        'additional_status_codes',  # make retry for codes in `additional_http_error_status_codes` list
+        'ignore_error_status_codes',  # take as successful status codes from the `ignore_http_error_status_codes` list
+        'additional_and_ignore',  # check precedence for `additional_http_error_status_codes`
+    ],
+)
 async def test_handles_client_errors(
-    crawler_without_retries: HttpCrawler,
+    additional_http_error_status_codes: list[int],
+    ignore_http_error_status_codes: list[int],
+    expected_number_error: int,
     mock_request_handler: AsyncMock,
     server: respx.MockRouter,
 ) -> None:
-    crawler = crawler_without_retries
+    crawler = HttpCrawler(
+        request_handler=mock_request_handler,
+        additional_http_error_status_codes=additional_http_error_status_codes,
+        ignore_http_error_status_codes=ignore_http_error_status_codes,
+        request_provider=RequestList(),
+        max_request_retries=3,
+    )
 
-    await crawler.add_requests(['https://test.io/404'])
+    await crawler.add_requests(['https://test.io/403'])
     await crawler.run()
 
+    assert crawler.statistics.error_tracker.total == expected_number_error
+
     # Request handler should not be called for error status codes.
-    mock_request_handler.assert_not_called()
-    assert server['404_endpoint'].called
+    if expected_number_error:
+        mock_request_handler.assert_not_called()
+    else:
+        mock_request_handler.assert_called()
+
+    assert server['403_endpoint'].called
 
 
 async def test_handles_server_error(
@@ -211,19 +240,19 @@ async def test_do_not_retry_on_client_errors(crawler: HttpCrawler, server: respx
 
 async def test_http_status_statistics(crawler: HttpCrawler, server: respx.MockRouter) -> None:
     await crawler.add_requests([f'https://test.io/500?id={i}' for i in range(10)])
-    await crawler.add_requests([f'https://test.io/404?id={i}' for i in range(10)])
+    await crawler.add_requests([f'https://test.io/403?id={i}' for i in range(10)])
     await crawler.add_requests([f'https://test.io/html?id={i}' for i in range(10)])
 
     await crawler.run()
 
     assert crawler.statistics.state.requests_with_status_code == {
         '200': 10,
-        '404': 10,  # client errors are not retried by default
+        '403': 10,  # client errors are not retried by default
         '500': 30,  # server errors are retried by default
     }
 
     assert len(server['html_endpoint'].calls) == 10
-    assert len(server['404_endpoint'].calls) == 10
+    assert len(server['403_endpoint'].calls) == 10
     assert len(server['500_endpoint'].calls) == 30