Skip to content

Commit 74fa1d9

Browse files
authored
fix: Fix retry count to not count the original request (#1328)
### Description The argument `max_request_retries` of `BasicCrawler` previously included also the initial request in retries. Now it counts only the retries. ### Issues - Closes: #1326
1 parent 3120061 commit 74fa1d9

File tree

4 files changed

+19
-36
lines changed

4 files changed

+19
-36
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -855,7 +855,7 @@ def _should_retry_request(self, context: BasicCrawlingContext, error: Exception)
855855
if max_request_retries is None:
856856
max_request_retries = self._max_request_retries
857857

858-
return (context.request.retry_count + 1) < max_request_retries
858+
return context.request.retry_count < max_request_retries
859859

860860
async def _check_url_after_redirects(self, context: TCrawlingContext) -> AsyncGenerator[TCrawlingContext, None]:
861861
"""Ensure that the `loaded_url` still matches the enqueue strategy after redirects.

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 13 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
from crawlee import ConcurrencySettings, Glob, service_locator
1919
from crawlee._request import Request
20-
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpHeaders, HttpMethod
20+
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod
2121
from crawlee._utils.robots import RobotsTxtFile
2222
from crawlee.configuration import Configuration
2323
from crawlee.crawlers import BasicCrawler
@@ -135,11 +135,12 @@ async def handler(context: BasicCrawlingContext) -> None:
135135
'https://c.placeholder.com',
136136
'https://b.placeholder.com',
137137
'https://b.placeholder.com',
138+
'https://b.placeholder.com',
138139
]
139140

140141

141142
async def test_respects_no_retry() -> None:
142-
crawler = BasicCrawler(max_request_retries=3)
143+
crawler = BasicCrawler(max_request_retries=2)
143144
calls = list[str]()
144145

145146
@crawler.router.default_handler
@@ -167,7 +168,7 @@ async def handler(context: BasicCrawlingContext) -> None:
167168

168169

169170
async def test_respects_request_specific_max_retries() -> None:
170-
crawler = BasicCrawler(max_request_retries=1)
171+
crawler = BasicCrawler(max_request_retries=0)
171172
calls = list[str]()
172173

173174
@crawler.router.default_handler
@@ -179,7 +180,7 @@ async def handler(context: BasicCrawlingContext) -> None:
179180
[
180181
'https://a.placeholder.com',
181182
'https://b.placeholder.com',
182-
Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 4}}),
183+
Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 1}}),
183184
]
184185
)
185186

@@ -188,8 +189,6 @@ async def handler(context: BasicCrawlingContext) -> None:
188189
'https://b.placeholder.com',
189190
'https://c.placeholder.com',
190191
'https://c.placeholder.com',
191-
'https://c.placeholder.com',
192-
'https://c.placeholder.com',
193192
]
194193

195194

@@ -199,12 +198,11 @@ async def test_calls_error_handler() -> None:
199198
class Call:
200199
url: str
201200
error: Exception
202-
custom_retry_count: int
203201

204202
# List to store the information of calls to the error handler.
205203
calls = list[Call]()
206204

207-
crawler = BasicCrawler(max_request_retries=3)
205+
crawler = BasicCrawler(max_request_retries=2)
208206

209207
@crawler.router.default_handler
210208
async def handler(context: BasicCrawlingContext) -> None:
@@ -213,34 +211,19 @@ async def handler(context: BasicCrawlingContext) -> None:
213211

214212
@crawler.error_handler
215213
async def error_handler(context: BasicCrawlingContext, error: Exception) -> Request:
216-
# Retrieve or initialize the headers, and extract the current custom retry count.
217-
headers = context.request.headers or HttpHeaders()
218-
custom_retry_count = int(headers.get('custom_retry_count', '0'))
219-
220214
# Append the current call information.
221-
calls.append(Call(context.request.url, error, custom_retry_count))
222-
223-
# Update the request to include an incremented custom retry count in the headers and return it.
224-
request = context.request.model_dump()
225-
request['headers'] = HttpHeaders({'custom_retry_count': str(custom_retry_count + 1)})
226-
return Request.model_validate(request)
215+
calls.append(Call(context.request.url, error))
216+
return context.request
227217

228218
await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
229219

230220
# Verify that the error handler was called twice
231221
assert len(calls) == 2
232222

233-
# Check the first call...
234-
first_call = calls[0]
235-
assert first_call.url == 'https://b.placeholder.com'
236-
assert isinstance(first_call.error, RuntimeError)
237-
assert first_call.custom_retry_count == 0
238-
239-
# Check the second call...
240-
second_call = calls[1]
241-
assert second_call.url == 'https://b.placeholder.com'
242-
assert isinstance(second_call.error, RuntimeError)
243-
assert second_call.custom_retry_count == 1
223+
# Check calls
224+
for error_call in calls:
225+
assert error_call.url == 'https://b.placeholder.com'
226+
assert isinstance(error_call.error, RuntimeError)
244227

245228

246229
async def test_calls_error_handler_for_sesion_errors() -> None:
@@ -578,7 +561,7 @@ async def handler(context: BasicCrawlingContext) -> None:
578561

579562

580563
async def test_final_statistics() -> None:
581-
crawler = BasicCrawler(max_request_retries=3)
564+
crawler = BasicCrawler(max_request_retries=2)
582565

583566
@crawler.router.default_handler
584567
async def handler(context: BasicCrawlingContext) -> None:

tests/unit/crawlers/_http/test_http_crawler.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ async def test_handles_client_errors(
104104
request_handler=mock_request_handler,
105105
additional_http_error_status_codes=additional_http_error_status_codes,
106106
ignore_http_error_status_codes=ignore_http_error_status_codes,
107-
max_request_retries=3,
107+
max_request_retries=2,
108108
)
109109

110110
await crawler.add_requests([str(server_url / 'status/402')])
@@ -230,7 +230,7 @@ async def test_http_status_statistics(crawler: HttpCrawler, server_url: URL) ->
230230
'200': 10,
231231
'403': 100, # block errors change session and retry
232232
'402': 10, # client errors are not retried by default
233-
'500': 30, # server errors are retried by default
233+
'500': 40, # server errors are retried by default
234234
}
235235

236236

@@ -568,7 +568,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:
568568
kvs_content[key_info.key] = await kvs.get_value(key_info.key)
569569

570570
# One error, three time retried.
571-
assert crawler.statistics.error_tracker.total == 3
571+
assert crawler.statistics.error_tracker.total == 4
572572
assert crawler.statistics.error_tracker.unique_error_count == 1
573573
assert len(kvs_content) == 1
574574
assert key_info.key.endswith('.html')

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
150150

151151

152152
async def test_nonexistent_url_invokes_error_handler() -> None:
153-
crawler = PlaywrightCrawler(max_request_retries=4, request_handler=mock.AsyncMock())
153+
crawler = PlaywrightCrawler(max_request_retries=3, request_handler=mock.AsyncMock())
154154

155155
error_handler = mock.AsyncMock(return_value=None)
156156
crawler.error_handler(error_handler)
@@ -617,7 +617,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
617617
assert kvs_content[key_info.key] == HELLO_WORLD.decode('utf-8')
618618

619619
# Three errors twice retried errors, but only 2 unique -> 4 (2 x (html and jpg)) artifacts expected.
620-
assert crawler.statistics.error_tracker.total == 3 * max_retries
620+
assert crawler.statistics.error_tracker.total == 3 * (max_retries + 1)
621621
assert crawler.statistics.error_tracker.unique_error_count == 2
622622
assert len(list(kvs_content.keys())) == 4
623623

0 commit comments

Comments
 (0)