1717
1818from crawlee import ConcurrencySettings , Glob , service_locator
1919from crawlee ._request import Request
20- from crawlee ._types import BasicCrawlingContext , EnqueueLinksKwargs , HttpHeaders , HttpMethod
20+ from crawlee ._types import BasicCrawlingContext , EnqueueLinksKwargs , HttpMethod
2121from crawlee ._utils .robots import RobotsTxtFile
2222from crawlee .configuration import Configuration
2323from crawlee .crawlers import BasicCrawler
@@ -135,11 +135,12 @@ async def handler(context: BasicCrawlingContext) -> None:
135135 'https://c.placeholder.com' ,
136136 'https://b.placeholder.com' ,
137137 'https://b.placeholder.com' ,
138+ 'https://b.placeholder.com' ,
138139 ]
139140
140141
141142async def test_respects_no_retry () -> None :
142- crawler = BasicCrawler (max_request_retries = 3 )
143+ crawler = BasicCrawler (max_request_retries = 2 )
143144 calls = list [str ]()
144145
145146 @crawler .router .default_handler
@@ -167,7 +168,7 @@ async def handler(context: BasicCrawlingContext) -> None:
167168
168169
169170async def test_respects_request_specific_max_retries () -> None :
170- crawler = BasicCrawler (max_request_retries = 1 )
171+ crawler = BasicCrawler (max_request_retries = 0 )
171172 calls = list [str ]()
172173
173174 @crawler .router .default_handler
@@ -179,7 +180,7 @@ async def handler(context: BasicCrawlingContext) -> None:
179180 [
180181 'https://a.placeholder.com' ,
181182 'https://b.placeholder.com' ,
182- Request .from_url (url = 'https://c.placeholder.com' , user_data = {'__crawlee' : {'maxRetries' : 4 }}),
183+ Request .from_url (url = 'https://c.placeholder.com' , user_data = {'__crawlee' : {'maxRetries' : 1 }}),
183184 ]
184185 )
185186
@@ -188,8 +189,6 @@ async def handler(context: BasicCrawlingContext) -> None:
188189 'https://b.placeholder.com' ,
189190 'https://c.placeholder.com' ,
190191 'https://c.placeholder.com' ,
191- 'https://c.placeholder.com' ,
192- 'https://c.placeholder.com' ,
193192 ]
194193
195194
@@ -199,12 +198,11 @@ async def test_calls_error_handler() -> None:
199198 class Call :
200199 url : str
201200 error : Exception
202- custom_retry_count : int
203201
204202 # List to store the information of calls to the error handler.
205203 calls = list [Call ]()
206204
207- crawler = BasicCrawler (max_request_retries = 3 )
205+ crawler = BasicCrawler (max_request_retries = 2 )
208206
209207 @crawler .router .default_handler
210208 async def handler (context : BasicCrawlingContext ) -> None :
@@ -213,34 +211,19 @@ async def handler(context: BasicCrawlingContext) -> None:
213211
214212 @crawler .error_handler
215213 async def error_handler (context : BasicCrawlingContext , error : Exception ) -> Request :
216- # Retrieve or initialize the headers, and extract the current custom retry count.
217- headers = context .request .headers or HttpHeaders ()
218- custom_retry_count = int (headers .get ('custom_retry_count' , '0' ))
219-
220214 # Append the current call information.
221- calls .append (Call (context .request .url , error , custom_retry_count ))
222-
223- # Update the request to include an incremented custom retry count in the headers and return it.
224- request = context .request .model_dump ()
225- request ['headers' ] = HttpHeaders ({'custom_retry_count' : str (custom_retry_count + 1 )})
226- return Request .model_validate (request )
215+ calls .append (Call (context .request .url , error ))
216+ return context .request
227217
228218 await crawler .run (['https://a.placeholder.com' , 'https://b.placeholder.com' , 'https://c.placeholder.com' ])
229219
230220 # Verify that the error handler was called twice
231221 assert len (calls ) == 2
232222
233- # Check the first call...
234- first_call = calls [0 ]
235- assert first_call .url == 'https://b.placeholder.com'
236- assert isinstance (first_call .error , RuntimeError )
237- assert first_call .custom_retry_count == 0
238-
239- # Check the second call...
240- second_call = calls [1 ]
241- assert second_call .url == 'https://b.placeholder.com'
242- assert isinstance (second_call .error , RuntimeError )
243- assert second_call .custom_retry_count == 1
223+ # Check calls
224+ for error_call in calls :
225+ assert error_call .url == 'https://b.placeholder.com'
226+ assert isinstance (error_call .error , RuntimeError )
244227
245228
246229async def test_calls_error_handler_for_sesion_errors () -> None :
@@ -578,7 +561,7 @@ async def handler(context: BasicCrawlingContext) -> None:
578561
579562
580563async def test_final_statistics () -> None :
581- crawler = BasicCrawler (max_request_retries = 3 )
564+ crawler = BasicCrawler (max_request_retries = 2 )
582565
583566 @crawler .router .default_handler
584567 async def handler (context : BasicCrawlingContext ) -> None :
0 commit comments