@@ -335,6 +335,7 @@ async def handler(context: BasicCrawlingContext) -> None:
335335@dataclass
336336class AddRequestsTestInput :
337337 start_url : str
338+ loaded_url : str
338339 requests : Sequence [str | Request ]
339340 expected_urls : Sequence [str ]
340341 kwargs : EnqueueLinksKwargs
@@ -344,6 +345,7 @@ class AddRequestsTestInput:
344345 'https://someplace.com/' ,
345346 'http://someplace.com/index.html' ,
346347 'https://blog.someplace.com/index.html' ,
348+ 'https://redirect.someplace.com' ,
347349 'https://other.place.com/index.html' ,
348350)
349351
@@ -363,6 +365,7 @@ class AddRequestsTestInput:
363365 pytest .param (
364366 AddRequestsTestInput (
365367 start_url = 'https://a.com/' ,
368+ loaded_url = 'https://a.com/' ,
366369 requests = [
367370 'https://a.com/' ,
368371 Request .from_url ('http://b.com/' ),
@@ -377,52 +380,109 @@ class AddRequestsTestInput:
377380 pytest .param (
378381 AddRequestsTestInput (
379382 start_url = STRATEGY_TEST_URLS [0 ],
383+ loaded_url = STRATEGY_TEST_URLS [0 ],
380384 requests = STRATEGY_TEST_URLS ,
381385 kwargs = EnqueueLinksKwargs (),
382386 expected_urls = STRATEGY_TEST_URLS [1 :],
383387 ),
384- id = 'enqueue_strategy_1 ' ,
388+ id = 'enqueue_strategy_default ' ,
385389 ),
386390 pytest .param (
387391 AddRequestsTestInput (
388392 start_url = STRATEGY_TEST_URLS [0 ],
393+ loaded_url = STRATEGY_TEST_URLS [0 ],
389394 requests = STRATEGY_TEST_URLS ,
390395 kwargs = EnqueueLinksKwargs (strategy = 'all' ),
391396 expected_urls = STRATEGY_TEST_URLS [1 :],
392397 ),
393- id = 'enqueue_strategy_2 ' ,
398+ id = 'enqueue_strategy_all ' ,
394399 ),
395400 pytest .param (
396401 AddRequestsTestInput (
397402 start_url = STRATEGY_TEST_URLS [0 ],
398- requests = STRATEGY_TEST_URLS ,
403+ loaded_url = STRATEGY_TEST_URLS [0 ],
404+ requests = STRATEGY_TEST_URLS [:4 ],
399405 kwargs = EnqueueLinksKwargs (strategy = 'same-domain' ),
400- expected_urls = STRATEGY_TEST_URLS [1 :3 ],
406+ expected_urls = STRATEGY_TEST_URLS [1 :4 ],
401407 ),
402- id = 'enqueue_strategy_3 ' ,
408+ id = 'enqueue_strategy_same_domain ' ,
403409 ),
404410 pytest .param (
405411 AddRequestsTestInput (
406412 start_url = STRATEGY_TEST_URLS [0 ],
407- requests = STRATEGY_TEST_URLS ,
413+ loaded_url = STRATEGY_TEST_URLS [0 ],
414+ requests = STRATEGY_TEST_URLS [:4 ],
408415 kwargs = EnqueueLinksKwargs (strategy = 'same-hostname' ),
409416 expected_urls = [STRATEGY_TEST_URLS [1 ]],
410417 ),
411- id = 'enqueue_strategy_4 ' ,
418+ id = 'enqueue_strategy_same_hostname ' ,
412419 ),
413420 pytest .param (
414421 AddRequestsTestInput (
415422 start_url = STRATEGY_TEST_URLS [0 ],
423+ loaded_url = STRATEGY_TEST_URLS [0 ],
424+ requests = STRATEGY_TEST_URLS [:4 ],
425+ kwargs = EnqueueLinksKwargs (strategy = 'same-origin' ),
426+ expected_urls = [],
427+ ),
428+ id = 'enqueue_strategy_same_origin' ,
429+ ),
430+ # Enqueue strategy with redirect
431+ pytest .param (
432+ AddRequestsTestInput (
433+ start_url = STRATEGY_TEST_URLS [3 ],
434+ loaded_url = STRATEGY_TEST_URLS [0 ],
435+ requests = STRATEGY_TEST_URLS ,
436+ kwargs = EnqueueLinksKwargs (),
437+ expected_urls = STRATEGY_TEST_URLS [:3 ] + STRATEGY_TEST_URLS [4 :],
438+ ),
439+ id = 'redirect_enqueue_strategy_default' ,
440+ ),
441+ pytest .param (
442+ AddRequestsTestInput (
443+ start_url = STRATEGY_TEST_URLS [3 ],
444+ loaded_url = STRATEGY_TEST_URLS [0 ],
445+ requests = STRATEGY_TEST_URLS ,
446+ kwargs = EnqueueLinksKwargs (strategy = 'all' ),
447+ expected_urls = STRATEGY_TEST_URLS [:3 ] + STRATEGY_TEST_URLS [4 :],
448+ ),
449+ id = 'redirect_enqueue_strategy_all' ,
450+ ),
451+ pytest .param (
452+ AddRequestsTestInput (
453+ start_url = STRATEGY_TEST_URLS [3 ],
454+ loaded_url = STRATEGY_TEST_URLS [0 ],
455+ requests = STRATEGY_TEST_URLS ,
456+ kwargs = EnqueueLinksKwargs (strategy = 'same-domain' ),
457+ expected_urls = STRATEGY_TEST_URLS [:3 ],
458+ ),
459+ id = 'redirect_enqueue_strategy_same_domain' ,
460+ ),
461+ pytest .param (
462+ AddRequestsTestInput (
463+ start_url = STRATEGY_TEST_URLS [3 ],
464+ loaded_url = STRATEGY_TEST_URLS [0 ],
465+ requests = STRATEGY_TEST_URLS ,
466+ kwargs = EnqueueLinksKwargs (strategy = 'same-hostname' ),
467+ expected_urls = [],
468+ ),
469+ id = 'redirect_enqueue_strategy_same_hostname' ,
470+ ),
471+ pytest .param (
472+ AddRequestsTestInput (
473+ start_url = STRATEGY_TEST_URLS [3 ],
474+ loaded_url = STRATEGY_TEST_URLS [0 ],
416475 requests = STRATEGY_TEST_URLS ,
417476 kwargs = EnqueueLinksKwargs (strategy = 'same-origin' ),
418477 expected_urls = [],
419478 ),
420- id = 'enqueue_strategy_5 ' ,
479+ id = 'redirect_enqueue_strategy_same_origin ' ,
421480 ),
422481 # Include/exclude
423482 pytest .param (
424483 AddRequestsTestInput (
425484 start_url = INCLUDE_TEST_URLS [0 ],
485+ loaded_url = INCLUDE_TEST_URLS [0 ],
426486 requests = INCLUDE_TEST_URLS ,
427487 kwargs = EnqueueLinksKwargs (include = [Glob ('https://someplace.com/**/cats' )]),
428488 expected_urls = [INCLUDE_TEST_URLS [1 ], INCLUDE_TEST_URLS [4 ]],
@@ -432,6 +492,7 @@ class AddRequestsTestInput:
432492 pytest .param (
433493 AddRequestsTestInput (
434494 start_url = INCLUDE_TEST_URLS [0 ],
495+ loaded_url = INCLUDE_TEST_URLS [0 ],
435496 requests = INCLUDE_TEST_URLS ,
436497 kwargs = EnqueueLinksKwargs (exclude = [Glob ('https://someplace.com/**/cats' )]),
437498 expected_urls = [INCLUDE_TEST_URLS [2 ], INCLUDE_TEST_URLS [3 ]],
@@ -441,6 +502,7 @@ class AddRequestsTestInput:
441502 pytest .param (
442503 AddRequestsTestInput (
443504 start_url = INCLUDE_TEST_URLS [0 ],
505+ loaded_url = INCLUDE_TEST_URLS [0 ],
444506 requests = INCLUDE_TEST_URLS ,
445507 kwargs = EnqueueLinksKwargs (
446508 include = [Glob ('https://someplace.com/**/cats' )], exclude = [Glob ('https://**/archive/**' )]
@@ -458,6 +520,8 @@ async def test_enqueue_strategy(test_input: AddRequestsTestInput) -> None:
458520
459521 @crawler .router .handler ('start' )
460522 async def start_handler (context : BasicCrawlingContext ) -> None :
523+ # Assign test value to loaded_url - BasicCrawler does not do any navigation by itself
524+ context .request .loaded_url = test_input .loaded_url
461525 await context .add_requests (
462526 test_input .requests ,
463527 ** test_input .kwargs ,
0 commit comments