Skip to content

Commit d84c30c

Browse files
authored
fix: fix match check for specified enqueue strategy for requests with redirect (#1199)
### Description - Fixes match check for specified enqueue strategy for requests with redirect. Before this PR, the check used the final url after the redirect, after that the original url will be used. ### Issues - Closes: #1198 ### Testing - Added tests for enqueue strategy with redirect simulation.
1 parent 9563ce3 commit d84c30c

File tree

2 files changed

+73
-9
lines changed

2 files changed

+73
-9
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1132,7 +1132,7 @@ async def _commit_request_handler_result(self, context: BasicCrawlingContext) ->
11321132
and self._check_enqueue_strategy(
11331133
add_requests_call.get('strategy', 'all'),
11341134
target_url=urlparse(dst_request.url),
1135-
origin_url=urlparse(origin),
1135+
origin_url=urlparse(context.request.url),
11361136
)
11371137
and self._check_url_patterns(
11381138
dst_request.url,

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 72 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,7 @@ async def handler(context: BasicCrawlingContext) -> None:
335335
@dataclass
336336
class AddRequestsTestInput:
337337
start_url: str
338+
loaded_url: str
338339
requests: Sequence[str | Request]
339340
expected_urls: Sequence[str]
340341
kwargs: EnqueueLinksKwargs
@@ -344,6 +345,7 @@ class AddRequestsTestInput:
344345
'https://someplace.com/',
345346
'http://someplace.com/index.html',
346347
'https://blog.someplace.com/index.html',
348+
'https://redirect.someplace.com',
347349
'https://other.place.com/index.html',
348350
)
349351

@@ -363,6 +365,7 @@ class AddRequestsTestInput:
363365
pytest.param(
364366
AddRequestsTestInput(
365367
start_url='https://a.com/',
368+
loaded_url='https://a.com/',
366369
requests=[
367370
'https://a.com/',
368371
Request.from_url('http://b.com/'),
@@ -377,52 +380,109 @@ class AddRequestsTestInput:
377380
pytest.param(
378381
AddRequestsTestInput(
379382
start_url=STRATEGY_TEST_URLS[0],
383+
loaded_url=STRATEGY_TEST_URLS[0],
380384
requests=STRATEGY_TEST_URLS,
381385
kwargs=EnqueueLinksKwargs(),
382386
expected_urls=STRATEGY_TEST_URLS[1:],
383387
),
384-
id='enqueue_strategy_1',
388+
id='enqueue_strategy_default',
385389
),
386390
pytest.param(
387391
AddRequestsTestInput(
388392
start_url=STRATEGY_TEST_URLS[0],
393+
loaded_url=STRATEGY_TEST_URLS[0],
389394
requests=STRATEGY_TEST_URLS,
390395
kwargs=EnqueueLinksKwargs(strategy='all'),
391396
expected_urls=STRATEGY_TEST_URLS[1:],
392397
),
393-
id='enqueue_strategy_2',
398+
id='enqueue_strategy_all',
394399
),
395400
pytest.param(
396401
AddRequestsTestInput(
397402
start_url=STRATEGY_TEST_URLS[0],
398-
requests=STRATEGY_TEST_URLS,
403+
loaded_url=STRATEGY_TEST_URLS[0],
404+
requests=STRATEGY_TEST_URLS[:4],
399405
kwargs=EnqueueLinksKwargs(strategy='same-domain'),
400-
expected_urls=STRATEGY_TEST_URLS[1:3],
406+
expected_urls=STRATEGY_TEST_URLS[1:4],
401407
),
402-
id='enqueue_strategy_3',
408+
id='enqueue_strategy_same_domain',
403409
),
404410
pytest.param(
405411
AddRequestsTestInput(
406412
start_url=STRATEGY_TEST_URLS[0],
407-
requests=STRATEGY_TEST_URLS,
413+
loaded_url=STRATEGY_TEST_URLS[0],
414+
requests=STRATEGY_TEST_URLS[:4],
408415
kwargs=EnqueueLinksKwargs(strategy='same-hostname'),
409416
expected_urls=[STRATEGY_TEST_URLS[1]],
410417
),
411-
id='enqueue_strategy_4',
418+
id='enqueue_strategy_same_hostname',
412419
),
413420
pytest.param(
414421
AddRequestsTestInput(
415422
start_url=STRATEGY_TEST_URLS[0],
423+
loaded_url=STRATEGY_TEST_URLS[0],
424+
requests=STRATEGY_TEST_URLS[:4],
425+
kwargs=EnqueueLinksKwargs(strategy='same-origin'),
426+
expected_urls=[],
427+
),
428+
id='enqueue_strategy_same_origin',
429+
),
430+
# Enqueue strategy with redirect
431+
pytest.param(
432+
AddRequestsTestInput(
433+
start_url=STRATEGY_TEST_URLS[3],
434+
loaded_url=STRATEGY_TEST_URLS[0],
435+
requests=STRATEGY_TEST_URLS,
436+
kwargs=EnqueueLinksKwargs(),
437+
expected_urls=STRATEGY_TEST_URLS[:3] + STRATEGY_TEST_URLS[4:],
438+
),
439+
id='redirect_enqueue_strategy_default',
440+
),
441+
pytest.param(
442+
AddRequestsTestInput(
443+
start_url=STRATEGY_TEST_URLS[3],
444+
loaded_url=STRATEGY_TEST_URLS[0],
445+
requests=STRATEGY_TEST_URLS,
446+
kwargs=EnqueueLinksKwargs(strategy='all'),
447+
expected_urls=STRATEGY_TEST_URLS[:3] + STRATEGY_TEST_URLS[4:],
448+
),
449+
id='redirect_enqueue_strategy_all',
450+
),
451+
pytest.param(
452+
AddRequestsTestInput(
453+
start_url=STRATEGY_TEST_URLS[3],
454+
loaded_url=STRATEGY_TEST_URLS[0],
455+
requests=STRATEGY_TEST_URLS,
456+
kwargs=EnqueueLinksKwargs(strategy='same-domain'),
457+
expected_urls=STRATEGY_TEST_URLS[:3],
458+
),
459+
id='redirect_enqueue_strategy_same_domain',
460+
),
461+
pytest.param(
462+
AddRequestsTestInput(
463+
start_url=STRATEGY_TEST_URLS[3],
464+
loaded_url=STRATEGY_TEST_URLS[0],
465+
requests=STRATEGY_TEST_URLS,
466+
kwargs=EnqueueLinksKwargs(strategy='same-hostname'),
467+
expected_urls=[],
468+
),
469+
id='redirect_enqueue_strategy_same_hostname',
470+
),
471+
pytest.param(
472+
AddRequestsTestInput(
473+
start_url=STRATEGY_TEST_URLS[3],
474+
loaded_url=STRATEGY_TEST_URLS[0],
416475
requests=STRATEGY_TEST_URLS,
417476
kwargs=EnqueueLinksKwargs(strategy='same-origin'),
418477
expected_urls=[],
419478
),
420-
id='enqueue_strategy_5',
479+
id='redirect_enqueue_strategy_same_origin',
421480
),
422481
# Include/exclude
423482
pytest.param(
424483
AddRequestsTestInput(
425484
start_url=INCLUDE_TEST_URLS[0],
485+
loaded_url=INCLUDE_TEST_URLS[0],
426486
requests=INCLUDE_TEST_URLS,
427487
kwargs=EnqueueLinksKwargs(include=[Glob('https://someplace.com/**/cats')]),
428488
expected_urls=[INCLUDE_TEST_URLS[1], INCLUDE_TEST_URLS[4]],
@@ -432,6 +492,7 @@ class AddRequestsTestInput:
432492
pytest.param(
433493
AddRequestsTestInput(
434494
start_url=INCLUDE_TEST_URLS[0],
495+
loaded_url=INCLUDE_TEST_URLS[0],
435496
requests=INCLUDE_TEST_URLS,
436497
kwargs=EnqueueLinksKwargs(exclude=[Glob('https://someplace.com/**/cats')]),
437498
expected_urls=[INCLUDE_TEST_URLS[2], INCLUDE_TEST_URLS[3]],
@@ -441,6 +502,7 @@ class AddRequestsTestInput:
441502
pytest.param(
442503
AddRequestsTestInput(
443504
start_url=INCLUDE_TEST_URLS[0],
505+
loaded_url=INCLUDE_TEST_URLS[0],
444506
requests=INCLUDE_TEST_URLS,
445507
kwargs=EnqueueLinksKwargs(
446508
include=[Glob('https://someplace.com/**/cats')], exclude=[Glob('https://**/archive/**')]
@@ -458,6 +520,8 @@ async def test_enqueue_strategy(test_input: AddRequestsTestInput) -> None:
458520

459521
@crawler.router.handler('start')
460522
async def start_handler(context: BasicCrawlingContext) -> None:
523+
# Assign test value to loaded_url - BasicCrawler does not do any navigation by itself
524+
context.request.loaded_url = test_input.loaded_url
461525
await context.add_requests(
462526
test_input.requests,
463527
**test_input.kwargs,

0 commit comments

Comments
 (0)