Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -941,12 +941,25 @@ def _enqueue_links_filter_iterator(
"""Filter requests based on the enqueue strategy and URL patterns."""
limit = kwargs.get('limit')
parsed_origin_url = urlparse(origin_url)
strategy = kwargs.get('strategy', 'all')

if strategy == 'all' and not parsed_origin_url.hostname:
self.log.warning(f'Skipping enqueue: Missing hostname in origin_url = {origin_url}.')
return

# Emit a `warning` message to the log, only once per call
warning_flag = True

for request in request_iterator:
target_url = request.url if isinstance(request, Request) else request
parsed_target_url = urlparse(target_url)

if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
self.log.warning(f'Skipping enqueue url: Missing hostname in target_url = {target_url}.')
warning_flag = False

if self._check_enqueue_strategy(
kwargs.get('strategy', 'all'), target_url=urlparse(target_url), origin_url=parsed_origin_url
strategy, target_url=parsed_target_url, origin_url=parsed_origin_url
) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')):
yield request

Expand All @@ -962,13 +975,20 @@ def _check_enqueue_strategy(
origin_url: ParseResult,
) -> bool:
"""Check if a URL matches the enqueue_strategy."""
if strategy == 'all':
return True

if origin_url.hostname is None or target_url.hostname is None:
self.log.debug(
f'Skipping enqueue: Missing hostname in origin_url = {origin_url.geturl()} or '
f'target_url = {target_url.geturl()}'
)
return False

if strategy == 'same-hostname':
return target_url.hostname == origin_url.hostname

if strategy == 'same-domain':
if origin_url.hostname is None or target_url.hostname is None:
raise ValueError('Both origin and target URLs must have a hostname')

origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
return origin_domain == target_domain
Expand All @@ -980,9 +1000,6 @@ def _check_enqueue_strategy(
and target_url.port == origin_url.port
)

if strategy == 'all':
return True

assert_never(strategy)

def _check_url_patterns(
Expand Down
Loading