@@ -942,12 +942,25 @@ def _enqueue_links_filter_iterator(
942942 """Filter requests based on the enqueue strategy and URL patterns."""
943943 limit = kwargs .get ('limit' )
944944 parsed_origin_url = urlparse (origin_url )
945+ strategy = kwargs .get ('strategy' , 'all' )
946+
947+ if strategy == 'all' and not parsed_origin_url .hostname :
948+ self .log .warning (f'Skipping enqueue: Missing hostname in origin_url = { origin_url } .' )
949+ return
950+
951+ # Emit a `warning` message to the log, only once per call
952+ warning_flag = True
945953
946954 for request in request_iterator :
947955 target_url = request .url if isinstance (request , Request ) else request
956+ parsed_target_url = urlparse (target_url )
957+
958+ if warning_flag and strategy != 'all' and not parsed_target_url .hostname :
959+ self .log .warning (f'Skipping enqueue url: Missing hostname in target_url = { target_url } .' )
960+ warning_flag = False
948961
949962 if self ._check_enqueue_strategy (
950- kwargs . get ( ' strategy' , 'all' ), target_url = urlparse ( target_url ) , origin_url = parsed_origin_url
963+ strategy , target_url = parsed_target_url , origin_url = parsed_origin_url
951964 ) and self ._check_url_patterns (target_url , kwargs .get ('include' ), kwargs .get ('exclude' )):
952965 yield request
953966
@@ -963,13 +976,20 @@ def _check_enqueue_strategy(
963976 origin_url : ParseResult ,
964977 ) -> bool :
965978 """Check if a URL matches the enqueue_strategy."""
979+ if strategy == 'all' :
980+ return True
981+
982+ if origin_url .hostname is None or target_url .hostname is None :
983+ self .log .debug (
984+ f'Skipping enqueue: Missing hostname in origin_url = { origin_url .geturl ()} or '
985+ f'target_url = { target_url .geturl ()} '
986+ )
987+ return False
988+
966989 if strategy == 'same-hostname' :
967990 return target_url .hostname == origin_url .hostname
968991
969992 if strategy == 'same-domain' :
970- if origin_url .hostname is None or target_url .hostname is None :
971- raise ValueError ('Both origin and target URLs must have a hostname' )
972-
973993 origin_domain = self ._tld_extractor .extract_str (origin_url .hostname ).domain
974994 target_domain = self ._tld_extractor .extract_str (target_url .hostname ).domain
975995 return origin_domain == target_domain
@@ -981,9 +1001,6 @@ def _check_enqueue_strategy(
9811001 and target_url .port == origin_url .port
9821002 )
9831003
984- if strategy == 'all' :
985- return True
986-
9871004 assert_never (strategy )
9881005
9891006 def _check_url_patterns (
0 commit comments