44from collections import OrderedDict
55from datetime import datetime , timedelta , timezone
66from logging import getLogger
7- from typing import TYPE_CHECKING
8- from typing import OrderedDict as OrderedDictType
7+ from typing import TYPE_CHECKING , Generic , TypedDict , TypeVar
98
109from typing_extensions import override
1110
3130
3231logger = getLogger (__name__ )
3332
33+ __all__ = ['RequestQueue' ]
34+
35+
36+ T = TypeVar ('T' )
37+
38+
39+ class BoundedSet (Generic [T ]):
40+ """A simple set datastructure that removes the least recently accessed item when it reaches `max_length`."""
41+
42+ def __init__ (self , max_length : int ) -> None :
43+ self ._max_length = max_length
44+ self ._data = OrderedDict [T , object ]()
45+
46+ def __contains__ (self , item : T ) -> bool :
47+ found = item in self ._data
48+ if found :
49+ self ._data .move_to_end (item , last = True )
50+ return found
51+
52+ def add (self , item : T ) -> None :
53+ self ._data [item ] = True
54+ self ._data .move_to_end (item )
55+
56+ if len (self ._data ) > self ._max_length :
57+ self ._data .popitem (last = False )
58+
59+ def clear (self ) -> None :
60+ self ._data .clear ()
61+
62+
63+ class CachedRequest (TypedDict ):
64+ id : str
65+ was_already_handled : bool
66+
3467
3568class RequestQueue (BaseStorage , RequestProvider ):
3669 """Represents a queue storage for HTTP requests to crawl.
@@ -97,12 +130,12 @@ def __init__(
97130 self ._internal_timeout_seconds = 5 * 60
98131 self ._assumed_total_count = 0
99132 self ._assumed_handled_count = 0
100- self ._queue_head_dict : OrderedDictType [str , str ] = OrderedDict ()
133+ self ._queue_head_dict : OrderedDict [str , str ] = OrderedDict ()
101134 self ._query_queue_head_task : asyncio .Task | None = None
102135 self ._in_progress : set [str ] = set ()
103136 self ._last_activity = datetime .now (timezone .utc )
104- self ._recently_handled : LRUCache [ bool ] = LRUCache (max_length = self ._RECENTLY_HANDLED_CACHE_SIZE )
105- self ._requests_cache : LRUCache [dict ] = LRUCache (max_length = self ._MAX_CACHED_REQUESTS )
137+ self ._recently_handled : BoundedSet [ str ] = BoundedSet (max_length = self ._RECENTLY_HANDLED_CACHE_SIZE )
138+ self ._requests_cache : LRUCache [CachedRequest ] = LRUCache (max_length = self ._MAX_CACHED_REQUESTS )
106139
107140 @override
108141 @property
@@ -125,13 +158,17 @@ async def open(
125158 ) -> RequestQueue :
126159 from crawlee .storages ._creation_management import open_storage
127160
128- return await open_storage (
161+ storage = await open_storage (
129162 storage_class = cls ,
130163 id = id ,
131164 name = name ,
132165 configuration = configuration ,
133166 )
134167
168+ await storage ._ensure_head_is_non_empty () # noqa: SLF001 - accessing private members from factories is OK
169+
170+ return storage
171+
135172 @override
136173 async def drop (self , * , timeout : timedelta | None = None ) -> None :
137174 from crawlee .storages ._creation_management import remove_storage_from_cache
@@ -208,7 +245,7 @@ async def add_request(
208245 not is_handled
209246 and not was_already_present
210247 and request_id not in self ._in_progress
211- and self . _recently_handled . get ( request_id ) is None
248+ and request_id not in self . _recently_handled
212249 ):
213250 self ._assumed_total_count += 1
214251 self ._maybe_add_request_to_queue_head (request_id , forefront = forefront )
@@ -284,7 +321,7 @@ async def fetch_next_request(self) -> Request | None:
284321 Returns:
285322 The request or `None` if there are no more pending requests.
286323 """
287- await self .ensure_head_is_non_empty ()
324+ await self ._ensure_head_is_non_empty ()
288325
289326 # We are likely done at this point.
290327 if len (self ._queue_head_dict ) == 0 :
@@ -293,7 +330,7 @@ async def fetch_next_request(self) -> Request | None:
293330 next_request_id , _ = self ._queue_head_dict .popitem (last = False ) # ~removeFirst()
294331
295332 # This should never happen, but...
296- if next_request_id in self ._in_progress or self ._recently_handled . get ( next_request_id ) :
333+ if next_request_id in self ._in_progress or next_request_id in self ._recently_handled :
297334 logger .warning (
298335 'Queue head returned a request that is already in progress?!' ,
299336 extra = {
@@ -343,7 +380,7 @@ async def fetch_next_request(self) -> Request | None:
343380 'Request fetched from the beginning of queue was already handled' ,
344381 extra = {'nextRequestId' : next_request_id },
345382 )
346- self ._recently_handled [ next_request_id ] = True
383+ self ._recently_handled . add ( next_request_id )
347384 return None
348385
349386 return request
@@ -372,7 +409,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest |
372409 processed_request .unique_key = request .unique_key
373410
374411 self ._in_progress .remove (request .id )
375- self ._recently_handled [ request .id ] = True
412+ self ._recently_handled . add ( request .id )
376413
377414 if not processed_request .was_already_handled :
378415 self ._assumed_handled_count += 1
@@ -431,7 +468,7 @@ async def is_empty(self) -> bool:
431468 Returns:
432469 bool: `True` if the next call to `RequestQueue.fetchNextRequest` would return `None`, otherwise `False`.
433470 """
434- await self .ensure_head_is_non_empty ()
471+ await self ._ensure_head_is_non_empty ()
435472 return len (self ._queue_head_dict ) == 0
436473
437474 async def is_finished (self ) -> bool :
@@ -457,7 +494,7 @@ async def is_finished(self) -> bool:
457494
458495 # TODO: set ensure_consistency to True once the following issue is resolved:
459496 # https://github.com/apify/crawlee-python/issues/203
460- is_head_consistent = await self .ensure_head_is_non_empty (ensure_consistency = False )
497+ is_head_consistent = await self ._ensure_head_is_non_empty (ensure_consistency = False )
461498 return is_head_consistent and len (self ._queue_head_dict ) == 0 and self ._in_progress_count () == 0
462499
463500 async def get_info (self ) -> RequestQueueMetadata | None :
@@ -472,7 +509,7 @@ async def get_handled_count(self) -> int:
472509 async def get_total_count (self ) -> int :
473510 return self ._assumed_total_count
474511
475- async def ensure_head_is_non_empty (
512+ async def _ensure_head_is_non_empty (
476513 self ,
477514 * ,
478515 ensure_consistency : bool = False ,
@@ -556,7 +593,7 @@ async def ensure_head_is_non_empty(
556593 logger .info (f'Waiting for { delay_seconds } for queue finalization, to ensure data consistency.' )
557594 await asyncio .sleep (delay_seconds )
558595
559- return await self .ensure_head_is_non_empty (
596+ return await self ._ensure_head_is_non_empty (
560597 ensure_consistency = ensure_consistency ,
561598 limit = next_limit ,
562599 iteration = iteration + 1 ,
@@ -578,8 +615,6 @@ def _reset(self) -> None:
578615 def _cache_request (self , cache_key : str , processed_request : ProcessedRequest ) -> None :
579616 self ._requests_cache [cache_key ] = {
580617 'id' : processed_request .id ,
581- 'is_handled' : processed_request .was_already_handled ,
582- 'unique_key' : processed_request .unique_key ,
583618 'was_already_handled' : processed_request .was_already_handled ,
584619 }
585620
@@ -595,7 +630,7 @@ async def _queue_query_head(self, limit: int) -> RequestQueueHeadState:
595630 not request .id
596631 or not request .unique_key
597632 or request .id in self ._in_progress
598- or self . _recently_handled . get ( request .id )
633+ or request .id in self . _recently_handled
599634 ):
600635 continue
601636
0 commit comments