diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index f4ea248346..0c5045f543 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -110,9 +110,9 @@ class ConcurrencySettings: def __init__( self, min_concurrency: int = 1, - max_concurrency: int = 200, + max_concurrency: int = 100, max_tasks_per_minute: float = float('inf'), - desired_concurrency: int | None = None, + desired_concurrency: int = 10, ) -> None: """Initialize a new instance. @@ -125,21 +125,24 @@ def __init__( desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool, if there is a large enough supply of them. By default, it is `min_concurrency`. """ - if desired_concurrency is not None and desired_concurrency < 1: - raise ValueError('desired_concurrency must be 1 or larger') - if min_concurrency < 1: raise ValueError('min_concurrency must be 1 or larger') if max_concurrency < min_concurrency: raise ValueError('max_concurrency cannot be less than min_concurrency') + if desired_concurrency < min_concurrency: + raise ValueError('desired_concurrency cannot be less than min_concurrency') + + if desired_concurrency > max_concurrency: + raise ValueError('desired_concurrency cannot be greater than max_concurrency') + if max_tasks_per_minute <= 0: raise ValueError('max_tasks_per_minute must be positive') self.min_concurrency = min_concurrency self.max_concurrency = max_concurrency - self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency + self.desired_concurrency = desired_concurrency self.max_tasks_per_minute = max_tasks_per_minute diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 2e59cc5b0f..055e2731d9 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -12,7 +12,7 @@ from parsel import Selector from typing_extensions import Self, TypeVar, override -from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult +from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult from crawlee._utils.docs import docs_group from crawlee._utils.wait import wait_for from crawlee.crawlers import ( @@ -158,6 +158,10 @@ def __init__( self.result_checker = result_checker or (lambda _: True) self.result_comparator = result_comparator or create_default_comparator(result_checker) + # Set default concurrency settings for browser crawlers if not provided + if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None: + kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1) + super().__init__(statistics=statistics, **kwargs) # Sub crawlers related. diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index b0b6dc9244..c32a9c9f27 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -12,6 +12,7 @@ from crawlee import service_locator from crawlee._request import Request, RequestOptions +from crawlee._types import ConcurrencySettings from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee._utils.docs import docs_group from crawlee._utils.robots import RobotsTxtFile @@ -194,6 +195,10 @@ def __init__( kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client'] + # Set default concurrency settings for browser crawlers if not provided + if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None: + kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1) + super().__init__(**kwargs) async def _open_page( diff --git a/tests/unit/_autoscaling/test_autoscaled_pool.py b/tests/unit/_autoscaling/test_autoscaled_pool.py index b312961022..31e03cf73d 100644 --- a/tests/unit/_autoscaling/test_autoscaled_pool.py +++ b/tests/unit/_autoscaling/test_autoscaled_pool.py @@ -135,6 +135,7 @@ async def run() -> None: is_finished_function=lambda: future(started_count > 0), concurrency_settings=ConcurrencySettings( min_concurrency=1, + desired_concurrency=1, max_concurrency=1, ), ) @@ -320,6 +321,7 @@ async def run() -> None: is_finished_function=lambda: future(done_count >= 4), concurrency_settings=ConcurrencySettings( min_concurrency=4, + desired_concurrency=4, max_concurrency=4, ), ) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 62ede11e67..4937f0cedb 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -799,7 +799,7 @@ async def test_max_requests_per_crawl() -> None: # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately crawler = BasicCrawler( - concurrency_settings=ConcurrencySettings(max_concurrency=1), + concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), max_requests_per_crawl=3, ) @@ -820,7 +820,7 @@ async def test_max_crawl_depth() -> None: # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately crawler = BasicCrawler( - concurrency_settings=ConcurrencySettings(max_concurrency=1), + concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), max_crawl_depth=2, ) @@ -859,7 +859,10 @@ async def test_abort_on_error( ) -> None: starts_urls = [] - crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(max_concurrency=1), abort_on_error=True) + crawler = BasicCrawler( + concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), + abort_on_error=True, + ) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: @@ -991,7 +994,7 @@ async def test_crawler_manual_stop() -> None: processed_urls = [] # Set max_concurrency to 1 to ensure testing urls are visited one by one in order. - crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(max_concurrency=1)) + crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1)) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: @@ -1018,8 +1021,8 @@ async def test_crawler_multiple_stops_in_parallel() -> None: ] processed_urls = [] - # Set max_concurrency to 2 to ensure two urls are being visited in parallel. - crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(max_concurrency=2)) + # Set concurrency to 2 to ensure two urls are being visited in parallel. + crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=2, max_concurrency=2)) both_handlers_started = asyncio.Barrier(2) # type:ignore[attr-defined] # Test is skipped in older Python versions. only_one_handler_at_a_time = asyncio.Semaphore(1) @@ -1298,7 +1301,7 @@ async def test_keep_alive( keep_alive=keep_alive, max_requests_per_crawl=max_requests_per_crawl, # If more request can run in parallel, then max_requests_per_crawl is not deterministic. - concurrency_settings=ConcurrencySettings(max_concurrency=1), + concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), ) mocked_handler = Mock() diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 37f2d1b8ed..ff0113659e 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -79,7 +79,9 @@ async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpCl # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately crawler = BeautifulSoupCrawler( - concurrency_settings=ConcurrencySettings(max_concurrency=1), max_requests_per_crawl=3, http_client=http_client + concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), + max_requests_per_crawl=3, + http_client=http_client, ) @crawler.router.default_handler diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py index 8a3edfd986..35699a45f9 100644 --- a/tests/unit/crawlers/_http/test_http_crawler.py +++ b/tests/unit/crawlers/_http/test_http_crawler.py @@ -381,7 +381,7 @@ async def test_isolation_cookies(http_client: HttpClient, server_url: URL) -> No ), http_client=http_client, max_request_retries=10, - concurrency_settings=ConcurrencySettings(max_concurrency=1), + concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), ) @crawler.router.default_handler diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 909563d822..4785fffea9 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -102,7 +102,9 @@ async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpCl # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately crawler = ParselCrawler( - concurrency_settings=ConcurrencySettings(max_concurrency=1), max_requests_per_crawl=3, http_client=http_client + concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), + max_requests_per_crawl=3, + http_client=http_client, ) @crawler.router.default_handler diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 7e666489c8..e9a01cf17b 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -334,7 +334,7 @@ async def test_isolation_cookies(*, use_incognito_pages: bool, server_url: URL) crawler = PlaywrightCrawler( session_pool=SessionPool(max_pool_size=1), use_incognito_pages=use_incognito_pages, - concurrency_settings=ConcurrencySettings(max_concurrency=1), + concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1), ) @crawler.router.default_handler