Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,9 @@ class ConcurrencySettings:
def __init__(
self,
min_concurrency: int = 1,
max_concurrency: int = 200,
max_concurrency: int = 100,
max_tasks_per_minute: float = float('inf'),
desired_concurrency: int | None = None,
desired_concurrency: int = 10,
) -> None:
"""Initialize a new instance.

Expand All @@ -125,21 +125,24 @@ def __init__(
desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
if there is a large enough supply of them. By default, it is `min_concurrency`.
"""
if desired_concurrency is not None and desired_concurrency < 1:
raise ValueError('desired_concurrency must be 1 or larger')

if min_concurrency < 1:
raise ValueError('min_concurrency must be 1 or larger')

if max_concurrency < min_concurrency:
raise ValueError('max_concurrency cannot be less than min_concurrency')

if desired_concurrency < min_concurrency:
raise ValueError('desired_concurrency cannot be less than min_concurrency')

if desired_concurrency > max_concurrency:
raise ValueError('desired_concurrency cannot be greater than max_concurrency')

if max_tasks_per_minute <= 0:
raise ValueError('max_tasks_per_minute must be positive')

self.min_concurrency = min_concurrency
self.max_concurrency = max_concurrency
self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency
self.desired_concurrency = desired_concurrency
self.max_tasks_per_minute = max_tasks_per_minute


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from parsel import Selector
from typing_extensions import Self, TypeVar, override

from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
from crawlee._utils.docs import docs_group
from crawlee._utils.wait import wait_for
from crawlee.crawlers import (
Expand Down Expand Up @@ -158,6 +158,10 @@ def __init__(
self.result_checker = result_checker or (lambda _: True)
self.result_comparator = result_comparator or create_default_comparator(result_checker)

# Set default concurrency settings for browser crawlers if not provided
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)

super().__init__(statistics=statistics, **kwargs)

# Sub crawlers related.
Expand Down
5 changes: 5 additions & 0 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from crawlee import service_locator
from crawlee._request import Request, RequestOptions
from crawlee._types import ConcurrencySettings
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
from crawlee._utils.docs import docs_group
from crawlee._utils.robots import RobotsTxtFile
Expand Down Expand Up @@ -194,6 +195,10 @@ def __init__(

kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']

# Set default concurrency settings for browser crawlers if not provided
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)

super().__init__(**kwargs)

async def _open_page(
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/_autoscaling/test_autoscaled_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ async def run() -> None:
is_finished_function=lambda: future(started_count > 0),
concurrency_settings=ConcurrencySettings(
min_concurrency=1,
desired_concurrency=1,
max_concurrency=1,
),
)
Expand Down Expand Up @@ -320,6 +321,7 @@ async def run() -> None:
is_finished_function=lambda: future(done_count >= 4),
concurrency_settings=ConcurrencySettings(
min_concurrency=4,
desired_concurrency=4,
max_concurrency=4,
),
)
Expand Down
17 changes: 10 additions & 7 deletions tests/unit/crawlers/_basic/test_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,7 +799,7 @@ async def test_max_requests_per_crawl() -> None:

# Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
crawler = BasicCrawler(
concurrency_settings=ConcurrencySettings(max_concurrency=1),
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
max_requests_per_crawl=3,
)

Expand All @@ -820,7 +820,7 @@ async def test_max_crawl_depth() -> None:

# Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
crawler = BasicCrawler(
concurrency_settings=ConcurrencySettings(max_concurrency=1),
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
max_crawl_depth=2,
)

Expand Down Expand Up @@ -859,7 +859,10 @@ async def test_abort_on_error(
) -> None:
starts_urls = []

crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(max_concurrency=1), abort_on_error=True)
crawler = BasicCrawler(
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
abort_on_error=True,
)

@crawler.router.default_handler
async def handler(context: BasicCrawlingContext) -> None:
Expand Down Expand Up @@ -991,7 +994,7 @@ async def test_crawler_manual_stop() -> None:
processed_urls = []

# Set max_concurrency to 1 to ensure testing urls are visited one by one in order.
crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(max_concurrency=1))
crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1))

@crawler.router.default_handler
async def handler(context: BasicCrawlingContext) -> None:
Expand All @@ -1018,8 +1021,8 @@ async def test_crawler_multiple_stops_in_parallel() -> None:
]
processed_urls = []

# Set max_concurrency to 2 to ensure two urls are being visited in parallel.
crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(max_concurrency=2))
# Set concurrency to 2 to ensure two urls are being visited in parallel.
crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=2, max_concurrency=2))

both_handlers_started = asyncio.Barrier(2) # type:ignore[attr-defined] # Test is skipped in older Python versions.
only_one_handler_at_a_time = asyncio.Semaphore(1)
Expand Down Expand Up @@ -1298,7 +1301,7 @@ async def test_keep_alive(
keep_alive=keep_alive,
max_requests_per_crawl=max_requests_per_crawl,
# If more request can run in parallel, then max_requests_per_crawl is not deterministic.
concurrency_settings=ConcurrencySettings(max_concurrency=1),
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
)
mocked_handler = Mock()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpCl

# Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
crawler = BeautifulSoupCrawler(
concurrency_settings=ConcurrencySettings(max_concurrency=1), max_requests_per_crawl=3, http_client=http_client
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
max_requests_per_crawl=3,
http_client=http_client,
)

@crawler.router.default_handler
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/crawlers/_http/test_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ async def test_isolation_cookies(http_client: HttpClient, server_url: URL) -> No
),
http_client=http_client,
max_request_retries=10,
concurrency_settings=ConcurrencySettings(max_concurrency=1),
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
)

@crawler.router.default_handler
Expand Down
4 changes: 3 additions & 1 deletion tests/unit/crawlers/_parsel/test_parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpCl

# Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
crawler = ParselCrawler(
concurrency_settings=ConcurrencySettings(max_concurrency=1), max_requests_per_crawl=3, http_client=http_client
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
max_requests_per_crawl=3,
http_client=http_client,
)

@crawler.router.default_handler
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ async def test_isolation_cookies(*, use_incognito_pages: bool, server_url: URL)
crawler = PlaywrightCrawler(
session_pool=SessionPool(max_pool_size=1),
use_incognito_pages=use_incognito_pages,
concurrency_settings=ConcurrencySettings(max_concurrency=1),
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
)

@crawler.router.default_handler
Expand Down
Loading