Skip to content

Commit ce172c4

Browse files
authored
fix: Resolve compatibility issue between SqlStorageClient and AdaptivePlaywrightCrawler (#1496)
### Description - Remove the use of `deepcopy` for `kwargs` in `AdaptivePlaywrightCrawler` for compatibility with `SqlStorageClient` ### Issues - Closes: #1495
1 parent aeacd96 commit ce172c4

File tree

2 files changed

+24
-6
lines changed

2 files changed

+24
-6
lines changed

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,6 @@ def __init__(
149149
non-default configuration.
150150
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
151151
"""
152-
# Some sub crawler kwargs are internally modified. Prepare copies.
153-
basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
154-
basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
155-
156152
# Adaptive crawling related.
157153
self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
158154
self.result_checker = result_checker or (lambda _: True)
@@ -170,11 +166,11 @@ def __init__(
170166
# Each sub crawler will use custom logger .
171167
static_logger = getLogger('Subcrawler_static')
172168
static_logger.setLevel(logging.ERROR)
173-
basic_crawler_kwargs_for_static_crawler['_logger'] = static_logger
169+
basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
174170

175171
pw_logger = getLogger('Subcrawler_playwright')
176172
pw_logger.setLevel(logging.ERROR)
177-
basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger
173+
basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
178174

179175
# Initialize sub crawlers to create their pipelines.
180176
static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)

tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,12 @@
3030
AdaptiveContextError,
3131
)
3232
from crawlee.statistics import Statistics
33+
from crawlee.storage_clients import SqlStorageClient
3334
from crawlee.storages import KeyValueStore
3435

3536
if TYPE_CHECKING:
3637
from collections.abc import AsyncGenerator, Iterator
38+
from pathlib import Path
3739

3840
from yarl import URL
3941

@@ -726,3 +728,23 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
726728
await crawler.run(test_urls[:1])
727729

728730
mocked_h3_handler.assert_called_once_with(None)
731+
732+
733+
async def test_adaptive_playwright_crawler_with_sql_storage(test_urls: list[str], tmp_path: Path) -> None:
734+
"""Tests that AdaptivePlaywrightCrawler can be initialized with SqlStorageClient."""
735+
storage_dir = tmp_path / 'test_table.db'
736+
737+
async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client:
738+
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
739+
storage_client=storage_client,
740+
)
741+
742+
mocked_handler = Mock()
743+
744+
@crawler.router.default_handler
745+
async def request_handler(_context: AdaptivePlaywrightCrawlingContext) -> None:
746+
mocked_handler()
747+
748+
await crawler.run(test_urls[:1])
749+
750+
mocked_handler.assert_called()

0 commit comments

Comments
 (0)