docs: Update Request loaders guide (#1376)

vdusek · web-flow · commit 734af8cca114 · 2025-08-27T11:46:59.000+02:00
- Adaptive crawler - prefer combining with Parsel.
- Highlight some important parts of the code samples.
- Prefer Impit over Httpx.
- Expose `FingerprintGenerator` in the public API docs.
- Add a note to the request loaders guide to highlight the usage with
crawlers.
diff --git a/docs/guides/code_examples/playwright_crawler_adaptive/handler.py b/docs/guides/code_examples/playwright_crawler_adaptive/handler.py
@@ -5,7 +5,7 @@
 
 
 async def main() -> None:
-    crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser()
+    crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser()
 
     @crawler.router.default_handler
     async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
diff --git a/docs/guides/code_examples/request_loaders/rl_basic_example.py b/docs/guides/code_examples/request_loaders/rl_basic_example.py
@@ -18,6 +18,7 @@ async def main() -> None:
     # Fetch and process requests from the queue.
     while request := await request_list.fetch_next_request():
         # Do something with it...
+        print(f'Processing {request.url}')
 
         # And mark it as handled.
         await request_list.mark_request_as_handled(request)
diff --git a/docs/guides/code_examples/request_loaders/rl_tandem_example.py b/docs/guides/code_examples/request_loaders/rl_tandem_example.py
@@ -8,9 +8,11 @@ async def main() -> None:
     # Create a static request list.
     request_list = RequestList(['https://crawlee.dev', 'https://apify.com'])
 
+    # highlight-start
     # Convert the request list to a request manager using the to_tandem method.
     # It is a tandem with the default request queue.
     request_manager = await request_list.to_tandem()
+    # highlight-end
 
     # Create a crawler and pass the request manager to it.
     crawler = ParselCrawler(
@@ -20,9 +22,20 @@ async def main() -> None:
 
     @crawler.router.default_handler
     async def handler(context: ParselCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url}')
+
         # New links will be enqueued directly to the queue.
         await context.enqueue_links()
 
+        # Extract data using Parsel's XPath and CSS selectors.
+        data = {
+            'url': context.request.url,
+            'title': context.selector.xpath('//title/text()').get(),
+        }
+
+        # Push extracted data to the dataset.
+        await context.push_data(data)
+
     await crawler.run()
 
 
diff --git a/docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py b/docs/guides/code_examples/request_loaders/rl_tandem_example_explicit.py
@@ -23,9 +23,20 @@ async def main() -> None:
 
     @crawler.router.default_handler
     async def handler(context: ParselCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url}')
+
         # New links will be enqueued directly to the queue.
         await context.enqueue_links()
 
+        # Extract data using Parsel's XPath and CSS selectors.
+        data = {
+            'url': context.request.url,
+            'title': context.selector.xpath('//title/text()').get(),
+        }
+
+        # Push extracted data to the dataset.
+        await context.push_data(data)
+
     await crawler.run()
 
 
diff --git a/docs/guides/code_examples/request_loaders/sitemap_basic_example.py b/docs/guides/code_examples/request_loaders/sitemap_basic_example.py
@@ -0,0 +1,29 @@
+import asyncio
+import re
+
+from crawlee.http_clients import ImpitHttpClient
+from crawlee.request_loaders import SitemapRequestLoader
+
+
+async def main() -> None:
+    # Create an HTTP client for fetching the sitemap.
+    http_client = ImpitHttpClient()
+
+    # Create a sitemap request loader with filtering rules.
+    sitemap_loader = SitemapRequestLoader(
+        sitemap_urls=['https://crawlee.dev/sitemap.xml'],
+        http_client=http_client,
+        include=[re.compile(r'.*docs.*')],  # Only include URLs containing 'docs'.
+        max_buffer_size=500,  # Keep up to 500 URLs in memory before processing.
+    )
+
+    while request := await sitemap_loader.fetch_next_request():
+        # Do something with it...
+        print(f'Processing {request.url}')
+
+        # And mark it as handled.
+        await sitemap_loader.mark_request_as_handled(request)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/request_loaders/sitemap_example.py b/docs/guides/code_examples/request_loaders/sitemap_example.py
diff --git a/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py b/docs/guides/code_examples/request_loaders/sitemap_tandem_example.py
@@ -2,38 +2,51 @@
 import re
 
 from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
-from crawlee.http_clients import HttpxHttpClient
+from crawlee.http_clients import ImpitHttpClient
 from crawlee.request_loaders import SitemapRequestLoader
 
 
 async def main() -> None:
-    # Create an HTTP client for fetching sitemaps
-    async with HttpxHttpClient() as http_client:
-        # Create a sitemap request loader with URL filtering
-        sitemap_loader = SitemapRequestLoader(
-            sitemap_urls=['https://crawlee.dev/sitemap.xml'],
-            http_client=http_client,
-            # Include only URLs that contain 'docs'
-            include=[re.compile(r'.*docs.*')],
-            max_buffer_size=500,  # Buffer up to 500 URLs in memory
-        )
-
-        # Convert the sitemap loader to a request manager using the to_tandem method.
-        # It is a tandem with the default request queue.
-        request_manager = await sitemap_loader.to_tandem()
-
-        # Create a crawler and pass the request manager to it.
-        crawler = ParselCrawler(
-            request_manager=request_manager,
-            max_requests_per_crawl=10,  # Limit the max requests per crawl.
-        )
-
-        @crawler.router.default_handler
-        async def handler(context: ParselCrawlingContext) -> None:
-            # New links will be enqueued directly to the queue.
-            await context.enqueue_links()
-
-        await crawler.run()
+    # Create an HTTP client for fetching the sitemap.
+    http_client = ImpitHttpClient()
+
+    # Create a sitemap request loader with filtering rules.
+    sitemap_loader = SitemapRequestLoader(
+        sitemap_urls=['https://crawlee.dev/sitemap.xml'],
+        http_client=http_client,
+        include=[re.compile(r'.*docs.*')],  # Only include URLs containing 'docs'.
+        max_buffer_size=500,  # Keep up to 500 URLs in memory before processing.
+    )
+
+    # highlight-start
+    # Convert the sitemap loader into a request manager linked
+    # to the default request queue.
+    request_manager = await sitemap_loader.to_tandem()
+    # highlight-end
+
+    # Create a crawler and pass the request manager to it.
+    crawler = ParselCrawler(
+        request_manager=request_manager,
+        max_requests_per_crawl=10,  # Limit the max requests per crawl.
+    )
+
+    @crawler.router.default_handler
+    async def handler(context: ParselCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url}')
+
+        # New links will be enqueued directly to the queue.
+        await context.enqueue_links()
+
+        # Extract data using Parsel's XPath and CSS selectors.
+        data = {
+            'url': context.request.url,
+            'title': context.selector.xpath('//title/text()').get(),
+        }
+
+        # Push extracted data to the dataset.
+        await context.push_data(data)
+
+    await crawler.run()
 
 
 if __name__ == '__main__':
diff --git a/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py b/docs/guides/code_examples/request_loaders/sitemap_tandem_example_explicit.py
@@ -2,41 +2,52 @@
 import re
 
 from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
-from crawlee.http_clients import HttpxHttpClient
+from crawlee.http_clients import ImpitHttpClient
 from crawlee.request_loaders import RequestManagerTandem, SitemapRequestLoader
 from crawlee.storages import RequestQueue
 
 
 async def main() -> None:
-    # Create an HTTP client for fetching sitemaps
-    async with HttpxHttpClient() as http_client:
-        # Create a sitemap request loader with URL filtering
-        sitemap_loader = SitemapRequestLoader(
-            sitemap_urls=['https://crawlee.dev/sitemap.xml'],
-            http_client=http_client,
-            # Include only URLs that contain 'docs'
-            include=[re.compile(r'.*docs.*')],
-            max_buffer_size=500,  # Buffer up to 500 URLs in memory
-        )
-
-        # Open the default request queue.
-        request_queue = await RequestQueue.open()
-
-        # And combine them together to a single request manager.
-        request_manager = RequestManagerTandem(sitemap_loader, request_queue)
-
-        # Create a crawler and pass the request manager to it.
-        crawler = ParselCrawler(
-            request_manager=request_manager,
-            max_requests_per_crawl=10,  # Limit the max requests per crawl.
-        )
-
-        @crawler.router.default_handler
-        async def handler(context: ParselCrawlingContext) -> None:
-            # New links will be enqueued directly to the queue.
-            await context.enqueue_links()
-
-        await crawler.run()
+    # Create an HTTP client for fetching the sitemap.
+    http_client = ImpitHttpClient()
+
+    # Create a sitemap request loader with filtering rules.
+    sitemap_loader = SitemapRequestLoader(
+        sitemap_urls=['https://crawlee.dev/sitemap.xml'],
+        http_client=http_client,
+        include=[re.compile(r'.*docs.*')],  # Only include URLs containing 'docs'.
+        max_buffer_size=500,  # Keep up to 500 URLs in memory before processing.
+    )
+
+    # Open the default request queue.
+    request_queue = await RequestQueue.open()
+
+    # And combine them together to a single request manager.
+    request_manager = RequestManagerTandem(sitemap_loader, request_queue)
+
+    # Create a crawler and pass the request manager to it.
+    crawler = ParselCrawler(
+        request_manager=request_manager,
+        max_requests_per_crawl=10,  # Limit the max requests per crawl.
+    )
+
+    @crawler.router.default_handler
+    async def handler(context: ParselCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url}')
+
+        # New links will be enqueued directly to the queue.
+        await context.enqueue_links()
+
+        # Extract data using Parsel's XPath and CSS selectors.
+        data = {
+            'url': context.request.url,
+            'title': context.selector.xpath('//title/text()').get(),
+        }
+
+        # Push extracted data to the dataset.
+        await context.push_data(data)
+
+    await crawler.run()
 
 
 if __name__ == '__main__':
diff --git a/docs/guides/request_loaders.mdx b/docs/guides/request_loaders.mdx
@@ -10,7 +10,7 @@ import TabItem from '@theme/TabItem';
 import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
 
 import RlBasicExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_basic_example.py';
-import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_example.py';
+import SitemapExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_basic_example.py';
 import RlTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example.py';
 import RlExplicitTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/rl_tandem_example_explicit.py';
 import SitemapTandemExample from '!!raw-loader!roa-loader!./code_examples/request_loaders/sitemap_tandem_example.py';
@@ -102,6 +102,10 @@ RequestManager --|> RequestManagerTandem
 
 The <ApiLink to="class/RequestLoader">`RequestLoader`</ApiLink> interface defines the foundation for fetching requests during a crawl. It provides abstract methods for basic operations like retrieving, marking, and checking the status of requests. Concrete implementations, such as <ApiLink to="class/RequestList">`RequestList`</ApiLink>, build on this interface to handle specific scenarios. You can create your own custom loader that reads from an external file, web endpoint, database, or any other specific data source. For more details, refer to the <ApiLink to="class/RequestLoader">`RequestLoader`</ApiLink> API reference.
 
+:::info NOTE
+To learn how to use request loaders in your crawlers, see the [Request manager tandem](#request-manager-tandem) section below.
+:::
+
 ### Request list
 
 The <ApiLink to="class/RequestList">`RequestList`</ApiLink> can accept an asynchronous generator as input, allowing requests to be streamed rather than loading them all into memory at once. This can significantly reduce memory usage, especially when working with large sets of URLs.
diff --git a/docs/guides/service_locator.mdx b/docs/guides/service_locator.mdx
@@ -33,7 +33,7 @@ There are three core services that are managed by the service locator:
 
 ### Configuration
 
-<ApiLink to="class/Configuration">`Configuration`</ApiLink> is a class that provides access to application-wide settings and parameters. It allows you to configure various aspects of Crawlee, such as timeouts, logging level, persistance intervals, and various other settings. The configuration can be set directly in the code or via environment variables.
+<ApiLink to="class/Configuration">`Configuration`</ApiLink> is a class that provides access to application-wide settings and parameters. It allows you to configure various aspects of Crawlee, such as timeouts, logging level, persistence intervals, and various other settings. The configuration can be set directly in the code or via environment variables.
 
 ### StorageClient
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -174,6 +174,7 @@ indent-style = "space"
     "INP001", # File {filename} is part of an implicit namespace package, add an __init__.py
     "F841",   # Local variable {variable} is assigned to but never used
     "N999",   # Invalid module name
+    "T201",   # `print` found
 ]
 "**/docs/examples/code_examples/*crawler_with_error_snapshotter.py" = [
     "PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code.
diff --git a/src/crawlee/fingerprint_suite/_fingerprint_generator.py b/src/crawlee/fingerprint_suite/_fingerprint_generator.py
@@ -3,10 +3,13 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING
 
+from crawlee._utils.docs import docs_group
+
 if TYPE_CHECKING:
     from browserforge.fingerprints import Fingerprint
 
 
+@docs_group('Other')
 class FingerprintGenerator(ABC):
     """A class for creating browser fingerprints that mimic browser fingerprints of real users."""