docs: add request storage guide (#521)

vdusek · janbuchar · web-flow · commit 0f968e8dd366 · 2024-09-16T15:52:14.000+02:00
### Description - Add request storage guide. - It is highly inspired by the TS version. ### Issues - Closes: #355 - Closes: #478 ### Testing - It was rendered locally. ### Checklist - [x] CI passed --------- Co-authored-by: Jan Buchar <jan.buchar@apify.com>
diff --git a/docs/guides/code/request_storage_do_not_purge.py b/docs/guides/code/request_storage_do_not_purge.py
@@ -0,0 +1,20 @@
+import asyncio
+
+from crawlee.configuration import Configuration
+from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext
+
+
+async def main() -> None:
+    # highlight-next-line
+    config = Configuration(purge_on_start=False)
+    crawler = HttpCrawler(configuration=config)
+
+    @crawler.router.default_handler
+    async def request_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code/request_storage_helper_add_requests.py b/docs/guides/code/request_storage_helper_add_requests.py
@@ -0,0 +1,19 @@
+import asyncio
+
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+
+async def main() -> None:
+    crawler = BeautifulSoupCrawler()
+
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+        # highlight-next-line
+        await context.add_requests(['https://apify.com/'])
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code/request_storage_helper_enqueue_links.py b/docs/guides/code/request_storage_helper_enqueue_links.py
@@ -0,0 +1,19 @@
+import asyncio
+
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+
+async def main() -> None:
+    crawler = BeautifulSoupCrawler()
+
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+        # highlight-next-line
+        await context.enqueue_links()
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code/request_storage_purge_explicitly.py b/docs/guides/code/request_storage_purge_explicitly.py
@@ -0,0 +1,13 @@
+import asyncio
+
+from crawlee.memory_storage_client import MemoryStorageClient
+
+
+async def main() -> None:
+    storage_client = MemoryStorageClient()
+    # highlight-next-line
+    await storage_client.purge_on_start()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code/request_storage_rl_basic.py b/docs/guides/code/request_storage_rl_basic.py
@@ -0,0 +1,34 @@
+import asyncio
+
+from crawlee.storages import RequestList
+
+
+async def main() -> None:
+    # Open the request list, if it does not exist, it will be created.
+    # Leave name empty to use the default request list.
+    request_list = RequestList(
+        name='my-request-list',
+        requests=['https://apify.com/', 'https://crawlee.dev/', 'https://crawlee.dev/python/'],
+    )
+
+    # You can interact with the request list in the same way as with the request queue.
+    await request_list.add_requests_batched(
+        [
+            'https://crawlee.dev/python/docs/quick-start',
+            'https://crawlee.dev/python/api',
+        ]
+    )
+
+    # Fetch and process requests from the queue.
+    while request := await request_list.fetch_next_request():
+        # Do something with it..
+
+        # And mark it as handled.
+        await request_list.mark_request_as_handled(request)
+
+    # Remove the request queue.
+    await request_list.drop()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code/request_storage_rl_with_crawler.py b/docs/guides/code/request_storage_rl_with_crawler.py
@@ -0,0 +1,35 @@
+import asyncio
+
+from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext
+from crawlee.storages import RequestList
+
+
+async def main() -> None:
+    # Open the request list, if it does not exist, it will be created.
+    # Leave name empty to use the default request list.
+    request_list = RequestList(
+        name='my-request-list',
+        requests=['https://apify.com/', 'https://crawlee.dev/'],
+    )
+
+    # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request
+    # list as request provider to it. It will be managed by the crawler.
+    crawler = HttpCrawler(request_provider=request_list)
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Use context's add_requests method helper to add new requests from the handler.
+        await context.add_requests(['https://crawlee.dev/python/docs/quick-start'])
+
+    # Use crawler's add_requests method helper to add new requests.
+    await crawler.add_requests(['https://crawlee.dev/python/api'])
+
+    # Run the crawler. You can optionally pass the list of initial requests.
+    await crawler.run(['https://crawlee.dev/python/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code/request_storage_rq_basic.py b/docs/guides/code/request_storage_rq_basic.py
@@ -0,0 +1,29 @@
+import asyncio
+
+from crawlee.storages import RequestQueue
+
+
+async def main() -> None:
+    # Open the request queue, if it does not exist, it will be created.
+    # Leave name empty to use the default request queue.
+    request_queue = await RequestQueue.open(name='my-request-queue')
+
+    # Add a single request.
+    await request_queue.add_request('https://apify.com/')
+
+    # Add multiple requests as a batch.
+    await request_queue.add_requests_batched(['https://crawlee.dev/', 'https://crawlee.dev/python/'])
+
+    # Fetch and process requests from the queue.
+    while request := await request_queue.fetch_next_request():
+        # Do something with it..
+
+        # And mark it as handled.
+        await request_queue.mark_request_as_handled(request)
+
+    # Remove the request queue.
+    await request_queue.drop()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code/request_storage_rq_with_crawler.py b/docs/guides/code/request_storage_rq_with_crawler.py
@@ -0,0 +1,27 @@
+import asyncio
+
+from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext
+
+
+async def main() -> None:
+    # Create a new crawler (it can be any subclass of BasicCrawler). Request queue is a default
+    # request provider, it will be opened, and fully managed if not specified.
+    crawler = HttpCrawler()
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Use context's add_requests method helper to add new requests from the handler.
+        await context.add_requests(['https://crawlee.dev/python/'])
+
+    # Use crawler's add_requests method helper to add new requests.
+    await crawler.add_requests(['https://apify.com/'])
+
+    # Run the crawler. You can optionally pass the list of initial requests.
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code/request_storage_rq_with_crawler_explicit.py b/docs/guides/code/request_storage_rq_with_crawler_explicit.py
@@ -0,0 +1,29 @@
+import asyncio
+
+from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext
+from crawlee.storages import RequestQueue
+
+
+async def main() -> None:
+    # Open the request queue, if it does not exist, it will be created.
+    # Leave name empty to use the default request queue.
+    request_queue = await RequestQueue.open(name='my-request-queue')
+
+    # Interact with the request queue directly, e.g. add a batch of requests.
+    await request_queue.add_requests_batched(['https://apify.com/', 'https://crawlee.dev/'])
+
+    # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request
+    # list as request provider to it. It will be managed by the crawler.
+    crawler = HttpCrawler(request_provider=request_queue)
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+    # And execute the crawler.
+    await crawler.run()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/request_storage.mdx b/docs/guides/request_storage.mdx
@@ -0,0 +1,136 @@
+---
+id: request-storage
+title: Request storage
+description: How to store the requests your crawler will go through
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import CodeBlock from '@theme/CodeBlock';
+
+import RqBasicExample from '!!raw-loader!./code/request_storage_rq_basic.py';
+import RqWithCrawlerExample from '!!raw-loader!./code/request_storage_rq_with_crawler.py';
+import RqWithCrawlerExplicitExample from '!!raw-loader!./code/request_storage_rq_with_crawler_explicit.py';
+
+import RlBasicExample from '!!raw-loader!./code/request_storage_rl_basic.py';
+import RlWithCrawlerExample from '!!raw-loader!./code/request_storage_rl_with_crawler.py';
+
+import RsHelperAddRequestsExample from '!!raw-loader!./code/request_storage_helper_add_requests.py';
+import RsHelperEnqueueLinksExample from '!!raw-loader!./code/request_storage_helper_enqueue_links.py';
+
+import RsDoNotPurgeExample from '!!raw-loader!./code/request_storage_do_not_purge.py';
+import RsPurgeExplicitlyExample from '!!raw-loader!./code/request_storage_purge_explicitly.py';
+
+This guide explains the different types of request storage available in Crawlee, how to store the requests that your crawler will process, and which storage type to choose based on your needs.
+
+## Request providers overview
+
+All request storage types in Crawlee implement the same interface - <ApiLink to="class/RequestProvider">`RequestProvider`</ApiLink>. This unified interface allows them to be used in a consistent manner, regardless of the storage backend. The request providers are managed by storage clients - subclasses of <ApiLink to="class/BaseStorageClient">`BaseStorageClient`</ApiLink>. For instance, <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> stores data in memory while it can also offload them to the local directory. Data are stored in the following directory structure:
+
+```text
+{CRAWLEE_STORAGE_DIR}/{request_provider}/{QUEUE_ID}/
+```
+:::note
+
+Local directory is specified by the `CRAWLEE_STORAGE_DIR` environment variable with default value `./storage`. `{QUEUE_ID}` is the name or ID of the specific request storage. The default value is `default`, unless we override it by setting the `CRAWLEE_DEFAULT_REQUEST_QUEUE_ID` environment variable.
+
+:::
+
+## Request queue
+
+The <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> is the primary storage for URLs in Crawlee, especially useful for deep crawling. It supports dynamic addition and removal of URLs, making it ideal for recursive tasks where URLs are discovered and added during the crawling process (e.g., following links across multiple pages). Each Crawlee project has a **default request queue**, which can be used to store URLs during a specific run. The <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink> is highly useful for large-scale and complex crawls.
+
+The following code demonstrates the usage of the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>:
+
+<Tabs groupId="request_queue">
+    <TabItem value="request_queue_basic_example" label="Basic usage" default>
+        <CodeBlock className="language-python">
+            {RqBasicExample}
+        </CodeBlock>
+    </TabItem>
+    <TabItem value="request_queue_with_crawler" label="Usage with Crawler">
+        <CodeBlock className="language-python">
+            {RqWithCrawlerExample}
+        </CodeBlock>
+    </TabItem>
+    <TabItem value="request_queue_with_crawler_explicit" label="Explicit usage with Crawler" default>
+        <CodeBlock className="language-python">
+            {RqWithCrawlerExplicitExample}
+        </CodeBlock>
+    </TabItem>
+</Tabs>
+
+## Request list
+
+The <ApiLink to="class/RequestList">`RequestList`</ApiLink> is a simpler, lightweight storage option, used when all URLs to be crawled are known upfront. It represents the list of URLs to crawl that is stored in a crawler run memory (or optionally in default <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink> associated with the run, if specified). The list is used for the crawling of a large number of URLs, when we know all the URLs which should be visited by the crawler and no URLs would be added during the run. The URLs can be provided either in code or parsed from a text file hosted on the web. The <ApiLink to="class/RequestList">`RequestList`</ApiLink> is typically created exclusively for a single crawler run, and its usage must be explicitly specified.
+
+:::warning
+
+The <ApiLink to="class/RequestList">`RequestList`</ApiLink> class is in its early version and is not fully
+implemented. It is currently intended mainly for testing purposes and small-scale projects. The current
+implementation is only in-memory storage and is very limited. It will be (re)implemented in the future.
+For more details, see the GitHub issue [crawlee-python#99](https://github.com/apify/crawlee-python/issues/99).
+For production usage we recommend to use the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>.
+
+:::
+
+The following code demonstrates the usage of the <ApiLink to="class/RequestList">`RequestList`</ApiLink>:
+
+<Tabs groupId="request_list">
+    <TabItem value="request_list_basic_example" label="Basic usage" default>
+        <CodeBlock className="language-python">
+            {RlBasicExample}
+        </CodeBlock>
+    </TabItem>
+    <TabItem value="request_list_with_crawler" label="Usage with Crawler">
+        <CodeBlock className="language-python">
+            {RlWithCrawlerExample}
+        </CodeBlock>
+    </TabItem>
+</Tabs>
+
+{/*
+
+## Which one to choose?
+
+TODO: write this section, once https://github.com/apify/crawlee-python/issues/99 is resolved
+
+*/}
+
+## Request-related helpers
+
+We offer several helper functions to simplify interactions with request storages:
+
+- The <ApiLink to="class/AddRequestsFunction">`add_requests`</ApiLink> function allows you to manually add specific URLs to the configured request storage. In this case, you must explicitly provide the URLs you want to be added to the request storage. If you need to specify further details of the request, such as a `label` or `user_data`, you have to pass instances of the <ApiLink to="class/Request">`Request`</ApiLink> class to the helper.
+- The <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> function is designed to discover new URLs in the current page and add them to the request storage. It can be used with default settings, requiring no arguments, or you can customize its behavior by specifying link element selectors, choosing different enqueue strategies, or applying include/exclude filters to control which URLs are added. See [Crawl website with relative links](../examples/crawl-website-with-relative-links) example for more details.
+
+<Tabs groupId="request_helpers">
+    <TabItem value="request_helper_add_requests" label="Add requests" default>
+        <CodeBlock className="language-python">
+            {RsHelperAddRequestsExample}
+        </CodeBlock>
+    </TabItem>
+    <TabItem value="request_helper_enqueue_links" label="Enqueue links">
+        <CodeBlock className="language-python">
+            {RsHelperEnqueueLinksExample}
+        </CodeBlock>
+    </TabItem>
+</Tabs>
+
+## Cleaning up the storages
+
+Default storages are purged before the crawler starts, unless explicitly configured otherwise. For that case, see <ApiLink to="class/Configuration#purge_on_start">`Configuration.purge_on_start`</ApiLink>. This cleanup happens as soon as a storage is accessed, either when you open a storage (e.g. using <ApiLink to="class/RequestQueue#open">`RequestQueue.open`</ApiLink>) or when interacting with a storage through one of the helper functions (e.g. <ApiLink to="class/AddRequestsFunction">`add_requests`</ApiLink> or <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink>, which implicitly opens the request storage).
+
+<CodeBlock className="language-python">
+    {RsDoNotPurgeExample}
+</CodeBlock>
+
+If you do not explicitly interact with storages in your code, the purging will occur automatically when the <ApiLink to="class/BasicCrawler#run">`BasicCrawler.run`</ApiLink> method is invoked.
+
+If you need to purge storages earlier, you can call <ApiLink to="class/MemoryStorageClient#purge_on_start">`MemoryStorageClient.purge_on_start`</ApiLink> directly. This method triggers the purging process for the underlying storage implementation you are currently using.
+
+<CodeBlock className="language-python">
+    {RsPurgeExplicitlyExample}
+</CodeBlock>