apify · Pijukatel · Dec 18, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 10, 2024
diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING
+
+from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'}
+BLOCK_TAGS = {
+    'p',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'ol',
+    'ul',
+    'li',
+    'pre',
+    'address',
+    'blockquote',
+    'dl',
+    'div',
+    'fieldset',
+    'form',
+    'table',
+    'tr',
+    'select',
+    'option',
+}
+
+_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\s)$')
+_EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\n)$')
+_ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\s+')
+
+
+def html_to_text(source: str | BeautifulSoup) -> str:
+    """Converts markup string or BeautifulSoup object to newline separated plain text without tags."""
+    if isinstance(source, str):
+        soup = BeautifulSoup(source)
+    elif isinstance(source, BeautifulSoup):
+        soup = source
+    else:
+        raise TypeError('Source must be either a string or a BeautifulSoup object.')
+
+    text = ''
+
+    def _page_element_to_text(page_elements: Iterable[PageElement]) -> None:
+        """Custom html parsing that performs as implementation from Javascript version of Crawlee."""
+        nonlocal text
+        for page_element in page_elements:
+            if isinstance(page_element, (Tag, NavigableString)):
+                if isinstance(page_element, NavigableString):
+                    compr: str
+                    if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre':
+                        compr = page_element.get_text()
+                    else:
+                        # Compress white spaces outside of pre block
+                        compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text())
+                    # If text is empty or ends with a whitespace, don't add the leading whitespace or new line
+                    if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text):
+                        compr = compr[1:]
+                    text += compr
+                elif page_element.name.lower() in SKIP_TAGS or isinstance(page_element, int):
+                    # Skip comments and special elements
+                    pass
+                elif page_element.name.lower() == 'br':
+                    text += '\n'
+                elif page_element.name.lower() == 'td':
+                    _page_element_to_text(page_element.children)
+                    text += '\t'
+                else:
+                    # Block elements must be surrounded by newlines(unless beginning of text)
+                    is_block_tag = page_element.name.lower() in BLOCK_TAGS
+                    if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text):
+                        text += '\n'
+                    _page_element_to_text(page_element.children)
+                    if is_block_tag and not text.endswith('\n'):
+                        text += '\n'
+
+    _page_element_to_text(soup.children)
+    return text.strip()
diff --git a/src/crawlee/beautifulsoup_crawler/__init__.py b/src/crawlee/beautifulsoup_crawler/__init__.py
@@ -1,4 +1,6 @@
 try:
+    from crawlee._utils.html_to_text import html_to_text
+
     from ._beautifulsoup_crawler import BeautifulSoupCrawler
     from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
     from ._beautifulsoup_parser import BeautifulSoupParserType
@@ -8,4 +10,4 @@
         "For example, if you use pip, run `pip install 'crawlee[beautifulsoup]'`.",
     ) from exc
 
-__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType']
+__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType', 'html_to_text']
diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py
@@ -0,0 +1,185 @@
+from __future__ import annotations
+
+import pytest
+from bs4 import BeautifulSoup
+
+from crawlee.beautifulsoup_crawler import html_to_text
+
+_EXPECTED_TEXT = (
+    "Let's start with a simple text. \n"
+    "The ships hung in the sky, much the way that bricks don't. \n"
+    "These aren't the Droids you're looking for\n"
+    "I'm sorry, Dave. I'm afraid I can't do that.\n"
+    "I'm sorry, Dave. I'm afraid I can't do that.\n"
+    'A1\tA2\tA3\t\n'
+    'B1\tB2\tB3\tB 4\t\n'
+    'This is some text with inline elements and HTML entities (>bla<) \n'
+    'Test\n'
+    'a\n'
+    'few\n'
+    'line\n'
+    'breaks\n'
+    'Spaces in an inline text should be completely ignored. \n'
+    'But,\n'
+    '    a pre-formatted\n'
+    '                block  should  be  kept\n'
+    '                                       pre-formatted.\n'
+    'The Greatest Science Fiction Quotes Of All Time \n'
+    "Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You "
+    'Nexus, huh? I design your eyes.'
+)
+
+_EXAMPLE_HTML = """
+<html>
+<head>
+    <title>Title SHOULD NOT be converted</title>
+
+    <!-- Comments SHOULD NOT be converted -->
+</head>
+<body with='some attributes'>
+Let's start with a        simple text.
+<p>
+    The ships hung in the sky, much the <a class="click" href="https://example.com/a/b/first">way that</a> bricks don't.
+</p>
+<ul>
+    <li>These aren't the Droids you're looking for</li>
+    <li some="attribute"><a href="https://example.com/a/second">I'm sorry, Dave. I'm afraid I can't do that.</a></li>
+    <li><a class="click" href="https://example.com/a/b/third">I'm sorry, Dave. I'm afraid I can't do that.</a></li>
+</ul>
+
+<img src="something" alt="This should be ignored" />
+
+<!-- Comments SHOULD NOT be converted -->
+
+<table>
+    <tr class="something">
+        <td>A1</td>
+        <td attributes="are ignored">A2</td>
+        <td>A3</td>
+    </tr>
+    <tr class="something">
+        <td>B1</td>
+        <td attributes="are ignored" even="second attribute">B2</td>
+        <td>B3</td>
+        <td>B     4</td>
+    </tr>
+</table>
+
+<p>
+    This is <b>some<i> text <b>with</b></i></b> inline <span>elements</span> and HTML&nbsp;entities (&gt;bla&lt;)
+</p>
+
+<div>
+    Test<br>
+    a<br />
+    few<br>
+    line<br>
+    breaks<br>
+</div>
+
+
+
+
+    Spaces
+
+
+    in
+
+
+    an inline text                                should be
+
+
+    completely ignored.
+
+
+
+<pre>
+But,
+    a pre-formatted
+                block  should  be  kept
+                                       pre-formatted.
+</pre>
+
+<svg>
+    These special elements SHOULD NOT BE CONVERTED.
+</svg>
+
+<script>
+    // These special elements should be completely skipped.
+    skipThis();
+</script>
+
+<style>
+    /* These special elements should be completely skipped. */
+    .skip_this {}
+</style>
+
+<canvas>
+    This should be skipped too.
+</canvas>
+
+<a class="click" href="https://another.com/a/fifth">The Greatest Science Fiction Quotes Of All Time</a>
+<p>
+    Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design,
+    just eyes. You Nexus, huh? I design your <a class="click" href="http://cool.com/">eyes</a>.
+</p>
+</body>
+</html>
+"""
+
+
+@pytest.mark.parametrize(
+    ('source', 'expected_text'),
+    [
+        (_EXAMPLE_HTML, _EXPECTED_TEXT),
+        (BeautifulSoup(_EXAMPLE_HTML), _EXPECTED_TEXT),
+        ('   Plain    text     node    ', 'Plain text node'),
+        ('   \nPlain    text     node  \n  ', 'Plain text node'),
+        ('<h1>Header 1</h1> <h2>Header 2</h2>', 'Header 1\nHeader 2'),
+        ('<h1>Header 1</h1> <h2>Header 2</h2><br>', 'Header 1\nHeader 2'),
+        ('<h1>Header 1</h1> <h2>Header 2</h2><br><br>', 'Header 1\nHeader 2'),
+        ('<h1>Header 1</h1> <h2>Header 2</h2><br><br><br>', 'Header 1\nHeader 2'),
+        ('<h1>Header 1</h1><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
+        ('<h1>Header 1</h1> <br> <h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
+        ('<h1>Header 1</h1>  \n <br>\n<h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
+        ('<h1>Header 1</h1>  \n <br>\n<br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\nHeader 2'),
+        ('<h1>Header 1</h1>  \n <br>\n<br><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\n\nHeader 2'),
+        ('<div><div>Div</div><p>Paragraph</p></div>', 'Div\nParagraph'),
+        ('<div>Div1</div><!-- Some comments --><div>Div2</div>', 'Div1\nDiv2'),
+        ('<div>Div1</div><style>Skip styles</style>', 'Div1'),
+        ('<script>Skip_scripts();</script><div>Div1</div>', 'Div1'),
+        ('<SCRIPT>Skip_scripts();</SCRIPT><div>Div1</div>', 'Div1'),
+        ('<svg>Skip svg</svg><div>Div1</div>', 'Div1'),
+        ('<canvas>Skip canvas</canvas><div>Div1</div>', 'Div1'),
+        ('<b>A  B  C  D  E\n\nF  G</b>', 'A B C D E F G'),
+        ('<pre>A  B  C  D  E\n\nF  G</pre>', 'A  B  C  D  E\n\nF  G'),
+        (
+            '<h1>Heading 1</h1><div><div><div><div>Deep  Div</div></div></div></div><h2>Heading       2</h2>',
+            'Heading 1\nDeep Div' '\nHeading 2',
+        ),
+        ('<a>this_word</a>_should_<b></b>be_<span>one</span>', 'this_word_should_be_one'),
+        ('<span attributes="should" be="ignored">some <span>text</span></span>', 'some text'),
+        (
+            (
+                """<table>
+    <tr>
+        <td>Cell    A1</td><td>Cell A2</td>
+        <td>    Cell A3    </td>
+    </tr>
+    <tr>
+        <td>Cell    B1</td><td>Cell B2</td>
+    </tr>
+</table>"""
+            ),
+            'Cell A1\tCell A2\tCell A3 \t\nCell B1\tCell B2',
+        ),
+        ('<span>&aacute; &eacute;</span>', 'á é'),
+    ],
+)
+def test_html_to_text(source: str | BeautifulSoup, expected_text: str) -> None:
+    assert html_to_text(source) == expected_text
+
+
+def test_html_to_text_raises_on_wrong_input_type() -> None:
+    with pytest.raises(TypeError):
+        html_to_text(1)  # type: ignore[arg-type]  # Intentional wrong type test.
diff --git a/tests/unit/basic_crawler/test_basic_crawler.py b/tests/unit/basic_crawler/test_basic_crawler.py
@@ -4,10 +4,10 @@
 import asyncio
 import json
 import logging
+import os
 from collections import Counter
 from dataclasses import dataclass
 from datetime import timedelta
-import os
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from unittest.mock import AsyncMock, Mock