apify · Pijukatel · Dec 18, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 10, 2024
diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py
@@ -0,0 +1,33 @@
+# This file contains shared constants used by different implementations of html_to_text function.
+from __future__ import annotations
+
+import re
+
+SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'}
+BLOCK_TAGS = {
+    'p',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'ol',
+    'ul',
+    'li',
+    'pre',
+    'address',
+    'blockquote',
+    'dl',
+    'div',
+    'fieldset',
+    'form',
+    'table',
+    'tr',
+    'select',
+    'option',
+}
+
+_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\s)$')
+_EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\n)$')
+_ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\s+')
diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py
@@ -5,6 +5,7 @@
 
 from crawlee._utils.docs import docs_group
 from crawlee.abstract_http_crawler._http_crawling_context import ParsedHttpCrawlingContext
+from crawlee.beautifulsoup_crawler._utils import html_to_text
 
 
 @dataclass(frozen=True)
@@ -24,3 +25,7 @@ def soup(self) -> BeautifulSoup:
     def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self:
         """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`."""
         return cls(**{field.name: getattr(context, field.name) for field in fields(context)})
+
+    def html_to_text(self) -> str:
+        """Converts parsed_content to newline-separated plain text without tags."""
+        return html_to_text(self.parsed_content)
diff --git a/src/crawlee/beautifulsoup_crawler/_utils.py b/src/crawlee/beautifulsoup_crawler/_utils.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING
+
+from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
+
+from crawlee._utils.html_to_text import (
+    _ANY_CONSECUTIVE_WHITE_SPACES,
+    _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE,
+    _EMPTY_OR_ENDS_WITH_NEW_LINE,
+    BLOCK_TAGS,
+    SKIP_TAGS,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+
+def html_to_text(source: str | BeautifulSoup) -> str:
+    """Converts markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup."""
+    if isinstance(source, str):
+        soup = BeautifulSoup(source)
+    elif isinstance(source, BeautifulSoup):
+        soup = source
+    else:
+        raise TypeError('Source must be either a string or a `BeautifulSoup` object.')
+
+    text = ''
+
+    def _page_element_to_text(page_elements: Iterable[PageElement]) -> None:
+        """Custom html parsing that performs as implementation from Javascript version of Crawlee."""
+        nonlocal text
+        for page_element in page_elements:
+            if isinstance(page_element, (Tag, NavigableString)):
+                if isinstance(page_element, NavigableString):
+                    compr: str
+                    if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre':
+                        compr = page_element.get_text()
+                    else:
+                        # Compress white spaces outside of pre block
+                        compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text())
+                    # If text is empty or ends with a whitespace, don't add the leading whitespace or new line
+                    if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text):
+                        compr = compr[1:]
+                    text += compr
+                elif page_element.name.lower() in SKIP_TAGS:
+                    # Skip comments and special elements
+                    pass
+                elif page_element.name.lower() == 'br':
+                    text += '\n'
+                elif page_element.name.lower() == 'td':
+                    _page_element_to_text(page_element.children)
+                    text += '\t'
+                else:
+                    # Block elements must be surrounded by newlines(unless beginning of text)
+                    is_block_tag = page_element.name.lower() in BLOCK_TAGS
+                    if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text):
+                        text += '\n'
+                    _page_element_to_text(page_element.children)
+                    if is_block_tag and not text.endswith('\n'):
+                        text += '\n'
+
+    _page_element_to_text(soup.children)
+
+    return text.strip()
diff --git a/src/crawlee/parsel_crawler/_parsel_crawling_context.py b/src/crawlee/parsel_crawler/_parsel_crawling_context.py
@@ -5,6 +5,7 @@
 
 from crawlee._utils.docs import docs_group
 from crawlee.abstract_http_crawler._http_crawling_context import ParsedHttpCrawlingContext
+from crawlee.parsel_crawler._utils import html_to_text
 
 
 @dataclass(frozen=True)
@@ -24,3 +25,7 @@ def selector(self) -> Selector:
     def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Selector]) -> Self:
         """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`."""
         return cls(**{field.name: getattr(context, field.name) for field in fields(context)})
+
+    def html_to_text(self) -> str:
+        """Converts parsed_content to newline-separated plain text without tags."""
+        return html_to_text(self.parsed_content)
diff --git a/src/crawlee/parsel_crawler/_utils.py b/src/crawlee/parsel_crawler/_utils.py
@@ -0,0 +1,63 @@
+from __future__ import annotations
+
+import re
+
+from parsel import Selector
+
+from crawlee._utils.html_to_text import (
+    _ANY_CONSECUTIVE_WHITE_SPACES,
+    _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE,
+    _EMPTY_OR_ENDS_WITH_NEW_LINE,
+    BLOCK_TAGS,
+    SKIP_TAGS,
+)
+
+
+def html_to_text(source: str | Selector) -> str:
+    """Converts markup string or `Selector` to newline-separated plain text without tags using Parsel."""
+    if isinstance(source, str):
+        selector = Selector(text=source)
+    elif isinstance(source, Selector):
+        selector = source
+    else:
+        raise TypeError('Source must be either a string or a `Selector` object.')
+
+    text = ''
+
+    def _extract_text(elements: list[Selector], *, compress: bool = True) -> None:
+        """Custom html parsing that performs as implementation from Javascript version of Crawlee."""
+        nonlocal text
+        for element in elements:
+            tag = element.root.tag if hasattr(element.root, 'tag') else None
+
+            if tag is None:
+                # Compress white spaces outside of pre block
+                compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', element.root) if compress else element.root
+                # If text is empty or ends with a whitespace, don't add the leading whitespace or new line
+                if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text):
+                    compr = compr[1:]
+                text += compr
+
+            if tag in SKIP_TAGS or not isinstance(tag, str):
+                continue
+
+            if tag == 'br':
+                text += '\n'
+            elif tag == 'td':
+                _extract_text(element.xpath('./node()'))
+                text += '\t'
+            else:
+                is_block_tag = tag in BLOCK_TAGS if tag else False
+
+                if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text):
+                    text += '\n'
+
+                _extract_text(element.xpath('./node()'), compress=tag != 'pre')
+
+                if is_block_tag and not text.endswith('\n'):
+                    text += '\n'
+
+    # Start processing the root elements
+    _extract_text(selector.xpath('/*'))
+
+    return text.strip()
diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py
@@ -0,0 +1,198 @@
+from __future__ import annotations
+
+from typing import Callable
+
+import pytest
+from bs4 import BeautifulSoup
+from parsel import Selector
+
+from crawlee.beautifulsoup_crawler._utils import html_to_text as html_to_text_beautifulsoup
+from crawlee.parsel_crawler._utils import html_to_text as html_to_text_parsel
+
+_EXPECTED_TEXT = (
+    "Let's start with a simple text. \n"
+    "The ships hung in the sky, much the way that bricks don't. \n"
+    "These aren't the Droids you're looking for\n"
+    "I'm sorry, Dave. I'm afraid I can't do that.\n"
+    "I'm sorry, Dave. I'm afraid I can't do that.\n"
+    'A1\tA2\tA3\t\n'
+    'B1\tB2\tB3\tB 4\t\n'
+    'This is some text with inline elements and HTML entities (>bla<) \n'
+    'Test\n'
+    'a\n'
+    'few\n'
+    'line\n'
+    'breaks\n'
+    'Spaces in an inline text should be completely ignored. \n'
+    'But,\n'
+    '    a pre-formatted\n'
+    '                block  should  be  kept\n'
+    '                                       pre-formatted.\n'
+    'The Greatest Science Fiction Quotes Of All Time \n'
+    "Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You "
+    'Nexus, huh? I design your eyes.'
+)
+
+_EXAMPLE_HTML = """
+<html>
+<head>
+    <title>Title SHOULD NOT be converted</title>
+
+    <!-- Comments SHOULD NOT be converted -->
+</head>
+<body with='some attributes'>
+Let's start with a        simple text.
+<p>
+    The ships hung in the sky, much the <a class="click" href="https://example.com/a/b/first">way that</a> bricks don't.
+</p>
+<ul>
+    <li>These aren't the Droids you're looking for</li>
+    <li some="attribute"><a href="https://example.com/a/second">I'm sorry, Dave. I'm afraid I can't do that.</a></li>
+    <li><a class="click" href="https://example.com/a/b/third">I'm sorry, Dave. I'm afraid I can't do that.</a></li>
+</ul>
+
+<img src="something" alt="This should be ignored" />
+
+<!-- Comments SHOULD NOT be converted -->
+
+<table>
+    <tr class="something">
+        <td>A1</td>
+        <td attributes="are ignored">A2</td>
+        <td>A3</td>
+    </tr>
+    <tr class="something">
+        <td>B1</td>
+        <td attributes="are ignored" even="second attribute">B2</td>
+        <td>B3</td>
+        <td>B     4</td>
+    </tr>
+</table>
+
+<p>
+    This is <b>some<i> text <b>with</b></i></b> inline <span>elements</span> and HTML&nbsp;entities (&gt;bla&lt;)
+</p>
+
+<div>
+    Test<br>
+    a<br />
+    few<br>
+    line<br>
+    breaks<br>
+</div>
+
+
+
+
+    Spaces
+
+
+    in
+
+
+    an inline text                                should be
+
+
+    completely ignored.
+
+
+
+<pre>
+But,
+    a pre-formatted
+                block  should  be  kept
+                                       pre-formatted.
+</pre>
+
+<svg>
+    These special elements SHOULD NOT BE CONVERTED.
+</svg>
+
+<script>
+    // These special elements should be completely skipped.
+    skipThis();
+</script>
+
+<style>
+    /* These special elements should be completely skipped. */
+    .skip_this {}
+</style>
+
+<canvas>
+    This should be skipped too.
+</canvas>
+
+<a class="click" href="https://another.com/a/fifth">The Greatest Science Fiction Quotes Of All Time</a>
+<p>
+    Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design,
+    just eyes. You Nexus, huh? I design your <a class="click" href="http://cool.com/">eyes</a>.
+</p>
+</body>
+</html>
+"""
+
+
+@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])
+@pytest.mark.parametrize(
+    ('source', 'expected_text'),
+    [
+        (_EXAMPLE_HTML, _EXPECTED_TEXT),
+        ('   Plain    text     node    ', 'Plain text node'),
+        ('   \nPlain    text     node  \n  ', 'Plain text node'),
+        ('<h1>Header 1</h1> <h2>Header 2</h2>', 'Header 1\nHeader 2'),
+        ('<h1>Header 1</h1> <h2>Header 2</h2><br>', 'Header 1\nHeader 2'),
+        ('<h1>Header 1</h1> <h2>Header 2</h2><br><br>', 'Header 1\nHeader 2'),
+        ('<h1>Header 1</h1> <h2>Header 2</h2><br><br><br>', 'Header 1\nHeader 2'),
+        ('<h1>Header 1</h1><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
+        ('<h1>Header 1</h1> <br> <h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
+        ('<h1>Header 1</h1>  \n <br>\n<h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
+        ('<h1>Header 1</h1>  \n <br>\n<br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\nHeader 2'),
+        ('<h1>Header 1</h1>  \n <br>\n<br><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\n\nHeader 2'),
+        ('<div><div>Div</div><p>Paragraph</p></div>', 'Div\nParagraph'),
+        ('<div>Div1</div><!-- Some comments --><div>Div2</div>', 'Div1\nDiv2'),
+        ('<div>Div1</div><style>Skip styles</style>', 'Div1'),
+        ('<script>Skip_scripts();</script><div>Div1</div>', 'Div1'),
+        ('<SCRIPT>Skip_scripts();</SCRIPT><div>Div1</div>', 'Div1'),
+        ('<svg>Skip svg</svg><div>Div1</div>', 'Div1'),
+        ('<canvas>Skip canvas</canvas><div>Div1</div>', 'Div1'),
+        ('<b>A  B  C  D  E\n\nF  G</b>', 'A B C D E F G'),
+        ('<pre>A  B  C  D  E\n\nF  G</pre>', 'A  B  C  D  E\n\nF  G'),
+        (
+            '<h1>Heading 1</h1><div><div><div><div>Deep  Div</div></div></div></div><h2>Heading       2</h2>',
+            'Heading 1\nDeep Div' '\nHeading 2',
+        ),
+        ('<a>this_word</a>_should_<b></b>be_<span>one</span>', 'this_word_should_be_one'),
+        ('<span attributes="should" be="ignored">some <span>text</span></span>', 'some text'),
+        (
+            (
+                """<table>
+    <tr>
+        <td>Cell    A1</td><td>Cell A2</td>
+        <td>    Cell A3    </td>
+    </tr>
+    <tr>
+        <td>Cell    B1</td><td>Cell B2</td>
+    </tr>
+</table>"""
+            ),
+            'Cell A1\tCell A2\tCell A3 \t\nCell B1\tCell B2',
+        ),
+        ('<span>&aacute; &eacute;</span>', 'á é'),
+    ],
+)
+def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[str], str]) -> None:
+    assert html_to_text(source) == expected_text
+
+
+@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])
+def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None:
+    with pytest.raises(TypeError):
+        html_to_text(1)  # type: ignore[arg-type]  # Intentional wrong type test.
+
+
+def test_html_to_text_parsel() -> None:
+    assert html_to_text_parsel(Selector(_EXAMPLE_HTML)) == _EXPECTED_TEXT
+
+
+def test_html_to_text_beautifulsoup() -> None:
+    assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML)) == _EXPECTED_TEXT