From c803951ebc7c6957a5a9a36ed07281c4f954fdf4 Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Mon, 9 Dec 2024 10:23:32 +0100
Subject: [PATCH 01/13] Add simple html_to_text helper function. Add tests.

---
 src/crawlee/_utils/html_to_text.py     | 14 ++++++++++++++
 tests/unit/_utils/test_html_to_text.py | 19 +++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 src/crawlee/_utils/html_to_text.py
 create mode 100644 tests/unit/_utils/test_html_to_text.py
diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py
new file mode 100644
index 0000000000..6dfc72094b
--- /dev/null
+++ b/src/crawlee/_utils/html_to_text.py
@@ -0,0 +1,14 @@
+from __future__ import annotations
+
+from bs4 import BeautifulSoup
+
+
+def html_to_text(source: str | BeautifulSoup) -> str:
+    """Converts markup string or BeautifulSoup object to newline separated plain text without tags."""
+    if isinstance(source, str):
+        soup = BeautifulSoup(source)
+    elif isinstance(source, BeautifulSoup):
+        soup = source
+    else:
+        raise TypeError('Source must be either a string or a BeautifulSoup object.')
+    return soup.get_text('\n', strip=True)
diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py
new file mode 100644
index 0000000000..222156fd63
--- /dev/null
+++ b/tests/unit/_utils/test_html_to_text.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+
+import pytest
+from bs4 import BeautifulSoup
+
+from crawlee._utils.html_to_text import html_to_text
+
+_EXPECTED_LINES = ('line 1', 'line2', 'line3')
+_EXAMPLE_HTML = f'<a href="http://example.com/">{_EXPECTED_LINES[0]}<i>{_EXPECTED_LINES[1]}</i>\n</a><code>{_EXPECTED_LINES[2]}</code>'
+
+
+@pytest.mark.parametrize('source', [_EXAMPLE_HTML, BeautifulSoup(_EXAMPLE_HTML)], ids=('String', 'BeautifulSoup'))
+def test_html_to_text(source: str | BeautifulSoup) -> None:
+    assert html_to_text(source) == '\n'.join(_EXPECTED_LINES)
+
+
+def test_html_to_text_raises_on_wrong_input_type() -> None:
+    with pytest.raises(TypeError):
+        html_to_text(1)  # type: ignore[arg-type]  # Intentional wrong type test.

From c571caa4312c697252bef3b7f6a8c753d8677e82 Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Mon, 9 Dec 2024 17:09:23 +0100
Subject: [PATCH 02/13] WIP

---
 src/crawlee/_utils/html_to_text.py     |   8 ++
 tests/unit/_utils/test_html_to_text.py | 125 ++++++++++++++++++++++++-
 2 files changed, 130 insertions(+), 3 deletions(-)

diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py
index 6dfc72094b..7397da1c12 100644
--- a/src/crawlee/_utils/html_to_text.py
+++ b/src/crawlee/_utils/html_to_text.py
@@ -1,7 +1,11 @@
 from __future__ import annotations
 
+import re
+
 from bs4 import BeautifulSoup
 
+SKIP_TAGS = {"script", "style" "canvas", "svg", "noscript"}
+BLOCK_TAGS = {"p" , "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "li", "pre", "address", "blockquote","dl", "div","fieldset", "form", "table" ,"tr","select","option"}
 
 def html_to_text(source: str | BeautifulSoup) -> str:
     """Converts markup string or BeautifulSoup object to newline separated plain text without tags."""
@@ -11,4 +15,8 @@ def html_to_text(source: str | BeautifulSoup) -> str:
         soup = source
     else:
         raise TypeError('Source must be either a string or a BeautifulSoup object.')
+    for tag in soup.findAll():
+        print(tag)
+        if tag.c
+
     return soup.get_text('\n', strip=True)
diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py
index 222156fd63..0ca7af3eb5 100644
--- a/tests/unit/_utils/test_html_to_text.py
+++ b/tests/unit/_utils/test_html_to_text.py
@@ -5,13 +5,132 @@
 
 from crawlee._utils.html_to_text import html_to_text
 
-_EXPECTED_LINES = ('line 1', 'line2', 'line3')
-_EXAMPLE_HTML = f'<a href="http://example.com/">{_EXPECTED_LINES[0]}<i>{_EXPECTED_LINES[1]}</i>\n</a><code>{_EXPECTED_LINES[2]}</code>'
+_EXPECTED_TEXT = (
+"Let's start with a simple text. \n" +
+"The ships hung in the sky, much the way that bricks don't. \n" +
+"These aren't the Droids you're looking for\n" +
+"I'm sorry, Dave. I'm afraid I can't do that.\n" +
+"I'm sorry, Dave. I'm afraid I can't do that.\n" +
+'A1\tA2\tA3\t\n' +
+'B1\tB2\tB3\tB 4\t\n' +
+'This is some text with inline elements and HTML entities (>bla<) \n' +
+'Test\n' +
+'a\n' +
+'few\n' +
+'line\n' +
+'breaks\n' +
+'Spaces in an inline text should be completely ignored. \n' +
+'But,\n' +
+'    a pre-formatted\n' +
+'                block  should  be  kept\n' +
+'                                       pre-formatted.\n' +
+'The Greatest Science Fiction Quotes Of All Time \n' +
+"Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You Nexus, huh? I design your eyes."
+)
 
+_EXAMPLE_HTML = (
+"""
+<html>
+<head>
+    <title>Title SHOULD NOT be converted</title>
+
+    <!-- Comments SHOULD NOT be converted -->
+</head>
+<body with='some attributes'>
+Let's start with a        simple text.
+<p>
+    The ships hung in the sky, much the <a class="click" href="https://example.com/a/b/first">way that</a> bricks don't.
+</p>
+<ul>
+    <li>These aren't the Droids you're looking for</li>
+    <li some="attribute"><a href="https://example.com/a/second">I'm sorry, Dave. I'm afraid I can't do that.</a></li>
+    <li><a class="click" href="https://example.com/a/b/third">I'm sorry, Dave. I'm afraid I can't do that.</a></li>
+</ul>
+
+<img src="something" alt="This should be ignored" />
+
+<!-- Comments SHOULD NOT be converted -->
+
+<table>
+    <tr class="something">
+        <td>A1</td>
+        <td attributes="are ignored">A2</td>
+        <td>A3</td>
+    </tr>
+    <tr class="something">
+        <td>B1</td>
+        <td attributes="are ignored" even="second attribute">B2</td>
+        <td>B3</td>
+        <td>B     4</td>
+    </tr>
+</table>
+
+<p>
+    This is <b>some<i> text <b>with</b></i></b> inline <span>elements</span> and HTML&nbsp;entities (&gt;bla&lt;)
+</p>
+
+<div>
+    Test<br>
+    a<br />
+    few<br>
+    line<br>
+    breaks<br>
+</div>
+
+
+
+
+    Spaces
+
+
+    in
+
+
+    an inline text                                should be
+
+
+    completely ignored.
+
+
+
+<pre>
+But,
+    a pre-formatted
+                block  should  be  kept
+                                       pre-formatted.
+</pre>
+
+<svg>
+    These special elements SHOULD NOT BE CONVERTED.
+</svg>
+
+<script>
+    // These special elements should be completely skipped.
+    skipThis();
+</script>
+
+<style>
+    /* These special elements should be completely skipped. */
+    .skip_this {}
+</style>
+
+<canvas>
+    This should be skipped too.
+</canvas>
+
+<a class="click" href="https://another.com/a/fifth">The Greatest Science Fiction Quotes Of All Time</a>
+<p>
+    Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design,
+    just eyes. You Nexus, huh? I design your <a class="click" href="http://cool.com/">eyes</a>.
+</p>
+</body>
+</html>
+"""
+)
 
 @pytest.mark.parametrize('source', [_EXAMPLE_HTML, BeautifulSoup(_EXAMPLE_HTML)], ids=('String', 'BeautifulSoup'))
 def test_html_to_text(source: str | BeautifulSoup) -> None:
-    assert html_to_text(source) == '\n'.join(_EXPECTED_LINES)
+    assert html_to_text(source) == _EXPECTED_TEXT
 
 
 def test_html_to_text_raises_on_wrong_input_type() -> None:

From f26f8b6b7ef7ff0a1564e4f9f702c0a00c9c3197 Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Tue, 10 Dec 2024 17:38:41 +0100
Subject: [PATCH 03/13] WIP follow JS implementation

---
 src/crawlee/_utils/html_to_text.py | 45 ++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py
index 7397da1c12..e6371d4791 100644
--- a/src/crawlee/_utils/html_to_text.py
+++ b/src/crawlee/_utils/html_to_text.py
@@ -2,9 +2,10 @@
 
 import re
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString, Tag, PageElement
 
-SKIP_TAGS = {"script", "style" "canvas", "svg", "noscript"}
+
+SKIP_TAGS = {"script", "style" "canvas", "svg", "noscript", "title"}
 BLOCK_TAGS = {"p" , "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "li", "pre", "address", "blockquote","dl", "div","fieldset", "form", "table" ,"tr","select","option"}
 
 def html_to_text(source: str | BeautifulSoup) -> str:
@@ -15,8 +16,40 @@ def html_to_text(source: str | BeautifulSoup) -> str:
         soup = source
     else:
         raise TypeError('Source must be either a string or a BeautifulSoup object.')
-    for tag in soup.findAll():
-        print(tag)
-        if tag.c
 
-    return soup.get_text('\n', strip=True)
+    text = ""
+
+    def _page_element_to_text(page_element: PageElement) -> str:
+        nonlocal text
+        if isinstance(page_element, NavigableString):
+            compr: str
+            if page_element.parent.name.lower() == 'pre':
+                compr = page_element.get_text()
+            else:
+                # Compares white spaces outside of pre block
+                compr = re.sub(r"\s+", " ", page_element.get_text())
+            if compr.startswith(" ") and re.match(r"^|\s", page_element.get_text()):
+                compr = compr[1:]
+            text += compr
+            if page_element.parent.name.lower() == 'br':
+                text += "\n"
+            if page_element.parent.name.lower() == 'td':
+                text += "\t"
+            if page_element.parent.name.lower() in BLOCK_TAGS:
+                text = f"\n{compr}"
+            return compr
+
+        if isinstance(page_element, Tag) and page_element.name.lower() in SKIP_TAGS:
+            return ""
+        x = list(page_element.stripped_strings)
+        text_parts = [_page_element_to_text(child) for child in page_element.children]
+
+        return "".join(text_parts)
+
+
+
+
+    return _page_element_to_text(soup)
+
+
+

From db0cc6509b016a257e22377a10ad682c0eb11fad Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Tue, 10 Dec 2024 20:28:30 +0100
Subject: [PATCH 04/13] Almost same as JS implementation. TODO: Fix last
 differences and add more tests according to JS implementation.

---
 src/crawlee/_utils/html_to_text.py | 60 +++++++++++++++---------------
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py
index e6371d4791..37500673c3 100644
--- a/src/crawlee/_utils/html_to_text.py
+++ b/src/crawlee/_utils/html_to_text.py
@@ -5,7 +5,7 @@
 from bs4 import BeautifulSoup, NavigableString, Tag, PageElement
 
 
-SKIP_TAGS = {"script", "style" "canvas", "svg", "noscript", "title"}
+SKIP_TAGS = {"script", "style", "canvas", "svg", "noscript", "title"}
 BLOCK_TAGS = {"p" , "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "li", "pre", "address", "blockquote","dl", "div","fieldset", "form", "table" ,"tr","select","option"}
 
 def html_to_text(source: str | BeautifulSoup) -> str:
@@ -19,37 +19,39 @@ def html_to_text(source: str | BeautifulSoup) -> str:
 
     text = ""
 
-    def _page_element_to_text(page_element: PageElement) -> str:
+    def _page_element_to_text(page_elements: PageElement) -> str:
         nonlocal text
-        if isinstance(page_element, NavigableString):
-            compr: str
-            if page_element.parent.name.lower() == 'pre':
-                compr = page_element.get_text()
-            else:
-                # Compares white spaces outside of pre block
-                compr = re.sub(r"\s+", " ", page_element.get_text())
-            if compr.startswith(" ") and re.match(r"^|\s", page_element.get_text()):
-                compr = compr[1:]
-            text += compr
-            if page_element.parent.name.lower() == 'br':
+        for page_element in page_elements:
+            if isinstance(page_element, NavigableString):
+                compr: str
+                if page_element.parent.name.lower() == 'pre':
+                    compr = page_element.get_text()
+                else:
+                    # Compares white spaces outside of pre block
+                    compr = re.sub(r"\s+", " ", page_element.get_text())
+                # If text is empty or ends with a whitespace, don't add the leading whitespace
+                if compr.startswith(" ") and re.search(r"(^|\s)$", text):
+                    compr = compr[1:]
+                text += compr
+            elif page_element.name.lower() in SKIP_TAGS or isinstance(page_element, int):
+                # Skip comments and special elements
+                pass
+            elif page_element.name.lower() == 'br':
                 text += "\n"
-            if page_element.parent.name.lower() == 'td':
+            elif page_element.name.lower() == 'td':
+                _page_element_to_text(page_element.children)
                 text += "\t"
-            if page_element.parent.name.lower() in BLOCK_TAGS:
-                text = f"\n{compr}"
-            return compr
-
-        if isinstance(page_element, Tag) and page_element.name.lower() in SKIP_TAGS:
-            return ""
-        x = list(page_element.stripped_strings)
-        text_parts = [_page_element_to_text(child) for child in page_element.children]
-
-        return "".join(text_parts)
-
-
-
-
-    return _page_element_to_text(soup)
+            else:
+                # Block elements must be surrounded by newlines(unless beginning of text)
+                is_block_tag = page_element.name.lower() in BLOCK_TAGS
+                if is_block_tag and not re.search(r"(^|\n)$", text):
+                    text += '\n'
+                _page_element_to_text(page_element.children)
+                if (is_block_tag and not text.endswith('\n')):
+                    text += '\n'
+
+    _page_element_to_text(soup)
+    return text.strip()
 
 
 

From 26d5aee1e958ea3badffbce9b328a33241001002 Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Wed, 11 Dec 2024 10:32:24 +0100
Subject: [PATCH 05/13] Same behavior as JS implementation.

---
 src/crawlee/_utils/html_to_text.py     | 95 +++++++++++++++---------
 tests/unit/_utils/test_html_to_text.py | 99 +++++++++++++++++++-------
 2 files changed, 133 insertions(+), 61 deletions(-)

diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py
index 37500673c3..a65786a768 100644
--- a/src/crawlee/_utils/html_to_text.py
+++ b/src/crawlee/_utils/html_to_text.py
@@ -1,12 +1,38 @@
 from __future__ import annotations
 
 import re
+from typing import TYPE_CHECKING
 
-from bs4 import BeautifulSoup, NavigableString, Tag, PageElement
+from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
 
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'}
+BLOCK_TAGS = {
+    'p',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'ol',
+    'ul',
+    'li',
+    'pre',
+    'address',
+    'blockquote',
+    'dl',
+    'div',
+    'fieldset',
+    'form',
+    'table',
+    'tr',
+    'select',
+    'option',
+}
 
-SKIP_TAGS = {"script", "style", "canvas", "svg", "noscript", "title"}
-BLOCK_TAGS = {"p" , "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "li", "pre", "address", "blockquote","dl", "div","fieldset", "form", "table" ,"tr","select","option"}
 
 def html_to_text(source: str | BeautifulSoup) -> str:
     """Converts markup string or BeautifulSoup object to newline separated plain text without tags."""
@@ -17,41 +43,40 @@ def html_to_text(source: str | BeautifulSoup) -> str:
     else:
         raise TypeError('Source must be either a string or a BeautifulSoup object.')
 
-    text = ""
+    text = ''
 
-    def _page_element_to_text(page_elements: PageElement) -> str:
+    def _page_element_to_text(page_elements: Iterable[PageElement]) -> None:
+        """Custom html parsing that performs as implementation from Javascript version of Crawlee."""
         nonlocal text
         for page_element in page_elements:
-            if isinstance(page_element, NavigableString):
-                compr: str
-                if page_element.parent.name.lower() == 'pre':
-                    compr = page_element.get_text()
-                else:
-                    # Compares white spaces outside of pre block
-                    compr = re.sub(r"\s+", " ", page_element.get_text())
-                # If text is empty or ends with a whitespace, don't add the leading whitespace
-                if compr.startswith(" ") and re.search(r"(^|\s)$", text):
-                    compr = compr[1:]
-                text += compr
-            elif page_element.name.lower() in SKIP_TAGS or isinstance(page_element, int):
-                # Skip comments and special elements
-                pass
-            elif page_element.name.lower() == 'br':
-                text += "\n"
-            elif page_element.name.lower() == 'td':
-                _page_element_to_text(page_element.children)
-                text += "\t"
-            else:
-                # Block elements must be surrounded by newlines(unless beginning of text)
-                is_block_tag = page_element.name.lower() in BLOCK_TAGS
-                if is_block_tag and not re.search(r"(^|\n)$", text):
-                    text += '\n'
-                _page_element_to_text(page_element.children)
-                if (is_block_tag and not text.endswith('\n')):
+            if isinstance(page_element, (Tag, NavigableString)):
+                if isinstance(page_element, NavigableString):
+                    compr: str
+                    if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre':
+                        compr = page_element.get_text()
+                    else:
+                        # Compress white spaces outside of pre block
+                        compr = re.sub(r'\s+', ' ', page_element.get_text())
+                    # If text is empty or ends with a whitespace, don't add the leading whitespace or new line
+                    if (compr.startswith((' ', '\n'))) and re.search(r'(^|\s)$', text):
+                        compr = compr[1:]
+                    text += compr
+                elif page_element.name.lower() in SKIP_TAGS or isinstance(page_element, int):
+                    # Skip comments and special elements
+                    pass
+                elif page_element.name.lower() == 'br':
                     text += '\n'
+                elif page_element.name.lower() == 'td':
+                    _page_element_to_text(page_element.children)
+                    text += '\t'
+                else:
+                    # Block elements must be surrounded by newlines(unless beginning of text)
+                    is_block_tag = page_element.name.lower() in BLOCK_TAGS
+                    if is_block_tag and not re.search(r'(^|\n)$', text):
+                        text += '\n'
+                    _page_element_to_text(page_element.children)
+                    if is_block_tag and not text.endswith('\n'):
+                        text += '\n'
 
-    _page_element_to_text(soup)
+    _page_element_to_text(soup.children)
     return text.strip()
-
-
-
diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py
index 0ca7af3eb5..4e462c05cc 100644
--- a/tests/unit/_utils/test_html_to_text.py
+++ b/tests/unit/_utils/test_html_to_text.py
@@ -6,30 +6,30 @@
 from crawlee._utils.html_to_text import html_to_text
 
 _EXPECTED_TEXT = (
-"Let's start with a simple text. \n" +
-"The ships hung in the sky, much the way that bricks don't. \n" +
-"These aren't the Droids you're looking for\n" +
-"I'm sorry, Dave. I'm afraid I can't do that.\n" +
-"I'm sorry, Dave. I'm afraid I can't do that.\n" +
-'A1\tA2\tA3\t\n' +
-'B1\tB2\tB3\tB 4\t\n' +
-'This is some text with inline elements and HTML entities (>bla<) \n' +
-'Test\n' +
-'a\n' +
-'few\n' +
-'line\n' +
-'breaks\n' +
-'Spaces in an inline text should be completely ignored. \n' +
-'But,\n' +
-'    a pre-formatted\n' +
-'                block  should  be  kept\n' +
-'                                       pre-formatted.\n' +
-'The Greatest Science Fiction Quotes Of All Time \n' +
-"Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You Nexus, huh? I design your eyes."
+    "Let's start with a simple text. \n"
+    "The ships hung in the sky, much the way that bricks don't. \n"
+    "These aren't the Droids you're looking for\n"
+    "I'm sorry, Dave. I'm afraid I can't do that.\n"
+    "I'm sorry, Dave. I'm afraid I can't do that.\n"
+    'A1\tA2\tA3\t\n'
+    'B1\tB2\tB3\tB 4\t\n'
+    'This is some text with inline elements and HTML entities (>bla<) \n'
+    'Test\n'
+    'a\n'
+    'few\n'
+    'line\n'
+    'breaks\n'
+    'Spaces in an inline text should be completely ignored. \n'
+    'But,\n'
+    '    a pre-formatted\n'
+    '                block  should  be  kept\n'
+    '                                       pre-formatted.\n'
+    'The Greatest Science Fiction Quotes Of All Time \n'
+    "Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You "
+    'Nexus, huh? I design your eyes.'
 )
 
-_EXAMPLE_HTML = (
-"""
+_EXAMPLE_HTML = """
 <html>
 <head>
     <title>Title SHOULD NOT be converted</title>
@@ -126,11 +126,58 @@
 </body>
 </html>
 """
-)
 
-@pytest.mark.parametrize('source', [_EXAMPLE_HTML, BeautifulSoup(_EXAMPLE_HTML)], ids=('String', 'BeautifulSoup'))
-def test_html_to_text(source: str | BeautifulSoup) -> None:
-    assert html_to_text(source) == _EXPECTED_TEXT
+
+@pytest.mark.parametrize(
+    ('source', 'expected_text'),
+    [
+        (_EXAMPLE_HTML, _EXPECTED_TEXT),
+        (BeautifulSoup(_EXAMPLE_HTML), _EXPECTED_TEXT),
+        ('   Plain    text     node    ', 'Plain text node'),
+        ('   \nPlain    text     node  \n  ', 'Plain text node'),
+        ('<h1>Header 1</h1> <h2>Header 2</h2>', 'Header 1\nHeader 2'),
+        ('<h1>Header 1</h1> <h2>Header 2</h2><br>', 'Header 1\nHeader 2'),
+        ('<h1>Header 1</h1> <h2>Header 2</h2><br><br>', 'Header 1\nHeader 2'),
+        ('<h1>Header 1</h1> <h2>Header 2</h2><br><br><br>', 'Header 1\nHeader 2'),
+        ('<h1>Header 1</h1><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
+        ('<h1>Header 1</h1> <br> <h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
+        ('<h1>Header 1</h1>  \n <br>\n<h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
+        ('<h1>Header 1</h1>  \n <br>\n<br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\nHeader 2'),
+        ('<h1>Header 1</h1>  \n <br>\n<br><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\n\nHeader 2'),
+        ('<div><div>Div</div><p>Paragraph</p></div>', 'Div\nParagraph'),
+        ('<div>Div1</div><!-- Some comments --><div>Div2</div>', 'Div1\nDiv2'),
+        ('<div>Div1</div><style>Skip styles</style>', 'Div1'),
+        ('<script>Skip_scripts();</script><div>Div1</div>', 'Div1'),
+        ('<SCRIPT>Skip_scripts();</SCRIPT><div>Div1</div>', 'Div1'),
+        ('<svg>Skip svg</svg><div>Div1</div>', 'Div1'),
+        ('<canvas>Skip canvas</canvas><div>Div1</div>', 'Div1'),
+        ('<b>A  B  C  D  E\n\nF  G</b>', 'A B C D E F G'),
+        ('<pre>A  B  C  D  E\n\nF  G</pre>', 'A  B  C  D  E\n\nF  G'),
+        (
+            '<h1>Heading 1</h1><div><div><div><div>Deep  Div</div></div></div></div><h2>Heading       2</h2>',
+            'Heading 1\nDeep Div' '\nHeading 2',
+        ),
+        ('<a>this_word</a>_should_<b></b>be_<span>one</span>', 'this_word_should_be_one'),
+        ('<span attributes="should" be="ignored">some <span>text</span></span>', 'some text'),
+        (
+            (
+                """<table>
+    <tr>
+        <td>Cell    A1</td><td>Cell A2</td>
+        <td>    Cell A3    </td>
+    </tr>
+    <tr>
+        <td>Cell    B1</td><td>Cell B2</td>
+    </tr>
+</table>"""
+            ),
+            'Cell A1\tCell A2\tCell A3 \t\nCell B1\tCell B2',
+        ),
+        ('<span>&aacute; &eacute;</span>', 'á é'),
+    ],
+)
+def test_html_to_text(source: str | BeautifulSoup, expected_text: str) -> None:
+    assert html_to_text(source) == expected_text
 
 
 def test_html_to_text_raises_on_wrong_input_type() -> None:

From 27cfedd7a3a526a7c034a64a46b91333c9c6347a Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Wed, 11 Dec 2024 11:07:47 +0100
Subject: [PATCH 06/13] Reformat import in test_base_crawler.py

---
 tests/unit/basic_crawler/test_basic_crawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/basic_crawler/test_basic_crawler.py b/tests/unit/basic_crawler/test_basic_crawler.py
index 9e3768064e..5c9399c699 100644
--- a/tests/unit/basic_crawler/test_basic_crawler.py
+++ b/tests/unit/basic_crawler/test_basic_crawler.py
@@ -4,10 +4,10 @@
 import asyncio
 import json
 import logging
+import os
 from collections import Counter
 from dataclasses import dataclass
 from datetime import timedelta
-import os
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from unittest.mock import AsyncMock, Mock

From bab689c42882a9182d21a26c9e78bfe54678712a Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Fri, 13 Dec 2024 09:05:09 +0100
Subject: [PATCH 07/13] Pre-compile used re patterns. Expose this function in
 BS crawler.

---
 src/crawlee/_utils/html_to_text.py            | 10 +++++++---
 src/crawlee/beautifulsoup_crawler/__init__.py |  4 +++-
 tests/unit/_utils/test_html_to_text.py        |  2 +-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py
index a65786a768..1f35d5e4f3 100644
--- a/src/crawlee/_utils/html_to_text.py
+++ b/src/crawlee/_utils/html_to_text.py
@@ -33,6 +33,10 @@
     'option',
 }
 
+_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\s)$')
+_EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\n)$')
+_ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\s+')
+
 
 def html_to_text(source: str | BeautifulSoup) -> str:
     """Converts markup string or BeautifulSoup object to newline separated plain text without tags."""
@@ -56,9 +60,9 @@ def _page_element_to_text(page_elements: Iterable[PageElement]) -> None:
                         compr = page_element.get_text()
                     else:
                         # Compress white spaces outside of pre block
-                        compr = re.sub(r'\s+', ' ', page_element.get_text())
+                        compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text())
                     # If text is empty or ends with a whitespace, don't add the leading whitespace or new line
-                    if (compr.startswith((' ', '\n'))) and re.search(r'(^|\s)$', text):
+                    if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text):
                         compr = compr[1:]
                     text += compr
                 elif page_element.name.lower() in SKIP_TAGS or isinstance(page_element, int):
@@ -72,7 +76,7 @@ def _page_element_to_text(page_elements: Iterable[PageElement]) -> None:
                 else:
                     # Block elements must be surrounded by newlines(unless beginning of text)
                     is_block_tag = page_element.name.lower() in BLOCK_TAGS
-                    if is_block_tag and not re.search(r'(^|\n)$', text):
+                    if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text):
                         text += '\n'
                     _page_element_to_text(page_element.children)
                     if is_block_tag and not text.endswith('\n'):
diff --git a/src/crawlee/beautifulsoup_crawler/__init__.py b/src/crawlee/beautifulsoup_crawler/__init__.py
index 59b0264cc1..f9e991f62d 100644
--- a/src/crawlee/beautifulsoup_crawler/__init__.py
+++ b/src/crawlee/beautifulsoup_crawler/__init__.py
@@ -1,4 +1,6 @@
 try:
+    from crawlee._utils.html_to_text import html_to_text
+
     from ._beautifulsoup_crawler import BeautifulSoupCrawler
     from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
     from ._beautifulsoup_parser import BeautifulSoupParserType
@@ -8,4 +10,4 @@
         "For example, if you use pip, run `pip install 'crawlee[beautifulsoup]'`.",
     ) from exc
 
-__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType']
+__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType', 'html_to_text']
diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py
index 4e462c05cc..1273cb4775 100644
--- a/tests/unit/_utils/test_html_to_text.py
+++ b/tests/unit/_utils/test_html_to_text.py
@@ -3,7 +3,7 @@
 import pytest
 from bs4 import BeautifulSoup
 
-from crawlee._utils.html_to_text import html_to_text
+from crawlee.beautifulsoup_crawler import html_to_text
 
 _EXPECTED_TEXT = (
     "Let's start with a simple text. \n"

From 6f32156f36ab1611f6edde4dcfa7ede5f46b8446 Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Tue, 17 Dec 2024 09:33:57 +0100
Subject: [PATCH 08/13] Add Parsel version of html_to_text. Add both
 implementations to respective contexts. Set same tests for both.

---
 src/crawlee/_utils/html_to_text.py            | 55 +---------------
 src/crawlee/beautifulsoup_crawler/__init__.py |  4 +-
 .../_beautifulsoup_crawling_context.py        |  5 ++
 src/crawlee/beautifulsoup_crawler/_utils.py   | 66 +++++++++++++++++++
 .../_parsel_crawling_context.py               |  5 ++
 src/crawlee/parsel_crawler/_utils.py          | 63 ++++++++++++++++++
 tests/unit/_utils/test_html_to_text.py        | 21 ++++--
 7 files changed, 158 insertions(+), 61 deletions(-)
 create mode 100644 src/crawlee/beautifulsoup_crawler/_utils.py
 create mode 100644 src/crawlee/parsel_crawler/_utils.py

diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py
index 1f35d5e4f3..3cc813c3a6 100644
--- a/src/crawlee/_utils/html_to_text.py
+++ b/src/crawlee/_utils/html_to_text.py
@@ -1,12 +1,7 @@
+# This file contains shared constants used by different implementations of html_to_text function.
 from __future__ import annotations
 
 import re
-from typing import TYPE_CHECKING
-
-from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
-
-if TYPE_CHECKING:
-    from collections.abc import Iterable
 
 SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'}
 BLOCK_TAGS = {
@@ -36,51 +31,3 @@
 _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\s)$')
 _EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\n)$')
 _ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\s+')
-
-
-def html_to_text(source: str | BeautifulSoup) -> str:
-    """Converts markup string or BeautifulSoup object to newline separated plain text without tags."""
-    if isinstance(source, str):
-        soup = BeautifulSoup(source)
-    elif isinstance(source, BeautifulSoup):
-        soup = source
-    else:
-        raise TypeError('Source must be either a string or a BeautifulSoup object.')
-
-    text = ''
-
-    def _page_element_to_text(page_elements: Iterable[PageElement]) -> None:
-        """Custom html parsing that performs as implementation from Javascript version of Crawlee."""
-        nonlocal text
-        for page_element in page_elements:
-            if isinstance(page_element, (Tag, NavigableString)):
-                if isinstance(page_element, NavigableString):
-                    compr: str
-                    if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre':
-                        compr = page_element.get_text()
-                    else:
-                        # Compress white spaces outside of pre block
-                        compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text())
-                    # If text is empty or ends with a whitespace, don't add the leading whitespace or new line
-                    if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text):
-                        compr = compr[1:]
-                    text += compr
-                elif page_element.name.lower() in SKIP_TAGS or isinstance(page_element, int):
-                    # Skip comments and special elements
-                    pass
-                elif page_element.name.lower() == 'br':
-                    text += '\n'
-                elif page_element.name.lower() == 'td':
-                    _page_element_to_text(page_element.children)
-                    text += '\t'
-                else:
-                    # Block elements must be surrounded by newlines(unless beginning of text)
-                    is_block_tag = page_element.name.lower() in BLOCK_TAGS
-                    if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text):
-                        text += '\n'
-                    _page_element_to_text(page_element.children)
-                    if is_block_tag and not text.endswith('\n'):
-                        text += '\n'
-
-    _page_element_to_text(soup.children)
-    return text.strip()
diff --git a/src/crawlee/beautifulsoup_crawler/__init__.py b/src/crawlee/beautifulsoup_crawler/__init__.py
index f9e991f62d..59b0264cc1 100644
--- a/src/crawlee/beautifulsoup_crawler/__init__.py
+++ b/src/crawlee/beautifulsoup_crawler/__init__.py
@@ -1,6 +1,4 @@
 try:
-    from crawlee._utils.html_to_text import html_to_text
-
     from ._beautifulsoup_crawler import BeautifulSoupCrawler
     from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
     from ._beautifulsoup_parser import BeautifulSoupParserType
@@ -10,4 +8,4 @@
         "For example, if you use pip, run `pip install 'crawlee[beautifulsoup]'`.",
     ) from exc
 
-__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType', 'html_to_text']
+__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType']
diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py
index f01d66a1c0..520b678199 100644
--- a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py
+++ b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py
@@ -5,6 +5,7 @@
 
 from crawlee._utils.docs import docs_group
 from crawlee.abstract_http_crawler._http_crawling_context import ParsedHttpCrawlingContext
+from crawlee.beautifulsoup_crawler._utils import html_to_text
 
 
 @dataclass(frozen=True)
@@ -24,3 +25,7 @@ def soup(self) -> BeautifulSoup:
     def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self:
         """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`."""
         return cls(**{field.name: getattr(context, field.name) for field in fields(context)})
+
+    def html_to_text(self) -> str:
+        """Converts parsed_content to newline-separated plain text without tags."""
+        return html_to_text(self.parsed_content)
diff --git a/src/crawlee/beautifulsoup_crawler/_utils.py b/src/crawlee/beautifulsoup_crawler/_utils.py
new file mode 100644
index 0000000000..52f15684f1
--- /dev/null
+++ b/src/crawlee/beautifulsoup_crawler/_utils.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING
+
+from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
+
+from crawlee._utils.html_to_text import (
+    _ANY_CONSECUTIVE_WHITE_SPACES,
+    _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE,
+    _EMPTY_OR_ENDS_WITH_NEW_LINE,
+    BLOCK_TAGS,
+    SKIP_TAGS,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+
+def html_to_text(source: str | BeautifulSoup) -> str:
+    """Converts markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup."""
+    if isinstance(source, str):
+        soup = BeautifulSoup(source)
+    elif isinstance(source, BeautifulSoup):
+        soup = source
+    else:
+        raise TypeError('Source must be either a string or a `BeautifulSoup` object.')
+
+    text = ''
+
+    def _page_element_to_text(page_elements: Iterable[PageElement]) -> None:
+        """Custom html parsing that performs as implementation from Javascript version of Crawlee."""
+        nonlocal text
+        for page_element in page_elements:
+            if isinstance(page_element, (Tag, NavigableString)):
+                if isinstance(page_element, NavigableString):
+                    compr: str
+                    if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre':
+                        compr = page_element.get_text()
+                    else:
+                        # Compress white spaces outside of pre block
+                        compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text())
+                    # If text is empty or ends with a whitespace, don't add the leading whitespace or new line
+                    if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text):
+                        compr = compr[1:]
+                    text += compr
+                elif page_element.name.lower() in SKIP_TAGS:
+                    # Skip comments and special elements
+                    pass
+                elif page_element.name.lower() == 'br':
+                    text += '\n'
+                elif page_element.name.lower() == 'td':
+                    _page_element_to_text(page_element.children)
+                    text += '\t'
+                else:
+                    # Block elements must be surrounded by newlines(unless beginning of text)
+                    is_block_tag = page_element.name.lower() in BLOCK_TAGS
+                    if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text):
+                        text += '\n'
+                    _page_element_to_text(page_element.children)
+                    if is_block_tag and not text.endswith('\n'):
+                        text += '\n'
+
+    _page_element_to_text(soup.children)
+
+    return text.strip()
diff --git a/src/crawlee/parsel_crawler/_parsel_crawling_context.py b/src/crawlee/parsel_crawler/_parsel_crawling_context.py
index 5dd13e3868..96fb9bf9cc 100644
--- a/src/crawlee/parsel_crawler/_parsel_crawling_context.py
+++ b/src/crawlee/parsel_crawler/_parsel_crawling_context.py
@@ -5,6 +5,7 @@
 
 from crawlee._utils.docs import docs_group
 from crawlee.abstract_http_crawler._http_crawling_context import ParsedHttpCrawlingContext
+from crawlee.parsel_crawler._utils import html_to_text
 
 
 @dataclass(frozen=True)
@@ -24,3 +25,7 @@ def selector(self) -> Selector:
     def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Selector]) -> Self:
         """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`."""
         return cls(**{field.name: getattr(context, field.name) for field in fields(context)})
+
+    def html_to_text(self) -> str:
+        """Converts parsed_content to newline-separated plain text without tags."""
+        return html_to_text(self.parsed_content)
diff --git a/src/crawlee/parsel_crawler/_utils.py b/src/crawlee/parsel_crawler/_utils.py
new file mode 100644
index 0000000000..1300278d04
--- /dev/null
+++ b/src/crawlee/parsel_crawler/_utils.py
@@ -0,0 +1,63 @@
+from __future__ import annotations
+
+import re
+
+from parsel import Selector
+
+from crawlee._utils.html_to_text import (
+    _ANY_CONSECUTIVE_WHITE_SPACES,
+    _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE,
+    _EMPTY_OR_ENDS_WITH_NEW_LINE,
+    BLOCK_TAGS,
+    SKIP_TAGS,
+)
+
+
+def html_to_text(source: str | Selector) -> str:
+    """Converts markup string or `Selector` to newline-separated plain text without tags using Parsel."""
+    if isinstance(source, str):
+        selector = Selector(text=source)
+    elif isinstance(source, Selector):
+        selector = source
+    else:
+        raise TypeError('Source must be either a string or a `Selector` object.')
+
+    text = ''
+
+    def _extract_text(elements: list[Selector], *, compress: bool = True) -> None:
+        """Custom html parsing that performs as implementation from Javascript version of Crawlee."""
+        nonlocal text
+        for element in elements:
+            tag = element.root.tag if hasattr(element.root, 'tag') else None
+
+            if tag is None:
+                # Compress white spaces outside of pre block
+                compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', element.root) if compress else element.root
+                # If text is empty or ends with a whitespace, don't add the leading whitespace or new line
+                if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text):
+                    compr = compr[1:]
+                text += compr
+
+            if tag in SKIP_TAGS or not isinstance(tag, str):
+                continue
+
+            if tag == 'br':
+                text += '\n'
+            elif tag == 'td':
+                _extract_text(element.xpath('./node()'))
+                text += '\t'
+            else:
+                is_block_tag = tag in BLOCK_TAGS if tag else False
+
+                if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text):
+                    text += '\n'
+
+                _extract_text(element.xpath('./node()'), compress=tag != 'pre')
+
+                if is_block_tag and not text.endswith('\n'):
+                    text += '\n'
+
+    # Start processing the root elements
+    _extract_text(selector.xpath('/*'))
+
+    return text.strip()
diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py
index 1273cb4775..394470b75c 100644
--- a/tests/unit/_utils/test_html_to_text.py
+++ b/tests/unit/_utils/test_html_to_text.py
@@ -1,9 +1,13 @@
 from __future__ import annotations
 
+from typing import Callable
+
 import pytest
 from bs4 import BeautifulSoup
+from parsel import Selector
 
-from crawlee.beautifulsoup_crawler import html_to_text
+from crawlee.beautifulsoup_crawler._utils import html_to_text as html_to_text_beautifulsoup
+from crawlee.parsel_crawler._utils import html_to_text as html_to_text_parsel
 
 _EXPECTED_TEXT = (
     "Let's start with a simple text. \n"
@@ -128,11 +132,11 @@
 """
 
 
+@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])
 @pytest.mark.parametrize(
     ('source', 'expected_text'),
     [
         (_EXAMPLE_HTML, _EXPECTED_TEXT),
-        (BeautifulSoup(_EXAMPLE_HTML), _EXPECTED_TEXT),
         ('   Plain    text     node    ', 'Plain text node'),
         ('   \nPlain    text     node  \n  ', 'Plain text node'),
         ('<h1>Header 1</h1> <h2>Header 2</h2>', 'Header 1\nHeader 2'),
@@ -176,10 +180,19 @@
         ('<span>&aacute; &eacute;</span>', 'á é'),
     ],
 )
-def test_html_to_text(source: str | BeautifulSoup, expected_text: str) -> None:
+def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[str], str]) -> None:
     assert html_to_text(source) == expected_text
 
 
-def test_html_to_text_raises_on_wrong_input_type() -> None:
+@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])
+def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None:
     with pytest.raises(TypeError):
         html_to_text(1)  # type: ignore[arg-type]  # Intentional wrong type test.
+
+
+def test_html_to_text_parsel() -> None:
+    assert html_to_text_parsel(Selector(_EXAMPLE_HTML)) == _EXPECTED_TEXT
+
+
+def test_html_to_text_beautifulsoup() -> None:
+    assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML)) == _EXPECTED_TEXT

From 964c2cfbd1afab4a6ea89735b1ddcff6eeab6216 Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Tue, 17 Dec 2024 10:03:32 +0100
Subject: [PATCH 09/13] Add public function as well.

---
 src/crawlee/_utils/html_to_text.py     |  3 +++
 src/crawlee/utils.py                   | 17 +++++++++++++++++
 tests/unit/_utils/test_html_to_text.py |  5 +++--
 3 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 src/crawlee/utils.py

diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py
index 3cc813c3a6..bf7c3d3291 100644
--- a/src/crawlee/_utils/html_to_text.py
+++ b/src/crawlee/_utils/html_to_text.py
@@ -3,6 +3,9 @@
 
 import re
 
+# Tags based on Javascript implementation of text_to_html from:
+# https://github.com/apify/crawlee/blob/master/packages/utils/src/internals/cheerio.ts#L11
+# Originally added here: https://github.com/apify/apify-ts/commit/4c0e5e3e7377536a449bb7b205132348ad3b0fe9
 SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'}
 BLOCK_TAGS = {
     'p',
diff --git a/src/crawlee/utils.py b/src/crawlee/utils.py
new file mode 100644
index 0000000000..7f487cbb05
--- /dev/null
+++ b/src/crawlee/utils.py
@@ -0,0 +1,17 @@
+from typing import Callable
+
+
+def html_to_text(source: str) -> str:
+    """Converts markup string to newline separated plain text without tags."""
+    _html_to_text: Callable[[str], str]
+    try:
+        from crawlee.beautifulsoup_crawler._utils import html_to_text as _html_to_text
+    except ImportError:
+        try:
+            from crawlee.parsel_crawler._utils import html_to_text as _html_to_text
+        except ImportError as e:
+            raise ImportError(
+                'html_to_text requires either Parsel or BeautifulSoup package to be installed. Please '
+                'install one of following: crawlee[beautifulsoup], crawlee[parsel] or crawlee[all].'
+            ) from e
+    return _html_to_text(source)
diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py
index 394470b75c..c36d3b5811 100644
--- a/tests/unit/_utils/test_html_to_text.py
+++ b/tests/unit/_utils/test_html_to_text.py
@@ -8,6 +8,7 @@
 
 from crawlee.beautifulsoup_crawler._utils import html_to_text as html_to_text_beautifulsoup
 from crawlee.parsel_crawler._utils import html_to_text as html_to_text_parsel
+from crawlee.utils import html_to_text as html_to_text_public
 
 _EXPECTED_TEXT = (
     "Let's start with a simple text. \n"
@@ -132,7 +133,7 @@
 """
 
 
-@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])
+@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup, html_to_text_public])
 @pytest.mark.parametrize(
     ('source', 'expected_text'),
     [
@@ -184,7 +185,7 @@ def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[s
     assert html_to_text(source) == expected_text
 
 
-@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])
+@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup, html_to_text_public])
 def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None:
     with pytest.raises(TypeError):
         html_to_text(1)  # type: ignore[arg-type]  # Intentional wrong type test.

From 0cb68af859edf40b9926a1b5b0133c3806d00159 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= <josef.prochazka@apify.com>
Date: Tue, 17 Dec 2024 10:26:30 +0100
Subject: [PATCH 10/13] Apply suggestions from code review

Co-authored-by: Vlada Dusek <v.dusek96@gmail.com>
---
 .../beautifulsoup_crawler/_beautifulsoup_crawling_context.py    | 2 +-
 src/crawlee/parsel_crawler/_parsel_crawling_context.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py
index 520b678199..1a3751b97d 100644
--- a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py
+++ b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py
@@ -27,5 +27,5 @@ def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Be
         return cls(**{field.name: getattr(context, field.name) for field in fields(context)})
 
     def html_to_text(self) -> str:
-        """Converts parsed_content to newline-separated plain text without tags."""
+        """Convert the parsed HTML content to newline-separated plain text without tags."""
         return html_to_text(self.parsed_content)
diff --git a/src/crawlee/parsel_crawler/_parsel_crawling_context.py b/src/crawlee/parsel_crawler/_parsel_crawling_context.py
index 96fb9bf9cc..2b9b33df58 100644
--- a/src/crawlee/parsel_crawler/_parsel_crawling_context.py
+++ b/src/crawlee/parsel_crawler/_parsel_crawling_context.py
@@ -27,5 +27,5 @@ def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Se
         return cls(**{field.name: getattr(context, field.name) for field in fields(context)})
 
     def html_to_text(self) -> str:
-        """Converts parsed_content to newline-separated plain text without tags."""
+        """Convert the parsed HTML content to newline-separated plain text without tags."""
         return html_to_text(self.parsed_content)

From 210749e6d6ef495cf1a9d08e536677f4e3130efc Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Tue, 17 Dec 2024 10:56:44 +0100
Subject: [PATCH 11/13] Add docs decorator

---
 src/crawlee/utils.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/crawlee/utils.py b/src/crawlee/utils.py
index 7f487cbb05..e52c261de2 100644
--- a/src/crawlee/utils.py
+++ b/src/crawlee/utils.py
@@ -1,8 +1,17 @@
 from typing import Callable
 
+from crawlee._utils.docs import docs_group
 
+
+@docs_group('Functions')
 def html_to_text(source: str) -> str:
-    """Converts markup string to newline separated plain text without tags."""
+    """Converts markup string to newline separated plain text without tags.
+
+    Args:
+        source: Input markup string
+    Returns:
+        Newline separated plain text without tags.
+    """
     _html_to_text: Callable[[str], str]
     try:
         from crawlee.beautifulsoup_crawler._utils import html_to_text as _html_to_text

From 50968bd6d8c5ccc2c9f5f7054e48750ba50a6cfa Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Wed, 18 Dec 2024 09:21:17 +0100
Subject: [PATCH 12/13] Do not expose in crawlee.utils - review comments

---
 src/crawlee/beautifulsoup_crawler/_utils.py |  9 ++++++-
 src/crawlee/parsel_crawler/_utils.py        |  9 ++++++-
 src/crawlee/utils.py                        | 26 ---------------------
 tests/unit/_utils/test_html_to_text.py      |  5 ++--
 4 files changed, 18 insertions(+), 31 deletions(-)
 delete mode 100644 src/crawlee/utils.py

diff --git a/src/crawlee/beautifulsoup_crawler/_utils.py b/src/crawlee/beautifulsoup_crawler/_utils.py
index 52f15684f1..f92990ba9b 100644
--- a/src/crawlee/beautifulsoup_crawler/_utils.py
+++ b/src/crawlee/beautifulsoup_crawler/_utils.py
@@ -18,7 +18,14 @@
 
 
 def html_to_text(source: str | BeautifulSoup) -> str:
-    """Converts markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup."""
+    """Converts markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup.
+
+    Args:
+        source: Input markup string or `BeautifulSoup` object.
+
+    Returns:
+        Newline separated plain text without tags.
+    """
     if isinstance(source, str):
         soup = BeautifulSoup(source)
     elif isinstance(source, BeautifulSoup):
diff --git a/src/crawlee/parsel_crawler/_utils.py b/src/crawlee/parsel_crawler/_utils.py
index 1300278d04..cf0bf59ee8 100644
--- a/src/crawlee/parsel_crawler/_utils.py
+++ b/src/crawlee/parsel_crawler/_utils.py
@@ -14,7 +14,14 @@
 
 
 def html_to_text(source: str | Selector) -> str:
-    """Converts markup string or `Selector` to newline-separated plain text without tags using Parsel."""
+    """Converts markup string or `Selector` to newline-separated plain text without tags using Parsel.
+
+    Args:
+        source: Input markup string or `Selector` object.
+
+    Returns:
+        Newline separated plain text without tags.
+    """
     if isinstance(source, str):
         selector = Selector(text=source)
     elif isinstance(source, Selector):
diff --git a/src/crawlee/utils.py b/src/crawlee/utils.py
deleted file mode 100644
index e52c261de2..0000000000
--- a/src/crawlee/utils.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from typing import Callable
-
-from crawlee._utils.docs import docs_group
-
-
-@docs_group('Functions')
-def html_to_text(source: str) -> str:
-    """Converts markup string to newline separated plain text without tags.
-
-    Args:
-        source: Input markup string
-    Returns:
-        Newline separated plain text without tags.
-    """
-    _html_to_text: Callable[[str], str]
-    try:
-        from crawlee.beautifulsoup_crawler._utils import html_to_text as _html_to_text
-    except ImportError:
-        try:
-            from crawlee.parsel_crawler._utils import html_to_text as _html_to_text
-        except ImportError as e:
-            raise ImportError(
-                'html_to_text requires either Parsel or BeautifulSoup package to be installed. Please '
-                'install one of following: crawlee[beautifulsoup], crawlee[parsel] or crawlee[all].'
-            ) from e
-    return _html_to_text(source)
diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py
index c36d3b5811..394470b75c 100644
--- a/tests/unit/_utils/test_html_to_text.py
+++ b/tests/unit/_utils/test_html_to_text.py
@@ -8,7 +8,6 @@
 
 from crawlee.beautifulsoup_crawler._utils import html_to_text as html_to_text_beautifulsoup
 from crawlee.parsel_crawler._utils import html_to_text as html_to_text_parsel
-from crawlee.utils import html_to_text as html_to_text_public
 
 _EXPECTED_TEXT = (
     "Let's start with a simple text. \n"
@@ -133,7 +132,7 @@
 """
 
 
-@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup, html_to_text_public])
+@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])
 @pytest.mark.parametrize(
     ('source', 'expected_text'),
     [
@@ -185,7 +184,7 @@ def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[s
     assert html_to_text(source) == expected_text
 
 
-@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup, html_to_text_public])
+@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])
 def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None:
     with pytest.raises(TypeError):
         html_to_text(1)  # type: ignore[arg-type]  # Intentional wrong type test.

From 9f15a9148e646a003716fa392b6d1b668b81490a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= <josef.prochazka@apify.com>
Date: Wed, 18 Dec 2024 16:00:29 +0100
Subject: [PATCH 13/13] Update src/crawlee/_utils/html_to_text.py

Co-authored-by: Jan Buchar <jan.buchar@apify.com>
---
 src/crawlee/_utils/html_to_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py
index bf7c3d3291..804b55f464 100644
--- a/src/crawlee/_utils/html_to_text.py
+++ b/src/crawlee/_utils/html_to_text.py
@@ -3,7 +3,7 @@
 
 import re
 
-# Tags based on Javascript implementation of text_to_html from:
+# Tags based on Javascript implementation of htmlToText from:
 # https://github.com/apify/crawlee/blob/master/packages/utils/src/internals/cheerio.ts#L11
 # Originally added here: https://github.com/apify/apify-ts/commit/4c0e5e3e7377536a449bb7b205132348ad3b0fe9
 SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'}