From c803951ebc7c6957a5a9a36ed07281c4f954fdf4 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 9 Dec 2024 10:23:32 +0100 Subject: [PATCH 01/13] Add simple html_to_text helper function. Add tests. --- src/crawlee/_utils/html_to_text.py | 14 ++++++++++++++ tests/unit/_utils/test_html_to_text.py | 19 +++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 src/crawlee/_utils/html_to_text.py create mode 100644 tests/unit/_utils/test_html_to_text.py diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py new file mode 100644 index 0000000000..6dfc72094b --- /dev/null +++ b/src/crawlee/_utils/html_to_text.py @@ -0,0 +1,14 @@ +from __future__ import annotations + +from bs4 import BeautifulSoup + + +def html_to_text(source: str | BeautifulSoup) -> str: + """Converts markup string or BeautifulSoup object to newline separated plain text without tags.""" + if isinstance(source, str): + soup = BeautifulSoup(source) + elif isinstance(source, BeautifulSoup): + soup = source + else: + raise TypeError('Source must be either a string or a BeautifulSoup object.') + return soup.get_text('\n', strip=True) diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py new file mode 100644 index 0000000000..222156fd63 --- /dev/null +++ b/tests/unit/_utils/test_html_to_text.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +import pytest +from bs4 import BeautifulSoup + +from crawlee._utils.html_to_text import html_to_text + +_EXPECTED_LINES = ('line 1', 'line2', 'line3') +_EXAMPLE_HTML = f'{_EXPECTED_LINES[0]}{_EXPECTED_LINES[1]}\n{_EXPECTED_LINES[2]}' + + +@pytest.mark.parametrize('source', [_EXAMPLE_HTML, BeautifulSoup(_EXAMPLE_HTML)], ids=('String', 'BeautifulSoup')) +def test_html_to_text(source: str | BeautifulSoup) -> None: + assert html_to_text(source) == '\n'.join(_EXPECTED_LINES) + + +def test_html_to_text_raises_on_wrong_input_type() -> None: + with pytest.raises(TypeError): + html_to_text(1) # type: ignore[arg-type] # Intentional wrong type test. From c571caa4312c697252bef3b7f6a8c753d8677e82 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 9 Dec 2024 17:09:23 +0100 Subject: [PATCH 02/13] WIP --- src/crawlee/_utils/html_to_text.py | 8 ++ tests/unit/_utils/test_html_to_text.py | 125 ++++++++++++++++++++++++- 2 files changed, 130 insertions(+), 3 deletions(-) diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py index 6dfc72094b..7397da1c12 100644 --- a/src/crawlee/_utils/html_to_text.py +++ b/src/crawlee/_utils/html_to_text.py @@ -1,7 +1,11 @@ from __future__ import annotations +import re + from bs4 import BeautifulSoup +SKIP_TAGS = {"script", "style" "canvas", "svg", "noscript"} +BLOCK_TAGS = {"p" , "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "li", "pre", "address", "blockquote","dl", "div","fieldset", "form", "table" ,"tr","select","option"} def html_to_text(source: str | BeautifulSoup) -> str: """Converts markup string or BeautifulSoup object to newline separated plain text without tags.""" @@ -11,4 +15,8 @@ def html_to_text(source: str | BeautifulSoup) -> str: soup = source else: raise TypeError('Source must be either a string or a BeautifulSoup object.') + for tag in soup.findAll(): + print(tag) + if tag.c + return soup.get_text('\n', strip=True) diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py index 222156fd63..0ca7af3eb5 100644 --- a/tests/unit/_utils/test_html_to_text.py +++ b/tests/unit/_utils/test_html_to_text.py @@ -5,13 +5,132 @@ from crawlee._utils.html_to_text import html_to_text -_EXPECTED_LINES = ('line 1', 'line2', 'line3') -_EXAMPLE_HTML = f'{_EXPECTED_LINES[0]}{_EXPECTED_LINES[1]}\n{_EXPECTED_LINES[2]}' +_EXPECTED_TEXT = ( +"Let's start with a simple text. \n" + +"The ships hung in the sky, much the way that bricks don't. \n" + +"These aren't the Droids you're looking for\n" + +"I'm sorry, Dave. I'm afraid I can't do that.\n" + +"I'm sorry, Dave. I'm afraid I can't do that.\n" + +'A1\tA2\tA3\t\n' + +'B1\tB2\tB3\tB 4\t\n' + +'This is some text with inline elements and HTML entities (>bla<) \n' + +'Test\n' + +'a\n' + +'few\n' + +'line\n' + +'breaks\n' + +'Spaces in an inline text should be completely ignored. \n' + +'But,\n' + +' a pre-formatted\n' + +' block should be kept\n' + +' pre-formatted.\n' + +'The Greatest Science Fiction Quotes Of All Time \n' + +"Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You Nexus, huh? I design your eyes." +) +_EXAMPLE_HTML = ( +""" + + + Title SHOULD NOT be converted + + + + +Let's start with a simple text. +

+ The ships hung in the sky, much the way that bricks don't. +

+ + +This should be ignored + + + + + + + + + + + + + + + +
A1A2A3
B1B2B3B 4
+ +

+ This is some text with inline elements and HTML entities (>bla<) +

+ +
+ Test
+ a
+ few
+ line
+ breaks
+
+ + + + + Spaces + + + in + + + an inline text should be + + + completely ignored. + + + +
+But,
+    a pre-formatted
+                block  should  be  kept
+                                       pre-formatted.
+
+ + + These special elements SHOULD NOT BE CONVERTED. + + + + + + + + This should be skipped too. + + +The Greatest Science Fiction Quotes Of All Time +

+ Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, + just eyes. You Nexus, huh? I design your eyes. +

+ + +""" +) @pytest.mark.parametrize('source', [_EXAMPLE_HTML, BeautifulSoup(_EXAMPLE_HTML)], ids=('String', 'BeautifulSoup')) def test_html_to_text(source: str | BeautifulSoup) -> None: - assert html_to_text(source) == '\n'.join(_EXPECTED_LINES) + assert html_to_text(source) == _EXPECTED_TEXT def test_html_to_text_raises_on_wrong_input_type() -> None: From f26f8b6b7ef7ff0a1564e4f9f702c0a00c9c3197 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 10 Dec 2024 17:38:41 +0100 Subject: [PATCH 03/13] WIP follow JS implementation --- src/crawlee/_utils/html_to_text.py | 45 ++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py index 7397da1c12..e6371d4791 100644 --- a/src/crawlee/_utils/html_to_text.py +++ b/src/crawlee/_utils/html_to_text.py @@ -2,9 +2,10 @@ import re -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, NavigableString, Tag, PageElement -SKIP_TAGS = {"script", "style" "canvas", "svg", "noscript"} + +SKIP_TAGS = {"script", "style" "canvas", "svg", "noscript", "title"} BLOCK_TAGS = {"p" , "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "li", "pre", "address", "blockquote","dl", "div","fieldset", "form", "table" ,"tr","select","option"} def html_to_text(source: str | BeautifulSoup) -> str: @@ -15,8 +16,40 @@ def html_to_text(source: str | BeautifulSoup) -> str: soup = source else: raise TypeError('Source must be either a string or a BeautifulSoup object.') - for tag in soup.findAll(): - print(tag) - if tag.c - return soup.get_text('\n', strip=True) + text = "" + + def _page_element_to_text(page_element: PageElement) -> str: + nonlocal text + if isinstance(page_element, NavigableString): + compr: str + if page_element.parent.name.lower() == 'pre': + compr = page_element.get_text() + else: + # Compares white spaces outside of pre block + compr = re.sub(r"\s+", " ", page_element.get_text()) + if compr.startswith(" ") and re.match(r"^|\s", page_element.get_text()): + compr = compr[1:] + text += compr + if page_element.parent.name.lower() == 'br': + text += "\n" + if page_element.parent.name.lower() == 'td': + text += "\t" + if page_element.parent.name.lower() in BLOCK_TAGS: + text = f"\n{compr}" + return compr + + if isinstance(page_element, Tag) and page_element.name.lower() in SKIP_TAGS: + return "" + x = list(page_element.stripped_strings) + text_parts = [_page_element_to_text(child) for child in page_element.children] + + return "".join(text_parts) + + + + + return _page_element_to_text(soup) + + + From db0cc6509b016a257e22377a10ad682c0eb11fad Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 10 Dec 2024 20:28:30 +0100 Subject: [PATCH 04/13] Almost same as JS implementation. TODO: Fix last differences and add more tests according to JS implementation. --- src/crawlee/_utils/html_to_text.py | 60 +++++++++++++++--------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py index e6371d4791..37500673c3 100644 --- a/src/crawlee/_utils/html_to_text.py +++ b/src/crawlee/_utils/html_to_text.py @@ -5,7 +5,7 @@ from bs4 import BeautifulSoup, NavigableString, Tag, PageElement -SKIP_TAGS = {"script", "style" "canvas", "svg", "noscript", "title"} +SKIP_TAGS = {"script", "style", "canvas", "svg", "noscript", "title"} BLOCK_TAGS = {"p" , "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "li", "pre", "address", "blockquote","dl", "div","fieldset", "form", "table" ,"tr","select","option"} def html_to_text(source: str | BeautifulSoup) -> str: @@ -19,37 +19,39 @@ def html_to_text(source: str | BeautifulSoup) -> str: text = "" - def _page_element_to_text(page_element: PageElement) -> str: + def _page_element_to_text(page_elements: PageElement) -> str: nonlocal text - if isinstance(page_element, NavigableString): - compr: str - if page_element.parent.name.lower() == 'pre': - compr = page_element.get_text() - else: - # Compares white spaces outside of pre block - compr = re.sub(r"\s+", " ", page_element.get_text()) - if compr.startswith(" ") and re.match(r"^|\s", page_element.get_text()): - compr = compr[1:] - text += compr - if page_element.parent.name.lower() == 'br': + for page_element in page_elements: + if isinstance(page_element, NavigableString): + compr: str + if page_element.parent.name.lower() == 'pre': + compr = page_element.get_text() + else: + # Compares white spaces outside of pre block + compr = re.sub(r"\s+", " ", page_element.get_text()) + # If text is empty or ends with a whitespace, don't add the leading whitespace + if compr.startswith(" ") and re.search(r"(^|\s)$", text): + compr = compr[1:] + text += compr + elif page_element.name.lower() in SKIP_TAGS or isinstance(page_element, int): + # Skip comments and special elements + pass + elif page_element.name.lower() == 'br': text += "\n" - if page_element.parent.name.lower() == 'td': + elif page_element.name.lower() == 'td': + _page_element_to_text(page_element.children) text += "\t" - if page_element.parent.name.lower() in BLOCK_TAGS: - text = f"\n{compr}" - return compr - - if isinstance(page_element, Tag) and page_element.name.lower() in SKIP_TAGS: - return "" - x = list(page_element.stripped_strings) - text_parts = [_page_element_to_text(child) for child in page_element.children] - - return "".join(text_parts) - - - - - return _page_element_to_text(soup) + else: + # Block elements must be surrounded by newlines(unless beginning of text) + is_block_tag = page_element.name.lower() in BLOCK_TAGS + if is_block_tag and not re.search(r"(^|\n)$", text): + text += '\n' + _page_element_to_text(page_element.children) + if (is_block_tag and not text.endswith('\n')): + text += '\n' + + _page_element_to_text(soup) + return text.strip() From 26d5aee1e958ea3badffbce9b328a33241001002 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 11 Dec 2024 10:32:24 +0100 Subject: [PATCH 05/13] Same behavior as JS implementation. --- src/crawlee/_utils/html_to_text.py | 95 +++++++++++++++--------- tests/unit/_utils/test_html_to_text.py | 99 +++++++++++++++++++------- 2 files changed, 133 insertions(+), 61 deletions(-) diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py index 37500673c3..a65786a768 100644 --- a/src/crawlee/_utils/html_to_text.py +++ b/src/crawlee/_utils/html_to_text.py @@ -1,12 +1,38 @@ from __future__ import annotations import re +from typing import TYPE_CHECKING -from bs4 import BeautifulSoup, NavigableString, Tag, PageElement +from bs4 import BeautifulSoup, NavigableString, PageElement, Tag +if TYPE_CHECKING: + from collections.abc import Iterable + +SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'} +BLOCK_TAGS = { + 'p', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'ol', + 'ul', + 'li', + 'pre', + 'address', + 'blockquote', + 'dl', + 'div', + 'fieldset', + 'form', + 'table', + 'tr', + 'select', + 'option', +} -SKIP_TAGS = {"script", "style", "canvas", "svg", "noscript", "title"} -BLOCK_TAGS = {"p" , "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "li", "pre", "address", "blockquote","dl", "div","fieldset", "form", "table" ,"tr","select","option"} def html_to_text(source: str | BeautifulSoup) -> str: """Converts markup string or BeautifulSoup object to newline separated plain text without tags.""" @@ -17,41 +43,40 @@ def html_to_text(source: str | BeautifulSoup) -> str: else: raise TypeError('Source must be either a string or a BeautifulSoup object.') - text = "" + text = '' - def _page_element_to_text(page_elements: PageElement) -> str: + def _page_element_to_text(page_elements: Iterable[PageElement]) -> None: + """Custom html parsing that performs as implementation from Javascript version of Crawlee.""" nonlocal text for page_element in page_elements: - if isinstance(page_element, NavigableString): - compr: str - if page_element.parent.name.lower() == 'pre': - compr = page_element.get_text() - else: - # Compares white spaces outside of pre block - compr = re.sub(r"\s+", " ", page_element.get_text()) - # If text is empty or ends with a whitespace, don't add the leading whitespace - if compr.startswith(" ") and re.search(r"(^|\s)$", text): - compr = compr[1:] - text += compr - elif page_element.name.lower() in SKIP_TAGS or isinstance(page_element, int): - # Skip comments and special elements - pass - elif page_element.name.lower() == 'br': - text += "\n" - elif page_element.name.lower() == 'td': - _page_element_to_text(page_element.children) - text += "\t" - else: - # Block elements must be surrounded by newlines(unless beginning of text) - is_block_tag = page_element.name.lower() in BLOCK_TAGS - if is_block_tag and not re.search(r"(^|\n)$", text): - text += '\n' - _page_element_to_text(page_element.children) - if (is_block_tag and not text.endswith('\n')): + if isinstance(page_element, (Tag, NavigableString)): + if isinstance(page_element, NavigableString): + compr: str + if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre': + compr = page_element.get_text() + else: + # Compress white spaces outside of pre block + compr = re.sub(r'\s+', ' ', page_element.get_text()) + # If text is empty or ends with a whitespace, don't add the leading whitespace or new line + if (compr.startswith((' ', '\n'))) and re.search(r'(^|\s)$', text): + compr = compr[1:] + text += compr + elif page_element.name.lower() in SKIP_TAGS or isinstance(page_element, int): + # Skip comments and special elements + pass + elif page_element.name.lower() == 'br': text += '\n' + elif page_element.name.lower() == 'td': + _page_element_to_text(page_element.children) + text += '\t' + else: + # Block elements must be surrounded by newlines(unless beginning of text) + is_block_tag = page_element.name.lower() in BLOCK_TAGS + if is_block_tag and not re.search(r'(^|\n)$', text): + text += '\n' + _page_element_to_text(page_element.children) + if is_block_tag and not text.endswith('\n'): + text += '\n' - _page_element_to_text(soup) + _page_element_to_text(soup.children) return text.strip() - - - diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py index 0ca7af3eb5..4e462c05cc 100644 --- a/tests/unit/_utils/test_html_to_text.py +++ b/tests/unit/_utils/test_html_to_text.py @@ -6,30 +6,30 @@ from crawlee._utils.html_to_text import html_to_text _EXPECTED_TEXT = ( -"Let's start with a simple text. \n" + -"The ships hung in the sky, much the way that bricks don't. \n" + -"These aren't the Droids you're looking for\n" + -"I'm sorry, Dave. I'm afraid I can't do that.\n" + -"I'm sorry, Dave. I'm afraid I can't do that.\n" + -'A1\tA2\tA3\t\n' + -'B1\tB2\tB3\tB 4\t\n' + -'This is some text with inline elements and HTML entities (>bla<) \n' + -'Test\n' + -'a\n' + -'few\n' + -'line\n' + -'breaks\n' + -'Spaces in an inline text should be completely ignored. \n' + -'But,\n' + -' a pre-formatted\n' + -' block should be kept\n' + -' pre-formatted.\n' + -'The Greatest Science Fiction Quotes Of All Time \n' + -"Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You Nexus, huh? I design your eyes." + "Let's start with a simple text. \n" + "The ships hung in the sky, much the way that bricks don't. \n" + "These aren't the Droids you're looking for\n" + "I'm sorry, Dave. I'm afraid I can't do that.\n" + "I'm sorry, Dave. I'm afraid I can't do that.\n" + 'A1\tA2\tA3\t\n' + 'B1\tB2\tB3\tB 4\t\n' + 'This is some text with inline elements and HTML entities (>bla<) \n' + 'Test\n' + 'a\n' + 'few\n' + 'line\n' + 'breaks\n' + 'Spaces in an inline text should be completely ignored. \n' + 'But,\n' + ' a pre-formatted\n' + ' block should be kept\n' + ' pre-formatted.\n' + 'The Greatest Science Fiction Quotes Of All Time \n' + "Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You " + 'Nexus, huh? I design your eyes.' ) -_EXAMPLE_HTML = ( -""" +_EXAMPLE_HTML = """ Title SHOULD NOT be converted @@ -126,11 +126,58 @@ """ -) -@pytest.mark.parametrize('source', [_EXAMPLE_HTML, BeautifulSoup(_EXAMPLE_HTML)], ids=('String', 'BeautifulSoup')) -def test_html_to_text(source: str | BeautifulSoup) -> None: - assert html_to_text(source) == _EXPECTED_TEXT + +@pytest.mark.parametrize( + ('source', 'expected_text'), + [ + (_EXAMPLE_HTML, _EXPECTED_TEXT), + (BeautifulSoup(_EXAMPLE_HTML), _EXPECTED_TEXT), + (' Plain text node ', 'Plain text node'), + (' \nPlain text node \n ', 'Plain text node'), + ('

Header 1

Header 2

', 'Header 1\nHeader 2'), + ('

Header 1

Header 2


', 'Header 1\nHeader 2'), + ('

Header 1

Header 2



', 'Header 1\nHeader 2'), + ('

Header 1

Header 2




', 'Header 1\nHeader 2'), + ('

Header 1


Header 2




', 'Header 1\n\nHeader 2'), + ('

Header 1


Header 2




', 'Header 1\n\nHeader 2'), + ('

Header 1

\n
\n

Header 2




', 'Header 1\n\nHeader 2'), + ('

Header 1

\n
\n

Header 2




', 'Header 1\n\n\nHeader 2'), + ('

Header 1

\n
\n

Header 2




', 'Header 1\n\n\n\nHeader 2'), + ('
Div

Paragraph

', 'Div\nParagraph'), + ('
Div1
Div2
', 'Div1\nDiv2'), + ('
Div1
', 'Div1'), + ('
Div1
', 'Div1'), + ('
Div1
', 'Div1'), + ('Skip svg
Div1
', 'Div1'), + ('Skip canvas
Div1
', 'Div1'), + ('A B C D E\n\nF G', 'A B C D E F G'), + ('
A  B  C  D  E\n\nF  G
', 'A B C D E\n\nF G'), + ( + '

Heading 1

Deep Div

Heading 2

', + 'Heading 1\nDeep Div' '\nHeading 2', + ), + ('this_word_should_be_one', 'this_word_should_be_one'), + ('some text', 'some text'), + ( + ( + """ + + + + + + + +
Cell A1Cell A2 Cell A3
Cell B1Cell B2
""" + ), + 'Cell A1\tCell A2\tCell A3 \t\nCell B1\tCell B2', + ), + ('á é', 'á é'), + ], +) +def test_html_to_text(source: str | BeautifulSoup, expected_text: str) -> None: + assert html_to_text(source) == expected_text def test_html_to_text_raises_on_wrong_input_type() -> None: From 27cfedd7a3a526a7c034a64a46b91333c9c6347a Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 11 Dec 2024 11:07:47 +0100 Subject: [PATCH 06/13] Reformat import in test_base_crawler.py --- tests/unit/basic_crawler/test_basic_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/basic_crawler/test_basic_crawler.py b/tests/unit/basic_crawler/test_basic_crawler.py index 9e3768064e..5c9399c699 100644 --- a/tests/unit/basic_crawler/test_basic_crawler.py +++ b/tests/unit/basic_crawler/test_basic_crawler.py @@ -4,10 +4,10 @@ import asyncio import json import logging +import os from collections import Counter from dataclasses import dataclass from datetime import timedelta -import os from pathlib import Path from typing import TYPE_CHECKING, Any from unittest.mock import AsyncMock, Mock From bab689c42882a9182d21a26c9e78bfe54678712a Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 13 Dec 2024 09:05:09 +0100 Subject: [PATCH 07/13] Pre-compile used re patterns. Expose this function in BS crawler. --- src/crawlee/_utils/html_to_text.py | 10 +++++++--- src/crawlee/beautifulsoup_crawler/__init__.py | 4 +++- tests/unit/_utils/test_html_to_text.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py index a65786a768..1f35d5e4f3 100644 --- a/src/crawlee/_utils/html_to_text.py +++ b/src/crawlee/_utils/html_to_text.py @@ -33,6 +33,10 @@ 'option', } +_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\s)$') +_EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\n)$') +_ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\s+') + def html_to_text(source: str | BeautifulSoup) -> str: """Converts markup string or BeautifulSoup object to newline separated plain text without tags.""" @@ -56,9 +60,9 @@ def _page_element_to_text(page_elements: Iterable[PageElement]) -> None: compr = page_element.get_text() else: # Compress white spaces outside of pre block - compr = re.sub(r'\s+', ' ', page_element.get_text()) + compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text()) # If text is empty or ends with a whitespace, don't add the leading whitespace or new line - if (compr.startswith((' ', '\n'))) and re.search(r'(^|\s)$', text): + if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text): compr = compr[1:] text += compr elif page_element.name.lower() in SKIP_TAGS or isinstance(page_element, int): @@ -72,7 +76,7 @@ def _page_element_to_text(page_elements: Iterable[PageElement]) -> None: else: # Block elements must be surrounded by newlines(unless beginning of text) is_block_tag = page_element.name.lower() in BLOCK_TAGS - if is_block_tag and not re.search(r'(^|\n)$', text): + if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text): text += '\n' _page_element_to_text(page_element.children) if is_block_tag and not text.endswith('\n'): diff --git a/src/crawlee/beautifulsoup_crawler/__init__.py b/src/crawlee/beautifulsoup_crawler/__init__.py index 59b0264cc1..f9e991f62d 100644 --- a/src/crawlee/beautifulsoup_crawler/__init__.py +++ b/src/crawlee/beautifulsoup_crawler/__init__.py @@ -1,4 +1,6 @@ try: + from crawlee._utils.html_to_text import html_to_text + from ._beautifulsoup_crawler import BeautifulSoupCrawler from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext from ._beautifulsoup_parser import BeautifulSoupParserType @@ -8,4 +10,4 @@ "For example, if you use pip, run `pip install 'crawlee[beautifulsoup]'`.", ) from exc -__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType'] +__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType', 'html_to_text'] diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py index 4e462c05cc..1273cb4775 100644 --- a/tests/unit/_utils/test_html_to_text.py +++ b/tests/unit/_utils/test_html_to_text.py @@ -3,7 +3,7 @@ import pytest from bs4 import BeautifulSoup -from crawlee._utils.html_to_text import html_to_text +from crawlee.beautifulsoup_crawler import html_to_text _EXPECTED_TEXT = ( "Let's start with a simple text. \n" From 6f32156f36ab1611f6edde4dcfa7ede5f46b8446 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 17 Dec 2024 09:33:57 +0100 Subject: [PATCH 08/13] Add Parsel version of html_to_text. Add both implementations to respective contexts. Set same tests for both. --- src/crawlee/_utils/html_to_text.py | 55 +--------------- src/crawlee/beautifulsoup_crawler/__init__.py | 4 +- .../_beautifulsoup_crawling_context.py | 5 ++ src/crawlee/beautifulsoup_crawler/_utils.py | 66 +++++++++++++++++++ .../_parsel_crawling_context.py | 5 ++ src/crawlee/parsel_crawler/_utils.py | 63 ++++++++++++++++++ tests/unit/_utils/test_html_to_text.py | 21 ++++-- 7 files changed, 158 insertions(+), 61 deletions(-) create mode 100644 src/crawlee/beautifulsoup_crawler/_utils.py create mode 100644 src/crawlee/parsel_crawler/_utils.py diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py index 1f35d5e4f3..3cc813c3a6 100644 --- a/src/crawlee/_utils/html_to_text.py +++ b/src/crawlee/_utils/html_to_text.py @@ -1,12 +1,7 @@ +# This file contains shared constants used by different implementations of html_to_text function. from __future__ import annotations import re -from typing import TYPE_CHECKING - -from bs4 import BeautifulSoup, NavigableString, PageElement, Tag - -if TYPE_CHECKING: - from collections.abc import Iterable SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'} BLOCK_TAGS = { @@ -36,51 +31,3 @@ _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\s)$') _EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\n)$') _ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\s+') - - -def html_to_text(source: str | BeautifulSoup) -> str: - """Converts markup string or BeautifulSoup object to newline separated plain text without tags.""" - if isinstance(source, str): - soup = BeautifulSoup(source) - elif isinstance(source, BeautifulSoup): - soup = source - else: - raise TypeError('Source must be either a string or a BeautifulSoup object.') - - text = '' - - def _page_element_to_text(page_elements: Iterable[PageElement]) -> None: - """Custom html parsing that performs as implementation from Javascript version of Crawlee.""" - nonlocal text - for page_element in page_elements: - if isinstance(page_element, (Tag, NavigableString)): - if isinstance(page_element, NavigableString): - compr: str - if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre': - compr = page_element.get_text() - else: - # Compress white spaces outside of pre block - compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text()) - # If text is empty or ends with a whitespace, don't add the leading whitespace or new line - if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text): - compr = compr[1:] - text += compr - elif page_element.name.lower() in SKIP_TAGS or isinstance(page_element, int): - # Skip comments and special elements - pass - elif page_element.name.lower() == 'br': - text += '\n' - elif page_element.name.lower() == 'td': - _page_element_to_text(page_element.children) - text += '\t' - else: - # Block elements must be surrounded by newlines(unless beginning of text) - is_block_tag = page_element.name.lower() in BLOCK_TAGS - if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text): - text += '\n' - _page_element_to_text(page_element.children) - if is_block_tag and not text.endswith('\n'): - text += '\n' - - _page_element_to_text(soup.children) - return text.strip() diff --git a/src/crawlee/beautifulsoup_crawler/__init__.py b/src/crawlee/beautifulsoup_crawler/__init__.py index f9e991f62d..59b0264cc1 100644 --- a/src/crawlee/beautifulsoup_crawler/__init__.py +++ b/src/crawlee/beautifulsoup_crawler/__init__.py @@ -1,6 +1,4 @@ try: - from crawlee._utils.html_to_text import html_to_text - from ._beautifulsoup_crawler import BeautifulSoupCrawler from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext from ._beautifulsoup_parser import BeautifulSoupParserType @@ -10,4 +8,4 @@ "For example, if you use pip, run `pip install 'crawlee[beautifulsoup]'`.", ) from exc -__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType', 'html_to_text'] +__all__ = ['BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType'] diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py index f01d66a1c0..520b678199 100644 --- a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py +++ b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py @@ -5,6 +5,7 @@ from crawlee._utils.docs import docs_group from crawlee.abstract_http_crawler._http_crawling_context import ParsedHttpCrawlingContext +from crawlee.beautifulsoup_crawler._utils import html_to_text @dataclass(frozen=True) @@ -24,3 +25,7 @@ def soup(self) -> BeautifulSoup: def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self: """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`.""" return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) + + def html_to_text(self) -> str: + """Converts parsed_content to newline-separated plain text without tags.""" + return html_to_text(self.parsed_content) diff --git a/src/crawlee/beautifulsoup_crawler/_utils.py b/src/crawlee/beautifulsoup_crawler/_utils.py new file mode 100644 index 0000000000..52f15684f1 --- /dev/null +++ b/src/crawlee/beautifulsoup_crawler/_utils.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +from bs4 import BeautifulSoup, NavigableString, PageElement, Tag + +from crawlee._utils.html_to_text import ( + _ANY_CONSECUTIVE_WHITE_SPACES, + _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, + _EMPTY_OR_ENDS_WITH_NEW_LINE, + BLOCK_TAGS, + SKIP_TAGS, +) + +if TYPE_CHECKING: + from collections.abc import Iterable + + +def html_to_text(source: str | BeautifulSoup) -> str: + """Converts markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup.""" + if isinstance(source, str): + soup = BeautifulSoup(source) + elif isinstance(source, BeautifulSoup): + soup = source + else: + raise TypeError('Source must be either a string or a `BeautifulSoup` object.') + + text = '' + + def _page_element_to_text(page_elements: Iterable[PageElement]) -> None: + """Custom html parsing that performs as implementation from Javascript version of Crawlee.""" + nonlocal text + for page_element in page_elements: + if isinstance(page_element, (Tag, NavigableString)): + if isinstance(page_element, NavigableString): + compr: str + if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre': + compr = page_element.get_text() + else: + # Compress white spaces outside of pre block + compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text()) + # If text is empty or ends with a whitespace, don't add the leading whitespace or new line + if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text): + compr = compr[1:] + text += compr + elif page_element.name.lower() in SKIP_TAGS: + # Skip comments and special elements + pass + elif page_element.name.lower() == 'br': + text += '\n' + elif page_element.name.lower() == 'td': + _page_element_to_text(page_element.children) + text += '\t' + else: + # Block elements must be surrounded by newlines(unless beginning of text) + is_block_tag = page_element.name.lower() in BLOCK_TAGS + if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text): + text += '\n' + _page_element_to_text(page_element.children) + if is_block_tag and not text.endswith('\n'): + text += '\n' + + _page_element_to_text(soup.children) + + return text.strip() diff --git a/src/crawlee/parsel_crawler/_parsel_crawling_context.py b/src/crawlee/parsel_crawler/_parsel_crawling_context.py index 5dd13e3868..96fb9bf9cc 100644 --- a/src/crawlee/parsel_crawler/_parsel_crawling_context.py +++ b/src/crawlee/parsel_crawler/_parsel_crawling_context.py @@ -5,6 +5,7 @@ from crawlee._utils.docs import docs_group from crawlee.abstract_http_crawler._http_crawling_context import ParsedHttpCrawlingContext +from crawlee.parsel_crawler._utils import html_to_text @dataclass(frozen=True) @@ -24,3 +25,7 @@ def selector(self) -> Selector: def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Selector]) -> Self: """Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`.""" return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) + + def html_to_text(self) -> str: + """Converts parsed_content to newline-separated plain text without tags.""" + return html_to_text(self.parsed_content) diff --git a/src/crawlee/parsel_crawler/_utils.py b/src/crawlee/parsel_crawler/_utils.py new file mode 100644 index 0000000000..1300278d04 --- /dev/null +++ b/src/crawlee/parsel_crawler/_utils.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import re + +from parsel import Selector + +from crawlee._utils.html_to_text import ( + _ANY_CONSECUTIVE_WHITE_SPACES, + _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, + _EMPTY_OR_ENDS_WITH_NEW_LINE, + BLOCK_TAGS, + SKIP_TAGS, +) + + +def html_to_text(source: str | Selector) -> str: + """Converts markup string or `Selector` to newline-separated plain text without tags using Parsel.""" + if isinstance(source, str): + selector = Selector(text=source) + elif isinstance(source, Selector): + selector = source + else: + raise TypeError('Source must be either a string or a `Selector` object.') + + text = '' + + def _extract_text(elements: list[Selector], *, compress: bool = True) -> None: + """Custom html parsing that performs as implementation from Javascript version of Crawlee.""" + nonlocal text + for element in elements: + tag = element.root.tag if hasattr(element.root, 'tag') else None + + if tag is None: + # Compress white spaces outside of pre block + compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', element.root) if compress else element.root + # If text is empty or ends with a whitespace, don't add the leading whitespace or new line + if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text): + compr = compr[1:] + text += compr + + if tag in SKIP_TAGS or not isinstance(tag, str): + continue + + if tag == 'br': + text += '\n' + elif tag == 'td': + _extract_text(element.xpath('./node()')) + text += '\t' + else: + is_block_tag = tag in BLOCK_TAGS if tag else False + + if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text): + text += '\n' + + _extract_text(element.xpath('./node()'), compress=tag != 'pre') + + if is_block_tag and not text.endswith('\n'): + text += '\n' + + # Start processing the root elements + _extract_text(selector.xpath('/*')) + + return text.strip() diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py index 1273cb4775..394470b75c 100644 --- a/tests/unit/_utils/test_html_to_text.py +++ b/tests/unit/_utils/test_html_to_text.py @@ -1,9 +1,13 @@ from __future__ import annotations +from typing import Callable + import pytest from bs4 import BeautifulSoup +from parsel import Selector -from crawlee.beautifulsoup_crawler import html_to_text +from crawlee.beautifulsoup_crawler._utils import html_to_text as html_to_text_beautifulsoup +from crawlee.parsel_crawler._utils import html_to_text as html_to_text_parsel _EXPECTED_TEXT = ( "Let's start with a simple text. \n" @@ -128,11 +132,11 @@ """ +@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup]) @pytest.mark.parametrize( ('source', 'expected_text'), [ (_EXAMPLE_HTML, _EXPECTED_TEXT), - (BeautifulSoup(_EXAMPLE_HTML), _EXPECTED_TEXT), (' Plain text node ', 'Plain text node'), (' \nPlain text node \n ', 'Plain text node'), ('

Header 1

Header 2

', 'Header 1\nHeader 2'), @@ -176,10 +180,19 @@ ('á é', 'á é'), ], ) -def test_html_to_text(source: str | BeautifulSoup, expected_text: str) -> None: +def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[str], str]) -> None: assert html_to_text(source) == expected_text -def test_html_to_text_raises_on_wrong_input_type() -> None: +@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup]) +def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None: with pytest.raises(TypeError): html_to_text(1) # type: ignore[arg-type] # Intentional wrong type test. + + +def test_html_to_text_parsel() -> None: + assert html_to_text_parsel(Selector(_EXAMPLE_HTML)) == _EXPECTED_TEXT + + +def test_html_to_text_beautifulsoup() -> None: + assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML)) == _EXPECTED_TEXT From 964c2cfbd1afab4a6ea89735b1ddcff6eeab6216 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 17 Dec 2024 10:03:32 +0100 Subject: [PATCH 09/13] Add public function as well. --- src/crawlee/_utils/html_to_text.py | 3 +++ src/crawlee/utils.py | 17 +++++++++++++++++ tests/unit/_utils/test_html_to_text.py | 5 +++-- 3 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 src/crawlee/utils.py diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py index 3cc813c3a6..bf7c3d3291 100644 --- a/src/crawlee/_utils/html_to_text.py +++ b/src/crawlee/_utils/html_to_text.py @@ -3,6 +3,9 @@ import re +# Tags based on Javascript implementation of text_to_html from: +# https://github.com/apify/crawlee/blob/master/packages/utils/src/internals/cheerio.ts#L11 +# Originally added here: https://github.com/apify/apify-ts/commit/4c0e5e3e7377536a449bb7b205132348ad3b0fe9 SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'} BLOCK_TAGS = { 'p', diff --git a/src/crawlee/utils.py b/src/crawlee/utils.py new file mode 100644 index 0000000000..7f487cbb05 --- /dev/null +++ b/src/crawlee/utils.py @@ -0,0 +1,17 @@ +from typing import Callable + + +def html_to_text(source: str) -> str: + """Converts markup string to newline separated plain text without tags.""" + _html_to_text: Callable[[str], str] + try: + from crawlee.beautifulsoup_crawler._utils import html_to_text as _html_to_text + except ImportError: + try: + from crawlee.parsel_crawler._utils import html_to_text as _html_to_text + except ImportError as e: + raise ImportError( + 'html_to_text requires either Parsel or BeautifulSoup package to be installed. Please ' + 'install one of following: crawlee[beautifulsoup], crawlee[parsel] or crawlee[all].' + ) from e + return _html_to_text(source) diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py index 394470b75c..c36d3b5811 100644 --- a/tests/unit/_utils/test_html_to_text.py +++ b/tests/unit/_utils/test_html_to_text.py @@ -8,6 +8,7 @@ from crawlee.beautifulsoup_crawler._utils import html_to_text as html_to_text_beautifulsoup from crawlee.parsel_crawler._utils import html_to_text as html_to_text_parsel +from crawlee.utils import html_to_text as html_to_text_public _EXPECTED_TEXT = ( "Let's start with a simple text. \n" @@ -132,7 +133,7 @@ """ -@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup]) +@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup, html_to_text_public]) @pytest.mark.parametrize( ('source', 'expected_text'), [ @@ -184,7 +185,7 @@ def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[s assert html_to_text(source) == expected_text -@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup]) +@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup, html_to_text_public]) def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None: with pytest.raises(TypeError): html_to_text(1) # type: ignore[arg-type] # Intentional wrong type test. From 0cb68af859edf40b9926a1b5b0133c3806d00159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= Date: Tue, 17 Dec 2024 10:26:30 +0100 Subject: [PATCH 10/13] Apply suggestions from code review Co-authored-by: Vlada Dusek --- .../beautifulsoup_crawler/_beautifulsoup_crawling_context.py | 2 +- src/crawlee/parsel_crawler/_parsel_crawling_context.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py index 520b678199..1a3751b97d 100644 --- a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py +++ b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawling_context.py @@ -27,5 +27,5 @@ def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Be return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) def html_to_text(self) -> str: - """Converts parsed_content to newline-separated plain text without tags.""" + """Convert the parsed HTML content to newline-separated plain text without tags.""" return html_to_text(self.parsed_content) diff --git a/src/crawlee/parsel_crawler/_parsel_crawling_context.py b/src/crawlee/parsel_crawler/_parsel_crawling_context.py index 96fb9bf9cc..2b9b33df58 100644 --- a/src/crawlee/parsel_crawler/_parsel_crawling_context.py +++ b/src/crawlee/parsel_crawler/_parsel_crawling_context.py @@ -27,5 +27,5 @@ def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Se return cls(**{field.name: getattr(context, field.name) for field in fields(context)}) def html_to_text(self) -> str: - """Converts parsed_content to newline-separated plain text without tags.""" + """Convert the parsed HTML content to newline-separated plain text without tags.""" return html_to_text(self.parsed_content) From 210749e6d6ef495cf1a9d08e536677f4e3130efc Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 17 Dec 2024 10:56:44 +0100 Subject: [PATCH 11/13] Add docs decorator --- src/crawlee/utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/crawlee/utils.py b/src/crawlee/utils.py index 7f487cbb05..e52c261de2 100644 --- a/src/crawlee/utils.py +++ b/src/crawlee/utils.py @@ -1,8 +1,17 @@ from typing import Callable +from crawlee._utils.docs import docs_group + +@docs_group('Functions') def html_to_text(source: str) -> str: - """Converts markup string to newline separated plain text without tags.""" + """Converts markup string to newline separated plain text without tags. + + Args: + source: Input markup string + Returns: + Newline separated plain text without tags. + """ _html_to_text: Callable[[str], str] try: from crawlee.beautifulsoup_crawler._utils import html_to_text as _html_to_text From 50968bd6d8c5ccc2c9f5f7054e48750ba50a6cfa Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 18 Dec 2024 09:21:17 +0100 Subject: [PATCH 12/13] Do not expose in crawlee.utils - review comments --- src/crawlee/beautifulsoup_crawler/_utils.py | 9 ++++++- src/crawlee/parsel_crawler/_utils.py | 9 ++++++- src/crawlee/utils.py | 26 --------------------- tests/unit/_utils/test_html_to_text.py | 5 ++-- 4 files changed, 18 insertions(+), 31 deletions(-) delete mode 100644 src/crawlee/utils.py diff --git a/src/crawlee/beautifulsoup_crawler/_utils.py b/src/crawlee/beautifulsoup_crawler/_utils.py index 52f15684f1..f92990ba9b 100644 --- a/src/crawlee/beautifulsoup_crawler/_utils.py +++ b/src/crawlee/beautifulsoup_crawler/_utils.py @@ -18,7 +18,14 @@ def html_to_text(source: str | BeautifulSoup) -> str: - """Converts markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup.""" + """Converts markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup. + + Args: + source: Input markup string or `BeautifulSoup` object. + + Returns: + Newline separated plain text without tags. + """ if isinstance(source, str): soup = BeautifulSoup(source) elif isinstance(source, BeautifulSoup): diff --git a/src/crawlee/parsel_crawler/_utils.py b/src/crawlee/parsel_crawler/_utils.py index 1300278d04..cf0bf59ee8 100644 --- a/src/crawlee/parsel_crawler/_utils.py +++ b/src/crawlee/parsel_crawler/_utils.py @@ -14,7 +14,14 @@ def html_to_text(source: str | Selector) -> str: - """Converts markup string or `Selector` to newline-separated plain text without tags using Parsel.""" + """Converts markup string or `Selector` to newline-separated plain text without tags using Parsel. + + Args: + source: Input markup string or `Selector` object. + + Returns: + Newline separated plain text without tags. + """ if isinstance(source, str): selector = Selector(text=source) elif isinstance(source, Selector): diff --git a/src/crawlee/utils.py b/src/crawlee/utils.py deleted file mode 100644 index e52c261de2..0000000000 --- a/src/crawlee/utils.py +++ /dev/null @@ -1,26 +0,0 @@ -from typing import Callable - -from crawlee._utils.docs import docs_group - - -@docs_group('Functions') -def html_to_text(source: str) -> str: - """Converts markup string to newline separated plain text without tags. - - Args: - source: Input markup string - Returns: - Newline separated plain text without tags. - """ - _html_to_text: Callable[[str], str] - try: - from crawlee.beautifulsoup_crawler._utils import html_to_text as _html_to_text - except ImportError: - try: - from crawlee.parsel_crawler._utils import html_to_text as _html_to_text - except ImportError as e: - raise ImportError( - 'html_to_text requires either Parsel or BeautifulSoup package to be installed. Please ' - 'install one of following: crawlee[beautifulsoup], crawlee[parsel] or crawlee[all].' - ) from e - return _html_to_text(source) diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py index c36d3b5811..394470b75c 100644 --- a/tests/unit/_utils/test_html_to_text.py +++ b/tests/unit/_utils/test_html_to_text.py @@ -8,7 +8,6 @@ from crawlee.beautifulsoup_crawler._utils import html_to_text as html_to_text_beautifulsoup from crawlee.parsel_crawler._utils import html_to_text as html_to_text_parsel -from crawlee.utils import html_to_text as html_to_text_public _EXPECTED_TEXT = ( "Let's start with a simple text. \n" @@ -133,7 +132,7 @@ """ -@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup, html_to_text_public]) +@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup]) @pytest.mark.parametrize( ('source', 'expected_text'), [ @@ -185,7 +184,7 @@ def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[s assert html_to_text(source) == expected_text -@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup, html_to_text_public]) +@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup]) def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None: with pytest.raises(TypeError): html_to_text(1) # type: ignore[arg-type] # Intentional wrong type test. From 9f15a9148e646a003716fa392b6d1b668b81490a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= Date: Wed, 18 Dec 2024 16:00:29 +0100 Subject: [PATCH 13/13] Update src/crawlee/_utils/html_to_text.py Co-authored-by: Jan Buchar --- src/crawlee/_utils/html_to_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/_utils/html_to_text.py b/src/crawlee/_utils/html_to_text.py index bf7c3d3291..804b55f464 100644 --- a/src/crawlee/_utils/html_to_text.py +++ b/src/crawlee/_utils/html_to_text.py @@ -3,7 +3,7 @@ import re -# Tags based on Javascript implementation of text_to_html from: +# Tags based on Javascript implementation of htmlToText from: # https://github.com/apify/crawlee/blob/master/packages/utils/src/internals/cheerio.ts#L11 # Originally added here: https://github.com/apify/apify-ts/commit/4c0e5e3e7377536a449bb7b205132348ad3b0fe9 SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'}