-
Notifications
You must be signed in to change notification settings - Fork 513
feat: Add html_to_text helper function
#792
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 9 commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
c803951
Add simple html_to_text helper function.
Pijukatel c571caa
WIP
Pijukatel f26f8b6
WIP follow JS implementation
Pijukatel db0cc65
Almost same as JS implementation.
Pijukatel 26d5aee
Same behavior as JS implementation.
Pijukatel b45e2f5
Merge remote-tracking branch 'origin/master' into helper-function-tag…
Pijukatel 27cfedd
Reformat import in test_base_crawler.py
Pijukatel bab689c
Pre-compile used re patterns.
Pijukatel 6f32156
Add Parsel version of html_to_text.
Pijukatel 964c2cf
Add public function as well.
Pijukatel 0cb68af
Apply suggestions from code review
Pijukatel 210749e
Add docs decorator
Pijukatel 50968bd
Do not expose in crawlee.utils - review comments
Pijukatel 9f15a91
Update src/crawlee/_utils/html_to_text.py
Pijukatel File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| # This file contains shared constants used by different implementations of html_to_text function. | ||
| from __future__ import annotations | ||
|
|
||
| import re | ||
|
|
||
| SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'} | ||
| BLOCK_TAGS = { | ||
| 'p', | ||
| 'h1', | ||
| 'h2', | ||
| 'h3', | ||
| 'h4', | ||
| 'h5', | ||
| 'h6', | ||
| 'ol', | ||
| 'ul', | ||
| 'li', | ||
| 'pre', | ||
| 'address', | ||
| 'blockquote', | ||
| 'dl', | ||
| 'div', | ||
| 'fieldset', | ||
| 'form', | ||
| 'table', | ||
| 'tr', | ||
| 'select', | ||
| 'option', | ||
| } | ||
janbuchar marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\s)$') | ||
| _EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\n)$') | ||
| _ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\s+') | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import re | ||
| from typing import TYPE_CHECKING | ||
|
|
||
| from bs4 import BeautifulSoup, NavigableString, PageElement, Tag | ||
|
|
||
| from crawlee._utils.html_to_text import ( | ||
| _ANY_CONSECUTIVE_WHITE_SPACES, | ||
| _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, | ||
| _EMPTY_OR_ENDS_WITH_NEW_LINE, | ||
| BLOCK_TAGS, | ||
| SKIP_TAGS, | ||
| ) | ||
|
|
||
| if TYPE_CHECKING: | ||
| from collections.abc import Iterable | ||
|
|
||
|
|
||
| def html_to_text(source: str | BeautifulSoup) -> str: | ||
| """Converts markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup.""" | ||
vdusek marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if isinstance(source, str): | ||
| soup = BeautifulSoup(source) | ||
| elif isinstance(source, BeautifulSoup): | ||
| soup = source | ||
| else: | ||
| raise TypeError('Source must be either a string or a `BeautifulSoup` object.') | ||
|
|
||
| text = '' | ||
|
|
||
| def _page_element_to_text(page_elements: Iterable[PageElement]) -> None: | ||
| """Custom html parsing that performs as implementation from Javascript version of Crawlee.""" | ||
| nonlocal text | ||
| for page_element in page_elements: | ||
| if isinstance(page_element, (Tag, NavigableString)): | ||
| if isinstance(page_element, NavigableString): | ||
| compr: str | ||
| if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre': | ||
| compr = page_element.get_text() | ||
| else: | ||
| # Compress white spaces outside of pre block | ||
| compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text()) | ||
| # If text is empty or ends with a whitespace, don't add the leading whitespace or new line | ||
| if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text): | ||
| compr = compr[1:] | ||
| text += compr | ||
| elif page_element.name.lower() in SKIP_TAGS: | ||
| # Skip comments and special elements | ||
| pass | ||
| elif page_element.name.lower() == 'br': | ||
| text += '\n' | ||
| elif page_element.name.lower() == 'td': | ||
| _page_element_to_text(page_element.children) | ||
| text += '\t' | ||
| else: | ||
| # Block elements must be surrounded by newlines(unless beginning of text) | ||
| is_block_tag = page_element.name.lower() in BLOCK_TAGS | ||
| if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text): | ||
| text += '\n' | ||
| _page_element_to_text(page_element.children) | ||
| if is_block_tag and not text.endswith('\n'): | ||
| text += '\n' | ||
|
|
||
| _page_element_to_text(soup.children) | ||
|
|
||
| return text.strip() | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import re | ||
|
|
||
| from parsel import Selector | ||
|
|
||
| from crawlee._utils.html_to_text import ( | ||
| _ANY_CONSECUTIVE_WHITE_SPACES, | ||
| _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, | ||
| _EMPTY_OR_ENDS_WITH_NEW_LINE, | ||
| BLOCK_TAGS, | ||
| SKIP_TAGS, | ||
| ) | ||
|
|
||
|
|
||
| def html_to_text(source: str | Selector) -> str: | ||
| """Converts markup string or `Selector` to newline-separated plain text without tags using Parsel.""" | ||
| if isinstance(source, str): | ||
| selector = Selector(text=source) | ||
| elif isinstance(source, Selector): | ||
| selector = source | ||
| else: | ||
| raise TypeError('Source must be either a string or a `Selector` object.') | ||
|
|
||
| text = '' | ||
|
|
||
| def _extract_text(elements: list[Selector], *, compress: bool = True) -> None: | ||
| """Custom html parsing that performs as implementation from Javascript version of Crawlee.""" | ||
| nonlocal text | ||
| for element in elements: | ||
| tag = element.root.tag if hasattr(element.root, 'tag') else None | ||
|
|
||
| if tag is None: | ||
| # Compress white spaces outside of pre block | ||
| compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', element.root) if compress else element.root | ||
| # If text is empty or ends with a whitespace, don't add the leading whitespace or new line | ||
| if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text): | ||
| compr = compr[1:] | ||
| text += compr | ||
|
|
||
| if tag in SKIP_TAGS or not isinstance(tag, str): | ||
| continue | ||
|
|
||
| if tag == 'br': | ||
| text += '\n' | ||
| elif tag == 'td': | ||
| _extract_text(element.xpath('./node()')) | ||
| text += '\t' | ||
| else: | ||
| is_block_tag = tag in BLOCK_TAGS if tag else False | ||
|
|
||
| if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text): | ||
| text += '\n' | ||
|
|
||
| _extract_text(element.xpath('./node()'), compress=tag != 'pre') | ||
|
|
||
| if is_block_tag and not text.endswith('\n'): | ||
| text += '\n' | ||
|
|
||
| # Start processing the root elements | ||
| _extract_text(selector.xpath('/*')) | ||
|
|
||
| return text.strip() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,198 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from typing import Callable | ||
|
|
||
| import pytest | ||
| from bs4 import BeautifulSoup | ||
| from parsel import Selector | ||
|
|
||
| from crawlee.beautifulsoup_crawler._utils import html_to_text as html_to_text_beautifulsoup | ||
| from crawlee.parsel_crawler._utils import html_to_text as html_to_text_parsel | ||
|
|
||
| _EXPECTED_TEXT = ( | ||
| "Let's start with a simple text. \n" | ||
| "The ships hung in the sky, much the way that bricks don't. \n" | ||
| "These aren't the Droids you're looking for\n" | ||
| "I'm sorry, Dave. I'm afraid I can't do that.\n" | ||
| "I'm sorry, Dave. I'm afraid I can't do that.\n" | ||
| 'A1\tA2\tA3\t\n' | ||
| 'B1\tB2\tB3\tB 4\t\n' | ||
| 'This is some text with inline elements and HTML entities (>bla<) \n' | ||
| 'Test\n' | ||
| 'a\n' | ||
| 'few\n' | ||
| 'line\n' | ||
| 'breaks\n' | ||
| 'Spaces in an inline text should be completely ignored. \n' | ||
| 'But,\n' | ||
| ' a pre-formatted\n' | ||
| ' block should be kept\n' | ||
| ' pre-formatted.\n' | ||
| 'The Greatest Science Fiction Quotes Of All Time \n' | ||
| "Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You " | ||
| 'Nexus, huh? I design your eyes.' | ||
| ) | ||
|
|
||
| _EXAMPLE_HTML = """ | ||
| <html> | ||
| <head> | ||
| <title>Title SHOULD NOT be converted</title> | ||
|
|
||
| <!-- Comments SHOULD NOT be converted --> | ||
| </head> | ||
| <body with='some attributes'> | ||
| Let's start with a simple text. | ||
| <p> | ||
| The ships hung in the sky, much the <a class="click" href="https://example.com/a/b/first">way that</a> bricks don't. | ||
| </p> | ||
| <ul> | ||
| <li>These aren't the Droids you're looking for</li> | ||
| <li some="attribute"><a href="https://example.com/a/second">I'm sorry, Dave. I'm afraid I can't do that.</a></li> | ||
| <li><a class="click" href="https://example.com/a/b/third">I'm sorry, Dave. I'm afraid I can't do that.</a></li> | ||
| </ul> | ||
|
|
||
| <img src="something" alt="This should be ignored" /> | ||
|
|
||
| <!-- Comments SHOULD NOT be converted --> | ||
|
|
||
| <table> | ||
| <tr class="something"> | ||
| <td>A1</td> | ||
| <td attributes="are ignored">A2</td> | ||
| <td>A3</td> | ||
| </tr> | ||
| <tr class="something"> | ||
| <td>B1</td> | ||
| <td attributes="are ignored" even="second attribute">B2</td> | ||
| <td>B3</td> | ||
| <td>B 4</td> | ||
| </tr> | ||
| </table> | ||
|
|
||
| <p> | ||
| This is <b>some<i> text <b>with</b></i></b> inline <span>elements</span> and HTML entities (>bla<) | ||
| </p> | ||
|
|
||
| <div> | ||
| Test<br> | ||
| a<br /> | ||
| few<br> | ||
| line<br> | ||
| breaks<br> | ||
| </div> | ||
|
|
||
|
|
||
|
|
||
|
|
||
| Spaces | ||
|
|
||
|
|
||
| in | ||
|
|
||
|
|
||
| an inline text should be | ||
|
|
||
|
|
||
| completely ignored. | ||
|
|
||
|
|
||
|
|
||
| <pre> | ||
| But, | ||
| a pre-formatted | ||
| block should be kept | ||
| pre-formatted. | ||
| </pre> | ||
|
|
||
| <svg> | ||
| These special elements SHOULD NOT BE CONVERTED. | ||
| </svg> | ||
|
|
||
| <script> | ||
| // These special elements should be completely skipped. | ||
| skipThis(); | ||
| </script> | ||
|
|
||
| <style> | ||
| /* These special elements should be completely skipped. */ | ||
| .skip_this {} | ||
| </style> | ||
|
|
||
| <canvas> | ||
| This should be skipped too. | ||
| </canvas> | ||
|
|
||
| <a class="click" href="https://another.com/a/fifth">The Greatest Science Fiction Quotes Of All Time</a> | ||
| <p> | ||
| Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, | ||
| just eyes. You Nexus, huh? I design your <a class="click" href="http://cool.com/">eyes</a>. | ||
| </p> | ||
| </body> | ||
| </html> | ||
| """ | ||
|
|
||
|
|
||
| @pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup]) | ||
| @pytest.mark.parametrize( | ||
| ('source', 'expected_text'), | ||
| [ | ||
| (_EXAMPLE_HTML, _EXPECTED_TEXT), | ||
| (' Plain text node ', 'Plain text node'), | ||
| (' \nPlain text node \n ', 'Plain text node'), | ||
| ('<h1>Header 1</h1> <h2>Header 2</h2>', 'Header 1\nHeader 2'), | ||
| ('<h1>Header 1</h1> <h2>Header 2</h2><br>', 'Header 1\nHeader 2'), | ||
| ('<h1>Header 1</h1> <h2>Header 2</h2><br><br>', 'Header 1\nHeader 2'), | ||
| ('<h1>Header 1</h1> <h2>Header 2</h2><br><br><br>', 'Header 1\nHeader 2'), | ||
| ('<h1>Header 1</h1><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'), | ||
| ('<h1>Header 1</h1> <br> <h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'), | ||
| ('<h1>Header 1</h1> \n <br>\n<h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'), | ||
| ('<h1>Header 1</h1> \n <br>\n<br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\nHeader 2'), | ||
| ('<h1>Header 1</h1> \n <br>\n<br><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\n\nHeader 2'), | ||
| ('<div><div>Div</div><p>Paragraph</p></div>', 'Div\nParagraph'), | ||
| ('<div>Div1</div><!-- Some comments --><div>Div2</div>', 'Div1\nDiv2'), | ||
| ('<div>Div1</div><style>Skip styles</style>', 'Div1'), | ||
| ('<script>Skip_scripts();</script><div>Div1</div>', 'Div1'), | ||
| ('<SCRIPT>Skip_scripts();</SCRIPT><div>Div1</div>', 'Div1'), | ||
| ('<svg>Skip svg</svg><div>Div1</div>', 'Div1'), | ||
| ('<canvas>Skip canvas</canvas><div>Div1</div>', 'Div1'), | ||
| ('<b>A B C D E\n\nF G</b>', 'A B C D E F G'), | ||
| ('<pre>A B C D E\n\nF G</pre>', 'A B C D E\n\nF G'), | ||
| ( | ||
| '<h1>Heading 1</h1><div><div><div><div>Deep Div</div></div></div></div><h2>Heading 2</h2>', | ||
| 'Heading 1\nDeep Div' '\nHeading 2', | ||
| ), | ||
| ('<a>this_word</a>_should_<b></b>be_<span>one</span>', 'this_word_should_be_one'), | ||
| ('<span attributes="should" be="ignored">some <span>text</span></span>', 'some text'), | ||
| ( | ||
| ( | ||
| """<table> | ||
| <tr> | ||
| <td>Cell A1</td><td>Cell A2</td> | ||
| <td> Cell A3 </td> | ||
| </tr> | ||
| <tr> | ||
| <td>Cell B1</td><td>Cell B2</td> | ||
| </tr> | ||
| </table>""" | ||
| ), | ||
| 'Cell A1\tCell A2\tCell A3 \t\nCell B1\tCell B2', | ||
| ), | ||
| ('<span>á é</span>', 'á é'), | ||
| ], | ||
| ) | ||
| def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[str], str]) -> None: | ||
| assert html_to_text(source) == expected_text | ||
|
|
||
|
|
||
| @pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup]) | ||
| def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None: | ||
| with pytest.raises(TypeError): | ||
| html_to_text(1) # type: ignore[arg-type] # Intentional wrong type test. | ||
|
|
||
|
|
||
| def test_html_to_text_parsel() -> None: | ||
| assert html_to_text_parsel(Selector(_EXAMPLE_HTML)) == _EXPECTED_TEXT | ||
|
|
||
|
|
||
| def test_html_to_text_beautifulsoup() -> None: | ||
| assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML)) == _EXPECTED_TEXT |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.