-
Notifications
You must be signed in to change notification settings - Fork 514
feat: Add html_to_text helper function
#792
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
c803951
c571caa
f26f8b6
db0cc65
26d5aee
b45e2f5
27cfedd
bab689c
6f32156
964c2cf
0cb68af
210749e
50968bd
9f15a91
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,86 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import re | ||
| from typing import TYPE_CHECKING | ||
|
|
||
| from bs4 import BeautifulSoup, NavigableString, PageElement, Tag | ||
|
|
||
| if TYPE_CHECKING: | ||
| from collections.abc import Iterable | ||
|
|
||
| SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'} | ||
| BLOCK_TAGS = { | ||
| 'p', | ||
| 'h1', | ||
| 'h2', | ||
| 'h3', | ||
| 'h4', | ||
| 'h5', | ||
| 'h6', | ||
| 'ol', | ||
| 'ul', | ||
| 'li', | ||
| 'pre', | ||
| 'address', | ||
| 'blockquote', | ||
| 'dl', | ||
| 'div', | ||
| 'fieldset', | ||
| 'form', | ||
| 'table', | ||
| 'tr', | ||
| 'select', | ||
| 'option', | ||
| } | ||
janbuchar marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| _EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\s)$') | ||
| _EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\n)$') | ||
| _ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\s+') | ||
|
|
||
|
|
||
| def html_to_text(source: str | BeautifulSoup) -> str: | ||
|
||
| """Converts markup string or BeautifulSoup object to newline separated plain text without tags.""" | ||
| if isinstance(source, str): | ||
| soup = BeautifulSoup(source) | ||
| elif isinstance(source, BeautifulSoup): | ||
| soup = source | ||
| else: | ||
| raise TypeError('Source must be either a string or a BeautifulSoup object.') | ||
|
|
||
| text = '' | ||
|
|
||
| def _page_element_to_text(page_elements: Iterable[PageElement]) -> None: | ||
| """Custom html parsing that performs as implementation from Javascript version of Crawlee.""" | ||
| nonlocal text | ||
| for page_element in page_elements: | ||
| if isinstance(page_element, (Tag, NavigableString)): | ||
| if isinstance(page_element, NavigableString): | ||
| compr: str | ||
| if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre': | ||
| compr = page_element.get_text() | ||
| else: | ||
| # Compress white spaces outside of pre block | ||
| compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text()) | ||
| # If text is empty or ends with a whitespace, don't add the leading whitespace or new line | ||
| if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text): | ||
| compr = compr[1:] | ||
| text += compr | ||
| elif page_element.name.lower() in SKIP_TAGS or isinstance(page_element, int): | ||
| # Skip comments and special elements | ||
| pass | ||
| elif page_element.name.lower() == 'br': | ||
| text += '\n' | ||
| elif page_element.name.lower() == 'td': | ||
| _page_element_to_text(page_element.children) | ||
| text += '\t' | ||
| else: | ||
| # Block elements must be surrounded by newlines(unless beginning of text) | ||
| is_block_tag = page_element.name.lower() in BLOCK_TAGS | ||
| if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text): | ||
| text += '\n' | ||
| _page_element_to_text(page_element.children) | ||
| if is_block_tag and not text.endswith('\n'): | ||
| text += '\n' | ||
|
|
||
| _page_element_to_text(soup.children) | ||
| return text.strip() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,185 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import pytest | ||
| from bs4 import BeautifulSoup | ||
|
|
||
| from crawlee.beautifulsoup_crawler import html_to_text | ||
|
|
||
| _EXPECTED_TEXT = ( | ||
| "Let's start with a simple text. \n" | ||
| "The ships hung in the sky, much the way that bricks don't. \n" | ||
| "These aren't the Droids you're looking for\n" | ||
| "I'm sorry, Dave. I'm afraid I can't do that.\n" | ||
| "I'm sorry, Dave. I'm afraid I can't do that.\n" | ||
| 'A1\tA2\tA3\t\n' | ||
| 'B1\tB2\tB3\tB 4\t\n' | ||
| 'This is some text with inline elements and HTML entities (>bla<) \n' | ||
| 'Test\n' | ||
| 'a\n' | ||
| 'few\n' | ||
| 'line\n' | ||
| 'breaks\n' | ||
| 'Spaces in an inline text should be completely ignored. \n' | ||
| 'But,\n' | ||
| ' a pre-formatted\n' | ||
| ' block should be kept\n' | ||
| ' pre-formatted.\n' | ||
| 'The Greatest Science Fiction Quotes Of All Time \n' | ||
| "Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You " | ||
| 'Nexus, huh? I design your eyes.' | ||
| ) | ||
|
|
||
| _EXAMPLE_HTML = """ | ||
| <html> | ||
| <head> | ||
| <title>Title SHOULD NOT be converted</title> | ||
|
|
||
| <!-- Comments SHOULD NOT be converted --> | ||
| </head> | ||
| <body with='some attributes'> | ||
| Let's start with a simple text. | ||
| <p> | ||
| The ships hung in the sky, much the <a class="click" href="https://example.com/a/b/first">way that</a> bricks don't. | ||
| </p> | ||
| <ul> | ||
| <li>These aren't the Droids you're looking for</li> | ||
| <li some="attribute"><a href="https://example.com/a/second">I'm sorry, Dave. I'm afraid I can't do that.</a></li> | ||
| <li><a class="click" href="https://example.com/a/b/third">I'm sorry, Dave. I'm afraid I can't do that.</a></li> | ||
| </ul> | ||
|
|
||
| <img src="something" alt="This should be ignored" /> | ||
|
|
||
| <!-- Comments SHOULD NOT be converted --> | ||
|
|
||
| <table> | ||
| <tr class="something"> | ||
| <td>A1</td> | ||
| <td attributes="are ignored">A2</td> | ||
| <td>A3</td> | ||
| </tr> | ||
| <tr class="something"> | ||
| <td>B1</td> | ||
| <td attributes="are ignored" even="second attribute">B2</td> | ||
| <td>B3</td> | ||
| <td>B 4</td> | ||
| </tr> | ||
| </table> | ||
|
|
||
| <p> | ||
| This is <b>some<i> text <b>with</b></i></b> inline <span>elements</span> and HTML entities (>bla<) | ||
| </p> | ||
|
|
||
| <div> | ||
| Test<br> | ||
| a<br /> | ||
| few<br> | ||
| line<br> | ||
| breaks<br> | ||
| </div> | ||
|
|
||
|
|
||
|
|
||
|
|
||
| Spaces | ||
|
|
||
|
|
||
| in | ||
|
|
||
|
|
||
| an inline text should be | ||
|
|
||
|
|
||
| completely ignored. | ||
|
|
||
|
|
||
|
|
||
| <pre> | ||
| But, | ||
| a pre-formatted | ||
| block should be kept | ||
| pre-formatted. | ||
| </pre> | ||
|
|
||
| <svg> | ||
| These special elements SHOULD NOT BE CONVERTED. | ||
| </svg> | ||
|
|
||
| <script> | ||
| // These special elements should be completely skipped. | ||
| skipThis(); | ||
| </script> | ||
|
|
||
| <style> | ||
| /* These special elements should be completely skipped. */ | ||
| .skip_this {} | ||
| </style> | ||
|
|
||
| <canvas> | ||
| This should be skipped too. | ||
| </canvas> | ||
|
|
||
| <a class="click" href="https://another.com/a/fifth">The Greatest Science Fiction Quotes Of All Time</a> | ||
| <p> | ||
| Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, | ||
| just eyes. You Nexus, huh? I design your <a class="click" href="http://cool.com/">eyes</a>. | ||
| </p> | ||
| </body> | ||
| </html> | ||
| """ | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| ('source', 'expected_text'), | ||
| [ | ||
| (_EXAMPLE_HTML, _EXPECTED_TEXT), | ||
| (BeautifulSoup(_EXAMPLE_HTML), _EXPECTED_TEXT), | ||
| (' Plain text node ', 'Plain text node'), | ||
| (' \nPlain text node \n ', 'Plain text node'), | ||
| ('<h1>Header 1</h1> <h2>Header 2</h2>', 'Header 1\nHeader 2'), | ||
| ('<h1>Header 1</h1> <h2>Header 2</h2><br>', 'Header 1\nHeader 2'), | ||
| ('<h1>Header 1</h1> <h2>Header 2</h2><br><br>', 'Header 1\nHeader 2'), | ||
| ('<h1>Header 1</h1> <h2>Header 2</h2><br><br><br>', 'Header 1\nHeader 2'), | ||
| ('<h1>Header 1</h1><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'), | ||
| ('<h1>Header 1</h1> <br> <h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'), | ||
| ('<h1>Header 1</h1> \n <br>\n<h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'), | ||
| ('<h1>Header 1</h1> \n <br>\n<br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\nHeader 2'), | ||
| ('<h1>Header 1</h1> \n <br>\n<br><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\n\nHeader 2'), | ||
| ('<div><div>Div</div><p>Paragraph</p></div>', 'Div\nParagraph'), | ||
| ('<div>Div1</div><!-- Some comments --><div>Div2</div>', 'Div1\nDiv2'), | ||
| ('<div>Div1</div><style>Skip styles</style>', 'Div1'), | ||
| ('<script>Skip_scripts();</script><div>Div1</div>', 'Div1'), | ||
| ('<SCRIPT>Skip_scripts();</SCRIPT><div>Div1</div>', 'Div1'), | ||
| ('<svg>Skip svg</svg><div>Div1</div>', 'Div1'), | ||
| ('<canvas>Skip canvas</canvas><div>Div1</div>', 'Div1'), | ||
| ('<b>A B C D E\n\nF G</b>', 'A B C D E F G'), | ||
| ('<pre>A B C D E\n\nF G</pre>', 'A B C D E\n\nF G'), | ||
| ( | ||
| '<h1>Heading 1</h1><div><div><div><div>Deep Div</div></div></div></div><h2>Heading 2</h2>', | ||
| 'Heading 1\nDeep Div' '\nHeading 2', | ||
| ), | ||
| ('<a>this_word</a>_should_<b></b>be_<span>one</span>', 'this_word_should_be_one'), | ||
| ('<span attributes="should" be="ignored">some <span>text</span></span>', 'some text'), | ||
| ( | ||
| ( | ||
| """<table> | ||
| <tr> | ||
| <td>Cell A1</td><td>Cell A2</td> | ||
| <td> Cell A3 </td> | ||
| </tr> | ||
| <tr> | ||
| <td>Cell B1</td><td>Cell B2</td> | ||
| </tr> | ||
| </table>""" | ||
| ), | ||
| 'Cell A1\tCell A2\tCell A3 \t\nCell B1\tCell B2', | ||
| ), | ||
| ('<span>á é</span>', 'á é'), | ||
| ], | ||
| ) | ||
| def test_html_to_text(source: str | BeautifulSoup, expected_text: str) -> None: | ||
| assert html_to_text(source) == expected_text | ||
|
|
||
|
|
||
| def test_html_to_text_raises_on_wrong_input_type() -> None: | ||
| with pytest.raises(TypeError): | ||
| html_to_text(1) # type: ignore[arg-type] # Intentional wrong type test. |
Uh oh!
There was an error while loading. Please reload this page.