Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions src/crawlee/_utils/html_to_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# This file contains shared constants used by different implementations of html_to_text function.
from __future__ import annotations

import re

SKIP_TAGS = {'script', 'style', 'canvas', 'svg', 'noscript', 'title'}
BLOCK_TAGS = {
'p',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'ol',
'ul',
'li',
'pre',
'address',
'blockquote',
'dl',
'div',
'fieldset',
'form',
'table',
'tr',
'select',
'option',
}

_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE = re.compile(r'(^|\s)$')
_EMPTY_OR_ENDS_WITH_NEW_LINE = re.compile(r'(^|\n)$')
_ANY_CONSECUTIVE_WHITE_SPACES = re.compile(r'\s+')
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from crawlee._utils.docs import docs_group
from crawlee.abstract_http_crawler._http_crawling_context import ParsedHttpCrawlingContext
from crawlee.beautifulsoup_crawler._utils import html_to_text


@dataclass(frozen=True)
Expand All @@ -24,3 +25,7 @@ def soup(self) -> BeautifulSoup:
def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[BeautifulSoup]) -> Self:
"""Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`."""
return cls(**{field.name: getattr(context, field.name) for field in fields(context)})

def html_to_text(self) -> str:
"""Converts parsed_content to newline-separated plain text without tags."""
return html_to_text(self.parsed_content)
66 changes: 66 additions & 0 deletions src/crawlee/beautifulsoup_crawler/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from __future__ import annotations

import re
from typing import TYPE_CHECKING

from bs4 import BeautifulSoup, NavigableString, PageElement, Tag

from crawlee._utils.html_to_text import (
_ANY_CONSECUTIVE_WHITE_SPACES,
_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE,
_EMPTY_OR_ENDS_WITH_NEW_LINE,
BLOCK_TAGS,
SKIP_TAGS,
)

if TYPE_CHECKING:
from collections.abc import Iterable


def html_to_text(source: str | BeautifulSoup) -> str:
"""Converts markup string or `BeautifulSoup` to newline separated plain text without tags using BeautifulSoup."""
if isinstance(source, str):
soup = BeautifulSoup(source)
elif isinstance(source, BeautifulSoup):
soup = source
else:
raise TypeError('Source must be either a string or a `BeautifulSoup` object.')

text = ''

def _page_element_to_text(page_elements: Iterable[PageElement]) -> None:
"""Custom html parsing that performs as implementation from Javascript version of Crawlee."""
nonlocal text
for page_element in page_elements:
if isinstance(page_element, (Tag, NavigableString)):
if isinstance(page_element, NavigableString):
compr: str
if isinstance(page_element.parent, Tag) and page_element.parent.name.lower() == 'pre':
compr = page_element.get_text()
else:
# Compress white spaces outside of pre block
compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', page_element.get_text())
# If text is empty or ends with a whitespace, don't add the leading whitespace or new line
if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text):
compr = compr[1:]
text += compr
elif page_element.name.lower() in SKIP_TAGS:
# Skip comments and special elements
pass
elif page_element.name.lower() == 'br':
text += '\n'
elif page_element.name.lower() == 'td':
_page_element_to_text(page_element.children)
text += '\t'
else:
# Block elements must be surrounded by newlines(unless beginning of text)
is_block_tag = page_element.name.lower() in BLOCK_TAGS
if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text):
text += '\n'
_page_element_to_text(page_element.children)
if is_block_tag and not text.endswith('\n'):
text += '\n'

_page_element_to_text(soup.children)

return text.strip()
5 changes: 5 additions & 0 deletions src/crawlee/parsel_crawler/_parsel_crawling_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from crawlee._utils.docs import docs_group
from crawlee.abstract_http_crawler._http_crawling_context import ParsedHttpCrawlingContext
from crawlee.parsel_crawler._utils import html_to_text


@dataclass(frozen=True)
Expand All @@ -24,3 +25,7 @@ def selector(self) -> Selector:
def from_parsed_http_crawling_context(cls, context: ParsedHttpCrawlingContext[Selector]) -> Self:
"""Convenience constructor that creates new context from existing `ParsedHttpCrawlingContext[BeautifulSoup]`."""
return cls(**{field.name: getattr(context, field.name) for field in fields(context)})

def html_to_text(self) -> str:
"""Converts parsed_content to newline-separated plain text without tags."""
return html_to_text(self.parsed_content)
63 changes: 63 additions & 0 deletions src/crawlee/parsel_crawler/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from __future__ import annotations

import re

from parsel import Selector

from crawlee._utils.html_to_text import (
_ANY_CONSECUTIVE_WHITE_SPACES,
_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE,
_EMPTY_OR_ENDS_WITH_NEW_LINE,
BLOCK_TAGS,
SKIP_TAGS,
)


def html_to_text(source: str | Selector) -> str:
"""Converts markup string or `Selector` to newline-separated plain text without tags using Parsel."""
if isinstance(source, str):
selector = Selector(text=source)
elif isinstance(source, Selector):
selector = source
else:
raise TypeError('Source must be either a string or a `Selector` object.')

text = ''

def _extract_text(elements: list[Selector], *, compress: bool = True) -> None:
"""Custom html parsing that performs as implementation from Javascript version of Crawlee."""
nonlocal text
for element in elements:
tag = element.root.tag if hasattr(element.root, 'tag') else None

if tag is None:
# Compress white spaces outside of pre block
compr = re.sub(_ANY_CONSECUTIVE_WHITE_SPACES, ' ', element.root) if compress else element.root
# If text is empty or ends with a whitespace, don't add the leading whitespace or new line
if (compr.startswith((' ', '\n'))) and re.search(_EMPTY_OR_ENDS_WITH_ANY_WHITE_SPACE, text):
compr = compr[1:]
text += compr

if tag in SKIP_TAGS or not isinstance(tag, str):
continue

if tag == 'br':
text += '\n'
elif tag == 'td':
_extract_text(element.xpath('./node()'))
text += '\t'
else:
is_block_tag = tag in BLOCK_TAGS if tag else False

if is_block_tag and not re.search(_EMPTY_OR_ENDS_WITH_NEW_LINE, text):
text += '\n'

_extract_text(element.xpath('./node()'), compress=tag != 'pre')

if is_block_tag and not text.endswith('\n'):
text += '\n'

# Start processing the root elements
_extract_text(selector.xpath('/*'))

return text.strip()
198 changes: 198 additions & 0 deletions tests/unit/_utils/test_html_to_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
from __future__ import annotations

from typing import Callable

import pytest
from bs4 import BeautifulSoup
from parsel import Selector

from crawlee.beautifulsoup_crawler._utils import html_to_text as html_to_text_beautifulsoup
from crawlee.parsel_crawler._utils import html_to_text as html_to_text_parsel

_EXPECTED_TEXT = (
"Let's start with a simple text. \n"
"The ships hung in the sky, much the way that bricks don't. \n"
"These aren't the Droids you're looking for\n"
"I'm sorry, Dave. I'm afraid I can't do that.\n"
"I'm sorry, Dave. I'm afraid I can't do that.\n"
'A1\tA2\tA3\t\n'
'B1\tB2\tB3\tB 4\t\n'
'This is some text with inline elements and HTML entities (>bla<) \n'
'Test\n'
'a\n'
'few\n'
'line\n'
'breaks\n'
'Spaces in an inline text should be completely ignored. \n'
'But,\n'
' a pre-formatted\n'
' block should be kept\n'
' pre-formatted.\n'
'The Greatest Science Fiction Quotes Of All Time \n'
"Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You "
'Nexus, huh? I design your eyes.'
)

_EXAMPLE_HTML = """
<html>
<head>
<title>Title SHOULD NOT be converted</title>

<!-- Comments SHOULD NOT be converted -->
</head>
<body with='some attributes'>
Let's start with a simple text.
<p>
The ships hung in the sky, much the <a class="click" href="https://example.com/a/b/first">way that</a> bricks don't.
</p>
<ul>
<li>These aren't the Droids you're looking for</li>
<li some="attribute"><a href="https://example.com/a/second">I'm sorry, Dave. I'm afraid I can't do that.</a></li>
<li><a class="click" href="https://example.com/a/b/third">I'm sorry, Dave. I'm afraid I can't do that.</a></li>
</ul>

<img src="something" alt="This should be ignored" />

<!-- Comments SHOULD NOT be converted -->

<table>
<tr class="something">
<td>A1</td>
<td attributes="are ignored">A2</td>
<td>A3</td>
</tr>
<tr class="something">
<td>B1</td>
<td attributes="are ignored" even="second attribute">B2</td>
<td>B3</td>
<td>B 4</td>
</tr>
</table>

<p>
This is <b>some<i> text <b>with</b></i></b> inline <span>elements</span> and HTML&nbsp;entities (&gt;bla&lt;)
</p>

<div>
Test<br>
a<br />
few<br>
line<br>
breaks<br>
</div>




Spaces


in


an inline text should be


completely ignored.



<pre>
But,
a pre-formatted
block should be kept
pre-formatted.
</pre>

<svg>
These special elements SHOULD NOT BE CONVERTED.
</svg>

<script>
// These special elements should be completely skipped.
skipThis();
</script>

<style>
/* These special elements should be completely skipped. */
.skip_this {}
</style>

<canvas>
This should be skipped too.
</canvas>

<a class="click" href="https://another.com/a/fifth">The Greatest Science Fiction Quotes Of All Time</a>
<p>
Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design,
just eyes. You Nexus, huh? I design your <a class="click" href="http://cool.com/">eyes</a>.
</p>
</body>
</html>
"""


@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])
@pytest.mark.parametrize(
('source', 'expected_text'),
[
(_EXAMPLE_HTML, _EXPECTED_TEXT),
(' Plain text node ', 'Plain text node'),
(' \nPlain text node \n ', 'Plain text node'),
('<h1>Header 1</h1> <h2>Header 2</h2>', 'Header 1\nHeader 2'),
('<h1>Header 1</h1> <h2>Header 2</h2><br>', 'Header 1\nHeader 2'),
('<h1>Header 1</h1> <h2>Header 2</h2><br><br>', 'Header 1\nHeader 2'),
('<h1>Header 1</h1> <h2>Header 2</h2><br><br><br>', 'Header 1\nHeader 2'),
('<h1>Header 1</h1><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
('<h1>Header 1</h1> <br> <h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
('<h1>Header 1</h1> \n <br>\n<h2>Header 2</h2><br><br><br>', 'Header 1\n\nHeader 2'),
('<h1>Header 1</h1> \n <br>\n<br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\nHeader 2'),
('<h1>Header 1</h1> \n <br>\n<br><br><h2>Header 2</h2><br><br><br>', 'Header 1\n\n\n\nHeader 2'),
('<div><div>Div</div><p>Paragraph</p></div>', 'Div\nParagraph'),
('<div>Div1</div><!-- Some comments --><div>Div2</div>', 'Div1\nDiv2'),
('<div>Div1</div><style>Skip styles</style>', 'Div1'),
('<script>Skip_scripts();</script><div>Div1</div>', 'Div1'),
('<SCRIPT>Skip_scripts();</SCRIPT><div>Div1</div>', 'Div1'),
('<svg>Skip svg</svg><div>Div1</div>', 'Div1'),
('<canvas>Skip canvas</canvas><div>Div1</div>', 'Div1'),
('<b>A B C D E\n\nF G</b>', 'A B C D E F G'),
('<pre>A B C D E\n\nF G</pre>', 'A B C D E\n\nF G'),
(
'<h1>Heading 1</h1><div><div><div><div>Deep Div</div></div></div></div><h2>Heading 2</h2>',
'Heading 1\nDeep Div' '\nHeading 2',
),
('<a>this_word</a>_should_<b></b>be_<span>one</span>', 'this_word_should_be_one'),
('<span attributes="should" be="ignored">some <span>text</span></span>', 'some text'),
(
(
"""<table>
<tr>
<td>Cell A1</td><td>Cell A2</td>
<td> Cell A3 </td>
</tr>
<tr>
<td>Cell B1</td><td>Cell B2</td>
</tr>
</table>"""
),
'Cell A1\tCell A2\tCell A3 \t\nCell B1\tCell B2',
),
('<span>&aacute; &eacute;</span>', 'á é'),
],
)
def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[str], str]) -> None:
assert html_to_text(source) == expected_text


@pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup])
def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None:
with pytest.raises(TypeError):
html_to_text(1) # type: ignore[arg-type] # Intentional wrong type test.


def test_html_to_text_parsel() -> None:
assert html_to_text_parsel(Selector(_EXAMPLE_HTML)) == _EXPECTED_TEXT


def test_html_to_text_beautifulsoup() -> None:
assert html_to_text_beautifulsoup(BeautifulSoup(_EXAMPLE_HTML)) == _EXPECTED_TEXT
Loading
Loading