Skip to content

Commit 93f84a8

Browse files
authored
Merge pull request #6083 from BookStackApp/better_plain_text
New HTML to Plaintext handling
2 parents 4feb50e + abed4ea commit 93f84a8

File tree

9 files changed

+132
-22
lines changed

9 files changed

+132
-22
lines changed

app/Activity/Models/Comment.php

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
use BookStack\Users\Models\OwnableInterface;
1010
use BookStack\Util\HtmlContentFilter;
1111
use BookStack\Util\HtmlContentFilterConfig;
12+
use BookStack\Util\HtmlToPlainText;
1213
use Illuminate\Database\Eloquent\Builder;
1314
use Illuminate\Database\Eloquent\Factories\HasFactory;
1415
use Illuminate\Database\Eloquent\Relations\BelongsTo;
@@ -87,6 +88,12 @@ public function safeHtml(): string
8788
return $filter->filterString($this->html ?? '');
8889
}
8990

91+
public function getPlainText(): string
92+
{
93+
$converter = new HtmlToPlainText();
94+
return $converter->convert($this->html ?? '');
95+
}
96+
9097
public function jointPermissions(): HasMany
9198
{
9299
return $this->hasMany(JointPermission::class, 'entity_id', 'commentable_id')

app/Activity/Notifications/Messages/CommentCreationNotification.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public function toMail(User $notifiable): MailMessage
2424
$locale->trans('notifications.detail_page_name') => new EntityLinkMessageLine($page),
2525
$locale->trans('notifications.detail_page_path') => $this->buildPagePathLine($page, $notifiable),
2626
$locale->trans('notifications.detail_commenter') => $this->user->name,
27-
$locale->trans('notifications.detail_comment') => strip_tags($comment->html),
27+
$locale->trans('notifications.detail_comment') => $comment->getPlainText(),
2828
]);
2929

3030
return $this->newMailMessage($locale)

app/Activity/Notifications/Messages/CommentMentionNotification.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public function toMail(User $notifiable): MailMessage
2424
$locale->trans('notifications.detail_page_name') => new EntityLinkMessageLine($page),
2525
$locale->trans('notifications.detail_page_path') => $this->buildPagePathLine($page, $notifiable),
2626
$locale->trans('notifications.detail_commenter') => $this->user->name,
27-
$locale->trans('notifications.detail_comment') => strip_tags($comment->html),
27+
$locale->trans('notifications.detail_comment') => $comment->getPlainText(),
2828
]);
2929

3030
return $this->newMailMessage($locale)

app/Entities/Repos/BaseRepo.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
use BookStack\Sorting\BookSorter;
1717
use BookStack\Uploads\ImageRepo;
1818
use BookStack\Util\HtmlDescriptionFilter;
19+
use BookStack\Util\HtmlToPlainText;
1920
use Illuminate\Http\UploadedFile;
2021

2122
class BaseRepo
@@ -151,9 +152,10 @@ protected function updateDescription(Entity $entity, array $input): void
151152
}
152153

153154
if (isset($input['description_html'])) {
155+
$plainTextConverter = new HtmlToPlainText();
154156
$entity->descriptionInfo()->set(
155157
HtmlDescriptionFilter::filterFromString($input['description_html']),
156-
html_entity_decode(strip_tags($input['description_html']))
158+
$plainTextConverter->convert($input['description_html']),
157159
);
158160
} else if (isset($input['description'])) {
159161
$entity->descriptionInfo()->set('', $input['description']);

app/Entities/Tools/PageContent.php

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
use BookStack\Util\HtmlContentFilter;
1717
use BookStack\Util\HtmlContentFilterConfig;
1818
use BookStack\Util\HtmlDocument;
19+
use BookStack\Util\HtmlToPlainText;
1920
use BookStack\Util\WebSafeMimeSniffer;
2021
use Closure;
2122
use DOMElement;
@@ -303,8 +304,8 @@ protected function setUniqueId(DOMNode $element, array &$idMap): array
303304
public function toPlainText(): string
304305
{
305306
$html = $this->render(true);
306-
307-
return html_entity_decode(strip_tags($html));
307+
$converter = new HtmlToPlainText();
308+
return $converter->convert($html);
308309
}
309310

310311
/**

app/Exports/ExportFormatter.php

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
use BookStack\Uploads\ImageService;
1212
use BookStack\Util\CspService;
1313
use BookStack\Util\HtmlDocument;
14+
use BookStack\Util\HtmlToPlainText;
1415
use DOMElement;
1516
use Exception;
1617
use Throwable;
@@ -242,32 +243,21 @@ protected function containHtml(string $htmlContent): string
242243

243244
/**
244245
* Converts the page contents into simple plain text.
245-
* This method filters any bad looking content to provide a nice final output.
246+
* We re-generate the plain text from HTML at this point, post-page-content rendering.
246247
*/
247248
public function pageToPlainText(Page $page, bool $pageRendered = false, bool $fromParent = false): string
248249
{
249250
$html = $pageRendered ? $page->html : (new PageContent($page))->render();
250-
// Add proceeding spaces before tags so spaces remain between
251-
// text within elements after stripping tags.
252-
$html = str_replace('<', " <", $html);
253-
$text = trim(strip_tags($html));
254-
// Replace multiple spaces with single spaces
255-
$text = preg_replace('/ {2,}/', ' ', $text);
256-
// Reduce multiple horrid whitespace characters.
257-
$text = preg_replace('/(\x0A|\xA0|\x0A|\r|\n){2,}/su', "\n\n", $text);
258-
$text = html_entity_decode($text);
259-
// Add title
260-
$text = $page->name . ($fromParent ? "\n" : "\n\n") . $text;
261-
262-
return $text;
251+
$contentText = (new HtmlToPlainText())->convert($html);
252+
return $page->name . ($fromParent ? "\n" : "\n\n") . $contentText;
263253
}
264254

265255
/**
266256
* Convert a chapter into a plain text string.
267257
*/
268258
public function chapterToPlainText(Chapter $chapter): string
269259
{
270-
$text = $chapter->name . "\n" . $chapter->description;
260+
$text = $chapter->name . "\n" . $chapter->descriptionInfo()->getPlain();
271261
$text = trim($text) . "\n\n";
272262

273263
$parts = [];

app/Util/HtmlToPlainText.php

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
<?php
2+
3+
namespace BookStack\Util;
4+
5+
class HtmlToPlainText
6+
{
7+
/**
8+
* Inline tags types where the content should not be put on a new line.
9+
*/
10+
protected array $inlineTags = [
11+
'a', 'b', 'i', 'u', 'strong', 'em', 'small', 'sup', 'sub', 'span', 'div',
12+
];
13+
14+
/**
15+
* Convert the provided HTML to relatively clean plain text.
16+
*/
17+
public function convert(string $html): string
18+
{
19+
$doc = new HtmlDocument($html);
20+
$text = $this->nodeToText($doc->getBody());
21+
22+
// Remove repeated newlines
23+
$text = preg_replace('/\n+/', "\n", $text);
24+
// Remove leading/trailing whitespace
25+
$text = trim($text);
26+
27+
return $text;
28+
}
29+
30+
protected function nodeToText(\DOMNode $node): string
31+
{
32+
if ($node->nodeType === XML_TEXT_NODE) {
33+
return $node->textContent;
34+
}
35+
36+
$text = '';
37+
if (!in_array($node->nodeName, $this->inlineTags)) {
38+
$text .= "\n";
39+
}
40+
41+
foreach ($node->childNodes as $childNode) {
42+
$text .= $this->nodeToText($childNode);
43+
}
44+
45+
return $text;
46+
}
47+
}

tests/Exports/TextExportTest.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ public function test_book_text_export_format()
5252
$resp = $this->asEditor()->get($entities['book']->getUrl('/export/plaintext'));
5353

5454
$expected = "Export Book\nThis is a book with stuff to export\n\nExport chapter\nA test chapter to be exported\nIt has loads of info within\n\n";
55-
$expected .= "My wonderful page!\nMy great page Full of great stuff";
55+
$expected .= "My wonderful page!\nMy great page\nFull of great stuff";
5656
$resp->assertSee($expected);
5757
}
5858

@@ -82,7 +82,7 @@ public function test_chapter_text_export_format()
8282
$resp = $this->asEditor()->get($entities['book']->getUrl('/export/plaintext'));
8383

8484
$expected = "Export chapter\nA test chapter to be exported\nIt has loads of info within\n\n";
85-
$expected .= "My wonderful page!\nMy great page Full of great stuff";
85+
$expected .= "My wonderful page!\nMy great page\nFull of great stuff";
8686
$resp->assertSee($expected);
8787
}
8888
}

tests/Util/HtmlToPlainTextTest.php

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
<?php
2+
3+
namespace Tests\Util;
4+
5+
use BookStack\Util\HtmlToPlainText;
6+
use Tests\TestCase;
7+
8+
class HtmlToPlainTextTest extends TestCase
9+
{
10+
public function test_it_converts_html_to_plain_text()
11+
{
12+
$html = <<<HTML
13+
<p>This is a test</p>
14+
<ul>
15+
<li>Item 1</li>
16+
<li>Item 2</li>
17+
</ul>
18+
<h2>A Header</h2>
19+
<p>more &lt;&copy;&gt; text <strong>with bold</strong></p>
20+
HTML;
21+
$expected = <<<TEXT
22+
This is a test
23+
Item 1
24+
Item 2
25+
A Header
26+
more <©> text with bold
27+
TEXT;
28+
29+
$this->runTest($html, $expected);
30+
}
31+
32+
public function test_adjacent_list_items_are_separated_by_newline()
33+
{
34+
$html = <<<HTML
35+
<ul><li>Item A</li><li>Item B</li></ul>
36+
HTML;
37+
$expected = <<<TEXT
38+
Item A
39+
Item B
40+
TEXT;
41+
42+
$this->runTest($html, $expected);
43+
}
44+
45+
public function test_inline_formats_dont_cause_newlines()
46+
{
47+
$html = <<<HTML
48+
<p><strong>H</strong><a>e</a><sup>l</sup><span>l</span><em>o</em></p>
49+
HTML;
50+
$expected = <<<TEXT
51+
Hello
52+
TEXT;
53+
54+
$this->runTest($html, $expected);
55+
}
56+
57+
protected function runTest(string $html, string $expected): void
58+
{
59+
$converter = new HtmlToPlainText();
60+
$result = $converter->convert(trim($html));
61+
$this->assertEquals(trim($expected), $result);
62+
}
63+
}

0 commit comments

Comments
 (0)