Skip to content

Commit 61f0f4e

Browse files
authored
fix: kept elements being removed (#166)
Elements marked to keep were removeable under specific circumstances. This PR fixes these edge cases.
1 parent 5741910 commit 61f0f4e

8 files changed

Lines changed: 654 additions & 17 deletions

File tree

fixtures/obamawhitehouse.archives.gov/1490227791307.html

Lines changed: 605 additions & 0 deletions
Large diffs are not rendered by default.

src/cleaners/content.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ export default function extractCleanNode(
3131
// this can sometimes be too aggressive.
3232
if (defaultCleaner) cleanImages(article, $);
3333

34+
// Make links absolute
35+
makeLinksAbsolute(article, $, url);
36+
3437
// Mark elements to keep that would normally be removed.
3538
// E.g., stripJunkTags will remove iframes, so we're going to mark
3639
// YouTube/Vimeo videos as elements we want to keep.
@@ -48,9 +51,6 @@ export default function extractCleanNode(
4851
// Clean headers
4952
cleanHeaders(article, $, title);
5053

51-
// Make links absolute
52-
makeLinksAbsolute(article, $, url);
53-
5454
// We used to clean UL's and OL's here, but it was leading to
5555
// too many in-article lists being removed. Consider a better
5656
// way to detect menus particularly and remove them.

src/extractors/custom/obamawhitehouse.archives.gov/index.js

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,6 @@ export const ObamawhitehouseArchivesGovExtractor = {
4646
// Is there anything in the content you selected that needs transformed
4747
// before it's consumable content? E.g., unusual lazy loaded images
4848
transforms: {
49-
'iframe[src*=youtube]': ($node) => {
50-
$node.parents('.panel-pane').replaceWith($node);
51-
},
5249
},
5350

5451
// Is there anything that is in the result that shouldn't be?

src/extractors/custom/obamawhitehouse.archives.gov/index.test.js

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,4 +115,30 @@ describe('ObamawhitehouseArchivesGovExtractor', () => {
115115
assert.equal($('iframe[src*="youtube"]').length, 1);
116116
});
117117
});
118+
119+
describe('gets more failing blogs', () => {
120+
let result;
121+
let url;
122+
beforeAll(() => {
123+
url =
124+
'https://obamawhitehouse.archives.gov/the-press-office/2016/12/24/weekly-address-merry-christmas-president-and-first-lady';
125+
const html =
126+
fs.readFileSync('./fixtures/obamawhitehouse.archives.gov/1490227791307.html');
127+
result =
128+
Mercury.parse(url, html, { fallback: false });
129+
});
130+
131+
it('gets the words and video', async () => {
132+
const { content } = await result;
133+
134+
const $ = cheerio.load(content || '');
135+
136+
const first13 = excerptContent($('*').first().text(), 13);
137+
138+
// Update these values with the expected values from
139+
// the article.
140+
assert.equal(first13, 'In this week’s address, the President and the First Lady wished all Americans');
141+
assert.equal($('iframe[src*="youtube"]').length, 1);
142+
});
143+
});
118144
});

src/utils/dom/clean-attributes.js

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@ import {
33
setAttrs,
44
} from 'utils/dom';
55

6-
import { WHITELIST_ATTRS_RE } from './constants';
6+
import {
7+
WHITELIST_ATTRS_RE,
8+
KEEP_CLASS,
9+
} from './constants';
710

8-
function removeAllButWhitelist($article) {
11+
function removeAllButWhitelist($article, $) {
912
$article.find('*').each((index, node) => {
1013
const attrs = getAttrs(node);
1114

@@ -18,6 +21,9 @@ function removeAllButWhitelist($article) {
1821
}, {}));
1922
});
2023

24+
// Remove the mercury-parser-keep class from result
25+
$(`.${KEEP_CLASS}`, $article).removeClass(KEEP_CLASS);
26+
2127
return $article;
2228
}
2329

@@ -28,12 +34,12 @@ function removeAllButWhitelist($article) {
2834
// }
2935

3036
// Remove attributes like style or align
31-
export default function cleanAttributes($article) {
37+
export default function cleanAttributes($article, $) {
3238
// Grabbing the parent because at this point
3339
// $article will be wrapped in a div which will
3440
// have a score set on it.
3541
return removeAllButWhitelist(
36-
$article.parent().length ?
37-
$article.parent() : $article
42+
$article.parent().length ? $article.parent() : $article,
43+
$,
3844
);
3945
}

src/utils/dom/clean-attributes.test.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ describe('cleanAttributes($)', () => {
99
it('removes style attributes from nodes', () => {
1010
const $ = cheerio.load(HTML.removeStyle.before);
1111

12-
const result = cleanAttributes($('*').first());
12+
const result = cleanAttributes($('*').first(), $);
1313
assertClean($.html(result), HTML.removeStyle.after);
1414
});
1515

1616
it('removes align attributes from nodes', () => {
1717
const $ = cheerio.load(HTML.removeAlign.before);
1818

19-
const result = cleanAttributes($('*').first());
19+
const result = cleanAttributes($('*').first(), $);
2020
assertClean($.html(result), HTML.removeAlign.after);
2121
});
2222
});

src/utils/dom/clean-tags.js

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@ import {
55
scoreCommas,
66
} from 'extractors/generic/content/scoring';
77

8-
import { CLEAN_CONDITIONALLY_TAGS } from './constants';
8+
import {
9+
CLEAN_CONDITIONALLY_TAGS,
10+
KEEP_CLASS,
11+
} from './constants';
912
import { normalizeSpaces } from '../text';
1013
import { linkDensity } from './index';
1114

@@ -89,6 +92,9 @@ function removeUnlessContent($node, $, weight) {
8992
export default function cleanTags($article, $) {
9093
$(CLEAN_CONDITIONALLY_TAGS, $article).each((index, node) => {
9194
const $node = $(node);
95+
// If marked to keep, skip it
96+
if ($node.hasClass(KEEP_CLASS) || $node.find(`.${KEEP_CLASS}`).length > 0) return;
97+
9298
let weight = getScore($node);
9399
if (!weight) {
94100
weight = getOrInitScore($node, $);

src/utils/dom/strip-junk-tags.js

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,5 @@ export default function stripJunkTags(article, $, tags = []) {
1212
// any element with a class of mercury-parser-keep
1313
$(tags.join(','), article).not(`.${KEEP_CLASS}`).remove();
1414

15-
// Remove the mercury-parser-keep class from result
16-
$(`.${KEEP_CLASS}`, article).removeClass(KEEP_CLASS);
17-
1815
return $;
1916
}

0 commit comments

Comments
 (0)