Skip to content

Commit 3ed778b

Browse files
toufic-madampash
authored andcommitted
fix: Adapt CNBC extractor to article redesign (#336)
1 parent da9606a commit 3ed778b

3 files changed

Lines changed: 88 additions & 2 deletions

File tree

fixtures/www.cnbc.com/1553160766510.html

Lines changed: 32 additions & 0 deletions
Large diffs are not rendered by default.

src/extractors/custom/www.cnbc.com/index.js

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ export const WwwCnbcComExtractor = {
22
domain: 'www.cnbc.com',
33

44
title: {
5-
selectors: ['h1.title'],
5+
selectors: ['h1.title', 'h1.ArticleHeader-headline'],
66
},
77

88
author: {
@@ -18,7 +18,11 @@ export const WwwCnbcComExtractor = {
1818
},
1919

2020
content: {
21-
selectors: ['div#article_body.content', 'div.story'],
21+
selectors: [
22+
'div#article_body.content',
23+
'div.story',
24+
'div.ArticleBody-articleBody',
25+
],
2226

2327
// Is there anything in the content you selected that needs transformed
2428
// before it's consumable content? E.g., unusual lazy loaded images

src/extractors/custom/www.cnbc.com/index.test.js

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,4 +99,54 @@ describe('WwwCnbcComExtractor', () => {
9999
);
100100
});
101101
});
102+
103+
describe('website redesign', () => {
104+
let result;
105+
let url;
106+
beforeAll(() => {
107+
url =
108+
'https://www.cnbc.com/2019/03/18/heres-how-cybersecurity-vendors-drive-the-hacking-news-cycle.html';
109+
const html = fs.readFileSync(
110+
'./fixtures/www.cnbc.com/1553160766510.html'
111+
);
112+
result = Mercury.parse(url, { html, fallback: false });
113+
});
114+
115+
it('returns the title', async () => {
116+
// To pass this test, fill out the title selector
117+
// in ./src/extractors/custom/www.cnbc.com/index.js.
118+
const { title } = await result;
119+
120+
// Update these values with the expected values from
121+
// the article.
122+
assert.equal(
123+
title,
124+
'Desperate to get through to executives, some cybersecurity vendors are resorting to lies and blackmail'
125+
);
126+
});
127+
128+
it('returns the content', async () => {
129+
// To pass this test, fill out the content selector
130+
// in ./src/extractors/custom/www.cnbc.com/index.js.
131+
// You may also want to make use of the clean and transform
132+
// options.
133+
const { content } = await result;
134+
135+
const $ = cheerio.load(content || '');
136+
137+
const first13 = excerptContent(
138+
$('*')
139+
.first()
140+
.text(),
141+
13
142+
);
143+
144+
// Update these values with the expected values from
145+
// the article.
146+
assert.equal(
147+
first13,
148+
'The cybersecurity vendor marketplace is growing so crowded that some companies have been'
149+
);
150+
});
151+
});
102152
});

0 commit comments

Comments
 (0)