Skip to content

Commit e66ad8b

Browse files
WajeehZantouttoufic-m
authored andcommitted
feat: add le monde extractor (#415)
1 parent f81dc63 commit e66ad8b

4 files changed

Lines changed: 125 additions & 0 deletions

File tree

fixtures/www.lemonde.fr/1557235525251.html

Lines changed: 7 additions & 0 deletions
Large diffs are not rendered by default.

src/extractors/custom/index.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,3 +126,4 @@ export * from './techlog.iij.ad.jp';
126126
export * from './wired.jp';
127127
export * from './japan.zdnet.com';
128128
export * from './www.rbbtoday.com';
129+
export * from './www.lemonde.fr';
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
export const WwwLemondeFrExtractor = {
2+
domain: 'www.lemonde.fr',
3+
4+
title: {
5+
selectors: ['h1.article__title'],
6+
},
7+
8+
author: {
9+
selectors: ['.author__name'],
10+
},
11+
12+
date_published: {
13+
selectors: [['meta[name="og:article:published_time"]', 'value']],
14+
},
15+
16+
dek: {
17+
selectors: ['.article__desc'],
18+
},
19+
20+
lead_image_url: {
21+
selectors: [['meta[name="og:image"]', 'value']],
22+
},
23+
24+
content: {
25+
selectors: ['.article__content'],
26+
27+
transforms: {},
28+
29+
clean: [],
30+
},
31+
};
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import assert from 'assert';
2+
import URL from 'url';
3+
import cheerio from 'cheerio';
4+
5+
import Mercury from 'mercury';
6+
import getExtractor from 'extractors/get-extractor';
7+
import { excerptContent } from 'utils/text';
8+
9+
const fs = require('fs');
10+
11+
describe('WwwLemondeFrExtractor', () => {
12+
describe('initial test case', () => {
13+
let result;
14+
let url;
15+
beforeAll(() => {
16+
url =
17+
'https://www.lemonde.fr/economie/article/2019/05/07/dans-ses-previsions-economiques-bruxelles-confirme-la-montee-des-perils_5459325_3234.html';
18+
const html = fs.readFileSync(
19+
'./fixtures/www.lemonde.fr/1557235525251.html'
20+
);
21+
result = Mercury.parse(url, { html, fallback: false });
22+
});
23+
24+
it('is selected properly', () => {
25+
const extractor = getExtractor(url);
26+
assert.equal(extractor.domain, URL.parse(url).hostname);
27+
});
28+
29+
it('returns the title', async () => {
30+
const { title } = await result;
31+
32+
assert.equal(
33+
title,
34+
`Les sombres perspectives économiques de la Commission européenne`
35+
);
36+
});
37+
38+
it('returns the author', async () => {
39+
const { author } = await result;
40+
41+
assert.equal(author, `Cécile Ducourtieux`);
42+
});
43+
44+
it('returns the date_published', async () => {
45+
const { date_published } = await result;
46+
47+
assert.equal(date_published, `2019-05-07T11:59:43.000Z`);
48+
});
49+
50+
it('returns the dek', async () => {
51+
const { dek } = await result;
52+
53+
assert.equal(
54+
dek,
55+
'Elle abaisse ses prévisions pour 2019, avec un PIB à 1,4 % pour l’ensemble de l’UE, et à 1,2 % pour la zone euro.'
56+
);
57+
});
58+
59+
it('returns the lead_image_url', async () => {
60+
const { lead_image_url } = await result;
61+
62+
assert.equal(
63+
lead_image_url,
64+
`https://img.lemde.fr/2019/05/07/316/0/3824/1912/1440/720/60/0/d105b14_dfjDE1I-caggQrT4gvHf2nZP.jpg`
65+
);
66+
});
67+
68+
it('returns the content', async () => {
69+
const { content } = await result;
70+
71+
const $ = cheerio.load(content || '');
72+
73+
const first13 = excerptContent(
74+
$('*')
75+
.first()
76+
.text(),
77+
13
78+
);
79+
80+
assert.equal(
81+
first13,
82+
'Les dirigeants européens qui doivent se réunir, jeudi 9 mai à Sibiu (Roumanie),'
83+
);
84+
});
85+
});
86+
});

0 commit comments

Comments
 (0)