Skip to content

Commit 2fb4764

Browse files
authored
Feat: detect platforms (#52)
Detectors for matching extractors for publishing platforms. Currently supporting Medium and Blogger.
1 parent 64c0fad commit 2fb4764

5 files changed

Lines changed: 58 additions & 6 deletions

File tree

src/extractors/detect-by-html.js

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import {
2+
MediumExtractor,
3+
BloggerExtractor,
4+
} from './custom/';
5+
6+
const Detectors = {
7+
'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,
8+
'meta[name="generator"][value="blogger"]': BloggerExtractor,
9+
};
10+
11+
export default function detectByHtml($) {
12+
const selector = Reflect.ownKeys(Detectors).find(s => $(s).length > 0);
13+
14+
return Detectors[selector];
15+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import assert from 'assert';
2+
import cheerio from 'cheerio';
3+
4+
import detectByHtml from './detect-by-html';
5+
6+
describe('detectByHtml', () => {
7+
it('detects a medium post from the html', () => {
8+
const html =
9+
'<head><meta name="al:ios:app_name" value="Medium" /></head>';
10+
11+
const $ = cheerio.load(html);
12+
13+
assert.equal(detectByHtml($).domain, 'medium.com');
14+
});
15+
16+
it('returns nothing if no match is found', () => {
17+
const html =
18+
'<div></div>';
19+
20+
const $ = cheerio.load(html);
21+
22+
assert.equal(detectByHtml($), null);
23+
});
24+
});

src/extractors/get-extractor.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@ import URL from 'url';
22

33
import Extractors from './all';
44
import GenericExtractor from './generic';
5+
import detectByHtml from './detect-by-html';
56

6-
export default function getExtractor(url, parsedUrl) {
7+
export default function getExtractor(url, parsedUrl, $) {
78
parsedUrl = parsedUrl || URL.parse(url);
89
const { hostname } = parsedUrl;
910
const baseDomain = hostname.split('.').slice(-2).join('.');
1011

11-
return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor;
12+
return Extractors[hostname] || Extractors[baseDomain] ||
13+
detectByHtml($) || GenericExtractor;
1214
}

src/extractors/get-extractor.test.js

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import assert from 'assert';
2+
import cheerio from 'cheerio';
23

34
import getExtractor from './get-extractor';
45

56
describe('getExtractor(url)', () => {
67
it('returns GenericExtractor if no custom extractor is found', () => {
7-
const extractor = getExtractor('http://example.com');
8+
const extractor = getExtractor('http://example.com', null, cheerio.load('<div />'));
89

910
assert.equal(extractor.domain, '*');
1011
});
@@ -26,4 +27,14 @@ describe('getExtractor(url)', () => {
2627

2728
assert.equal(extractor.domain, 'wikipedia.org');
2829
});
30+
31+
it('returns a custom extractor based on detectors', () => {
32+
const html =
33+
'<head><meta name="al:ios:app_name" value="Medium" /></head>';
34+
35+
const $ = cheerio.load(html);
36+
const extractor = getExtractor('http://foo.com', null, $);
37+
38+
assert.equal(extractor.domain, 'medium.com');
39+
});
2940
});

src/mercury.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ const Mercury = {
3131
return Errors.badUrl;
3232
}
3333

34-
const Extractor = getExtractor(url, parsedUrl);
35-
// console.log(`Using extractor for ${Extractor.domain}`);
36-
3734
const $ = await Resource.create(url, html, parsedUrl);
3835

36+
const Extractor = getExtractor(url, parsedUrl, $);
37+
// console.log(`Using extractor for ${Extractor.domain}`);
38+
3939
// If we found an error creating the resource, return that error
4040
if ($.failed) {
4141
return $;

0 commit comments

Comments
 (0)