Feat: detect platforms (#52)

adampash · web-flow · commit 2fb47640f243 · 2016-12-06T12:17:03.000-05:00
Detectors for matching extractors for publishing platforms. Currently supporting Medium and Blogger.
diff --git a/src/extractors/detect-by-html.js b/src/extractors/detect-by-html.js
@@ -0,0 +1,15 @@
+import {
+  MediumExtractor,
+  BloggerExtractor,
+} from './custom/';
+
+const Detectors = {
+  'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,
+  'meta[name="generator"][value="blogger"]': BloggerExtractor,
+};
+
+export default function detectByHtml($) {
+  const selector = Reflect.ownKeys(Detectors).find(s => $(s).length > 0);
+
+  return Detectors[selector];
+}
diff --git a/src/extractors/detect-by-html.test.js b/src/extractors/detect-by-html.test.js
@@ -0,0 +1,24 @@
+import assert from 'assert';
+import cheerio from 'cheerio';
+
+import detectByHtml from './detect-by-html';
+
+describe('detectByHtml', () => {
+  it('detects a medium post from the html', () => {
+    const html =
+      '<head><meta name="al:ios:app_name" value="Medium" /></head>';
+
+    const $ = cheerio.load(html);
+
+    assert.equal(detectByHtml($).domain, 'medium.com');
+  });
+
+  it('returns nothing if no match is found', () => {
+    const html =
+      '<div></div>';
+
+    const $ = cheerio.load(html);
+
+    assert.equal(detectByHtml($), null);
+  });
+});
diff --git a/src/extractors/get-extractor.js b/src/extractors/get-extractor.js
@@ -2,11 +2,13 @@ import URL from 'url';
 
 import Extractors from './all';
 import GenericExtractor from './generic';
+import detectByHtml from './detect-by-html';
 
-export default function getExtractor(url, parsedUrl) {
+export default function getExtractor(url, parsedUrl, $) {
   parsedUrl = parsedUrl || URL.parse(url);
   const { hostname } = parsedUrl;
   const baseDomain = hostname.split('.').slice(-2).join('.');
 
-  return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor;
+  return Extractors[hostname] || Extractors[baseDomain] ||
+    detectByHtml($) || GenericExtractor;
 }
diff --git a/src/extractors/get-extractor.test.js b/src/extractors/get-extractor.test.js
@@ -1,10 +1,11 @@
 import assert from 'assert';
+import cheerio from 'cheerio';
 
 import getExtractor from './get-extractor';
 
 describe('getExtractor(url)', () => {
   it('returns GenericExtractor if no custom extractor is found', () => {
-    const extractor = getExtractor('http://example.com');
+    const extractor = getExtractor('http://example.com', null, cheerio.load('<div />'));
 
     assert.equal(extractor.domain, '*');
   });
@@ -26,4 +27,14 @@ describe('getExtractor(url)', () => {
 
     assert.equal(extractor.domain, 'wikipedia.org');
   });
+
+  it('returns a custom extractor based on detectors', () => {
+    const html =
+      '<head><meta name="al:ios:app_name" value="Medium" /></head>';
+
+    const $ = cheerio.load(html);
+    const extractor = getExtractor('http://foo.com', null, $);
+
+    assert.equal(extractor.domain, 'medium.com');
+  });
 });
diff --git a/src/mercury.js b/src/mercury.js
@@ -31,11 +31,11 @@ const Mercury = {
       return Errors.badUrl;
     }
 
-    const Extractor = getExtractor(url, parsedUrl);
-    // console.log(`Using extractor for ${Extractor.domain}`);
-
     const $ = await Resource.create(url, html, parsedUrl);
 
+    const Extractor = getExtractor(url, parsedUrl, $);
+    // console.log(`Using extractor for ${Extractor.domain}`);
+
     // If we found an error creating the resource, return that error
     if ($.failed) {
       return $;