From 2fb47640f24345772f685cbe5b14cb900e0b0b74 Mon Sep 17 00:00:00 2001
From: Adam Pash <adam.pash@gmail.com>
Date: Tue, 6 Dec 2016 12:17:03 -0500
Subject: [PATCH] Feat: detect platforms (#52)

Detectors for matching extractors for publishing platforms. Currently supporting Medium and Blogger.
---
 src/extractors/detect-by-html.js      | 15 +++++++++++++++
 src/extractors/detect-by-html.test.js | 24 ++++++++++++++++++++++++
 src/extractors/get-extractor.js       |  6 ++++--
 src/extractors/get-extractor.test.js  | 13 ++++++++++++-
 src/mercury.js                        |  6 +++---
 5 files changed, 58 insertions(+), 6 deletions(-)
 create mode 100644 src/extractors/detect-by-html.js
 create mode 100644 src/extractors/detect-by-html.test.js
diff --git a/src/extractors/detect-by-html.js b/src/extractors/detect-by-html.js
new file mode 100644
index 00000000..21fae75d
--- /dev/null
+++ b/src/extractors/detect-by-html.js
@@ -0,0 +1,15 @@
+import {
+  MediumExtractor,
+  BloggerExtractor,
+} from './custom/';
+
+const Detectors = {
+  'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,
+  'meta[name="generator"][value="blogger"]': BloggerExtractor,
+};
+
+export default function detectByHtml($) {
+  const selector = Reflect.ownKeys(Detectors).find(s => $(s).length > 0);
+
+  return Detectors[selector];
+}
diff --git a/src/extractors/detect-by-html.test.js b/src/extractors/detect-by-html.test.js
new file mode 100644
index 00000000..ae5f28ef
--- /dev/null
+++ b/src/extractors/detect-by-html.test.js
@@ -0,0 +1,24 @@
+import assert from 'assert';
+import cheerio from 'cheerio';
+
+import detectByHtml from './detect-by-html';
+
+describe('detectByHtml', () => {
+  it('detects a medium post from the html', () => {
+    const html =
+      '<head><meta name="al:ios:app_name" value="Medium" /></head>';
+
+    const $ = cheerio.load(html);
+
+    assert.equal(detectByHtml($).domain, 'medium.com');
+  });
+
+  it('returns nothing if no match is found', () => {
+    const html =
+      '<div></div>';
+
+    const $ = cheerio.load(html);
+
+    assert.equal(detectByHtml($), null);
+  });
+});
diff --git a/src/extractors/get-extractor.js b/src/extractors/get-extractor.js
index 4b01297f..b92be9e6 100644
--- a/src/extractors/get-extractor.js
+++ b/src/extractors/get-extractor.js
@@ -2,11 +2,13 @@ import URL from 'url';
 
 import Extractors from './all';
 import GenericExtractor from './generic';
+import detectByHtml from './detect-by-html';
 
-export default function getExtractor(url, parsedUrl) {
+export default function getExtractor(url, parsedUrl, $) {
   parsedUrl = parsedUrl || URL.parse(url);
   const { hostname } = parsedUrl;
   const baseDomain = hostname.split('.').slice(-2).join('.');
 
-  return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor;
+  return Extractors[hostname] || Extractors[baseDomain] ||
+    detectByHtml($) || GenericExtractor;
 }
diff --git a/src/extractors/get-extractor.test.js b/src/extractors/get-extractor.test.js
index 412e8309..3e31e0dd 100644
--- a/src/extractors/get-extractor.test.js
+++ b/src/extractors/get-extractor.test.js
@@ -1,10 +1,11 @@
 import assert from 'assert';
+import cheerio from 'cheerio';
 
 import getExtractor from './get-extractor';
 
 describe('getExtractor(url)', () => {
   it('returns GenericExtractor if no custom extractor is found', () => {
-    const extractor = getExtractor('http://example.com');
+    const extractor = getExtractor('http://example.com', null, cheerio.load('<div />'));
 
     assert.equal(extractor.domain, '*');
   });
@@ -26,4 +27,14 @@ describe('getExtractor(url)', () => {
 
     assert.equal(extractor.domain, 'wikipedia.org');
   });
+
+  it('returns a custom extractor based on detectors', () => {
+    const html =
+      '<head><meta name="al:ios:app_name" value="Medium" /></head>';
+
+    const $ = cheerio.load(html);
+    const extractor = getExtractor('http://foo.com', null, $);
+
+    assert.equal(extractor.domain, 'medium.com');
+  });
 });
diff --git a/src/mercury.js b/src/mercury.js
index c79fafc7..c9e3bda4 100644
--- a/src/mercury.js
+++ b/src/mercury.js
@@ -31,11 +31,11 @@ const Mercury = {
       return Errors.badUrl;
     }
 
-    const Extractor = getExtractor(url, parsedUrl);
-    // console.log(`Using extractor for ${Extractor.domain}`);
-
     const $ = await Resource.create(url, html, parsedUrl);
 
+    const Extractor = getExtractor(url, parsedUrl, $);
+    // console.log(`Using extractor for ${Extractor.domain}`);
+
     // If we found an error creating the resource, return that error
     if ($.failed) {
       return $;