Feat: detect platforms (#52)

Detectors for matching extractors for publishing platforms. Currently supporting Medium and Blogger.
8 years ago · 2fb47640f2
parent 64c0fad2fd
commit 2fb47640f2
5 changed files with 58 additions and 6 deletions
--- a/src/extractors/detect-by-html.js
+++ b/src/extractors/detect-by-html.js
@ -0,0 +1,15 @@
+import {
+  MediumExtractor,
+  BloggerExtractor,
+} from './custom/';
+
+const Detectors = {
+  'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,
+  'meta[name="generator"][value="blogger"]': BloggerExtractor,
+};
+
+export default function detectByHtml($) {
+  const selector = Reflect.ownKeys(Detectors).find(s => $(s).length > 0);
+
+  return Detectors[selector];
+}
--- a/src/extractors/detect-by-html.test.js
+++ b/src/extractors/detect-by-html.test.js
@ -0,0 +1,24 @@
+import assert from 'assert';
+import cheerio from 'cheerio';
+
+import detectByHtml from './detect-by-html';
+
+describe('detectByHtml', () => {
+  it('detects a medium post from the html', () => {
+    const html =
+      '<head><meta name="al:ios:app_name" value="Medium" /></head>';
+
+    const $ = cheerio.load(html);
+
+    assert.equal(detectByHtml($).domain, 'medium.com');
+  });
+
+  it('returns nothing if no match is found', () => {
+    const html =
+      '<div></div>';
+
+    const $ = cheerio.load(html);
+
+    assert.equal(detectByHtml($), null);
+  });
+});
--- a/src/extractors/get-extractor.js
+++ b/src/extractors/get-extractor.js
@ -2,11 +2,13 @@ import URL from 'url';

 import Extractors from './all';
 import GenericExtractor from './generic';
+import detectByHtml from './detect-by-html';

-export default function getExtractor(url, parsedUrl) {
+export default function getExtractor(url, parsedUrl, $) {
  parsedUrl = parsedUrl || URL.parse(url);
  const { hostname } = parsedUrl;
  const baseDomain = hostname.split('.').slice(-2).join('.');

-  return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor;
+  return Extractors[hostname] || Extractors[baseDomain] ||
+    detectByHtml($) || GenericExtractor;
 }
--- a/src/extractors/get-extractor.test.js
+++ b/src/extractors/get-extractor.test.js
@ -1,10 +1,11 @@
 import assert from 'assert';
+import cheerio from 'cheerio';

 import getExtractor from './get-extractor';

 describe('getExtractor(url)', () => {
  it('returns GenericExtractor if no custom extractor is found', () => {
-    const extractor = getExtractor('http://example.com');
+    const extractor = getExtractor('http://example.com', null, cheerio.load('<div />'));

    assert.equal(extractor.domain, '*');
  });
@ -26,4 +27,14 @@ describe('getExtractor(url)', () => {

    assert.equal(extractor.domain, 'wikipedia.org');
  });
+
+  it('returns a custom extractor based on detectors', () => {
+    const html =
+      '<head><meta name="al:ios:app_name" value="Medium" /></head>';
+
+    const $ = cheerio.load(html);
+    const extractor = getExtractor('http://foo.com', null, $);
+
+    assert.equal(extractor.domain, 'medium.com');
+  });
 });
--- a/src/mercury.js
+++ b/src/mercury.js
@ -31,11 +31,11 @@ const Mercury = {
      return Errors.badUrl;
    }

-    const Extractor = getExtractor(url, parsedUrl);
-    // console.log(`Using extractor for ${Extractor.domain}`);
-
    const $ = await Resource.create(url, html, parsedUrl);

+    const Extractor = getExtractor(url, parsedUrl, $);
+    // console.log(`Using extractor for ${Extractor.domain}`);
+
    // If we found an error creating the resource, return that error
    if ($.failed) {
      return $;