diff --git a/src/extractors/detect-by-html.js b/src/extractors/detect-by-html.js new file mode 100644 index 00000000..21fae75d --- /dev/null +++ b/src/extractors/detect-by-html.js @@ -0,0 +1,15 @@ +import { + MediumExtractor, + BloggerExtractor, +} from './custom/'; + +const Detectors = { + 'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor, + 'meta[name="generator"][value="blogger"]': BloggerExtractor, +}; + +export default function detectByHtml($) { + const selector = Reflect.ownKeys(Detectors).find(s => $(s).length > 0); + + return Detectors[selector]; +} diff --git a/src/extractors/detect-by-html.test.js b/src/extractors/detect-by-html.test.js new file mode 100644 index 00000000..ae5f28ef --- /dev/null +++ b/src/extractors/detect-by-html.test.js @@ -0,0 +1,24 @@ +import assert from 'assert'; +import cheerio from 'cheerio'; + +import detectByHtml from './detect-by-html'; + +describe('detectByHtml', () => { + it('detects a medium post from the html', () => { + const html = + '
'; + + const $ = cheerio.load(html); + + assert.equal(detectByHtml($).domain, 'medium.com'); + }); + + it('returns nothing if no match is found', () => { + const html = + ''; + + const $ = cheerio.load(html); + + assert.equal(detectByHtml($), null); + }); +}); diff --git a/src/extractors/get-extractor.js b/src/extractors/get-extractor.js index 4b01297f..b92be9e6 100644 --- a/src/extractors/get-extractor.js +++ b/src/extractors/get-extractor.js @@ -2,11 +2,13 @@ import URL from 'url'; import Extractors from './all'; import GenericExtractor from './generic'; +import detectByHtml from './detect-by-html'; -export default function getExtractor(url, parsedUrl) { +export default function getExtractor(url, parsedUrl, $) { parsedUrl = parsedUrl || URL.parse(url); const { hostname } = parsedUrl; const baseDomain = hostname.split('.').slice(-2).join('.'); - return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor; + return Extractors[hostname] || Extractors[baseDomain] || + detectByHtml($) || GenericExtractor; } diff --git a/src/extractors/get-extractor.test.js b/src/extractors/get-extractor.test.js index 412e8309..3e31e0dd 100644 --- a/src/extractors/get-extractor.test.js +++ b/src/extractors/get-extractor.test.js @@ -1,10 +1,11 @@ import assert from 'assert'; +import cheerio from 'cheerio'; import getExtractor from './get-extractor'; describe('getExtractor(url)', () => { it('returns GenericExtractor if no custom extractor is found', () => { - const extractor = getExtractor('http://example.com'); + const extractor = getExtractor('http://example.com', null, cheerio.load('')); assert.equal(extractor.domain, '*'); }); @@ -26,4 +27,14 @@ describe('getExtractor(url)', () => { assert.equal(extractor.domain, 'wikipedia.org'); }); + + it('returns a custom extractor based on detectors', () => { + const html = + ''; + + const $ = cheerio.load(html); + const extractor = getExtractor('http://foo.com', null, $); + + assert.equal(extractor.domain, 'medium.com'); + }); }); diff --git a/src/mercury.js b/src/mercury.js index c79fafc7..c9e3bda4 100644 --- a/src/mercury.js +++ b/src/mercury.js @@ -31,11 +31,11 @@ const Mercury = { return Errors.badUrl; } - const Extractor = getExtractor(url, parsedUrl); - // console.log(`Using extractor for ${Extractor.domain}`); - const $ = await Resource.create(url, html, parsedUrl); + const Extractor = getExtractor(url, parsedUrl, $); + // console.log(`Using extractor for ${Extractor.domain}`); + // If we found an error creating the resource, return that error if ($.failed) { return $;