Feat: detect platforms (#52)

Detectors for matching extractors for publishing platforms. Currently supporting Medium and Blogger.
pull/53/head
Adam Pash 8 years ago committed by GitHub
parent 64c0fad2fd
commit 2fb47640f2

@ -0,0 +1,15 @@
import {
MediumExtractor,
BloggerExtractor,
} from './custom/';
const Detectors = {
'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor,
'meta[name="generator"][value="blogger"]': BloggerExtractor,
};
export default function detectByHtml($) {
const selector = Reflect.ownKeys(Detectors).find(s => $(s).length > 0);
return Detectors[selector];
}

@ -0,0 +1,24 @@
import assert from 'assert';
import cheerio from 'cheerio';
import detectByHtml from './detect-by-html';
describe('detectByHtml', () => {
it('detects a medium post from the html', () => {
const html =
'<head><meta name="al:ios:app_name" value="Medium" /></head>';
const $ = cheerio.load(html);
assert.equal(detectByHtml($).domain, 'medium.com');
});
it('returns nothing if no match is found', () => {
const html =
'<div></div>';
const $ = cheerio.load(html);
assert.equal(detectByHtml($), null);
});
});

@ -2,11 +2,13 @@ import URL from 'url';
import Extractors from './all';
import GenericExtractor from './generic';
import detectByHtml from './detect-by-html';
export default function getExtractor(url, parsedUrl) {
export default function getExtractor(url, parsedUrl, $) {
parsedUrl = parsedUrl || URL.parse(url);
const { hostname } = parsedUrl;
const baseDomain = hostname.split('.').slice(-2).join('.');
return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor;
return Extractors[hostname] || Extractors[baseDomain] ||
detectByHtml($) || GenericExtractor;
}

@ -1,10 +1,11 @@
import assert from 'assert';
import cheerio from 'cheerio';
import getExtractor from './get-extractor';
describe('getExtractor(url)', () => {
it('returns GenericExtractor if no custom extractor is found', () => {
const extractor = getExtractor('http://example.com');
const extractor = getExtractor('http://example.com', null, cheerio.load('<div />'));
assert.equal(extractor.domain, '*');
});
@ -26,4 +27,14 @@ describe('getExtractor(url)', () => {
assert.equal(extractor.domain, 'wikipedia.org');
});
it('returns a custom extractor based on detectors', () => {
const html =
'<head><meta name="al:ios:app_name" value="Medium" /></head>';
const $ = cheerio.load(html);
const extractor = getExtractor('http://foo.com', null, $);
assert.equal(extractor.domain, 'medium.com');
});
});

@ -31,11 +31,11 @@ const Mercury = {
return Errors.badUrl;
}
const Extractor = getExtractor(url, parsedUrl);
// console.log(`Using extractor for ${Extractor.domain}`);
const $ = await Resource.create(url, html, parsedUrl);
const Extractor = getExtractor(url, parsedUrl, $);
// console.log(`Using extractor for ${Extractor.domain}`);
// If we found an error creating the resource, return that error
if ($.failed) {
return $;

Loading…
Cancel
Save