feat: allowing extractors to support multiple domains

pull/17/head
Adam Pash 8 years ago
parent d038a36544
commit de5b120b79

35
dist/mercury.js vendored

@ -10,10 +10,10 @@ var cheerio = _interopDefault(require('cheerio'));
var _Promise = _interopDefault(require('babel-runtime/core-js/promise'));
var request = _interopDefault(require('request'));
var _Reflect$ownKeys = _interopDefault(require('babel-runtime/core-js/reflect/own-keys'));
var _toConsumableArray = _interopDefault(require('babel-runtime/helpers/toConsumableArray'));
var _slicedToArray = _interopDefault(require('babel-runtime/helpers/slicedToArray'));
var stringDirection = _interopDefault(require('string-direction'));
var _getIterator = _interopDefault(require('babel-runtime/core-js/get-iterator'));
var _toConsumableArray = _interopDefault(require('babel-runtime/helpers/toConsumableArray'));
var _defineProperty = _interopDefault(require('babel-runtime/helpers/defineProperty'));
var _typeof = _interopDefault(require('babel-runtime/helpers/typeof'));
var validUrl = _interopDefault(require('valid-url'));
@ -348,6 +348,17 @@ var Resource = {
}
};
var merge = function merge(extractor, domains) {
return domains.reduce(function (acc, domain) {
acc[domain] = extractor;
return acc;
}, {});
};
function mergeSupportedDomains(extractor) {
return extractor.supportedDomains ? merge(extractor, [extractor.domain].concat(_toConsumableArray(extractor.supportedDomains))) : merge(extractor, [extractor.domain]);
}
var NYMagExtractor = {
domain: 'nymag.com',
content: {
@ -936,6 +947,9 @@ var PoliticoExtractor = {
var DeadspinExtractor = {
domain: 'deadspin.com',
supportedDomains: ['jezebel.com', 'lifehacker.com', 'kotaku.com', 'gizmodo.com', 'jalopnik.com', 'kinja.com'],
title: {
selectors: ['h1.headline']
},
@ -1101,6 +1115,9 @@ var ApartmentTherapyExtractor = {
var MediumExtractor = {
domain: 'medium.com',
supportedDomains: ['trackchanges.postlight.com'],
title: {
selectors: ['h1']
},
@ -1169,7 +1186,7 @@ var MediumExtractor = {
}
};
var Extractors = {
var Extractors = _extends({
'nymag.com': NYMagExtractor,
'blogspot.com': BloggerExtractor,
'wikipedia.org': WikipediaExtractor,
@ -1183,17 +1200,11 @@ var Extractors = {
'www.buzzfeed.com': BuzzfeedExtractor,
'fandom.wikia.com': WikiaExtractor,
'www.littlethings.com': LittleThingsExtractor,
'www.politico.com': PoliticoExtractor,
'deadspin.com': DeadspinExtractor,
'jezebel.com': DeadspinExtractor,
'lifehacker.com': DeadspinExtractor,
'kotaku.com': DeadspinExtractor,
'gizmodo.com': DeadspinExtractor,
'jalopnik.com': DeadspinExtractor,
'www.politico.com': PoliticoExtractor
}, mergeSupportedDomains(DeadspinExtractor), {
'www.broadwayworld.com': BroadwayWorldExtractor,
'www.apartmenttherapy.com': ApartmentTherapyExtractor,
'medium.com': MediumExtractor
};
'www.apartmenttherapy.com': ApartmentTherapyExtractor
}, mergeSupportedDomains(MediumExtractor));
// Spacer images to be removed
var SPACER_RE = new RegExp('trans|transparent|spacer|blank', 'i');

File diff suppressed because one or more lines are too long

@ -1,3 +1,4 @@
import mergeSupportedDomains from 'utils/merge-supported-domains';
import { NYMagExtractor } from './custom/nymag.com';
import { BloggerExtractor } from './custom/blogspot.com';
import { WikipediaExtractor } from './custom/wikipedia.org';
@ -32,15 +33,10 @@ const Extractors = {
'fandom.wikia.com': WikiaExtractor,
'www.littlethings.com': LittleThingsExtractor,
'www.politico.com': PoliticoExtractor,
'deadspin.com': DeadspinExtractor,
'jezebel.com': DeadspinExtractor,
'lifehacker.com': DeadspinExtractor,
'kotaku.com': DeadspinExtractor,
'gizmodo.com': DeadspinExtractor,
'jalopnik.com': DeadspinExtractor,
...mergeSupportedDomains(DeadspinExtractor),
'www.broadwayworld.com': BroadwayWorldExtractor,
'www.apartmenttherapy.com': ApartmentTherapyExtractor,
'medium.com': MediumExtractor,
...mergeSupportedDomains(MediumExtractor),
};
export default Extractors;

@ -1,5 +1,15 @@
export const DeadspinExtractor = {
domain: 'deadspin.com',
supportedDomains: [
'jezebel.com',
'lifehacker.com',
'kotaku.com',
'gizmodo.com',
'jalopnik.com',
'kinja.com',
],
title: {
selectors: [
'h1.headline',

@ -1,5 +1,10 @@
export const MediumExtractor = {
domain: 'medium.com',
supportedDomains: [
'trackchanges.postlight.com',
],
title: {
selectors: [
'h1',

@ -0,0 +1,13 @@
const merge = (extractor, domains) => (
domains.reduce((acc, domain) => {
acc[domain] = extractor;
return acc;
}, {})
);
export default function mergeSupportedDomains(extractor) {
return extractor.supportedDomains ?
merge(extractor, [extractor.domain, ...extractor.supportedDomains])
:
merge(extractor, [extractor.domain]);
}

@ -0,0 +1,31 @@
import assert from 'assert';
import mergeSupportedDomains from './merge-supported-domains';
describe('mergeSupportedDomains(extractor, domains)', () => {
it('returns an object w/domains as keys and extractor as value', () => {
const extractor = {
domain: 'foo.com',
supportedDomains: ['example.com'],
};
const expected = {
'foo.com': extractor,
'example.com': extractor,
};
const result = mergeSupportedDomains(extractor);
assert.deepEqual(result, expected);
});
it('returns an object w/single domain if no supportedDomains', () => {
const extractor = {
domain: 'foo.com',
};
const expected = {
'foo.com': extractor,
};
const result = mergeSupportedDomains(extractor);
assert.deepEqual(result, expected);
});
});
Loading…
Cancel
Save