|
|
@ -10,14 +10,16 @@ import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
|
|
|
|
const GenericExtractor = {
|
|
|
|
const GenericExtractor = {
|
|
|
|
// This extractor is the default for all domains
|
|
|
|
// This extractor is the default for all domains
|
|
|
|
domain: '*',
|
|
|
|
domain: '*',
|
|
|
|
|
|
|
|
title: GenericTitleExtractor.extract,
|
|
|
|
|
|
|
|
datePublished : GenericDatePublishedExtractor.extract,
|
|
|
|
|
|
|
|
author: GenericAuthorExtractor.extract,
|
|
|
|
|
|
|
|
content: GenericContentExtractor.extract.bind(GenericContentExtractor),
|
|
|
|
|
|
|
|
leadImageUrl: GenericLeadImageUrlExtractor.extract,
|
|
|
|
|
|
|
|
dek: GenericDekExtractor.extract,
|
|
|
|
|
|
|
|
|
|
|
|
parse: (url, html, $) => {
|
|
|
|
parse: function(url, html, $) {
|
|
|
|
if (html) {
|
|
|
|
if (html) {
|
|
|
|
$ = cheerio.load(html)
|
|
|
|
$ = cheerio.load(html)
|
|
|
|
} else {
|
|
|
|
|
|
|
|
// TODO
|
|
|
|
|
|
|
|
// Fetch link, following redirects
|
|
|
|
|
|
|
|
// to return html and initialize $
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Cached value of every meta name in our document.
|
|
|
|
// Cached value of every meta name in our document.
|
|
|
@ -26,14 +28,12 @@ const GenericExtractor = {
|
|
|
|
return $(node).attr('name')
|
|
|
|
return $(node).attr('name')
|
|
|
|
}).toArray()
|
|
|
|
}).toArray()
|
|
|
|
|
|
|
|
|
|
|
|
const title = GenericTitleExtractor.extract($, url, metaCache)
|
|
|
|
const title = this.title($, url, metaCache)
|
|
|
|
const datePublished =
|
|
|
|
const datePublished = this.datePublished($, url, metaCache)
|
|
|
|
GenericDatePublishedExtractor.extract($, url, metaCache)
|
|
|
|
const author = this.author($, metaCache)
|
|
|
|
const author = GenericAuthorExtractor.extract($, metaCache)
|
|
|
|
const content = this.content($, html)
|
|
|
|
const content = GenericContentExtractor.parse($, html)
|
|
|
|
const leadImageUrl = this.leadImageUrl($, content, metaCache)
|
|
|
|
const leadImageUrl =
|
|
|
|
const dek = this.dek($, content, metaCache)
|
|
|
|
GenericLeadImageUrlExtractor.extract($, content, metaCache)
|
|
|
|
|
|
|
|
const dek = GenericDekExtractor.extract($, metaCache, content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
return {
|
|
|
|
title,
|
|
|
|
title,
|
|
|
|