refactor: preparing for extraction merging

This commit is contained in:
Adam Pash 2016-09-07 14:40:22 -04:00
parent a022252a14
commit 0f45b39ca2
4 changed files with 21 additions and 26 deletions

View File

@ -13,13 +13,6 @@ const GenericContentExtractor = {
cleanConditionally: true,
},
// Entry point for parsing html
parse($, html, title='', opts={}) {
opts = { ...this.defaultOpts, ...opts }
return this.extract($, html, opts, title)
},
// Extract the content for this resource - initially, pass in our
// most restrictive opts which will return the highest quality
// content. On each failure, retry with slightly more lax opts.
@ -39,7 +32,9 @@ const GenericContentExtractor = {
//
// cleanConditionally: Clean the node to return of some
// superfluous content. Things like forms, ads, etc.
extract($, html, opts, title) {
extract($, html, opts, title='') {
opts = { ...this.defaultOpts, ...opts }
$ = $ || cheerio.load(html)
// Cascade through our extraction-specific opts in an ordered fashion,

View File

@ -8,15 +8,15 @@ import GenericContentExtractor from './extractor'
describe('GenericContentExtractor', function() {
this.timeout(1000000)
describe('parse(html, opts)', () => {
it("parses html and returns the article", () => {
describe('extract($, html, opts)', () => {
it("extracts html and returns the article", () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
// Array.from(range(1, 100)).map((i) => {
// console.log(i)
// clean(GenericContentExtractor.parse(null, html))
// clean(GenericContentExtractor.extract(null, html))
// })
const result = clean(GenericContentExtractor.parse(null, html))
const result = clean(GenericContentExtractor.extract(null, html))
// console.log(result)
})
})

View File

@ -17,7 +17,7 @@ import {
// until we have a more robust generic option.
// Below is the original source for this, for reference.
const GenericDekExtractor = {
extract($, cachedMeta, content) {
extract($, content, cachedMeta) {
return null
}
}

View File

@ -10,14 +10,16 @@ import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
const GenericExtractor = {
// This extractor is the default for all domains
domain: '*',
title: GenericTitleExtractor.extract,
datePublished : GenericDatePublishedExtractor.extract,
author: GenericAuthorExtractor.extract,
content: GenericContentExtractor.extract.bind(GenericContentExtractor),
leadImageUrl: GenericLeadImageUrlExtractor.extract,
dek: GenericDekExtractor.extract,
parse: (url, html, $) => {
parse: function(url, html, $) {
if (html) {
$ = cheerio.load(html)
} else {
// TODO
// Fetch link, following redirects
// to return html and initialize $
}
// Cached value of every meta name in our document.
@ -26,14 +28,12 @@ const GenericExtractor = {
return $(node).attr('name')
}).toArray()
const title = GenericTitleExtractor.extract($, url, metaCache)
const datePublished =
GenericDatePublishedExtractor.extract($, url, metaCache)
const author = GenericAuthorExtractor.extract($, metaCache)
const content = GenericContentExtractor.parse($, html)
const leadImageUrl =
GenericLeadImageUrlExtractor.extract($, content, metaCache)
const dek = GenericDekExtractor.extract($, metaCache, content)
const title = this.title($, url, metaCache)
const datePublished = this.datePublished($, url, metaCache)
const author = this.author($, metaCache)
const content = this.content($, html)
const leadImageUrl = this.leadImageUrl($, content, metaCache)
const dek = this.dek($, content, metaCache)
return {
title,