refactor: preparing for extraction merging

pull/1/head
Adam Pash 8 years ago
parent a022252a14
commit 0f45b39ca2

@ -13,13 +13,6 @@ const GenericContentExtractor = {
cleanConditionally: true, cleanConditionally: true,
}, },
// Entry point for parsing html
parse($, html, title='', opts={}) {
opts = { ...this.defaultOpts, ...opts }
return this.extract($, html, opts, title)
},
// Extract the content for this resource - initially, pass in our // Extract the content for this resource - initially, pass in our
// most restrictive opts which will return the highest quality // most restrictive opts which will return the highest quality
// content. On each failure, retry with slightly more lax opts. // content. On each failure, retry with slightly more lax opts.
@ -39,7 +32,9 @@ const GenericContentExtractor = {
// //
// cleanConditionally: Clean the node to return of some // cleanConditionally: Clean the node to return of some
// superfluous content. Things like forms, ads, etc. // superfluous content. Things like forms, ads, etc.
extract($, html, opts, title) { extract($, html, opts, title='') {
opts = { ...this.defaultOpts, ...opts }
$ = $ || cheerio.load(html) $ = $ || cheerio.load(html)
// Cascade through our extraction-specific opts in an ordered fashion, // Cascade through our extraction-specific opts in an ordered fashion,

@ -8,15 +8,15 @@ import GenericContentExtractor from './extractor'
describe('GenericContentExtractor', function() { describe('GenericContentExtractor', function() {
this.timeout(1000000) this.timeout(1000000)
describe('parse(html, opts)', () => { describe('extract($, html, opts)', () => {
it("parses html and returns the article", () => { it("extracts html and returns the article", () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8') const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
// Array.from(range(1, 100)).map((i) => { // Array.from(range(1, 100)).map((i) => {
// console.log(i) // console.log(i)
// clean(GenericContentExtractor.parse(null, html)) // clean(GenericContentExtractor.extract(null, html))
// }) // })
const result = clean(GenericContentExtractor.parse(null, html)) const result = clean(GenericContentExtractor.extract(null, html))
// console.log(result) // console.log(result)
}) })
}) })

@ -17,7 +17,7 @@ import {
// until we have a more robust generic option. // until we have a more robust generic option.
// Below is the original source for this, for reference. // Below is the original source for this, for reference.
const GenericDekExtractor = { const GenericDekExtractor = {
extract($, cachedMeta, content) { extract($, content, cachedMeta) {
return null return null
} }
} }

@ -10,14 +10,16 @@ import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
const GenericExtractor = { const GenericExtractor = {
// This extractor is the default for all domains // This extractor is the default for all domains
domain: '*', domain: '*',
title: GenericTitleExtractor.extract,
datePublished : GenericDatePublishedExtractor.extract,
author: GenericAuthorExtractor.extract,
content: GenericContentExtractor.extract.bind(GenericContentExtractor),
leadImageUrl: GenericLeadImageUrlExtractor.extract,
dek: GenericDekExtractor.extract,
parse: (url, html, $) => { parse: function(url, html, $) {
if (html) { if (html) {
$ = cheerio.load(html) $ = cheerio.load(html)
} else {
// TODO
// Fetch link, following redirects
// to return html and initialize $
} }
// Cached value of every meta name in our document. // Cached value of every meta name in our document.
@ -26,14 +28,12 @@ const GenericExtractor = {
return $(node).attr('name') return $(node).attr('name')
}).toArray() }).toArray()
const title = GenericTitleExtractor.extract($, url, metaCache) const title = this.title($, url, metaCache)
const datePublished = const datePublished = this.datePublished($, url, metaCache)
GenericDatePublishedExtractor.extract($, url, metaCache) const author = this.author($, metaCache)
const author = GenericAuthorExtractor.extract($, metaCache) const content = this.content($, html)
const content = GenericContentExtractor.parse($, html) const leadImageUrl = this.leadImageUrl($, content, metaCache)
const leadImageUrl = const dek = this.dek($, content, metaCache)
GenericLeadImageUrlExtractor.extract($, content, metaCache)
const dek = GenericDekExtractor.extract($, metaCache, content)
return { return {
title, title,

Loading…
Cancel
Save