From 0f45b39ca258eb1cd4f94c05c166be20f81949da Mon Sep 17 00:00:00 2001 From: Adam Pash Date: Wed, 7 Sep 2016 14:40:22 -0400 Subject: [PATCH] refactor: preparing for extraction merging --- src/extractor/generic/content/extractor.js | 11 +++----- .../generic/content/extractor.test.js | 8 +++--- src/extractor/generic/dek/extractor.js | 2 +- src/extractor/generic/index.js | 26 +++++++++---------- 4 files changed, 21 insertions(+), 26 deletions(-) diff --git a/src/extractor/generic/content/extractor.js b/src/extractor/generic/content/extractor.js index cf1d404a..b1a3b58d 100644 --- a/src/extractor/generic/content/extractor.js +++ b/src/extractor/generic/content/extractor.js @@ -13,13 +13,6 @@ const GenericContentExtractor = { cleanConditionally: true, }, - // Entry point for parsing html - parse($, html, title='', opts={}) { - opts = { ...this.defaultOpts, ...opts } - - return this.extract($, html, opts, title) - }, - // Extract the content for this resource - initially, pass in our // most restrictive opts which will return the highest quality // content. On each failure, retry with slightly more lax opts. @@ -39,7 +32,9 @@ const GenericContentExtractor = { // // cleanConditionally: Clean the node to return of some // superfluous content. Things like forms, ads, etc. - extract($, html, opts, title) { + extract($, html, opts, title='') { + opts = { ...this.defaultOpts, ...opts } + $ = $ || cheerio.load(html) // Cascade through our extraction-specific opts in an ordered fashion, diff --git a/src/extractor/generic/content/extractor.test.js b/src/extractor/generic/content/extractor.test.js index 3163e837..37ca5bd5 100644 --- a/src/extractor/generic/content/extractor.test.js +++ b/src/extractor/generic/content/extractor.test.js @@ -8,15 +8,15 @@ import GenericContentExtractor from './extractor' describe('GenericContentExtractor', function() { this.timeout(1000000) - describe('parse(html, opts)', () => { - it("parses html and returns the article", () => { + describe('extract($, html, opts)', () => { + it("extracts html and returns the article", () => { const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8') // Array.from(range(1, 100)).map((i) => { // console.log(i) - // clean(GenericContentExtractor.parse(null, html)) + // clean(GenericContentExtractor.extract(null, html)) // }) - const result = clean(GenericContentExtractor.parse(null, html)) + const result = clean(GenericContentExtractor.extract(null, html)) // console.log(result) }) }) diff --git a/src/extractor/generic/dek/extractor.js b/src/extractor/generic/dek/extractor.js index 671a10c5..fc4257ed 100644 --- a/src/extractor/generic/dek/extractor.js +++ b/src/extractor/generic/dek/extractor.js @@ -17,7 +17,7 @@ import { // until we have a more robust generic option. // Below is the original source for this, for reference. const GenericDekExtractor = { - extract($, cachedMeta, content) { + extract($, content, cachedMeta) { return null } } diff --git a/src/extractor/generic/index.js b/src/extractor/generic/index.js index 846b6aa1..d196e6d3 100644 --- a/src/extractor/generic/index.js +++ b/src/extractor/generic/index.js @@ -10,14 +10,16 @@ import GenericLeadImageUrlExtractor from './lead-image-url/extractor' const GenericExtractor = { // This extractor is the default for all domains domain: '*', + title: GenericTitleExtractor.extract, + datePublished : GenericDatePublishedExtractor.extract, + author: GenericAuthorExtractor.extract, + content: GenericContentExtractor.extract.bind(GenericContentExtractor), + leadImageUrl: GenericLeadImageUrlExtractor.extract, + dek: GenericDekExtractor.extract, - parse: (url, html, $) => { + parse: function(url, html, $) { if (html) { $ = cheerio.load(html) - } else { - // TODO - // Fetch link, following redirects - // to return html and initialize $ } // Cached value of every meta name in our document. @@ -26,14 +28,12 @@ const GenericExtractor = { return $(node).attr('name') }).toArray() - const title = GenericTitleExtractor.extract($, url, metaCache) - const datePublished = - GenericDatePublishedExtractor.extract($, url, metaCache) - const author = GenericAuthorExtractor.extract($, metaCache) - const content = GenericContentExtractor.parse($, html) - const leadImageUrl = - GenericLeadImageUrlExtractor.extract($, content, metaCache) - const dek = GenericDekExtractor.extract($, metaCache, content) + const title = this.title($, url, metaCache) + const datePublished = this.datePublished($, url, metaCache) + const author = this.author($, metaCache) + const content = this.content($, html) + const leadImageUrl = this.leadImageUrl($, content, metaCache) + const dek = this.dek($, content, metaCache) return { title,