refactor: preparing for extraction merging

2024-11-17 03:25:31 +00:00 · 2016-09-07 14:40:22 -04:00 · 2016-09-07 14:40:22 -04:00 · 0f45b39ca2
commit 0f45b39ca2
parent a022252a14
4 changed files with 21 additions and 26 deletions
--- a/src/extractor/generic/content/extractor.js
+++ b/src/extractor/generic/content/extractor.js
@ -13,13 +13,6 @@ const GenericContentExtractor = {
    cleanConditionally: true,
  },
  // Entry point for parsing html
  parse($, html, title='', opts={}) {
    opts = { ...this.defaultOpts, ...opts }
    return this.extract($, html, opts, title)
  },
  // Extract the content for this resource - initially, pass in our
  // most restrictive opts which will return the highest quality
  // content. On each failure, retry with slightly more lax opts.
@ -39,7 +32,9 @@ const GenericContentExtractor = {
  //
  // cleanConditionally: Clean the node to return of some
  // superfluous content. Things like forms, ads, etc.
-  extract($, html, opts, title) {
+  extract($, html, opts, title='') {
    opts = { ...this.defaultOpts, ...opts }
    $ = $ || cheerio.load(html)
    // Cascade through our extraction-specific opts in an ordered fashion,
--- a/src/extractor/generic/content/extractor.test.js
+++ b/src/extractor/generic/content/extractor.test.js
@ -8,15 +8,15 @@ import GenericContentExtractor from './extractor'
 describe('GenericContentExtractor', function() {
  this.timeout(1000000)
-  describe('parse(html, opts)', () => {
+  describe('extract($, html, opts)', () => {
-    it("parses html and returns the article", () => {
+    it("extracts html and returns the article", () => {
      const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
      // Array.from(range(1, 100)).map((i) => {
      //   console.log(i)
-      //   clean(GenericContentExtractor.parse(null, html))
+      //   clean(GenericContentExtractor.extract(null, html))
      // })
-      const result = clean(GenericContentExtractor.parse(null, html))
+      const result = clean(GenericContentExtractor.extract(null, html))
      // console.log(result)
    })
  })
--- a/src/extractor/generic/dek/extractor.js
+++ b/src/extractor/generic/dek/extractor.js
@ -17,7 +17,7 @@ import {
 // until we have a more robust generic option.
 // Below is the original source for this, for reference.
 const GenericDekExtractor = {
-  extract($, cachedMeta, content) {
+  extract($, content, cachedMeta) {
    return null
  }
 }
--- a/src/extractor/generic/index.js
+++ b/src/extractor/generic/index.js
@ -10,14 +10,16 @@ import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
 const GenericExtractor = {
  // This extractor is the default for all domains
  domain: '*',
  title: GenericTitleExtractor.extract,
  datePublished : GenericDatePublishedExtractor.extract,
  author: GenericAuthorExtractor.extract,
  content: GenericContentExtractor.extract.bind(GenericContentExtractor),
  leadImageUrl: GenericLeadImageUrlExtractor.extract,
  dek: GenericDekExtractor.extract,
-  parse: (url, html, $) => {
+  parse: function(url, html, $) {
    if (html) {
      $ = cheerio.load(html)
    } else {
      // TODO
      // Fetch link, following redirects
      // to return html and initialize $
    }
    // Cached value of every meta name in our document.
@ -26,14 +28,12 @@ const GenericExtractor = {
      return $(node).attr('name')
    }).toArray()
-    const title = GenericTitleExtractor.extract($, url, metaCache)
+    const title = this.title($, url, metaCache)
-    const datePublished =
+    const datePublished = this.datePublished($, url, metaCache)
-      GenericDatePublishedExtractor.extract($, url, metaCache)
+    const author = this.author($, metaCache)
-    const author = GenericAuthorExtractor.extract($, metaCache)
+    const content = this.content($, html)
-    const content = GenericContentExtractor.parse($, html)
+    const leadImageUrl = this.leadImageUrl($, content, metaCache)
-    const leadImageUrl =
+    const dek = this.dek($, content, metaCache)
      GenericLeadImageUrlExtractor.extract($, content, metaCache)
    const dek = GenericDekExtractor.extract($, metaCache, content)
    return {
      title,