feat: RootExtractor performs extraction using custom and generic

extraction methods
8 years ago · 7d88fee199
parent 937138c7bb
commit 7d88fee199
27 changed files with 288 additions and 131 deletions
--- a/TODO.md
+++ b/TODO.md
@ -1,4 +1,5 @@
 TODO:
+- change customselector to rootselector. consider other options for generalizing cleaning (use generic cleaners)
 - run makeLinksAbsolute on extracted content before returning
 - remove logic for fetching meta attrs with custom props
 - Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)
--- a/rollup.config.js
+++ b/rollup.config.js
@ -3,7 +3,7 @@ import babelrc from 'babelrc-rollup'
 import commonjs from 'rollup-plugin-commonjs'

 export default {
-  entry: 'src/index.js',
+  entry: 'src/iris.js',
  plugins: [
    commonjs(),
    babel(babelrc()),
--- a/src/extractor/custom/extractor.js
+++ b/src/extractor/custom/extractor.js
@ -1,58 +0,0 @@
-import GenericExtractor from '../generic'
-import { stripTags } from '../utils'
-
-const CustomExtractor = {
-  extract(extractor=GenericExtractor, opts) {
-    const { $ } = opts
-    if (extractor.domain === '*') return extractor.parse(opts)
-
-    const title = extract({ ...opts, type: 'title', extractor })
-    const datePublished = extract({ ...opts, type: 'datePublished', extractor })
-    const author = extract({ ...opts, type: 'author', extractor })
-    const content = extract({ ...opts, type: 'content', extractor, html: true })
-    const leadImageUrl = extract({ ...opts, type: 'leadImageUrl', extractor, html: true })
-    const dek = extract({ ...opts, type: 'dek', extractor, html: true })
-
-    return {
-      title,
-      content,
-      datePublished,
-      leadImageUrl,
-      dek,
-    }
-  }
-}
-
-function extract(opts) {
-  const { type, extractor, $ } = opts
-  return select($, extractor[type]) ||
-    GenericExtractor[type](opts)
-}
-
-function select($, selectObj, html=false) {
-  if (!selectObj) return
-  const { selectors } = selectObj
-  if (!selectors) return
-
-  const matchingSelector = selectors.find((selector) => {
-    return $(selector).length === 1
-  })
-  if (!matchingSelector) return
-
-  if (html) {
-    let $content = $(matchingSelector)
-    $content = cleanBySelectors($content, $, selectObj)
-  } else {
-    return stripTags($(matchingSelector).text(), $)
-  }
-}
-
-function cleanBySelectors($content, $, selectObj) {
-  const { clean } = selectObj
-
-  $(clean.join(','), $content).remove()
-
-  return $content
-}
-
-export default CustomExtractor
--- a/src/extractor/custom/extractor.test.js
+++ b/src/extractor/custom/extractor.test.js
@ -1,27 +0,0 @@
-import assert from 'assert'
-import fs from 'fs'
-import cheerio from 'cheerio'
-
-import CustomExtractor from './extractor'
-import GenericExtractor from '../generic'
-import NYMagExtractor from './nymag.com'
-
-describe('CustomExtractor', () => {
-  it('extracts based on custom selectors', () => {
-    const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
-    const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
-    const $ = cheerio.load(html)
-
-    const {
-      title,
-      content,
-      author,
-      datePublished,
-      leadImageUrl,
-    } = CustomExtractor.extract(
-      NYMagExtractor, { url, html, $, metaCache: [] }
-    )
-
-    assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation')
-  })
-})
--- a/src/extractor/custom/nymag.com/index.js
+++ b/src/extractor/custom/nymag.com/index.js
@ -15,13 +15,13 @@ const NYMagExtractor = {
    ],

    // Array of tranformations to make on matched elements
-    // Each item in the array is an object. They key is the 
+    // Each item in the array is an object. They key is the
    // selector, the value is a tranformation function
    // for the matching node.
    transforms: [
      // Convert h1s to h2s
      {
-        'h1': ($node) => convertNodeTo($node, $, 'h2')
+        'h1': 'h2'
      },

      // Convert lazy-loaded noscript images to figures
@ -29,7 +29,7 @@ const NYMagExtractor = {
        'noscript': ($node) => {
          const $children = $node.children()
          if ($children.length === 1 && $children.get(0).tagName === 'img') {
-            convertNodeTo($node, $, 'figure')
+            return 'figure'
          }
        }
      }
--- a/src/extractor/generic/content/extract-clean-node.js
+++ b/src/extractor/generic/content/extract-clean-node.js
@ -1,5 +1,4 @@
 import {
-  convertNodeTo,
  rewriteTopLevel,
  cleanImages,
  stripJunkTags,
@ -10,6 +9,8 @@ import {
  removeEmpty,
 } from './utils/dom'

+import { convertNodeTo } from '../../utils/dom'
+
 // Clean our article content, returning a new, cleaned node.
 export default function extractCleanNode(article, $, cleanConditionally=true, title='') {
  // do I need to copy/clone?
--- a/src/extractor/generic/content/utils/dom/clean-h-ones.js
+++ b/src/extractor/generic/content/utils/dom/clean-h-ones.js
@ -1,4 +1,4 @@
-import { convertNodeTo } from './index'
+import { convertNodeTo } from '../../../../utils/dom'

 // H1 tags are typically the article title, which should be extracted
 // by the title extractor instead. If there's less than 3 of them (<3),
--- a/src/extractor/generic/content/utils/dom/convert-to-paragraphs.js
+++ b/src/extractor/generic/content/utils/dom/convert-to-paragraphs.js
@ -1,3 +1,5 @@
+import { convertNodeTo } from '../../../../utils/dom'
+
 import { brsToPs } from './index'
 import { DIV_TO_P_BLOCK_TAGS } from '../constants'
 // Loop through the provided doc, and convert any p-like elements to
@ -42,8 +44,3 @@ function convertSpans($) {

  return $
 }
-
-export function convertNodeTo(node, $, tag='p') {
-  $(node).replaceWith(`<${tag}>${$(node).contents()}</${tag}>`)
-  return $
-}
--- a/src/extractor/generic/content/utils/dom/convert-to-paragraphs.test.js
+++ b/src/extractor/generic/content/utils/dom/convert-to-paragraphs.test.js
@ -11,8 +11,6 @@ import {
  convertToParagraphs
 } from './index'

-import { convertNodeTo } from './convert-to-paragraphs'
-
 describe('Generic Extractor Utils', () => {
  describe('convertToParagraphs($)', () => {

@ -22,18 +20,6 @@ describe('Generic Extractor Utils', () => {

  })

-  describe('convertNodeTo(node, $)', () => {
-    it('takes a node with any tag and turns it into a P tag', () => {
-      const $ = cheerio.load(HTML.convertNodeTo.before)
-      const node = $('div').first()
-
-      const result = convertNodeTo(node, $).html()
-
-      assertClean(result, HTML.convertNodeTo.after)
-    })
-
-  })
-
 })


--- a/src/extractor/generic/content/utils/dom/rewrite-top-level.js
+++ b/src/extractor/generic/content/utils/dom/rewrite-top-level.js
@ -1,4 +1,4 @@
-import { convertNodeTo } from './index'
+import { convertNodeTo } from '../../../../utils/dom'

 // Rewrite the tag name to div if it's a top level node like body or
 // html to avoid later complications with multiple body tags.
--- a/src/extractor/generic/content/utils/fixtures/html.js
+++ b/src/extractor/generic/content/utils/fixtures/html.js
@ -219,12 +219,6 @@ const HTML = {
    `,
  },

-  // convertNodeTo
-  convertNodeTo: {
-    before: '<div>Should become a p</div>',
-    after: '<p>Should become a p</p>',
-  },
-
  // linkDensity
  linkDensity5: `
    <div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
--- a/src/extractor/generic/content/utils/scoring/score-content.js
+++ b/src/extractor/generic/content/utils/scoring/score-content.js
@ -7,7 +7,8 @@ import {
  getOrInitScore,
  addScore,
 } from './index'
-import { convertNodeTo } from '../dom'
+
+import { convertNodeTo } from '../../../../utils/dom'

 // score content. Parents get the full value of their children's
 // content score, grandparents half
--- a/src/extractor/generic/dek/clean-dek.js
+++ b/src/extractor/generic/dek/clean-dek.js
@ -1,5 +1,5 @@
 import { TEXT_LINK_RE } from './constants'
-import { stripTags } from '../../utils'
+import { stripTags } from '../../utils/dom'

 // Take a dek HTML fragment, and return the cleaned version of it.
 // Return None if the dek wasn't good enough.
--- a/src/extractor/generic/index.js
+++ b/src/extractor/generic/index.js
@ -17,7 +17,7 @@ const GenericExtractor = {
  leadImageUrl: GenericLeadImageUrlExtractor.extract,
  dek: GenericDekExtractor.extract,

-  parse: function(options) {
+  extract: function(options) {
    let { html } = options

    if (html) {
--- a/src/extractor/generic/index.test.js
+++ b/src/extractor/generic/index.test.js
@ -6,8 +6,8 @@ import { clean } from './content/utils/dom/test-helpers'
 import GenericExtractor from './index'

 describe('GenericExtractor', () => {
-  describe('parse(html)', () => {
-    it("parses this old LA Times article", () => {
+  describe('extract(opts)', () => {
+    it("extracts this old LA Times article", () => {
      const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')

      const {
@ -16,7 +16,7 @@ describe('GenericExtractor', () => {
        datePublished,
        dek,
        leadImageUrl,
-      } = GenericExtractor.parse(
+      } = GenericExtractor.extract(
        { url: "http://latimes.com", html, metaCache: [] }
      )

@ -33,7 +33,7 @@ describe('GenericExtractor', () => {
      assert.equal(leadImageUrl, 'http://latimesblogs.latimes.com/fb.jpg')
    })

-    it("parses html and returns the article title", () => {
+    it("extracts html and returns the article title", () => {
      const html = fs.readFileSync('../fixtures/wired.html', 'utf-8')

      const {
@ -42,7 +42,7 @@ describe('GenericExtractor', () => {
        datePublished,
        dek,
        leadImageUrl,
-      } = GenericExtractor.parse(
+      } = GenericExtractor.extract(
        { url: "http://wired.com", html, metaCache: [] }
      )

--- a/src/extractor/generic/title/utils/clean-title.js
+++ b/src/extractor/generic/title/utils/clean-title.js
@ -1,6 +1,6 @@
 import { TITLE_SPLITTERS_RE } from '../constants'
 import { resolveSplitTitle } from './index'
-import { stripTags } from '../../../utils'
+import { stripTags } from '../../../utils/dom'

 export default function cleanTitle(title, url, $) {
  // If title has |, :, or - in it, see if
--- a/src/extractor/generic/utils/extract-from-meta.js
+++ b/src/extractor/generic/utils/extract-from-meta.js
@ -1,4 +1,4 @@
-import { stripTags } from '../../utils'
+import { stripTags } from '../../utils/dom'

 // Given a node type to search for, and a list of meta tag names to
 // search for, find a meta tag associated.
--- a/src/extractor/root-extractor.js
+++ b/src/extractor/root-extractor.js
@ -0,0 +1,99 @@
+import 'babel-polyfill'
+
+import GenericExtractor from './generic'
+import { convertNodeTo, stripTags } from './utils/dom'
+
+const RootExtractor = {
+  extract(extractor=GenericExtractor, opts) {
+    const { $ } = opts
+    // This is the generic extractor. Run its extract method
+    if (extractor.domain === '*') return extractor.extract(opts)
+
+    const title = extract({ ...opts, type: 'title', extractor })
+    const datePublished = extract({ ...opts, type: 'datePublished', extractor })
+    const author = extract({ ...opts, type: 'author', extractor })
+    const content = extract({ ...opts, type: 'content', extractor, html: true })
+    const leadImageUrl = extract({ ...opts, type: 'leadImageUrl', extractor, html: true })
+    const dek = extract({ ...opts, type: 'dek', extractor, html: true })
+
+    return {
+      title,
+      content,
+      datePublished,
+      leadImageUrl,
+      dek,
+    }
+  }
+}
+
+function extract(opts) {
+  const { type, extractor, $ } = opts
+
+  // If nothing matches the selector,
+  // run the Generic extraction
+  return select($, extractor[type]) ||
+    GenericExtractor[type](opts)
+}
+
+function select($, extractionOpts, html=false) {
+  // Skip if there's not extraction for this type
+  if (!extractionOpts) return
+
+  const { selectors } = extractionOpts
+
+  const matchingSelector = selectors.find((selector) => {
+    return $(selector).length === 1
+  })
+  if (!matchingSelector) return
+
+  // If the selector type requests html as its return type
+  // clean the element with provided cleaning selectors
+  if (html) {
+    let $content = $(matchingSelector)
+    $content = cleanBySelectors($content, $, extractionOpts)
+    $content = transformElements($content, $, extractionOpts)
+
+    return $content
+  } else {
+    return stripTags($(matchingSelector).text(), $)
+  }
+}
+
+// Remove elements by an array of selectors
+export function cleanBySelectors($content, $, { clean }) {
+  if (!clean) return
+
+  $(clean.join(','), $content).remove()
+
+  return $content
+}
+
+// Transform matching elements
+export function transformElements($content, $, { transforms }) {
+  if (!transforms) return
+
+  Reflect.ownKeys(transforms).forEach((key) => {
+    const $matches = $(key, $content)
+    const value = transforms[key]
+
+    // If value is a string, convert directly
+    if (typeof value === 'string') {
+      $matches.each((index, node) => {
+        convertNodeTo(node, $, transforms[key])
+      })
+    } else if (typeof value === 'function') {
+      // If value is function, apply function to node
+      $matches.each((index, node) => {
+        const result = value($(node))
+        // If function returns a string, convert node to that value
+        if (typeof result === 'string') {
+          convertNodeTo(node, $, result)
+        }
+      })
+    }
+  })
+
+  return $content
+}
+
+export default RootExtractor
--- a/src/extractor/root-extractor.test.js
+++ b/src/extractor/root-extractor.test.js
@ -0,0 +1,135 @@
+import assert from 'assert'
+import fs from 'fs'
+import cheerio from 'cheerio'
+
+import CustomExtractor from './root-extractor'
+import {
+  cleanBySelectors,
+  transformElements
+} from './root-extractor'
+
+import GenericExtractor from './generic'
+import NYMagExtractor from './custom/nymag.com'
+
+describe('CustomExtractor', () => {
+  it('extracts based on custom selectors', () => {
+    const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
+    const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
+    const $ = cheerio.load(html)
+
+    const {
+      title,
+      content,
+      author,
+      datePublished,
+      leadImageUrl,
+    } = CustomExtractor.extract(
+      NYMagExtractor, { url, html, $, metaCache: [] }
+    )
+
+    assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation')
+  })
+})
+
+describe('cleanBySelectors($content, $, { clean })', () => {
+  it('removes provided selectors from the content', () => {
+    const opts = { clean: ['.ad', '.share'] }
+    const html = `
+      <div>
+        <div class="body">
+          <div class="share">Share this on twitter plz</div>
+          <p>This is some good content</p>
+          <div class="ad">Advertisement!</div>
+        </div>
+    </div>`
+    const $ = cheerio.load(html)
+
+    let $content = $('.body')
+    $content = cleanBySelectors($content, $, opts)
+
+    assert.equal($content.find('.ad').length, 0)
+    assert.equal($content.find('.share').length, 0)
+  })
+})
+
+describe('transformElements($content, $, { transforms })', () => {
+  it('performs a simple transformation on matched elements', () => {
+    const html = `
+    <div>
+      <div class="body">
+        <h1>WOW BIG TITLE</h1>
+        <p>Here are some words</p>
+        <h1>WOW BIG TITLE</h1>
+      </div>
+    </div>
+    `
+    const opts = {
+      transforms: { 'h1': 'h2' }
+    }
+    const $ = cheerio.load(html)
+    let $content = $('.body')
+
+    const after = `
+      <div class="body">
+        <h2>WOW BIG TITLE</h2>
+        <p>Here are some words</p>
+        <h2>WOW BIG TITLE</h2>
+      </div>
+    `
+
+    $content = transformElements($content, $, opts)
+    assertClean($.html($content), after)
+  })
+
+  it('performs a complex transformation on matched elements', () => {
+    const html = `
+    <div>
+      <div class="body">
+        <noscript>
+          <img src="/img.jpg" />
+        </noscript>
+        <noscript>
+          Something else
+        </noscript>
+        <p>Here are some words</p>
+      </div>
+    </div>
+    `
+    const opts = {
+      transforms: {
+        'noscript': ($node) => {
+          const $children = $node.children()
+          if ($children.length === 1 && $children.get(0).tagName === 'img') {
+            return 'figure'
+          }
+        }
+      }
+    }
+    const $ = cheerio.load(html)
+    let $content = $('.body')
+
+    const after = `
+      <div class="body">
+        <figure>
+          <img src="/img.jpg">
+        </figure>
+        <noscript>
+          Something else
+        </noscript>
+        <p>Here are some words</p>
+      </div>
+    `
+
+    $content = transformElements($content, $, opts)
+    assertClean($.html($content), after)
+  })
+})
+
+export function clean(string) {
+  return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ')
+}
+
+export function assertClean(a, b) {
+  assert.equal(clean(a), clean(b))
+}
+
--- a/src/extractor/utils/dom/convert-node-to.js
+++ b/src/extractor/utils/dom/convert-node-to.js
@ -0,0 +1,5 @@
+export default function convertNodeTo(node, $, tag='p') {
+  $(node).replaceWith(`<${tag}>${$(node).contents()}</${tag}>`)
+  return $
+}
+
--- a/src/extractor/utils/dom/convert-node-to.test.js
+++ b/src/extractor/utils/dom/convert-node-to.test.js
@ -0,0 +1,20 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import convertNodeTo from './convert-node-to'
+
+describe('convertNodeTo(node, $)', () => {
+  it('takes a node and converts it to a diff tag', () => {
+    const html = '<div>Should become a p</div>'
+    const $ = cheerio.load(html)
+    const node = $('div').first()
+
+    const result = convertNodeTo(node, $).html()
+    const after = '<p>Should become a p</p>'
+
+    assert.equal(result, after)
+  })
+
+})
+
+
--- a/src/extractor/utils/dom/index.js
+++ b/src/extractor/utils/dom/index.js
@ -1,2 +1,4 @@
 export { default as withinComment } from './within-comment'
+export { default as convertNodeTo } from './convert-node-to'
+export { default as stripTags } from './strip-tags'

--- a/src/extractor/utils/dom/strip-tags.js
+++ b/src/extractor/utils/dom/strip-tags.js
--- a/src/extractor/utils/dom/strip-tags.test.js
+++ b/src/extractor/utils/dom/strip-tags.test.js
@ -3,7 +3,7 @@ import cheerio from 'cheerio'

 import { stripTags } from './index'

-describe('cleanTitle(title, $)', () => {
+describe('stripTags(title, $)', () => {
  it('strips tags from a string of text', () => {
    const $ = cheerio.load('<div></div>')

--- a/src/extractor/utils/index.js
+++ b/src/extractor/utils/index.js
@ -1,2 +1 @@
 export { default as nodeIsSufficient } from './node-is-sufficient'
-export { default as stripTags } from './strip-tags'
--- a/src/index.test.js
+++ b/src/index.test.js
@ -1,4 +1,6 @@
-import Iris from './index'
+import assert from 'assert'
+
+import Iris from './iris'

 describe('Iris', function() {
  describe('parse(url)', function() {
--- a/src/index.js
+++ b/src/index.js
@ -2,7 +2,7 @@ import fs from 'fs'

 import Resource from './resource'
 import getExtractor from './extractor/get-extractor'
-import RootExtractor from './extractor/custom/extractor'
+import RootExtractor from './extractor/root-extractor'

 import fetchResource from './resource/utils/fetch-resource'