basic merging of extracting sources

8 years ago · b3f90c489e
parent 0f45b39ca2
commit b3f90c489e
7 changed files with 3072 additions and 2 deletions
--- a/src/extractor/all.js
+++ b/src/extractor/all.js
@ -1,7 +1,8 @@
 import GenericExtractor from './generic'
 import NYMagExtractor from './custom/nymag.com'
 const Extractors = {
-  '*': GenericExtractor
+  'nymag.com': NYMagExtractor,
 }
 export default Extractors
--- a/src/extractor/custom/extractor.js
+++ b/src/extractor/custom/extractor.js
@ -0,0 +1,69 @@
 import GenericExtractor from '../generic'
 import { stripTags } from '../utils'
 const CustomExtractor = {
  extract(Extractor=GenericExtractor, url, html, $) {
    if (Extractor.domain === '*') return Extractor.parse(url, html, $)
    const meta = []
    const title =
      select($, Extractor.title) ||
      GenericExtractor.title($, url, meta)
    const datePublished =
      select($, Extractor.datePublished) ||
      GenericExtractor.datePublished($, url, meta)
    const author =
      select($, Extractor.author) ||
      GenericExtractor.author($, meta)
    const content =
      select($, Extractor.content, true) ||
      GenericExtractor.content($, html, {}, title)
    const leadImageUrl =
      select($, Extractor.leadImageUrl) ||
      GenericExtractor.leadImageUrl($, content, meta)
    const dek =
      select($, Extractor.dek) ||
      GenericExtractor.dek($, content, meta)
      return {
        title,
        content,
        datePublished,
        leadImageUrl,
        dek,
      }
  }
 }
 function select($, selectObj, html=false) {
  if (!selectObj) return
  const { selectors } = selectObj
  if (!selectors) return
  const matchingSelector = selectors.find((selector) => {
    return $(selector).length === 1
  })
  if (!matchingSelector) return
  if (html) {
    let $content = $(matchingSelector)
    $content = cleanBySelectors($content, $, selectObj)
  } else {
    return stripTags($(matchingSelector).text(), $)
  }
 }
 function cleanBySelectors($content, $, selectObj) {
  const { clean } = selectObj
  $(clean.join(','), $content).remove()
  return $content
 }
 export default CustomExtractor
--- a/src/extractor/custom/extractor.test.js
+++ b/src/extractor/custom/extractor.test.js
@ -0,0 +1,26 @@
 import assert from 'assert'
 import fs from 'fs'
 import cheerio from 'cheerio'
 import CustomExtractor from './extractor'
 import GenericExtractor from '../generic'
 import NYMagExtractor from './nymag.com'
 describe('CustomExtractor', () => {
  it('extracts based on custom selectors', () => {
    const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
    const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
    const $ = cheerio.load(html)
    const {
      title,
      content,
      author,
      datePublished,
      leadImageUrl,
    } = CustomExtractor.extract(NYMagExtractor, url, html, $)
    assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation')
    console.log(leadImageUrl)
  })
 })
--- a/src/extractor/custom/nymag.com/fixtures/test.html
+++ b/src/extractor/custom/nymag.com/fixtures/test.html
--- a/src/extractor/custom/nymag.com/index.js
+++ b/src/extractor/custom/nymag.com/index.js
@ -0,0 +1,47 @@
 const NYMagExtractor = {
  domain: 'nymag.com',
  content: {
    // Order by most likely. Extractor will stop on first occurence
    selectors: [
      'div.article-content',
      'section.body',
      'article.article',
    ],
    // Selectors to remove from the extracted content
    clean: [
      '.ad',
      '.single-related-story',
    ],
    // Array of tranformations to make on matched elements
    // Each item in the array is an object. They key is the 
    // selector, the value is a tranformation function
    // for the matching node.
    transforms: [
      // Convert h1s to h2s
      {
        'h1': ($node) => convertNodeTo($node, $, 'h2')
      },
      // Convert lazy-loaded noscript images to figures
      {
        'noscript': ($node) => {
          const $children = $node.children()
          if ($children.length === 1 && $children.get(0).tagName === 'img') {
            convertNodeTo($node, $, 'figure')
          }
        }
      }
    ]
  },
  title: {
    selectors: [
      'h1.headline-primary',
      'h1',
    ]
  }
 }
 export default NYMagExtractor
--- a/src/extractor/get-extractor.js
+++ b/src/extractor/get-extractor.js
@ -1,10 +1,14 @@
 import URL from 'url'
 import Extractors from './all'
 import GenericExtractor from './generic'
 export default function getExtractor(url) {
  const parsedUrl = URL.parse(url)
  const { hostname } = parsedUrl
-  return Extractors[hostname] || Extractors['*']
+  return {
    ...GenericExtractor,
    ...Extractors[hostname]
  }
 }
--- a/src/extractor/get-extractor.test.js
+++ b/src/extractor/get-extractor.test.js
@ -8,4 +8,10 @@ describe('getExtractor(url)', () => {
    assert.equal(extractor.domain, '*')
  })
  it('returns a custom extractor if found', () => {
    const extractor = getExtractor('https://nymag.com')
    assert.equal(extractor.domain, 'nymag.com')
  })
 })