basic merging of extracting sources

8 years ago · b3f90c489e
parent 0f45b39ca2
commit b3f90c489e
7 changed files with 3072 additions and 2 deletions
--- a/src/extractor/all.js
+++ b/src/extractor/all.js
@ -1,7 +1,8 @@
 import GenericExtractor from './generic'
+import NYMagExtractor from './custom/nymag.com'

 const Extractors = {
-  '*': GenericExtractor
+  'nymag.com': NYMagExtractor,
 }

 export default Extractors
--- a/src/extractor/custom/extractor.js
+++ b/src/extractor/custom/extractor.js
@ -0,0 +1,69 @@
+import GenericExtractor from '../generic'
+import { stripTags } from '../utils'
+
+const CustomExtractor = {
+  extract(Extractor=GenericExtractor, url, html, $) {
+    if (Extractor.domain === '*') return Extractor.parse(url, html, $)
+    const meta = []
+
+    const title =
+      select($, Extractor.title) ||
+      GenericExtractor.title($, url, meta)
+
+    const datePublished =
+      select($, Extractor.datePublished) ||
+      GenericExtractor.datePublished($, url, meta)
+
+    const author =
+      select($, Extractor.author) ||
+      GenericExtractor.author($, meta)
+
+    const content =
+      select($, Extractor.content, true) ||
+      GenericExtractor.content($, html, {}, title)
+
+    const leadImageUrl =
+      select($, Extractor.leadImageUrl) ||
+      GenericExtractor.leadImageUrl($, content, meta)
+
+    const dek =
+      select($, Extractor.dek) ||
+      GenericExtractor.dek($, content, meta)
+
+      return {
+        title,
+        content,
+        datePublished,
+        leadImageUrl,
+        dek,
+      }
+  }
+}
+
+function select($, selectObj, html=false) {
+  if (!selectObj) return
+  const { selectors } = selectObj
+  if (!selectors) return
+
+  const matchingSelector = selectors.find((selector) => {
+    return $(selector).length === 1
+  })
+  if (!matchingSelector) return
+
+  if (html) {
+    let $content = $(matchingSelector)
+    $content = cleanBySelectors($content, $, selectObj)
+  } else {
+    return stripTags($(matchingSelector).text(), $)
+  }
+}
+
+function cleanBySelectors($content, $, selectObj) {
+  const { clean } = selectObj
+
+  $(clean.join(','), $content).remove()
+
+  return $content
+}
+
+export default CustomExtractor
--- a/src/extractor/custom/extractor.test.js
+++ b/src/extractor/custom/extractor.test.js
@ -0,0 +1,26 @@
+import assert from 'assert'
+import fs from 'fs'
+import cheerio from 'cheerio'
+
+import CustomExtractor from './extractor'
+import GenericExtractor from '../generic'
+import NYMagExtractor from './nymag.com'
+
+describe('CustomExtractor', () => {
+  it('extracts based on custom selectors', () => {
+    const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
+    const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
+    const $ = cheerio.load(html)
+
+    const {
+      title,
+      content,
+      author,
+      datePublished,
+      leadImageUrl,
+    } = CustomExtractor.extract(NYMagExtractor, url, html, $)
+
+    assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation')
+    console.log(leadImageUrl)
+  })
+})
--- a/src/extractor/custom/nymag.com/fixtures/test.html
+++ b/src/extractor/custom/nymag.com/fixtures/test.html
--- a/src/extractor/custom/nymag.com/index.js
+++ b/src/extractor/custom/nymag.com/index.js
@ -0,0 +1,47 @@
+const NYMagExtractor = {
+  domain: 'nymag.com',
+  content: {
+    // Order by most likely. Extractor will stop on first occurence
+    selectors: [
+      'div.article-content',
+      'section.body',
+      'article.article',
+    ],
+
+    // Selectors to remove from the extracted content
+    clean: [
+      '.ad',
+      '.single-related-story',
+    ],
+
+    // Array of tranformations to make on matched elements
+    // Each item in the array is an object. They key is the 
+    // selector, the value is a tranformation function
+    // for the matching node.
+    transforms: [
+      // Convert h1s to h2s
+      {
+        'h1': ($node) => convertNodeTo($node, $, 'h2')
+      },
+
+      // Convert lazy-loaded noscript images to figures
+      {
+        'noscript': ($node) => {
+          const $children = $node.children()
+          if ($children.length === 1 && $children.get(0).tagName === 'img') {
+            convertNodeTo($node, $, 'figure')
+          }
+        }
+      }
+    ]
+  },
+
+  title: {
+    selectors: [
+      'h1.headline-primary',
+      'h1',
+    ]
+  }
+}
+
+export default NYMagExtractor
--- a/src/extractor/get-extractor.js
+++ b/src/extractor/get-extractor.js
@ -1,10 +1,14 @@
 import URL from 'url'

 import Extractors from './all'
+import GenericExtractor from './generic'

 export default function getExtractor(url) {
  const parsedUrl = URL.parse(url)
  const { hostname } = parsedUrl

-  return Extractors[hostname] || Extractors['*']
+  return {
+    ...GenericExtractor,
+    ...Extractors[hostname]
+  }
 }
--- a/src/extractor/get-extractor.test.js
+++ b/src/extractor/get-extractor.test.js
@ -8,4 +8,10 @@ describe('getExtractor(url)', () => {

    assert.equal(extractor.domain, '*')
  })
+
+  it('returns a custom extractor if found', () => {
+    const extractor = getExtractor('https://nymag.com')
+
+    assert.equal(extractor.domain, 'nymag.com')
+  })
 })