basic merging of extracting sources

pull/1/head
Adam Pash 8 years ago
parent 0f45b39ca2
commit b3f90c489e

@ -1,7 +1,8 @@
import GenericExtractor from './generic'
import NYMagExtractor from './custom/nymag.com'
const Extractors = {
'*': GenericExtractor
'nymag.com': NYMagExtractor,
}
export default Extractors

@ -0,0 +1,69 @@
import GenericExtractor from '../generic'
import { stripTags } from '../utils'
const CustomExtractor = {
extract(Extractor=GenericExtractor, url, html, $) {
if (Extractor.domain === '*') return Extractor.parse(url, html, $)
const meta = []
const title =
select($, Extractor.title) ||
GenericExtractor.title($, url, meta)
const datePublished =
select($, Extractor.datePublished) ||
GenericExtractor.datePublished($, url, meta)
const author =
select($, Extractor.author) ||
GenericExtractor.author($, meta)
const content =
select($, Extractor.content, true) ||
GenericExtractor.content($, html, {}, title)
const leadImageUrl =
select($, Extractor.leadImageUrl) ||
GenericExtractor.leadImageUrl($, content, meta)
const dek =
select($, Extractor.dek) ||
GenericExtractor.dek($, content, meta)
return {
title,
content,
datePublished,
leadImageUrl,
dek,
}
}
}
function select($, selectObj, html=false) {
if (!selectObj) return
const { selectors } = selectObj
if (!selectors) return
const matchingSelector = selectors.find((selector) => {
return $(selector).length === 1
})
if (!matchingSelector) return
if (html) {
let $content = $(matchingSelector)
$content = cleanBySelectors($content, $, selectObj)
} else {
return stripTags($(matchingSelector).text(), $)
}
}
function cleanBySelectors($content, $, selectObj) {
const { clean } = selectObj
$(clean.join(','), $content).remove()
return $content
}
export default CustomExtractor

@ -0,0 +1,26 @@
import assert from 'assert'
import fs from 'fs'
import cheerio from 'cheerio'
import CustomExtractor from './extractor'
import GenericExtractor from '../generic'
import NYMagExtractor from './nymag.com'
describe('CustomExtractor', () => {
it('extracts based on custom selectors', () => {
const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
const $ = cheerio.load(html)
const {
title,
content,
author,
datePublished,
leadImageUrl,
} = CustomExtractor.extract(NYMagExtractor, url, html, $)
assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation')
console.log(leadImageUrl)
})
})

File diff suppressed because one or more lines are too long

@ -0,0 +1,47 @@
const NYMagExtractor = {
domain: 'nymag.com',
content: {
// Order by most likely. Extractor will stop on first occurence
selectors: [
'div.article-content',
'section.body',
'article.article',
],
// Selectors to remove from the extracted content
clean: [
'.ad',
'.single-related-story',
],
// Array of tranformations to make on matched elements
// Each item in the array is an object. They key is the
// selector, the value is a tranformation function
// for the matching node.
transforms: [
// Convert h1s to h2s
{
'h1': ($node) => convertNodeTo($node, $, 'h2')
},
// Convert lazy-loaded noscript images to figures
{
'noscript': ($node) => {
const $children = $node.children()
if ($children.length === 1 && $children.get(0).tagName === 'img') {
convertNodeTo($node, $, 'figure')
}
}
}
]
},
title: {
selectors: [
'h1.headline-primary',
'h1',
]
}
}
export default NYMagExtractor

@ -1,10 +1,14 @@
import URL from 'url'
import Extractors from './all'
import GenericExtractor from './generic'
export default function getExtractor(url) {
const parsedUrl = URL.parse(url)
const { hostname } = parsedUrl
return Extractors[hostname] || Extractors['*']
return {
...GenericExtractor,
...Extractors[hostname]
}
}

@ -8,4 +8,10 @@ describe('getExtractor(url)', () => {
assert.equal(extractor.domain, '*')
})
it('returns a custom extractor if found', () => {
const extractor = getExtractor('https://nymag.com')
assert.equal(extractor.domain, 'nymag.com')
})
})

Loading…
Cancel
Save