basic merging of extracting sources
parent
0f45b39ca2
commit
b3f90c489e
@ -1,7 +1,8 @@
|
||||
import GenericExtractor from './generic'
|
||||
import NYMagExtractor from './custom/nymag.com'
|
||||
|
||||
const Extractors = {
|
||||
'*': GenericExtractor
|
||||
'nymag.com': NYMagExtractor,
|
||||
}
|
||||
|
||||
export default Extractors
|
||||
|
@ -0,0 +1,69 @@
|
||||
import GenericExtractor from '../generic'
|
||||
import { stripTags } from '../utils'
|
||||
|
||||
const CustomExtractor = {
|
||||
extract(Extractor=GenericExtractor, url, html, $) {
|
||||
if (Extractor.domain === '*') return Extractor.parse(url, html, $)
|
||||
const meta = []
|
||||
|
||||
const title =
|
||||
select($, Extractor.title) ||
|
||||
GenericExtractor.title($, url, meta)
|
||||
|
||||
const datePublished =
|
||||
select($, Extractor.datePublished) ||
|
||||
GenericExtractor.datePublished($, url, meta)
|
||||
|
||||
const author =
|
||||
select($, Extractor.author) ||
|
||||
GenericExtractor.author($, meta)
|
||||
|
||||
const content =
|
||||
select($, Extractor.content, true) ||
|
||||
GenericExtractor.content($, html, {}, title)
|
||||
|
||||
const leadImageUrl =
|
||||
select($, Extractor.leadImageUrl) ||
|
||||
GenericExtractor.leadImageUrl($, content, meta)
|
||||
|
||||
const dek =
|
||||
select($, Extractor.dek) ||
|
||||
GenericExtractor.dek($, content, meta)
|
||||
|
||||
return {
|
||||
title,
|
||||
content,
|
||||
datePublished,
|
||||
leadImageUrl,
|
||||
dek,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function select($, selectObj, html=false) {
|
||||
if (!selectObj) return
|
||||
const { selectors } = selectObj
|
||||
if (!selectors) return
|
||||
|
||||
const matchingSelector = selectors.find((selector) => {
|
||||
return $(selector).length === 1
|
||||
})
|
||||
if (!matchingSelector) return
|
||||
|
||||
if (html) {
|
||||
let $content = $(matchingSelector)
|
||||
$content = cleanBySelectors($content, $, selectObj)
|
||||
} else {
|
||||
return stripTags($(matchingSelector).text(), $)
|
||||
}
|
||||
}
|
||||
|
||||
function cleanBySelectors($content, $, selectObj) {
|
||||
const { clean } = selectObj
|
||||
|
||||
$(clean.join(','), $content).remove()
|
||||
|
||||
return $content
|
||||
}
|
||||
|
||||
export default CustomExtractor
|
@ -0,0 +1,26 @@
|
||||
import assert from 'assert'
|
||||
import fs from 'fs'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import CustomExtractor from './extractor'
|
||||
import GenericExtractor from '../generic'
|
||||
import NYMagExtractor from './nymag.com'
|
||||
|
||||
describe('CustomExtractor', () => {
|
||||
it('extracts based on custom selectors', () => {
|
||||
const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
|
||||
const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const {
|
||||
title,
|
||||
content,
|
||||
author,
|
||||
datePublished,
|
||||
leadImageUrl,
|
||||
} = CustomExtractor.extract(NYMagExtractor, url, html, $)
|
||||
|
||||
assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation')
|
||||
console.log(leadImageUrl)
|
||||
})
|
||||
})
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,47 @@
|
||||
const NYMagExtractor = {
|
||||
domain: 'nymag.com',
|
||||
content: {
|
||||
// Order by most likely. Extractor will stop on first occurence
|
||||
selectors: [
|
||||
'div.article-content',
|
||||
'section.body',
|
||||
'article.article',
|
||||
],
|
||||
|
||||
// Selectors to remove from the extracted content
|
||||
clean: [
|
||||
'.ad',
|
||||
'.single-related-story',
|
||||
],
|
||||
|
||||
// Array of tranformations to make on matched elements
|
||||
// Each item in the array is an object. They key is the
|
||||
// selector, the value is a tranformation function
|
||||
// for the matching node.
|
||||
transforms: [
|
||||
// Convert h1s to h2s
|
||||
{
|
||||
'h1': ($node) => convertNodeTo($node, $, 'h2')
|
||||
},
|
||||
|
||||
// Convert lazy-loaded noscript images to figures
|
||||
{
|
||||
'noscript': ($node) => {
|
||||
const $children = $node.children()
|
||||
if ($children.length === 1 && $children.get(0).tagName === 'img') {
|
||||
convertNodeTo($node, $, 'figure')
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
title: {
|
||||
selectors: [
|
||||
'h1.headline-primary',
|
||||
'h1',
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
export default NYMagExtractor
|
@ -1,10 +1,14 @@
|
||||
import URL from 'url'
|
||||
|
||||
import Extractors from './all'
|
||||
import GenericExtractor from './generic'
|
||||
|
||||
export default function getExtractor(url) {
|
||||
const parsedUrl = URL.parse(url)
|
||||
const { hostname } = parsedUrl
|
||||
|
||||
return Extractors[hostname] || Extractors['*']
|
||||
return {
|
||||
...GenericExtractor,
|
||||
...Extractors[hostname]
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue