basic merging of extracting sources
parent
0f45b39ca2
commit
b3f90c489e
@ -1,7 +1,8 @@
|
|||||||
import GenericExtractor from './generic'
|
import GenericExtractor from './generic'
|
||||||
|
import NYMagExtractor from './custom/nymag.com'
|
||||||
|
|
||||||
const Extractors = {
|
const Extractors = {
|
||||||
'*': GenericExtractor
|
'nymag.com': NYMagExtractor,
|
||||||
}
|
}
|
||||||
|
|
||||||
export default Extractors
|
export default Extractors
|
||||||
|
@ -0,0 +1,69 @@
|
|||||||
|
import GenericExtractor from '../generic'
|
||||||
|
import { stripTags } from '../utils'
|
||||||
|
|
||||||
|
const CustomExtractor = {
|
||||||
|
extract(Extractor=GenericExtractor, url, html, $) {
|
||||||
|
if (Extractor.domain === '*') return Extractor.parse(url, html, $)
|
||||||
|
const meta = []
|
||||||
|
|
||||||
|
const title =
|
||||||
|
select($, Extractor.title) ||
|
||||||
|
GenericExtractor.title($, url, meta)
|
||||||
|
|
||||||
|
const datePublished =
|
||||||
|
select($, Extractor.datePublished) ||
|
||||||
|
GenericExtractor.datePublished($, url, meta)
|
||||||
|
|
||||||
|
const author =
|
||||||
|
select($, Extractor.author) ||
|
||||||
|
GenericExtractor.author($, meta)
|
||||||
|
|
||||||
|
const content =
|
||||||
|
select($, Extractor.content, true) ||
|
||||||
|
GenericExtractor.content($, html, {}, title)
|
||||||
|
|
||||||
|
const leadImageUrl =
|
||||||
|
select($, Extractor.leadImageUrl) ||
|
||||||
|
GenericExtractor.leadImageUrl($, content, meta)
|
||||||
|
|
||||||
|
const dek =
|
||||||
|
select($, Extractor.dek) ||
|
||||||
|
GenericExtractor.dek($, content, meta)
|
||||||
|
|
||||||
|
return {
|
||||||
|
title,
|
||||||
|
content,
|
||||||
|
datePublished,
|
||||||
|
leadImageUrl,
|
||||||
|
dek,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function select($, selectObj, html=false) {
|
||||||
|
if (!selectObj) return
|
||||||
|
const { selectors } = selectObj
|
||||||
|
if (!selectors) return
|
||||||
|
|
||||||
|
const matchingSelector = selectors.find((selector) => {
|
||||||
|
return $(selector).length === 1
|
||||||
|
})
|
||||||
|
if (!matchingSelector) return
|
||||||
|
|
||||||
|
if (html) {
|
||||||
|
let $content = $(matchingSelector)
|
||||||
|
$content = cleanBySelectors($content, $, selectObj)
|
||||||
|
} else {
|
||||||
|
return stripTags($(matchingSelector).text(), $)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function cleanBySelectors($content, $, selectObj) {
|
||||||
|
const { clean } = selectObj
|
||||||
|
|
||||||
|
$(clean.join(','), $content).remove()
|
||||||
|
|
||||||
|
return $content
|
||||||
|
}
|
||||||
|
|
||||||
|
export default CustomExtractor
|
@ -0,0 +1,26 @@
|
|||||||
|
import assert from 'assert'
|
||||||
|
import fs from 'fs'
|
||||||
|
import cheerio from 'cheerio'
|
||||||
|
|
||||||
|
import CustomExtractor from './extractor'
|
||||||
|
import GenericExtractor from '../generic'
|
||||||
|
import NYMagExtractor from './nymag.com'
|
||||||
|
|
||||||
|
describe('CustomExtractor', () => {
|
||||||
|
it('extracts based on custom selectors', () => {
|
||||||
|
const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
|
||||||
|
const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
|
||||||
|
const $ = cheerio.load(html)
|
||||||
|
|
||||||
|
const {
|
||||||
|
title,
|
||||||
|
content,
|
||||||
|
author,
|
||||||
|
datePublished,
|
||||||
|
leadImageUrl,
|
||||||
|
} = CustomExtractor.extract(NYMagExtractor, url, html, $)
|
||||||
|
|
||||||
|
assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation')
|
||||||
|
console.log(leadImageUrl)
|
||||||
|
})
|
||||||
|
})
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,47 @@
|
|||||||
|
const NYMagExtractor = {
|
||||||
|
domain: 'nymag.com',
|
||||||
|
content: {
|
||||||
|
// Order by most likely. Extractor will stop on first occurence
|
||||||
|
selectors: [
|
||||||
|
'div.article-content',
|
||||||
|
'section.body',
|
||||||
|
'article.article',
|
||||||
|
],
|
||||||
|
|
||||||
|
// Selectors to remove from the extracted content
|
||||||
|
clean: [
|
||||||
|
'.ad',
|
||||||
|
'.single-related-story',
|
||||||
|
],
|
||||||
|
|
||||||
|
// Array of tranformations to make on matched elements
|
||||||
|
// Each item in the array is an object. They key is the
|
||||||
|
// selector, the value is a tranformation function
|
||||||
|
// for the matching node.
|
||||||
|
transforms: [
|
||||||
|
// Convert h1s to h2s
|
||||||
|
{
|
||||||
|
'h1': ($node) => convertNodeTo($node, $, 'h2')
|
||||||
|
},
|
||||||
|
|
||||||
|
// Convert lazy-loaded noscript images to figures
|
||||||
|
{
|
||||||
|
'noscript': ($node) => {
|
||||||
|
const $children = $node.children()
|
||||||
|
if ($children.length === 1 && $children.get(0).tagName === 'img') {
|
||||||
|
convertNodeTo($node, $, 'figure')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
title: {
|
||||||
|
selectors: [
|
||||||
|
'h1.headline-primary',
|
||||||
|
'h1',
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export default NYMagExtractor
|
@ -1,10 +1,14 @@
|
|||||||
import URL from 'url'
|
import URL from 'url'
|
||||||
|
|
||||||
import Extractors from './all'
|
import Extractors from './all'
|
||||||
|
import GenericExtractor from './generic'
|
||||||
|
|
||||||
export default function getExtractor(url) {
|
export default function getExtractor(url) {
|
||||||
const parsedUrl = URL.parse(url)
|
const parsedUrl = URL.parse(url)
|
||||||
const { hostname } = parsedUrl
|
const { hostname } = parsedUrl
|
||||||
|
|
||||||
return Extractors[hostname] || Extractors['*']
|
return {
|
||||||
|
...GenericExtractor,
|
||||||
|
...Extractors[hostname]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue