feat: RootExtractor performs extraction using custom and generic
extraction methodspull/1/head
parent
937138c7bb
commit
7d88fee199
@ -1,58 +0,0 @@
|
||||
import GenericExtractor from '../generic'
|
||||
import { stripTags } from '../utils'
|
||||
|
||||
const CustomExtractor = {
|
||||
extract(extractor=GenericExtractor, opts) {
|
||||
const { $ } = opts
|
||||
if (extractor.domain === '*') return extractor.parse(opts)
|
||||
|
||||
const title = extract({ ...opts, type: 'title', extractor })
|
||||
const datePublished = extract({ ...opts, type: 'datePublished', extractor })
|
||||
const author = extract({ ...opts, type: 'author', extractor })
|
||||
const content = extract({ ...opts, type: 'content', extractor, html: true })
|
||||
const leadImageUrl = extract({ ...opts, type: 'leadImageUrl', extractor, html: true })
|
||||
const dek = extract({ ...opts, type: 'dek', extractor, html: true })
|
||||
|
||||
return {
|
||||
title,
|
||||
content,
|
||||
datePublished,
|
||||
leadImageUrl,
|
||||
dek,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function extract(opts) {
|
||||
const { type, extractor, $ } = opts
|
||||
return select($, extractor[type]) ||
|
||||
GenericExtractor[type](opts)
|
||||
}
|
||||
|
||||
function select($, selectObj, html=false) {
|
||||
if (!selectObj) return
|
||||
const { selectors } = selectObj
|
||||
if (!selectors) return
|
||||
|
||||
const matchingSelector = selectors.find((selector) => {
|
||||
return $(selector).length === 1
|
||||
})
|
||||
if (!matchingSelector) return
|
||||
|
||||
if (html) {
|
||||
let $content = $(matchingSelector)
|
||||
$content = cleanBySelectors($content, $, selectObj)
|
||||
} else {
|
||||
return stripTags($(matchingSelector).text(), $)
|
||||
}
|
||||
}
|
||||
|
||||
function cleanBySelectors($content, $, selectObj) {
|
||||
const { clean } = selectObj
|
||||
|
||||
$(clean.join(','), $content).remove()
|
||||
|
||||
return $content
|
||||
}
|
||||
|
||||
export default CustomExtractor
|
@ -1,27 +0,0 @@
|
||||
import assert from 'assert'
|
||||
import fs from 'fs'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import CustomExtractor from './extractor'
|
||||
import GenericExtractor from '../generic'
|
||||
import NYMagExtractor from './nymag.com'
|
||||
|
||||
describe('CustomExtractor', () => {
|
||||
it('extracts based on custom selectors', () => {
|
||||
const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
|
||||
const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const {
|
||||
title,
|
||||
content,
|
||||
author,
|
||||
datePublished,
|
||||
leadImageUrl,
|
||||
} = CustomExtractor.extract(
|
||||
NYMagExtractor, { url, html, $, metaCache: [] }
|
||||
)
|
||||
|
||||
assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation')
|
||||
})
|
||||
})
|
@ -0,0 +1,99 @@
|
||||
import 'babel-polyfill'
|
||||
|
||||
import GenericExtractor from './generic'
|
||||
import { convertNodeTo, stripTags } from './utils/dom'
|
||||
|
||||
const RootExtractor = {
|
||||
extract(extractor=GenericExtractor, opts) {
|
||||
const { $ } = opts
|
||||
// This is the generic extractor. Run its extract method
|
||||
if (extractor.domain === '*') return extractor.extract(opts)
|
||||
|
||||
const title = extract({ ...opts, type: 'title', extractor })
|
||||
const datePublished = extract({ ...opts, type: 'datePublished', extractor })
|
||||
const author = extract({ ...opts, type: 'author', extractor })
|
||||
const content = extract({ ...opts, type: 'content', extractor, html: true })
|
||||
const leadImageUrl = extract({ ...opts, type: 'leadImageUrl', extractor, html: true })
|
||||
const dek = extract({ ...opts, type: 'dek', extractor, html: true })
|
||||
|
||||
return {
|
||||
title,
|
||||
content,
|
||||
datePublished,
|
||||
leadImageUrl,
|
||||
dek,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function extract(opts) {
|
||||
const { type, extractor, $ } = opts
|
||||
|
||||
// If nothing matches the selector,
|
||||
// run the Generic extraction
|
||||
return select($, extractor[type]) ||
|
||||
GenericExtractor[type](opts)
|
||||
}
|
||||
|
||||
function select($, extractionOpts, html=false) {
|
||||
// Skip if there's not extraction for this type
|
||||
if (!extractionOpts) return
|
||||
|
||||
const { selectors } = extractionOpts
|
||||
|
||||
const matchingSelector = selectors.find((selector) => {
|
||||
return $(selector).length === 1
|
||||
})
|
||||
if (!matchingSelector) return
|
||||
|
||||
// If the selector type requests html as its return type
|
||||
// clean the element with provided cleaning selectors
|
||||
if (html) {
|
||||
let $content = $(matchingSelector)
|
||||
$content = cleanBySelectors($content, $, extractionOpts)
|
||||
$content = transformElements($content, $, extractionOpts)
|
||||
|
||||
return $content
|
||||
} else {
|
||||
return stripTags($(matchingSelector).text(), $)
|
||||
}
|
||||
}
|
||||
|
||||
// Remove elements by an array of selectors
|
||||
export function cleanBySelectors($content, $, { clean }) {
|
||||
if (!clean) return
|
||||
|
||||
$(clean.join(','), $content).remove()
|
||||
|
||||
return $content
|
||||
}
|
||||
|
||||
// Transform matching elements
|
||||
export function transformElements($content, $, { transforms }) {
|
||||
if (!transforms) return
|
||||
|
||||
Reflect.ownKeys(transforms).forEach((key) => {
|
||||
const $matches = $(key, $content)
|
||||
const value = transforms[key]
|
||||
|
||||
// If value is a string, convert directly
|
||||
if (typeof value === 'string') {
|
||||
$matches.each((index, node) => {
|
||||
convertNodeTo(node, $, transforms[key])
|
||||
})
|
||||
} else if (typeof value === 'function') {
|
||||
// If value is function, apply function to node
|
||||
$matches.each((index, node) => {
|
||||
const result = value($(node))
|
||||
// If function returns a string, convert node to that value
|
||||
if (typeof result === 'string') {
|
||||
convertNodeTo(node, $, result)
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
|
||||
return $content
|
||||
}
|
||||
|
||||
export default RootExtractor
|
@ -0,0 +1,135 @@
|
||||
import assert from 'assert'
|
||||
import fs from 'fs'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import CustomExtractor from './root-extractor'
|
||||
import {
|
||||
cleanBySelectors,
|
||||
transformElements
|
||||
} from './root-extractor'
|
||||
|
||||
import GenericExtractor from './generic'
|
||||
import NYMagExtractor from './custom/nymag.com'
|
||||
|
||||
describe('CustomExtractor', () => {
|
||||
it('extracts based on custom selectors', () => {
|
||||
const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
|
||||
const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
const {
|
||||
title,
|
||||
content,
|
||||
author,
|
||||
datePublished,
|
||||
leadImageUrl,
|
||||
} = CustomExtractor.extract(
|
||||
NYMagExtractor, { url, html, $, metaCache: [] }
|
||||
)
|
||||
|
||||
assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation')
|
||||
})
|
||||
})
|
||||
|
||||
describe('cleanBySelectors($content, $, { clean })', () => {
|
||||
it('removes provided selectors from the content', () => {
|
||||
const opts = { clean: ['.ad', '.share'] }
|
||||
const html = `
|
||||
<div>
|
||||
<div class="body">
|
||||
<div class="share">Share this on twitter plz</div>
|
||||
<p>This is some good content</p>
|
||||
<div class="ad">Advertisement!</div>
|
||||
</div>
|
||||
</div>`
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
let $content = $('.body')
|
||||
$content = cleanBySelectors($content, $, opts)
|
||||
|
||||
assert.equal($content.find('.ad').length, 0)
|
||||
assert.equal($content.find('.share').length, 0)
|
||||
})
|
||||
})
|
||||
|
||||
describe('transformElements($content, $, { transforms })', () => {
|
||||
it('performs a simple transformation on matched elements', () => {
|
||||
const html = `
|
||||
<div>
|
||||
<div class="body">
|
||||
<h1>WOW BIG TITLE</h1>
|
||||
<p>Here are some words</p>
|
||||
<h1>WOW BIG TITLE</h1>
|
||||
</div>
|
||||
</div>
|
||||
`
|
||||
const opts = {
|
||||
transforms: { 'h1': 'h2' }
|
||||
}
|
||||
const $ = cheerio.load(html)
|
||||
let $content = $('.body')
|
||||
|
||||
const after = `
|
||||
<div class="body">
|
||||
<h2>WOW BIG TITLE</h2>
|
||||
<p>Here are some words</p>
|
||||
<h2>WOW BIG TITLE</h2>
|
||||
</div>
|
||||
`
|
||||
|
||||
$content = transformElements($content, $, opts)
|
||||
assertClean($.html($content), after)
|
||||
})
|
||||
|
||||
it('performs a complex transformation on matched elements', () => {
|
||||
const html = `
|
||||
<div>
|
||||
<div class="body">
|
||||
<noscript>
|
||||
<img src="/img.jpg" />
|
||||
</noscript>
|
||||
<noscript>
|
||||
Something else
|
||||
</noscript>
|
||||
<p>Here are some words</p>
|
||||
</div>
|
||||
</div>
|
||||
`
|
||||
const opts = {
|
||||
transforms: {
|
||||
'noscript': ($node) => {
|
||||
const $children = $node.children()
|
||||
if ($children.length === 1 && $children.get(0).tagName === 'img') {
|
||||
return 'figure'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
const $ = cheerio.load(html)
|
||||
let $content = $('.body')
|
||||
|
||||
const after = `
|
||||
<div class="body">
|
||||
<figure>
|
||||
<img src="/img.jpg">
|
||||
</figure>
|
||||
<noscript>
|
||||
Something else
|
||||
</noscript>
|
||||
<p>Here are some words</p>
|
||||
</div>
|
||||
`
|
||||
|
||||
$content = transformElements($content, $, opts)
|
||||
assertClean($.html($content), after)
|
||||
})
|
||||
})
|
||||
|
||||
export function clean(string) {
|
||||
return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ')
|
||||
}
|
||||
|
||||
export function assertClean(a, b) {
|
||||
assert.equal(clean(a), clean(b))
|
||||
}
|
||||
|
@ -0,0 +1,5 @@
|
||||
export default function convertNodeTo(node, $, tag='p') {
|
||||
$(node).replaceWith(`<${tag}>${$(node).contents()}</${tag}>`)
|
||||
return $
|
||||
}
|
||||
|
@ -0,0 +1,20 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import convertNodeTo from './convert-node-to'
|
||||
|
||||
describe('convertNodeTo(node, $)', () => {
|
||||
it('takes a node and converts it to a diff tag', () => {
|
||||
const html = '<div>Should become a p</div>'
|
||||
const $ = cheerio.load(html)
|
||||
const node = $('div').first()
|
||||
|
||||
const result = convertNodeTo(node, $).html()
|
||||
const after = '<p>Should become a p</p>'
|
||||
|
||||
assert.equal(result, after)
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
|
@ -1,2 +1,4 @@
|
||||
export { default as withinComment } from './within-comment'
|
||||
export { default as convertNodeTo } from './convert-node-to'
|
||||
export { default as stripTags } from './strip-tags'
|
||||
|
||||
|
Loading…
Reference in New Issue