Stripping unlikely candidates from DOM

pull/1/head
Adam Pash 8 years ago
parent 89a2cfbb82
commit 777e11c25c

@ -942,3 +942,16 @@ export const BLOCK_LEVEL_TAGS = [
'ul',
'video',
]
// The removal is implemented as a blacklist and whitelist, this test finds
// blacklisted elements that aren't whitelisted. We do this all in one
// expression-both because it's only one pass, and because this skips the
// serialization for whitelisted nodes.
const candidates_blacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|')
export const CANDIDATES_BLACKLIST = new RegExp(candidates_blacklist, 'i')
const candidates_whitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|')
export const CANDIDATES_WHITELIST = new RegExp(candidates_whitelist, 'i')
export const UNLIKELY_RE = new RegExp(`!(${candidates_whitelist})|(${candidates_blacklist})`, 'i')

@ -1,4 +1,5 @@
const HTML = {
// getWeight fixtures
positiveId: `
<div id="entry">
<p>Ooo good one</p>
@ -44,6 +45,55 @@ const HTML = {
<p>Ooo good one</p>
</div>
`,
// stripUnlikelyCandidates
noMatches: `
<div id="foo">
<p>Ooo good one</p>
</div>
`,
whitelistMatch: {
before: `
<div class="header">Stuff</div>
<div class="article">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article">
<p>Ooo good one</p>
</div>
`,
},
whiteAndBlack: {
before: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
},
whiteInsideBlack: {
before: `
<div>
<div class="adbox">
<div class="article">
<p>Ooo good one</p>
</div>
</div>
<div>Something unrelated</div>
</div>
`,
after: `
<div>
<div>Something unrelated</div>
</div>
`,
},
}
export default HTML

@ -1 +1,2 @@
export { default as getWeight } from './getWeight'
export { default as getWeight } from './get-weight'
export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'

@ -0,0 +1,37 @@
import {
CANDIDATES_WHITELIST,
CANDIDATES_BLACKLIST,
UNLIKELY_RE,
} from '../constants'
// ## NOTES:
// This is a working first pass, but if/when we start optimizing
// this is a good candidate.
export default function stripUnlikelyCandidates($) {
// Loop through the provided document and remove any non-link nodes
// that are unlikely candidates for article content.
//
// Links are ignored because there are very often links to content
// that are identified as non-body-content, but may be inside
// article-like content.
//
// :param $: a cheerio object to strip nodes from
// :return $: the cleaned cheerio object
$('*').not('a').each(function(index, element) {
const classes = $(element).attr('class')
const id = $(element).attr('id')
if (!id && !classes) {
return
} else {
const classAndId = `${classes || ''} ${id || ''}`
if (CANDIDATES_WHITELIST.test(classAndId)) {
return
} else if (CANDIDATES_BLACKLIST.test(classAndId)) {
return $(element).remove()
}
}
})
return $
}

@ -0,0 +1,39 @@
import assert from 'assert'
import cheerio from 'cheerio'
import { clean } from './test-helpers'
import HTML from './fixtures/html'
import {
stripUnlikelyCandidates
} from './index'
describe('Generic Extractor Utils', () => {
describe('stripUnlikelyCandidates(node)', () => {
it("returns original doc if no matches found", () => {
const $ = cheerio.load(HTML.noMatches)
const stripped = stripUnlikelyCandidates($)
assert.equal(stripped.html(), HTML.noMatches)
})
it("strips unlikely matches from the doc", () => {
const $ = cheerio.load(HTML.whitelistMatch.before)
const stripped = clean(stripUnlikelyCandidates($).html())
assert.equal(stripped, clean(HTML.whitelistMatch.after))
})
it("keeps likely matches even when they also match the blacklist", () => {
const $ = cheerio.load(HTML.whiteAndBlack.before)
const stripped = clean(stripUnlikelyCandidates($).html())
assert.equal(stripped, clean(HTML.whiteAndBlack.after))
})
it("removed likely matches when inside blacklist node", () => {
const $ = cheerio.load(HTML.whiteInsideBlack.before)
const stripped = clean(stripUnlikelyCandidates($).html())
assert.equal(stripped, clean(HTML.whiteInsideBlack.after))
})
})
})

@ -0,0 +1,3 @@
export function clean(string) {
return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, '')
}
Loading…
Cancel
Save