Stripping unlikely candidates from DOM
parent
89a2cfbb82
commit
777e11c25c
@ -1 +1,2 @@
|
||||
export { default as getWeight } from './getWeight'
|
||||
export { default as getWeight } from './get-weight'
|
||||
export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
|
||||
|
@ -0,0 +1,37 @@
|
||||
import {
|
||||
CANDIDATES_WHITELIST,
|
||||
CANDIDATES_BLACKLIST,
|
||||
UNLIKELY_RE,
|
||||
} from '../constants'
|
||||
|
||||
// ## NOTES:
|
||||
// This is a working first pass, but if/when we start optimizing
|
||||
// this is a good candidate.
|
||||
|
||||
export default function stripUnlikelyCandidates($) {
|
||||
// Loop through the provided document and remove any non-link nodes
|
||||
// that are unlikely candidates for article content.
|
||||
//
|
||||
// Links are ignored because there are very often links to content
|
||||
// that are identified as non-body-content, but may be inside
|
||||
// article-like content.
|
||||
//
|
||||
// :param $: a cheerio object to strip nodes from
|
||||
// :return $: the cleaned cheerio object
|
||||
$('*').not('a').each(function(index, element) {
|
||||
const classes = $(element).attr('class')
|
||||
const id = $(element).attr('id')
|
||||
if (!id && !classes) {
|
||||
return
|
||||
} else {
|
||||
const classAndId = `${classes || ''} ${id || ''}`
|
||||
if (CANDIDATES_WHITELIST.test(classAndId)) {
|
||||
return
|
||||
} else if (CANDIDATES_BLACKLIST.test(classAndId)) {
|
||||
return $(element).remove()
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return $
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import { clean } from './test-helpers'
|
||||
import HTML from './fixtures/html'
|
||||
import {
|
||||
stripUnlikelyCandidates
|
||||
} from './index'
|
||||
|
||||
describe('Generic Extractor Utils', () => {
|
||||
describe('stripUnlikelyCandidates(node)', () => {
|
||||
it("returns original doc if no matches found", () => {
|
||||
const $ = cheerio.load(HTML.noMatches)
|
||||
const stripped = stripUnlikelyCandidates($)
|
||||
assert.equal(stripped.html(), HTML.noMatches)
|
||||
})
|
||||
|
||||
it("strips unlikely matches from the doc", () => {
|
||||
const $ = cheerio.load(HTML.whitelistMatch.before)
|
||||
const stripped = clean(stripUnlikelyCandidates($).html())
|
||||
assert.equal(stripped, clean(HTML.whitelistMatch.after))
|
||||
})
|
||||
|
||||
it("keeps likely matches even when they also match the blacklist", () => {
|
||||
const $ = cheerio.load(HTML.whiteAndBlack.before)
|
||||
const stripped = clean(stripUnlikelyCandidates($).html())
|
||||
assert.equal(stripped, clean(HTML.whiteAndBlack.after))
|
||||
})
|
||||
|
||||
it("removed likely matches when inside blacklist node", () => {
|
||||
const $ = cheerio.load(HTML.whiteInsideBlack.before)
|
||||
const stripped = clean(stripUnlikelyCandidates($).html())
|
||||
assert.equal(stripped, clean(HTML.whiteInsideBlack.after))
|
||||
})
|
||||
|
||||
|
||||
})
|
||||
})
|
||||
|
@ -0,0 +1,3 @@
|
||||
export function clean(string) {
|
||||
return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, '')
|
||||
}
|
Loading…
Reference in New Issue