feat: whitelisting attrs to keep

pull/1/head
Adam Pash 8 years ago
parent 7b97559778
commit 81e9e7a317

@ -9,6 +9,7 @@ TODO:
- Separate constants into activity-specific folders (dom, scoring)
DONE:
x remove all but attributes whitelist. research what attributes are important beyond SRC and href
x remove logic for fetching meta attrs with custom props
x cleaning embed and object nodes
x run makeLinksAbsolute on extracted content before returning

@ -82,9 +82,6 @@ const GenericContentExtractor = {
if (!node) {
return null
}
// Remove our scoring information from our content
node.removeAttr('score')
node.find('[score]').removeAttr('score')
return normalizeSpaces($.html(node))

@ -1,14 +1,34 @@
import 'babel-polyfill'
import {
REMOVE_ATTR_SELECTORS,
REMOVE_ATTR_LIST,
REMOVE_ATTRS,
WHITELIST_ATTRS_RE,
} from './constants'
// Remove attributes like style or align
export default function cleanAttributes(article, $) {
export default function cleanAttributes($article, $) {
removeAllButWhitelist($article, $)
return $
}
function removeAllButWhitelist($article, $) {
// $('*', article).each((index, node) => {
$article.find('*').each((index, node) => {
node.attribs = Reflect.ownKeys(node.attribs).reduce((acc, attr) => {
if (WHITELIST_ATTRS_RE.test(attr)) {
return { ...acc, [attr]: node.attribs[attr] }
} else {
return acc
}
}, {})
})
}
function removeAttrs(article, $) {
REMOVE_ATTRS.forEach((attr) => {
$(`[${attr}]`, article).removeAttr(attr)
})
return $
}

@ -17,6 +17,8 @@ export const STRIP_OUTPUT_TAGS = [
export const REMOVE_ATTRS = ['style', 'align']
export const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(selector => `[${selector}]`)
export const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',')
export const WHITELIST_ATTRS = ['src', 'href', 'class', 'id', 'score']
export const WHITELIST_ATTRS_RE = new RegExp(`^(${WHITELIST_ATTRS.join('|')})$`, 'i')
// removeEmpty
export const REMOVE_EMPTY_TAGS = ['p']

Loading…
Cancel
Save