From 81e9e7a3177e924c30e9833b247ccbf4ba99249a Mon Sep 17 00:00:00 2001 From: Adam Pash Date: Fri, 9 Sep 2016 14:33:16 -0400 Subject: [PATCH] feat: whitelisting attrs to keep --- TODO.md | 1 + src/extractors/generic/content/extractor.js | 3 --- src/utils/dom/clean-attributes.js | 26 ++++++++++++++++++--- src/utils/dom/constants.js | 2 ++ 4 files changed, 26 insertions(+), 6 deletions(-) diff --git a/TODO.md b/TODO.md index 4b068c95..de72c031 100644 --- a/TODO.md +++ b/TODO.md @@ -9,6 +9,7 @@ TODO: - Separate constants into activity-specific folders (dom, scoring) DONE: +x remove all but attributes whitelist. research what attributes are important beyond SRC and href x remove logic for fetching meta attrs with custom props x cleaning embed and object nodes x run makeLinksAbsolute on extracted content before returning diff --git a/src/extractors/generic/content/extractor.js b/src/extractors/generic/content/extractor.js index 3761b613..a0f137f9 100644 --- a/src/extractors/generic/content/extractor.js +++ b/src/extractors/generic/content/extractor.js @@ -82,9 +82,6 @@ const GenericContentExtractor = { if (!node) { return null } - // Remove our scoring information from our content - node.removeAttr('score') - node.find('[score]').removeAttr('score') return normalizeSpaces($.html(node)) diff --git a/src/utils/dom/clean-attributes.js b/src/utils/dom/clean-attributes.js index 5aaafc73..bc6913c7 100644 --- a/src/utils/dom/clean-attributes.js +++ b/src/utils/dom/clean-attributes.js @@ -1,14 +1,34 @@ +import 'babel-polyfill' + import { REMOVE_ATTR_SELECTORS, REMOVE_ATTR_LIST, REMOVE_ATTRS, + WHITELIST_ATTRS_RE, } from './constants' // Remove attributes like style or align -export default function cleanAttributes(article, $) { +export default function cleanAttributes($article, $) { + removeAllButWhitelist($article, $) + + return $ +} + +function removeAllButWhitelist($article, $) { + // $('*', article).each((index, node) => { + $article.find('*').each((index, node) => { + node.attribs = Reflect.ownKeys(node.attribs).reduce((acc, attr) => { + if (WHITELIST_ATTRS_RE.test(attr)) { + return { ...acc, [attr]: node.attribs[attr] } + } else { + return acc + } + }, {}) + }) +} + +function removeAttrs(article, $) { REMOVE_ATTRS.forEach((attr) => { $(`[${attr}]`, article).removeAttr(attr) }) - - return $ } diff --git a/src/utils/dom/constants.js b/src/utils/dom/constants.js index 438adc7c..a47928ae 100644 --- a/src/utils/dom/constants.js +++ b/src/utils/dom/constants.js @@ -17,6 +17,8 @@ export const STRIP_OUTPUT_TAGS = [ export const REMOVE_ATTRS = ['style', 'align'] export const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(selector => `[${selector}]`) export const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',') +export const WHITELIST_ATTRS = ['src', 'href', 'class', 'id', 'score'] +export const WHITELIST_ATTRS_RE = new RegExp(`^(${WHITELIST_ATTRS.join('|')})$`, 'i') // removeEmpty export const REMOVE_EMPTY_TAGS = ['p']