fix: bug that stopped proper attr cleaning in certain cases

pull/17/head
Adam Pash 8 years ago
parent 40768fa188
commit 17317823de

16
dist/mercury.js vendored

@ -1498,6 +1498,8 @@ function removeAllButWhitelist($article) {
return acc;
}, {});
});
return $article;
}
// function removeAttrs(article, $) {
@ -1508,9 +1510,11 @@ function removeAllButWhitelist($article) {
// Remove attributes like style or align
function cleanAttributes($article) {
removeAllButWhitelist($article);
return $article;
// Grabbing the parent because at this point
// $article will be wrapped in a div which will
// have a score set on it.
console.log('HMM', $article.parent().length);
return removeAllButWhitelist($article.parent().length ? $article.parent() : $article);
}
function removeEmpty($article, $) {
@ -2603,9 +2607,6 @@ function extractCleanNode(article, _ref) {
// Make links absolute
makeLinksAbsolute(article, $, url);
// Remove unnecessary attributes
cleanAttributes(article);
// We used to clean UL's and OL's here, but it was leading to
// too many in-article lists being removed. Consider a better
// way to detect menus particularly and remove them.
@ -2615,6 +2616,9 @@ function extractCleanNode(article, _ref) {
// Remove empty paragraph nodes
removeEmpty(article, $);
// Remove unnecessary attributes
cleanAttributes(article, $);
return article;
}

File diff suppressed because one or more lines are too long

@ -51,9 +51,6 @@ export default function extractCleanNode(
// Make links absolute
makeLinksAbsolute(article, $, url);
// Remove unnecessary attributes
cleanAttributes(article);
// We used to clean UL's and OL's here, but it was leading to
// too many in-article lists being removed. Consider a better
// way to detect menus particularly and remove them.
@ -63,5 +60,8 @@ export default function extractCleanNode(
// Remove empty paragraph nodes
removeEmpty(article, $);
// Remove unnecessary attributes
cleanAttributes(article, $);
return article;
}

@ -18,7 +18,16 @@ describe('Mercury', () => {
});
it('does the whole thing', async function() {
const result = await Mercury.parse('http://theconcourse.deadspin.com/phyllis-schlafly-finally-croaks-1786219220');
const result =
await Mercury.parse('http://deadspin.com/remember-when-donald-trump-got-booed-for-butchering-ta-1788216229');
assert.equal(typeof result, 'object');
assert.equal(result.content.indexOf('score="') === -1, true);
});
it('does blogger', async function() {
const result =
await Mercury.parse('https://googleblog.blogspot.com/2016/08/onhub-turns-one-today.html');
assert.equal(typeof result, 'object');
});

@ -1,7 +1,6 @@
import { WHITELIST_ATTRS_RE } from './constants';
function removeAllButWhitelist($article) {
// $('*', article).each((index, node) => {
$article.find('*').each((index, node) => {
node.attribs = Reflect.ownKeys(node.attribs).reduce((acc, attr) => {
if (WHITELIST_ATTRS_RE.test(attr)) {
@ -11,6 +10,8 @@ function removeAllButWhitelist($article) {
return acc;
}, {});
});
return $article;
}
// function removeAttrs(article, $) {
@ -21,7 +22,11 @@ function removeAllButWhitelist($article) {
// Remove attributes like style or align
export default function cleanAttributes($article) {
removeAllButWhitelist($article);
return $article;
// Grabbing the parent because at this point
// $article will be wrapped in a div which will
// have a score set on it.
return removeAllButWhitelist(
$article.parent().length ?
$article.parent() : $article
);
}

Loading…
Cancel
Save