fix: bug that stopped proper attr cleaning in certain cases

pull/17/head
Adam Pash 8 years ago
parent 40768fa188
commit 17317823de

16
dist/mercury.js vendored

@ -1498,6 +1498,8 @@ function removeAllButWhitelist($article) {
return acc; return acc;
}, {}); }, {});
}); });
return $article;
} }
// function removeAttrs(article, $) { // function removeAttrs(article, $) {
@ -1508,9 +1510,11 @@ function removeAllButWhitelist($article) {
// Remove attributes like style or align // Remove attributes like style or align
function cleanAttributes($article) { function cleanAttributes($article) {
removeAllButWhitelist($article); // Grabbing the parent because at this point
// $article will be wrapped in a div which will
return $article; // have a score set on it.
console.log('HMM', $article.parent().length);
return removeAllButWhitelist($article.parent().length ? $article.parent() : $article);
} }
function removeEmpty($article, $) { function removeEmpty($article, $) {
@ -2603,9 +2607,6 @@ function extractCleanNode(article, _ref) {
// Make links absolute // Make links absolute
makeLinksAbsolute(article, $, url); makeLinksAbsolute(article, $, url);
// Remove unnecessary attributes
cleanAttributes(article);
// We used to clean UL's and OL's here, but it was leading to // We used to clean UL's and OL's here, but it was leading to
// too many in-article lists being removed. Consider a better // too many in-article lists being removed. Consider a better
// way to detect menus particularly and remove them. // way to detect menus particularly and remove them.
@ -2615,6 +2616,9 @@ function extractCleanNode(article, _ref) {
// Remove empty paragraph nodes // Remove empty paragraph nodes
removeEmpty(article, $); removeEmpty(article, $);
// Remove unnecessary attributes
cleanAttributes(article, $);
return article; return article;
} }

File diff suppressed because one or more lines are too long

@ -51,9 +51,6 @@ export default function extractCleanNode(
// Make links absolute // Make links absolute
makeLinksAbsolute(article, $, url); makeLinksAbsolute(article, $, url);
// Remove unnecessary attributes
cleanAttributes(article);
// We used to clean UL's and OL's here, but it was leading to // We used to clean UL's and OL's here, but it was leading to
// too many in-article lists being removed. Consider a better // too many in-article lists being removed. Consider a better
// way to detect menus particularly and remove them. // way to detect menus particularly and remove them.
@ -63,5 +60,8 @@ export default function extractCleanNode(
// Remove empty paragraph nodes // Remove empty paragraph nodes
removeEmpty(article, $); removeEmpty(article, $);
// Remove unnecessary attributes
cleanAttributes(article, $);
return article; return article;
} }

@ -18,7 +18,16 @@ describe('Mercury', () => {
}); });
it('does the whole thing', async function() { it('does the whole thing', async function() {
const result = await Mercury.parse('http://theconcourse.deadspin.com/phyllis-schlafly-finally-croaks-1786219220'); const result =
await Mercury.parse('http://deadspin.com/remember-when-donald-trump-got-booed-for-butchering-ta-1788216229');
assert.equal(typeof result, 'object');
assert.equal(result.content.indexOf('score="') === -1, true);
});
it('does blogger', async function() {
const result =
await Mercury.parse('https://googleblog.blogspot.com/2016/08/onhub-turns-one-today.html');
assert.equal(typeof result, 'object'); assert.equal(typeof result, 'object');
}); });

@ -1,7 +1,6 @@
import { WHITELIST_ATTRS_RE } from './constants'; import { WHITELIST_ATTRS_RE } from './constants';
function removeAllButWhitelist($article) { function removeAllButWhitelist($article) {
// $('*', article).each((index, node) => {
$article.find('*').each((index, node) => { $article.find('*').each((index, node) => {
node.attribs = Reflect.ownKeys(node.attribs).reduce((acc, attr) => { node.attribs = Reflect.ownKeys(node.attribs).reduce((acc, attr) => {
if (WHITELIST_ATTRS_RE.test(attr)) { if (WHITELIST_ATTRS_RE.test(attr)) {
@ -11,6 +10,8 @@ function removeAllButWhitelist($article) {
return acc; return acc;
}, {}); }, {});
}); });
return $article;
} }
// function removeAttrs(article, $) { // function removeAttrs(article, $) {
@ -21,7 +22,11 @@ function removeAllButWhitelist($article) {
// Remove attributes like style or align // Remove attributes like style or align
export default function cleanAttributes($article) { export default function cleanAttributes($article) {
removeAllButWhitelist($article); // Grabbing the parent because at this point
// $article will be wrapped in a div which will
return $article; // have a score set on it.
return removeAllButWhitelist(
$article.parent().length ?
$article.parent() : $article
);
} }

Loading…
Cancel
Save