Use a dedicated method and backward iteration for removing nodes (#300)

This improves compat with "real" DOMs that provide a live NodeList as the return value of getElementsByTagName.
pull/301/head
Ivan Persidsky 8 years ago committed by Gijs
parent 140d4c4aca
commit fd11f92adb

@ -143,6 +143,28 @@ Readability.prototype = {
this._fixRelativeUris(articleContent);
},
/**
* Iterates over a NodeList, calls `filterFn` for each node and removes node
* if function returned `true`.
*
* If function is not passed, removes all the nodes in node list.
*
* @param NodeList nodeList The no
* @param Function filterFn
* @return void
*/
_removeNodes: function(nodeList, filterFn) {
for (var i = nodeList.length - 1; i >= 0; i--) {
var node = nodeList[i];
var parentNode = node.parentNode;
if (parentNode) {
if(!filterFn || filterFn.call(this, node, i, nodeList)) {
parentNode.removeChild(node);
}
}
}
},
/**
* Iterate over a NodeList, which doesn't natively fully implement the Array
* interface.
@ -152,10 +174,11 @@ Readability.prototype = {
*
* @param NodeList nodeList The NodeList.
* @param Function fn The iterate function.
* @param Boolean backward Whether to use backward iteration.
* @return void
*/
_forEachNode: function(nodeList, fn) {
return Array.prototype.forEach.call(nodeList, fn, this);
_forEachNode: function(nodeList, fn, backward) {
Array.prototype.forEach.call(nodeList, fn, this);
},
/**
@ -327,9 +350,7 @@ Readability.prototype = {
var doc = this._doc;
// Remove all style tags in head
this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) {
styleNode.parentNode.removeChild(styleNode);
});
this._removeNodes(doc.getElementsByTagName("style"));
if (doc.body) {
this._replaceBrs(doc.body);
@ -363,7 +384,7 @@ Readability.prototype = {
* <div>foo<br>bar<p>abc</p></div>
*/
_replaceBrs: function (elem) {
this._forEachNode(elem.getElementsByTagName("br"), function(br) {
this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) {
var next = br.nextSibling;
// Whether 2 or more <br> elements have been found and replaced with a
@ -459,7 +480,7 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "div");
// Remove extra paragraphs
this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) {
this._removeNodes(articleContent.getElementsByTagName('p'), function (paragraph) {
var imgCount = paragraph.getElementsByTagName('img').length;
var embedCount = paragraph.getElementsByTagName('embed').length;
var objectCount = paragraph.getElementsByTagName('object').length;
@ -467,11 +488,10 @@ Readability.prototype = {
var iframeCount = paragraph.getElementsByTagName('iframe').length;
var totalCount = imgCount + embedCount + objectCount + iframeCount;
if (totalCount === 0 && !this._getInnerText(paragraph, false))
paragraph.parentNode.removeChild(paragraph);
return totalCount === 0 && !this._getInnerText(paragraph, false);
});
this._forEachNode(articleContent.getElementsByTagName("br"), function(br) {
this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) {
var next = this._nextElement(br.nextSibling);
if (next && next.tagName == "P")
br.parentNode.removeChild(br);
@ -1035,17 +1055,12 @@ Readability.prototype = {
* @param Element
**/
_removeScripts: function(doc) {
this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) {
this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) {
scriptNode.nodeValue = "";
scriptNode.removeAttribute('src');
if (scriptNode.parentNode)
scriptNode.parentNode.removeChild(scriptNode);
});
this._forEachNode(doc.getElementsByTagName('noscript'), function(noscriptNode) {
if (noscriptNode.parentNode)
noscriptNode.parentNode.removeChild(noscriptNode);
return true;
});
this._removeNodes(doc.getElementsByTagName('noscript'));
},
/**
@ -1574,7 +1589,7 @@ Readability.prototype = {
_clean: function(e, tag) {
var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
this._forEachNode(e.getElementsByTagName(tag), function(element) {
this._removeNodes(e.getElementsByTagName(tag), function(element) {
// Allow youtube and vimeo videos through as people usually want to see those.
if (isEmbed) {
var attributeValues = [].map.call(element.attributes, function(attr) {
@ -1583,14 +1598,14 @@ Readability.prototype = {
// First, check the elements attributes to see if any of them contain youtube or vimeo
if (this.REGEXPS.videos.test(attributeValues))
return;
return false;
// Then check the elements inside this element for the same.
if (this.REGEXPS.videos.test(element.innerHTML))
return;
return false;
}
element.parentNode.removeChild(element);
return true;
});
},
@ -1627,8 +1642,6 @@ Readability.prototype = {
if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
return;
var tagsList = e.getElementsByTagName(tag);
var curTagsLength = tagsList.length;
var isList = tag === "ul" || tag === "ol";
// Gather counts for other typical elements embedded within.
@ -1636,54 +1649,48 @@ Readability.prototype = {
// without effecting the traversal.
//
// TODO: Consider taking into account original contentScore here.
for (var i = curTagsLength-1; i >= 0; i -= 1) {
var weight = this._getClassWeight(tagsList[i]);
this._removeNodes(e.getElementsByTagName(tag), function(node) {
var weight = this._getClassWeight(node);
var contentScore = 0;
this.log("Cleaning Conditionally", tagsList[i]);
this.log("Cleaning Conditionally", node);
if (weight + contentScore < 0) {
tagsList[i].parentNode.removeChild(tagsList[i]);
} else if (this._getCharCount(tagsList[i],',') < 10) {
return true;
}
if (this._getCharCount(node,',') < 10) {
// If there are not very many commas, and the number of
// non-paragraph elements is more than paragraphs or other
// ominous signs, remove the element.
var p = tagsList[i].getElementsByTagName("p").length;
var img = tagsList[i].getElementsByTagName("img").length;
var li = tagsList[i].getElementsByTagName("li").length-100;
var input = tagsList[i].getElementsByTagName("input").length;
var p = node.getElementsByTagName("p").length;
var img = node.getElementsByTagName("img").length;
var li = node.getElementsByTagName("li").length-100;
var input = node.getElementsByTagName("input").length;
var embedCount = 0;
var embeds = tagsList[i].getElementsByTagName("embed");
var embeds = node.getElementsByTagName("embed");
for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
if (!this.REGEXPS.videos.test(embeds[ei].src))
embedCount += 1;
}
var linkDensity = this._getLinkDensity(tagsList[i]);
var contentLength = this._getInnerText(tagsList[i]).length;
var toRemove = false;
if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
toRemove = true;
} else if (!isList && li > p) {
toRemove = true;
} else if (input > Math.floor(p/3)) {
toRemove = true;
} else if (!isList && contentLength < 25 && (img === 0 || img > 2)) {
toRemove = true;
} else if (!isList && weight < 25 && linkDensity > 0.2) {
toRemove = true;
} else if (weight >= 25 && linkDensity > 0.5) {
toRemove = true;
} else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
toRemove = true;
}
if (toRemove) {
tagsList[i].parentNode.removeChild(tagsList[i]);
}
var linkDensity = this._getLinkDensity(node);
var contentLength = this._getInnerText(node).length;
var haveToRemove =
// Make an exception for elements with no p's and exactly 1 img.
(img > p && !this._hasAncestorTag(node, "figure")) ||
(!isList && li > p) ||
(input > Math.floor(p/3)) ||
(!isList && contentLength < 25 && (img === 0 || img > 2)) ||
(!isList && weight < 25 && linkDensity > 0.2) ||
(weight >= 25 && linkDensity > 0.5) ||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
return haveToRemove;
}
}
return false
});
},
/**
@ -1694,11 +1701,9 @@ Readability.prototype = {
**/
_cleanHeaders: function(e) {
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
var headers = e.getElementsByTagName('h' + headerIndex);
for (var i = headers.length - 1; i >= 0; i -= 1) {
if (this._getClassWeight(headers[i]) < 0)
headers[i].parentNode.removeChild(headers[i]);
}
this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) {
return this._getClassWeight(header) < 0;
});
}
},

Loading…
Cancel
Save