Use a dedicated method and backward iteration for removing nodes (#300)

This improves compat with "real" DOMs that provide a live NodeList as the return value of getElementsByTagName.
This commit is contained in:
Ivan Persidsky 2016-07-18 15:56:51 +04:00 committed by Gijs
parent 140d4c4aca
commit fd11f92adb

View File

@ -143,6 +143,28 @@ Readability.prototype = {
this._fixRelativeUris(articleContent); this._fixRelativeUris(articleContent);
}, },
/**
* Iterates over a NodeList, calls `filterFn` for each node and removes node
* if function returned `true`.
*
* If function is not passed, removes all the nodes in node list.
*
* @param NodeList nodeList The no
* @param Function filterFn
* @return void
*/
_removeNodes: function(nodeList, filterFn) {
for (var i = nodeList.length - 1; i >= 0; i--) {
var node = nodeList[i];
var parentNode = node.parentNode;
if (parentNode) {
if(!filterFn || filterFn.call(this, node, i, nodeList)) {
parentNode.removeChild(node);
}
}
}
},
/** /**
* Iterate over a NodeList, which doesn't natively fully implement the Array * Iterate over a NodeList, which doesn't natively fully implement the Array
* interface. * interface.
@ -152,10 +174,11 @@ Readability.prototype = {
* *
* @param NodeList nodeList The NodeList. * @param NodeList nodeList The NodeList.
* @param Function fn The iterate function. * @param Function fn The iterate function.
* @param Boolean backward Whether to use backward iteration.
* @return void * @return void
*/ */
_forEachNode: function(nodeList, fn) { _forEachNode: function(nodeList, fn, backward) {
return Array.prototype.forEach.call(nodeList, fn, this); Array.prototype.forEach.call(nodeList, fn, this);
}, },
/** /**
@ -327,9 +350,7 @@ Readability.prototype = {
var doc = this._doc; var doc = this._doc;
// Remove all style tags in head // Remove all style tags in head
this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) { this._removeNodes(doc.getElementsByTagName("style"));
styleNode.parentNode.removeChild(styleNode);
});
if (doc.body) { if (doc.body) {
this._replaceBrs(doc.body); this._replaceBrs(doc.body);
@ -363,7 +384,7 @@ Readability.prototype = {
* <div>foo<br>bar<p>abc</p></div> * <div>foo<br>bar<p>abc</p></div>
*/ */
_replaceBrs: function (elem) { _replaceBrs: function (elem) {
this._forEachNode(elem.getElementsByTagName("br"), function(br) { this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) {
var next = br.nextSibling; var next = br.nextSibling;
// Whether 2 or more <br> elements have been found and replaced with a // Whether 2 or more <br> elements have been found and replaced with a
@ -459,7 +480,7 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "div"); this._cleanConditionally(articleContent, "div");
// Remove extra paragraphs // Remove extra paragraphs
this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) { this._removeNodes(articleContent.getElementsByTagName('p'), function (paragraph) {
var imgCount = paragraph.getElementsByTagName('img').length; var imgCount = paragraph.getElementsByTagName('img').length;
var embedCount = paragraph.getElementsByTagName('embed').length; var embedCount = paragraph.getElementsByTagName('embed').length;
var objectCount = paragraph.getElementsByTagName('object').length; var objectCount = paragraph.getElementsByTagName('object').length;
@ -467,11 +488,10 @@ Readability.prototype = {
var iframeCount = paragraph.getElementsByTagName('iframe').length; var iframeCount = paragraph.getElementsByTagName('iframe').length;
var totalCount = imgCount + embedCount + objectCount + iframeCount; var totalCount = imgCount + embedCount + objectCount + iframeCount;
if (totalCount === 0 && !this._getInnerText(paragraph, false)) return totalCount === 0 && !this._getInnerText(paragraph, false);
paragraph.parentNode.removeChild(paragraph);
}); });
this._forEachNode(articleContent.getElementsByTagName("br"), function(br) { this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) {
var next = this._nextElement(br.nextSibling); var next = this._nextElement(br.nextSibling);
if (next && next.tagName == "P") if (next && next.tagName == "P")
br.parentNode.removeChild(br); br.parentNode.removeChild(br);
@ -1035,17 +1055,12 @@ Readability.prototype = {
* @param Element * @param Element
**/ **/
_removeScripts: function(doc) { _removeScripts: function(doc) {
this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) { this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) {
scriptNode.nodeValue = ""; scriptNode.nodeValue = "";
scriptNode.removeAttribute('src'); scriptNode.removeAttribute('src');
return true;
if (scriptNode.parentNode)
scriptNode.parentNode.removeChild(scriptNode);
});
this._forEachNode(doc.getElementsByTagName('noscript'), function(noscriptNode) {
if (noscriptNode.parentNode)
noscriptNode.parentNode.removeChild(noscriptNode);
}); });
this._removeNodes(doc.getElementsByTagName('noscript'));
}, },
/** /**
@ -1574,7 +1589,7 @@ Readability.prototype = {
_clean: function(e, tag) { _clean: function(e, tag) {
var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1; var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
this._forEachNode(e.getElementsByTagName(tag), function(element) { this._removeNodes(e.getElementsByTagName(tag), function(element) {
// Allow youtube and vimeo videos through as people usually want to see those. // Allow youtube and vimeo videos through as people usually want to see those.
if (isEmbed) { if (isEmbed) {
var attributeValues = [].map.call(element.attributes, function(attr) { var attributeValues = [].map.call(element.attributes, function(attr) {
@ -1583,14 +1598,14 @@ Readability.prototype = {
// First, check the elements attributes to see if any of them contain youtube or vimeo // First, check the elements attributes to see if any of them contain youtube or vimeo
if (this.REGEXPS.videos.test(attributeValues)) if (this.REGEXPS.videos.test(attributeValues))
return; return false;
// Then check the elements inside this element for the same. // Then check the elements inside this element for the same.
if (this.REGEXPS.videos.test(element.innerHTML)) if (this.REGEXPS.videos.test(element.innerHTML))
return; return false;
} }
element.parentNode.removeChild(element); return true;
}); });
}, },
@ -1627,8 +1642,6 @@ Readability.prototype = {
if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
return; return;
var tagsList = e.getElementsByTagName(tag);
var curTagsLength = tagsList.length;
var isList = tag === "ul" || tag === "ol"; var isList = tag === "ul" || tag === "ol";
// Gather counts for other typical elements embedded within. // Gather counts for other typical elements embedded within.
@ -1636,54 +1649,48 @@ Readability.prototype = {
// without effecting the traversal. // without effecting the traversal.
// //
// TODO: Consider taking into account original contentScore here. // TODO: Consider taking into account original contentScore here.
for (var i = curTagsLength-1; i >= 0; i -= 1) { this._removeNodes(e.getElementsByTagName(tag), function(node) {
var weight = this._getClassWeight(tagsList[i]); var weight = this._getClassWeight(node);
var contentScore = 0; var contentScore = 0;
this.log("Cleaning Conditionally", tagsList[i]); this.log("Cleaning Conditionally", node);
if (weight + contentScore < 0) { if (weight + contentScore < 0) {
tagsList[i].parentNode.removeChild(tagsList[i]); return true;
} else if (this._getCharCount(tagsList[i],',') < 10) { }
if (this._getCharCount(node,',') < 10) {
// If there are not very many commas, and the number of // If there are not very many commas, and the number of
// non-paragraph elements is more than paragraphs or other // non-paragraph elements is more than paragraphs or other
// ominous signs, remove the element. // ominous signs, remove the element.
var p = tagsList[i].getElementsByTagName("p").length; var p = node.getElementsByTagName("p").length;
var img = tagsList[i].getElementsByTagName("img").length; var img = node.getElementsByTagName("img").length;
var li = tagsList[i].getElementsByTagName("li").length-100; var li = node.getElementsByTagName("li").length-100;
var input = tagsList[i].getElementsByTagName("input").length; var input = node.getElementsByTagName("input").length;
var embedCount = 0; var embedCount = 0;
var embeds = tagsList[i].getElementsByTagName("embed"); var embeds = node.getElementsByTagName("embed");
for (var ei = 0, il = embeds.length; ei < il; ei += 1) { for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
if (!this.REGEXPS.videos.test(embeds[ei].src)) if (!this.REGEXPS.videos.test(embeds[ei].src))
embedCount += 1; embedCount += 1;
} }
var linkDensity = this._getLinkDensity(tagsList[i]); var linkDensity = this._getLinkDensity(node);
var contentLength = this._getInnerText(tagsList[i]).length; var contentLength = this._getInnerText(node).length;
var toRemove = false;
if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
toRemove = true;
} else if (!isList && li > p) {
toRemove = true;
} else if (input > Math.floor(p/3)) {
toRemove = true;
} else if (!isList && contentLength < 25 && (img === 0 || img > 2)) {
toRemove = true;
} else if (!isList && weight < 25 && linkDensity > 0.2) {
toRemove = true;
} else if (weight >= 25 && linkDensity > 0.5) {
toRemove = true;
} else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
toRemove = true;
}
if (toRemove) { var haveToRemove =
tagsList[i].parentNode.removeChild(tagsList[i]); // Make an exception for elements with no p's and exactly 1 img.
} (img > p && !this._hasAncestorTag(node, "figure")) ||
} (!isList && li > p) ||
(input > Math.floor(p/3)) ||
(!isList && contentLength < 25 && (img === 0 || img > 2)) ||
(!isList && weight < 25 && linkDensity > 0.2) ||
(weight >= 25 && linkDensity > 0.5) ||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
return haveToRemove;
} }
return false
});
}, },
/** /**
@ -1694,11 +1701,9 @@ Readability.prototype = {
**/ **/
_cleanHeaders: function(e) { _cleanHeaders: function(e) {
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) { for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
var headers = e.getElementsByTagName('h' + headerIndex); this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) {
for (var i = headers.length - 1; i >= 0; i -= 1) { return this._getClassWeight(header) < 0;
if (this._getClassWeight(headers[i]) < 0) });
headers[i].parentNode.removeChild(headers[i]);
}
} }
}, },