Use a dedicated method and backward iteration for removing nodes (#300)
This improves compat with "real" DOMs that provide a live NodeList as the return value of getElementsByTagName.
This commit is contained in:
parent
140d4c4aca
commit
fd11f92adb
127
Readability.js
127
Readability.js
@ -143,6 +143,28 @@ Readability.prototype = {
|
|||||||
this._fixRelativeUris(articleContent);
|
this._fixRelativeUris(articleContent);
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Iterates over a NodeList, calls `filterFn` for each node and removes node
|
||||||
|
* if function returned `true`.
|
||||||
|
*
|
||||||
|
* If function is not passed, removes all the nodes in node list.
|
||||||
|
*
|
||||||
|
* @param NodeList nodeList The no
|
||||||
|
* @param Function filterFn
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
_removeNodes: function(nodeList, filterFn) {
|
||||||
|
for (var i = nodeList.length - 1; i >= 0; i--) {
|
||||||
|
var node = nodeList[i];
|
||||||
|
var parentNode = node.parentNode;
|
||||||
|
if (parentNode) {
|
||||||
|
if(!filterFn || filterFn.call(this, node, i, nodeList)) {
|
||||||
|
parentNode.removeChild(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Iterate over a NodeList, which doesn't natively fully implement the Array
|
* Iterate over a NodeList, which doesn't natively fully implement the Array
|
||||||
* interface.
|
* interface.
|
||||||
@ -152,10 +174,11 @@ Readability.prototype = {
|
|||||||
*
|
*
|
||||||
* @param NodeList nodeList The NodeList.
|
* @param NodeList nodeList The NodeList.
|
||||||
* @param Function fn The iterate function.
|
* @param Function fn The iterate function.
|
||||||
|
* @param Boolean backward Whether to use backward iteration.
|
||||||
* @return void
|
* @return void
|
||||||
*/
|
*/
|
||||||
_forEachNode: function(nodeList, fn) {
|
_forEachNode: function(nodeList, fn, backward) {
|
||||||
return Array.prototype.forEach.call(nodeList, fn, this);
|
Array.prototype.forEach.call(nodeList, fn, this);
|
||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -327,9 +350,7 @@ Readability.prototype = {
|
|||||||
var doc = this._doc;
|
var doc = this._doc;
|
||||||
|
|
||||||
// Remove all style tags in head
|
// Remove all style tags in head
|
||||||
this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) {
|
this._removeNodes(doc.getElementsByTagName("style"));
|
||||||
styleNode.parentNode.removeChild(styleNode);
|
|
||||||
});
|
|
||||||
|
|
||||||
if (doc.body) {
|
if (doc.body) {
|
||||||
this._replaceBrs(doc.body);
|
this._replaceBrs(doc.body);
|
||||||
@ -363,7 +384,7 @@ Readability.prototype = {
|
|||||||
* <div>foo<br>bar<p>abc</p></div>
|
* <div>foo<br>bar<p>abc</p></div>
|
||||||
*/
|
*/
|
||||||
_replaceBrs: function (elem) {
|
_replaceBrs: function (elem) {
|
||||||
this._forEachNode(elem.getElementsByTagName("br"), function(br) {
|
this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) {
|
||||||
var next = br.nextSibling;
|
var next = br.nextSibling;
|
||||||
|
|
||||||
// Whether 2 or more <br> elements have been found and replaced with a
|
// Whether 2 or more <br> elements have been found and replaced with a
|
||||||
@ -459,7 +480,7 @@ Readability.prototype = {
|
|||||||
this._cleanConditionally(articleContent, "div");
|
this._cleanConditionally(articleContent, "div");
|
||||||
|
|
||||||
// Remove extra paragraphs
|
// Remove extra paragraphs
|
||||||
this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) {
|
this._removeNodes(articleContent.getElementsByTagName('p'), function (paragraph) {
|
||||||
var imgCount = paragraph.getElementsByTagName('img').length;
|
var imgCount = paragraph.getElementsByTagName('img').length;
|
||||||
var embedCount = paragraph.getElementsByTagName('embed').length;
|
var embedCount = paragraph.getElementsByTagName('embed').length;
|
||||||
var objectCount = paragraph.getElementsByTagName('object').length;
|
var objectCount = paragraph.getElementsByTagName('object').length;
|
||||||
@ -467,11 +488,10 @@ Readability.prototype = {
|
|||||||
var iframeCount = paragraph.getElementsByTagName('iframe').length;
|
var iframeCount = paragraph.getElementsByTagName('iframe').length;
|
||||||
var totalCount = imgCount + embedCount + objectCount + iframeCount;
|
var totalCount = imgCount + embedCount + objectCount + iframeCount;
|
||||||
|
|
||||||
if (totalCount === 0 && !this._getInnerText(paragraph, false))
|
return totalCount === 0 && !this._getInnerText(paragraph, false);
|
||||||
paragraph.parentNode.removeChild(paragraph);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
this._forEachNode(articleContent.getElementsByTagName("br"), function(br) {
|
this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) {
|
||||||
var next = this._nextElement(br.nextSibling);
|
var next = this._nextElement(br.nextSibling);
|
||||||
if (next && next.tagName == "P")
|
if (next && next.tagName == "P")
|
||||||
br.parentNode.removeChild(br);
|
br.parentNode.removeChild(br);
|
||||||
@ -1035,17 +1055,12 @@ Readability.prototype = {
|
|||||||
* @param Element
|
* @param Element
|
||||||
**/
|
**/
|
||||||
_removeScripts: function(doc) {
|
_removeScripts: function(doc) {
|
||||||
this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) {
|
this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) {
|
||||||
scriptNode.nodeValue = "";
|
scriptNode.nodeValue = "";
|
||||||
scriptNode.removeAttribute('src');
|
scriptNode.removeAttribute('src');
|
||||||
|
return true;
|
||||||
if (scriptNode.parentNode)
|
|
||||||
scriptNode.parentNode.removeChild(scriptNode);
|
|
||||||
});
|
|
||||||
this._forEachNode(doc.getElementsByTagName('noscript'), function(noscriptNode) {
|
|
||||||
if (noscriptNode.parentNode)
|
|
||||||
noscriptNode.parentNode.removeChild(noscriptNode);
|
|
||||||
});
|
});
|
||||||
|
this._removeNodes(doc.getElementsByTagName('noscript'));
|
||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -1574,7 +1589,7 @@ Readability.prototype = {
|
|||||||
_clean: function(e, tag) {
|
_clean: function(e, tag) {
|
||||||
var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
|
var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
|
||||||
|
|
||||||
this._forEachNode(e.getElementsByTagName(tag), function(element) {
|
this._removeNodes(e.getElementsByTagName(tag), function(element) {
|
||||||
// Allow youtube and vimeo videos through as people usually want to see those.
|
// Allow youtube and vimeo videos through as people usually want to see those.
|
||||||
if (isEmbed) {
|
if (isEmbed) {
|
||||||
var attributeValues = [].map.call(element.attributes, function(attr) {
|
var attributeValues = [].map.call(element.attributes, function(attr) {
|
||||||
@ -1583,14 +1598,14 @@ Readability.prototype = {
|
|||||||
|
|
||||||
// First, check the elements attributes to see if any of them contain youtube or vimeo
|
// First, check the elements attributes to see if any of them contain youtube or vimeo
|
||||||
if (this.REGEXPS.videos.test(attributeValues))
|
if (this.REGEXPS.videos.test(attributeValues))
|
||||||
return;
|
return false;
|
||||||
|
|
||||||
// Then check the elements inside this element for the same.
|
// Then check the elements inside this element for the same.
|
||||||
if (this.REGEXPS.videos.test(element.innerHTML))
|
if (this.REGEXPS.videos.test(element.innerHTML))
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
element.parentNode.removeChild(element);
|
return true;
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
@ -1627,8 +1642,6 @@ Readability.prototype = {
|
|||||||
if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
|
if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
var tagsList = e.getElementsByTagName(tag);
|
|
||||||
var curTagsLength = tagsList.length;
|
|
||||||
var isList = tag === "ul" || tag === "ol";
|
var isList = tag === "ul" || tag === "ol";
|
||||||
|
|
||||||
// Gather counts for other typical elements embedded within.
|
// Gather counts for other typical elements embedded within.
|
||||||
@ -1636,54 +1649,48 @@ Readability.prototype = {
|
|||||||
// without effecting the traversal.
|
// without effecting the traversal.
|
||||||
//
|
//
|
||||||
// TODO: Consider taking into account original contentScore here.
|
// TODO: Consider taking into account original contentScore here.
|
||||||
for (var i = curTagsLength-1; i >= 0; i -= 1) {
|
this._removeNodes(e.getElementsByTagName(tag), function(node) {
|
||||||
var weight = this._getClassWeight(tagsList[i]);
|
var weight = this._getClassWeight(node);
|
||||||
var contentScore = 0;
|
var contentScore = 0;
|
||||||
|
|
||||||
this.log("Cleaning Conditionally", tagsList[i]);
|
this.log("Cleaning Conditionally", node);
|
||||||
|
|
||||||
if (weight + contentScore < 0) {
|
if (weight + contentScore < 0) {
|
||||||
tagsList[i].parentNode.removeChild(tagsList[i]);
|
return true;
|
||||||
} else if (this._getCharCount(tagsList[i],',') < 10) {
|
}
|
||||||
|
|
||||||
|
if (this._getCharCount(node,',') < 10) {
|
||||||
// If there are not very many commas, and the number of
|
// If there are not very many commas, and the number of
|
||||||
// non-paragraph elements is more than paragraphs or other
|
// non-paragraph elements is more than paragraphs or other
|
||||||
// ominous signs, remove the element.
|
// ominous signs, remove the element.
|
||||||
var p = tagsList[i].getElementsByTagName("p").length;
|
var p = node.getElementsByTagName("p").length;
|
||||||
var img = tagsList[i].getElementsByTagName("img").length;
|
var img = node.getElementsByTagName("img").length;
|
||||||
var li = tagsList[i].getElementsByTagName("li").length-100;
|
var li = node.getElementsByTagName("li").length-100;
|
||||||
var input = tagsList[i].getElementsByTagName("input").length;
|
var input = node.getElementsByTagName("input").length;
|
||||||
|
|
||||||
var embedCount = 0;
|
var embedCount = 0;
|
||||||
var embeds = tagsList[i].getElementsByTagName("embed");
|
var embeds = node.getElementsByTagName("embed");
|
||||||
for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
|
for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
|
||||||
if (!this.REGEXPS.videos.test(embeds[ei].src))
|
if (!this.REGEXPS.videos.test(embeds[ei].src))
|
||||||
embedCount += 1;
|
embedCount += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
var linkDensity = this._getLinkDensity(tagsList[i]);
|
var linkDensity = this._getLinkDensity(node);
|
||||||
var contentLength = this._getInnerText(tagsList[i]).length;
|
var contentLength = this._getInnerText(node).length;
|
||||||
var toRemove = false;
|
|
||||||
if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
|
|
||||||
toRemove = true;
|
|
||||||
} else if (!isList && li > p) {
|
|
||||||
toRemove = true;
|
|
||||||
} else if (input > Math.floor(p/3)) {
|
|
||||||
toRemove = true;
|
|
||||||
} else if (!isList && contentLength < 25 && (img === 0 || img > 2)) {
|
|
||||||
toRemove = true;
|
|
||||||
} else if (!isList && weight < 25 && linkDensity > 0.2) {
|
|
||||||
toRemove = true;
|
|
||||||
} else if (weight >= 25 && linkDensity > 0.5) {
|
|
||||||
toRemove = true;
|
|
||||||
} else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
|
|
||||||
toRemove = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (toRemove) {
|
var haveToRemove =
|
||||||
tagsList[i].parentNode.removeChild(tagsList[i]);
|
// Make an exception for elements with no p's and exactly 1 img.
|
||||||
}
|
(img > p && !this._hasAncestorTag(node, "figure")) ||
|
||||||
}
|
(!isList && li > p) ||
|
||||||
|
(input > Math.floor(p/3)) ||
|
||||||
|
(!isList && contentLength < 25 && (img === 0 || img > 2)) ||
|
||||||
|
(!isList && weight < 25 && linkDensity > 0.2) ||
|
||||||
|
(weight >= 25 && linkDensity > 0.5) ||
|
||||||
|
((embedCount === 1 && contentLength < 75) || embedCount > 1);
|
||||||
|
return haveToRemove;
|
||||||
}
|
}
|
||||||
|
return false
|
||||||
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -1694,11 +1701,9 @@ Readability.prototype = {
|
|||||||
**/
|
**/
|
||||||
_cleanHeaders: function(e) {
|
_cleanHeaders: function(e) {
|
||||||
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
|
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
|
||||||
var headers = e.getElementsByTagName('h' + headerIndex);
|
this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) {
|
||||||
for (var i = headers.length - 1; i >= 0; i -= 1) {
|
return this._getClassWeight(header) < 0;
|
||||||
if (this._getClassWeight(headers[i]) < 0)
|
});
|
||||||
headers[i].parentNode.removeChild(headers[i]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user