From eb81444946f75f115acc39b800efc8ec88955307 Mon Sep 17 00:00:00 2001 From: Gijs Kruitbosch Date: Fri, 20 Mar 2015 14:50:49 -0700 Subject: [PATCH] Improve logic to rely on children instead of childNodes --- JSDOMParser.js | 10 ++-- Readability.js | 153 +++++++++++++++++++++++-------------------------- 2 files changed, 77 insertions(+), 86 deletions(-) diff --git a/JSDOMParser.js b/JSDOMParser.js index 28df4be..ab606a0 100644 --- a/JSDOMParser.js +++ b/JSDOMParser.js @@ -262,11 +262,9 @@ var elems = []; var allTags = (tag === "*"); function getElems(node) { - var length = node.childNodes.length; + var length = node.children.length; for (var i = 0; i < length; i++) { - var child = node.childNodes[i]; - if (child.nodeType !== 1) - continue; + var child = node.children[i]; if (allTags || (child.tagName === tag)) elems.push(child); getElems(child); @@ -494,11 +492,11 @@ getElementById: function (id) { function getElem(node) { - var length = node.childNodes.length; + var length = node.children.length; if (node.id === id) return node; for (var i = 0; i < length; i++) { - var el = getElem(node.childNodes[i]); + var el = getElem(node.children[i]); if (el) return el; } diff --git a/Readability.js b/Readability.js index 6175deb..6a7354b 100644 --- a/Readability.js +++ b/Readability.js @@ -99,7 +99,8 @@ Readability.prototype = { videos: /https?:\/\/(www\.)?(youtube|vimeo)\.com/i, nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, prevLink: /(prev|earl|old|new|<|«)/i, - whitespace: /^\s*$/ + whitespace: /^\s*$/, + hasContent: /\S$/, }, DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ], @@ -480,7 +481,7 @@ Readability.prototype = { // First, node prepping. Trash nodes that look cruddy (like ones with the // class name "comment", etc), and turn divs into P tags where they have been // used inappropriately (as in, where they contain no other block level elements.) - var nodesToScore = []; + var elementsToScore = []; var node = this._doc.documentElement; while (node) { @@ -504,7 +505,7 @@ Readability.prototype = { } if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") - nodesToScore.push(node); + elementsToScore.push(node); // Turn all divs that don't have children block level elements into p's if (node.tagName === "DIV") { @@ -512,15 +513,13 @@ Readability.prototype = { // element. DIVs with only a P element inside and no text content can be // safely converted into plain P elements to avoid confusing the scoring // algorithm with DIVs with are, in practice, paragraphs. - var pIndex = this._getSinglePIndexInsideDiv(node); - - if (pIndex >= 0) { - var newNode = node.childNodes[pIndex]; + if (this._hasSinglePInsideElement(node)) { + var newNode = node.firstElementChild; node.parentNode.replaceChild(newNode, node); node = newNode; } else if (!this._hasChildBlockElement(node)) { this._setNodeTag(node, "P"); - nodesToScore.push(node); + elementsToScore.push(node); } else { // EXPERIMENTAL for (var i = 0, il = node.childNodes.length; i < il; i += 1) { @@ -545,10 +544,10 @@ Readability.prototype = { * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. **/ var candidates = []; - for (var pt = 0; pt < nodesToScore.length; pt += 1) { - var parentNode = nodesToScore[pt].parentNode; + for (var pt = 0; pt < elementsToScore.length; pt += 1) { + var parentNode = elementsToScore[pt].parentNode; var grandParentNode = parentNode ? parentNode.parentNode : null; - var innerText = this._getInnerText(nodesToScore[pt]); + var innerText = this._getInnerText(elementsToScore[pt]); if (!parentNode || typeof(parentNode.tagName) === 'undefined') continue; @@ -624,10 +623,12 @@ Readability.prototype = { // Move all of the page's children into topCandidate topCandidate = doc.createElement("DIV"); neededToCreateTopCandidate = true; - var children = page.childNodes; - while (children.length) { - this.log("Moving child out:", children[0]); - topCandidate.appendChild(children[0]); + // Move everything (not just elements, also text nodes etc.) into the container + // so we even include text directly in the body: + var kids = page.childNodes; + while (kids.length) { + this.log("Moving child out:", kids[0]); + topCandidate.appendChild(kids[0]); } page.appendChild(topCandidate); @@ -643,72 +644,71 @@ Readability.prototype = { articleContent.id = "readability-content"; var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); - var siblingNodes = topCandidate.parentNode.childNodes; + var siblings = topCandidate.parentNode.children; - for (var s = 0, sl = siblingNodes.length; s < sl; s += 1) { - var siblingNode = siblingNodes[s]; + for (var s = 0, sl = siblings.length; s < sl; s++) { + var sibling = siblings[s]; var append = false; - this.log("Looking at sibling node:", siblingNode, ((typeof siblingNode.readability !== 'undefined') ? ("with score " + siblingNode.readability.contentScore) : '')); - this.log("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); - - if (siblingNode === topCandidate) - append = true; - - var contentBonus = 0; - - // Give a bonus if sibling nodes and top candidates have the example same classname - if (siblingNode.className === topCandidate.className && topCandidate.className !== "") - contentBonus += topCandidate.readability.contentScore * 0.2; + this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ''); + this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown'); - if (typeof siblingNode.readability !== 'undefined' && - (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold) + if (sibling === topCandidate) { append = true; + } else { + var contentBonus = 0; - if (siblingNode.nodeName === "P") { - var linkDensity = this._getLinkDensity(siblingNode); - var nodeContent = this._getInnerText(siblingNode); - var nodeLength = nodeContent.length; + // Give a bonus if sibling nodes and top candidates have the example same classname + if (sibling.className === topCandidate.className && topCandidate.className !== "") + contentBonus += topCandidate.readability.contentScore * 0.2; - if (nodeLength > 80 && linkDensity < 0.25) { - append = true; - } else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) { + if (sibling.readability && + ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { append = true; + } else if (sibling.nodeName === "P") { + var linkDensity = this._getLinkDensity(sibling); + var nodeContent = this._getInnerText(sibling); + var nodeLength = nodeContent.length; + + if (nodeLength > 80 && linkDensity < 0.25) { + append = true; + } else if (nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) { + append = true; + } } } if (append) { - this.log("Appending node:", siblingNode); - - // siblingNodes is a reference to the childNodes array, and - // siblingNode is removed from the array when we call appendChild() - // below. As a result, we must revisit this index since the nodes - // have been shifted. - s -= 1; - sl -= 1; + this.log("Appending node:", sibling); - if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(siblingNode.nodeName) === -1) { + if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) { // We have a node that isn't a common block level element, like a form or td tag. - // Turn it into a div so it doesn't get filtered out later by accident. */ - this.log("Altering siblingNode:", siblingNode, 'to div.'); + // Turn it into a div so it doesn't get filtered out later by accident. + this.log("Altering sibling:", sibling, 'to div.'); - this._setNodeTag(siblingNode, "DIV"); + this._setNodeTag(sibling, "DIV"); } // To ensure a node does not interfere with readability styles, // remove its classnames. - siblingNode.removeAttribute("class"); + sibling.removeAttribute("class"); - // Append sibling and subtract from our list because it removes - // the node when you append to another node. - articleContent.appendChild(siblingNode); + articleContent.appendChild(sibling); + // siblings is a reference to the children array, and + // sibling is removed from the array when we call appendChild(). + // As a result, we must revisit this index since the nodes + // have been shifted. + s -= 1; + sl -= 1; } } - this.log("Article content pre-prep: " + articleContent.innerHTML); + if (this.ENABLE_LOGGING) + this.log("Article content pre-prep: " + articleContent.innerHTML); // So we have all of the content that we need. Now we clean it up for presentation. this._prepArticle(articleContent); - this.log("Article content post-prep: " + articleContent.innerHTML); + if (this.ENABLE_LOGGING) + this.log("Article content post-prep: " + articleContent.innerHTML); if (this._curPageNum === 1) { if (neededToCreateTopCandidate) { @@ -730,7 +730,8 @@ Readability.prototype = { } } - this.log("Article content after paging: " + articleContent.innerHTML); + if (this.ENABLE_LOGGING) + this.log("Article content after paging: " + articleContent.innerHTML); // Now that we've gone through the full algorithm, check to see if // we got any meaningful content. If we didn't, we may need to re-run @@ -847,33 +848,28 @@ Readability.prototype = { }, /** - * Get child index of the only P element inside a DIV with no - * text content. Returns -1 if the DIV node contains non-empty - * text nodes or if it contains other element nodes. + * Check if this node has only whitespace and a single P element + * Returns false if the DIV node contains non-empty text nodes + * or if it contains no P or more than 1 element. * * @param Element **/ - _getSinglePIndexInsideDiv: function(e) { + _hasSinglePInsideElement: function(e) { + // There should be exactly 1 element child which is a P: + if (e.children.length != 1 || e.firstElementChild.tagName !== "P") { + return false; + } + // And there should be no text nodes with real content var childNodes = e.childNodes; - var pIndex = -1; - for (var i = childNodes.length; --i >= 0;) { var node = childNodes[i]; - - if (node.nodeType === Node.ELEMENT_NODE) { - if (node.tagName !== "P") - return -1; - - if (pIndex >= 0) - return -1; - - pIndex = i; - } else if (node.nodeType == Node.TEXT_NODE && this._getInnerText(node, false)) { - return -1; + if (node.nodeType == Node.TEXT_NODE && + this.REGEXPS.hasContent.test(node.textContent)) { + return false; } } - return pIndex; + return true; }, /** @@ -882,12 +878,9 @@ Readability.prototype = { * @param Element */ _hasChildBlockElement: function (e) { - var length = e.childNodes.length; + var length = e.children.length; for (var i = 0; i < length; i++) { - var child = e.childNodes[i]; - if (child.nodeType != Node.ELEMENT_NODE) - continue; - + var child = e.children[i]; if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child)) return true; }