From 4f9615cb9af8f3b27bc1448c445bb1c7c5aabd23 Mon Sep 17 00:00:00 2001 From: Nicolas Perriault Date: Sat, 21 Mar 2015 10:37:56 +0100 Subject: [PATCH 1/2] Use forEach when it makes sense. --- Readability.js | 194 +++++++++--------- .../expected-metadata.json | 5 + .../basic-tags-cleaning/expected.html | 19 ++ .../basic-tags-cleaning/source.html | 35 ++++ .../remove-extra-brs/expected-metadata.json | 5 + .../test-pages/remove-extra-brs/expected.html | 21 ++ test/test-pages/remove-extra-brs/source.html | 32 +++ .../expected-metadata.json | 5 + .../remove-extra-paragraphs/expected.html | 19 ++ .../remove-extra-paragraphs/source.html | 41 ++++ .../remove-script-tags/expected-metadata.json | 5 + .../remove-script-tags/expected.html | 19 ++ .../test-pages/remove-script-tags/source.html | 43 ++++ .../replace-brs/expected-metadata.json | 5 + test/test-pages/replace-brs/expected.html | 20 ++ test/test-pages/replace-brs/source.html | 28 +++ .../replace-font-tags/expected-metadata.json | 5 + .../replace-font-tags/expected.html | 17 ++ test/test-pages/replace-font-tags/source.html | 28 +++ .../style-tags-removal/expected-metadata.json | 5 + .../style-tags-removal/expected.html | 15 ++ .../test-pages/style-tags-removal/source.html | 42 ++++ 22 files changed, 515 insertions(+), 93 deletions(-) create mode 100644 test/test-pages/basic-tags-cleaning/expected-metadata.json create mode 100644 test/test-pages/basic-tags-cleaning/expected.html create mode 100644 test/test-pages/basic-tags-cleaning/source.html create mode 100644 test/test-pages/remove-extra-brs/expected-metadata.json create mode 100644 test/test-pages/remove-extra-brs/expected.html create mode 100644 test/test-pages/remove-extra-brs/source.html create mode 100644 test/test-pages/remove-extra-paragraphs/expected-metadata.json create mode 100644 test/test-pages/remove-extra-paragraphs/expected.html create mode 100644 test/test-pages/remove-extra-paragraphs/source.html create mode 100644 test/test-pages/remove-script-tags/expected-metadata.json create mode 100644 test/test-pages/remove-script-tags/expected.html create mode 100644 test/test-pages/remove-script-tags/source.html create mode 100644 test/test-pages/replace-brs/expected-metadata.json create mode 100644 test/test-pages/replace-brs/expected.html create mode 100644 test/test-pages/replace-brs/source.html create mode 100644 test/test-pages/replace-font-tags/expected-metadata.json create mode 100644 test/test-pages/replace-font-tags/expected.html create mode 100644 test/test-pages/replace-font-tags/source.html create mode 100644 test/test-pages/style-tags-removal/expected-metadata.json create mode 100644 test/test-pages/style-tags-removal/expected.html create mode 100644 test/test-pages/style-tags-removal/source.html diff --git a/Readability.js b/Readability.js index dec5975..4c8789e 100644 --- a/Readability.js +++ b/Readability.js @@ -118,6 +118,36 @@ Readability.prototype = { this._fixRelativeUris(articleContent); }, + /** + * Iterate over a NodeList, which doesn't natively fully implement the Array + * interface. + * + * For convenience, the current object context is applied to the provided + * iterate function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The iterate function. + * @return void + */ + _forEachNode: function(nodeList, fn) { + return Array.prototype.forEach.call(nodeList, fn, this); + }, + + /** + * Iterate over a NodeList, return true if any of the provided iterate + * function calls returns true, false otherwise. + * + * For convenience, the current object context is applied to the + * provided iterate function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The iterate function. + * @return Boolean + */ + _someNode: function(nodeList, fn) { + return Array.prototype.some.call(nodeList, fn, this); + }, + /** * Converts each and uri in the given element to an absolute URI. * @@ -149,19 +179,18 @@ Readability.prototype = { function convertRelativeURIs(tagName, propName) { var elems = articleContent.getElementsByTagName(tagName); - for (var i = elems.length; --i >= 0;) { - var elem = elems[i]; + this._forEachNode(elems, function(elem) { var relativeURI = elem.getAttribute(propName); if (relativeURI != null) - elems[i].setAttribute(propName, toAbsoluteURI(relativeURI)); - } + elem.setAttribute(propName, toAbsoluteURI(relativeURI)); + }); } // Fix links. - convertRelativeURIs("a", "href"); + convertRelativeURIs.call(this, "a", "href"); // Fix images. - convertRelativeURIs("img", "src"); + convertRelativeURIs.call(this, "img", "src"); }, /** @@ -217,19 +246,17 @@ Readability.prototype = { var doc = this._doc; // Remove all style tags in head - var styleTags = doc.getElementsByTagName("style"); - for (var st = styleTags.length - 1; st >= 0; st -= 1) { - styleTags[st].parentNode.removeChild(styleTags[st]); - } + this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) { + styleNode.parentNode.removeChild(styleNode); + }); if (doc.body) { this._replaceBrs(doc.body); } - var fonts = doc.getElementsByTagName("FONT"); - for (var i = fonts.length; --i >=0;) { - this._setNodeTag(fonts[i], "SPAN"); - } + this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) { + this._setNodeTag(fontNode, "SPAN"); + }); }, /** @@ -255,9 +282,7 @@ Readability.prototype = { *
foo
bar

abc

*/ _replaceBrs: function (elem) { - var brs = elem.getElementsByTagName("br"); - for (var i = 0; i < brs.length; i++) { - var br = brs[i]; + this._forEachNode(elem.getElementsByTagName("br"), function(br) { var next = br.nextSibling; // Whether 2 or more
elements have been found and replaced with a @@ -296,7 +321,7 @@ Readability.prototype = { next = sibling; } } - } + }); }, _setNodeTag: function (node, tag) { @@ -336,26 +361,21 @@ Readability.prototype = { this._cleanConditionally(articleContent, "div"); // Remove extra paragraphs - var articleParagraphs = articleContent.getElementsByTagName('p'); - for (var i = articleParagraphs.length - 1; i >= 0; i -= 1) { - var imgCount = articleParagraphs[i].getElementsByTagName('img').length; - var embedCount = articleParagraphs[i].getElementsByTagName('embed').length; - var objectCount = articleParagraphs[i].getElementsByTagName('object').length; + this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) { + var imgCount = paragraph.getElementsByTagName('img').length; + var embedCount = paragraph.getElementsByTagName('embed').length; + var objectCount = paragraph.getElementsByTagName('object').length; + var totalCount = imgCount + embedCount + objectCount; - if (imgCount === 0 && - embedCount === 0 && - objectCount === 0 && - this._getInnerText(articleParagraphs[i], false) === '') - articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); - } + if (totalCount === 0 && !this._getInnerText(paragraph, false)) + paragraph.parentNode.removeChild(paragraph); + }); - var brs = articleContent.getElementsByTagName("BR"); - for (var i = brs.length; --i >= 0;) { - var br = brs[i]; + this._forEachNode(articleContent.getElementsByTagName("br"), function(br) { var next = this._nextElement(br.nextSibling); if (next && next.tagName == "P") br.parentNode.removeChild(br); - } + }); }, /** @@ -522,8 +542,7 @@ Readability.prototype = { elementsToScore.push(node); } else { // EXPERIMENTAL - for (var i = 0, il = node.childNodes.length; i < il; i += 1) { - var childNode = node.childNodes[i]; + this._forEachNode(node.childNodes, function(childNode) { if (childNode.nodeType === Node.TEXT_NODE) { var p = doc.createElement('p'); p.textContent = childNode.textContent; @@ -531,7 +550,7 @@ Readability.prototype = { p.className = 'readability-styled'; node.replaceChild(p, childNode); } - } + }); } } node = this._getNextNode(node); @@ -544,17 +563,17 @@ Readability.prototype = { * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. **/ var candidates = []; - for (var pt = 0; pt < elementsToScore.length; pt += 1) { - var parentNode = elementsToScore[pt].parentNode; + this._forEachNode(elementsToScore, function(elementToScore) { + var parentNode = elementToScore.parentNode; var grandParentNode = parentNode ? parentNode.parentNode : null; - var innerText = this._getInnerText(elementsToScore[pt]); + var innerText = this._getInnerText(elementToScore); if (!parentNode || typeof(parentNode.tagName) === 'undefined') - continue; + return; // If this paragraph is less than 25 characters, don't even count it. if (innerText.length < 25) - continue; + return; // Initialize readability data for the parent. if (typeof parentNode.readability === 'undefined') { @@ -586,7 +605,7 @@ Readability.prototype = { if (grandParentNode) grandParentNode.readability.contentScore += contentScore / 2; - } + }); // After we've calculated scores, loop through all of the possible // candidate nodes we found and find the one with the highest score. @@ -797,7 +816,7 @@ Readability.prototype = { /** * Attempts to get excerpt and byline metadata for the article. - * + * * @return Object with optional "excerpt" and "byline" properties */ _getArticleMetadata: function() { @@ -813,14 +832,13 @@ Readability.prototype = { var propertyPattern = /^\s*og\s*:\s*description\s*$/gi; // Find description tags. - for (var i = 0; i < metaElements.length; i++) { - var element = metaElements[i]; + this._forEachNode(metaElements, function(element) { var elementName = element.getAttribute("name"); var elementProperty = element.getAttribute("property"); if (elementName === "author") { metadata.byline = element.getAttribute("content"); - continue; + return; } var name = null; @@ -839,7 +857,7 @@ Readability.prototype = { values[name] = content.trim(); } } - } + }); if ("description" in values) { metadata.excerpt = values["description"]; @@ -860,14 +878,13 @@ Readability.prototype = { * @param Element **/ _removeScripts: function(doc) { - var scripts = doc.getElementsByTagName('script'); - for (var i = scripts.length - 1; i >= 0; i -= 1) { - scripts[i].nodeValue=""; - scripts[i].removeAttribute('src'); + this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) { + scriptNode.nodeValue = ""; + scriptNode.removeAttribute('src'); - if (scripts[i].parentNode) - scripts[i].parentNode.removeChild(scripts[i]); - } + if (scriptNode.parentNode) + scriptNode.parentNode.removeChild(scriptNode); + }); }, /** @@ -877,22 +894,17 @@ Readability.prototype = { * * @param Element **/ - _hasSinglePInsideElement: function(e) { + _hasSinglePInsideElement: function(element) { // There should be exactly 1 element child which is a P: - if (e.children.length != 1 || e.firstElementChild.tagName !== "P") { + if (element.children.length != 1 || element.firstElementChild.tagName !== "P") { return false; } - // And there should be no text nodes with real content - var childNodes = e.childNodes; - for (var i = childNodes.length; --i >= 0;) { - var node = childNodes[i]; - if (node.nodeType == Node.TEXT_NODE && - this.REGEXPS.hasContent.test(node.textContent)) { - return false; - } - } - return true; + // And there should be no text nodes with real content + return !this._someNode(element.childNodes, function(node) { + return node.nodeType === Node.TEXT_NODE && + this.REGEXPS.hasContent.test(node.textContent); + }); }, /** @@ -900,14 +912,11 @@ Readability.prototype = { * * @param Element */ - _hasChildBlockElement: function (e) { - var length = e.children.length; - for (var i = 0; i < length; i++) { - var child = e.children[i]; - if (this.DIV_TO_P_ELEMS.indexOf(child.tagName) !== -1 || this._hasChildBlockElement(child)) - return true; - } - return false; + _hasChildBlockElement: function (element) { + return this._someNode(element.childNodes, function(node) { + return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 || + this._hasChildBlockElement(node) + }); }, /** @@ -915,13 +924,13 @@ Readability.prototype = { * This also strips out any excess whitespace to be found. * * @param Element + * @param Boolean normalizeSpaces (default: true) * @return string **/ _getInnerText: function(e, normalizeSpaces) { var textContent = e.textContent.trim(); - normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; - if (normalizeSpaces) { + if (!Boolean(normalizeSpaces)) { return textContent.replace(this.REGEXPS.normalize, " "); } else { return textContent; @@ -978,16 +987,16 @@ Readability.prototype = { * @param Element * @return number (float) **/ - _getLinkDensity: function(e) { - var links = e.getElementsByTagName("a"); - var textLength = this._getInnerText(e).length; + _getLinkDensity: function(element) { + var textLength = this._getInnerText(element).length; var linkLength = 0; - for (var i = 0, il = links.length; i < il; i += 1) { - linkLength += this._getInnerText(links[i]).length; - } + // XXX implement _reduceNodeList? + this._forEachNode(element.getElementsByTagName("a"), function(linkNode) { + linkLength += this._getInnerText(linkNode).length; + }); - return linkLength / textLength; + return textLength !== 0 ? linkLength / textLength : 0; }, /** @@ -1398,28 +1407,27 @@ Readability.prototype = { * @return void **/ _clean: function(e, tag) { - var targetList = e.getElementsByTagName(tag); var isEmbed = (tag === 'object' || tag === 'embed'); - for (var y = targetList.length - 1; y >= 0; y -= 1) { + this._forEachNode(e.getElementsByTagName(tag), function(element) { // Allow youtube and vimeo videos through as people usually want to see those. if (isEmbed) { var attributeValues = ""; - for (var i = 0, il = targetList[y].attributes.length; i < il; i += 1) { - attributeValues += targetList[y].attributes[i].value + '|'; + for (var i = 0, il = element.attributes.length; i < il; i += 1) { + attributeValues += element.attributes[i].value + '|'; } // First, check the elements attributes to see if any of them contain youtube or vimeo if (this.REGEXPS.videos.test(attributeValues)) - continue; + return; // Then check the elements inside this element for the same. - if (this.REGEXPS.videos.test(targetList[y].innerHTML)) - continue; + if (this.REGEXPS.videos.test(element.innerHTML)) + return; } - targetList[y].parentNode.removeChild(targetList[y]); - } + element.parentNode.removeChild(element); + }); }, /** @@ -1571,7 +1579,7 @@ Readability.prototype = { if (!metadata.excerpt) { var paragraphs = articleContent.getElementsByTagName("p"); if (paragraphs.length > 0) { - metadata.excerpt = paragraphs[0].textContent; + metadata.excerpt = paragraphs[0].textContent.trim(); } } diff --git a/test/test-pages/basic-tags-cleaning/expected-metadata.json b/test/test-pages/basic-tags-cleaning/expected-metadata.json new file mode 100644 index 0000000..4fd25ab --- /dev/null +++ b/test/test-pages/basic-tags-cleaning/expected-metadata.json @@ -0,0 +1,5 @@ +{ + "title": "Basic tag cleaning test", + "byline": null, + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua." +} diff --git a/test/test-pages/basic-tags-cleaning/expected.html b/test/test-pages/basic-tags-cleaning/expected.html new file mode 100644 index 0000000..5fb9089 --- /dev/null +++ b/test/test-pages/basic-tags-cleaning/expected.html @@ -0,0 +1,19 @@ +
+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua.

+

Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi + ut aliquip ex ea commodo consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum + dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum + dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
\ No newline at end of file diff --git a/test/test-pages/basic-tags-cleaning/source.html b/test/test-pages/basic-tags-cleaning/source.html new file mode 100644 index 0000000..3a3b51d --- /dev/null +++ b/test/test-pages/basic-tags-cleaning/source.html @@ -0,0 +1,35 @@ + + + + + Basic tag cleaning test + + +
+

Lorem

+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua.

+

Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+ +

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+

Foo

+
+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+ + + +

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
+ + diff --git a/test/test-pages/remove-extra-brs/expected-metadata.json b/test/test-pages/remove-extra-brs/expected-metadata.json new file mode 100644 index 0000000..3eb3ebb --- /dev/null +++ b/test/test-pages/remove-extra-brs/expected-metadata.json @@ -0,0 +1,5 @@ +{ + "title": "Remove trailing brs test", + "byline": null, + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua." +} diff --git a/test/test-pages/remove-extra-brs/expected.html b/test/test-pages/remove-extra-brs/expected.html new file mode 100644 index 0000000..652531d --- /dev/null +++ b/test/test-pages/remove-extra-brs/expected.html @@ -0,0 +1,21 @@ +
+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua.

+

+

Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi + ut aliquip ex ea commodo consequat.

+

+

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum + dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum + dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
\ No newline at end of file diff --git a/test/test-pages/remove-extra-brs/source.html b/test/test-pages/remove-extra-brs/source.html new file mode 100644 index 0000000..44c4fcf --- /dev/null +++ b/test/test-pages/remove-extra-brs/source.html @@ -0,0 +1,32 @@ + + + + + Remove trailing brs test + + +
+

Lorem

+
+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua.

+

Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.


+

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+

Foo

+
+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
+ + diff --git a/test/test-pages/remove-extra-paragraphs/expected-metadata.json b/test/test-pages/remove-extra-paragraphs/expected-metadata.json new file mode 100644 index 0000000..662b7a2 --- /dev/null +++ b/test/test-pages/remove-extra-paragraphs/expected-metadata.json @@ -0,0 +1,5 @@ +{ + "title": "Replace font tags test", + "byline": null, + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua." +} diff --git a/test/test-pages/remove-extra-paragraphs/expected.html b/test/test-pages/remove-extra-paragraphs/expected.html new file mode 100644 index 0000000..5fb9089 --- /dev/null +++ b/test/test-pages/remove-extra-paragraphs/expected.html @@ -0,0 +1,19 @@ +
+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua.

+

Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi + ut aliquip ex ea commodo consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum + dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum + dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
\ No newline at end of file diff --git a/test/test-pages/remove-extra-paragraphs/source.html b/test/test-pages/remove-extra-paragraphs/source.html new file mode 100644 index 0000000..ff49d48 --- /dev/null +++ b/test/test-pages/remove-extra-paragraphs/source.html @@ -0,0 +1,41 @@ + + + + + Replace font tags test + + +
+

Lorem

+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua.

+

+

Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+

+

+

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

+
+

Foo

+
+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+

+

+

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

+ + +

+
+
+ + diff --git a/test/test-pages/remove-script-tags/expected-metadata.json b/test/test-pages/remove-script-tags/expected-metadata.json new file mode 100644 index 0000000..707383d --- /dev/null +++ b/test/test-pages/remove-script-tags/expected-metadata.json @@ -0,0 +1,5 @@ +{ + "title": "Remove script tags test", + "byline": null, + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua." +} diff --git a/test/test-pages/remove-script-tags/expected.html b/test/test-pages/remove-script-tags/expected.html new file mode 100644 index 0000000..5fb9089 --- /dev/null +++ b/test/test-pages/remove-script-tags/expected.html @@ -0,0 +1,19 @@ +
+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua.

+

Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi + ut aliquip ex ea commodo consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum + dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum + dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
\ No newline at end of file diff --git a/test/test-pages/remove-script-tags/source.html b/test/test-pages/remove-script-tags/source.html new file mode 100644 index 0000000..fbfdec3 --- /dev/null +++ b/test/test-pages/remove-script-tags/source.html @@ -0,0 +1,43 @@ + + + + + Remove script tags test + + + +
+

Lorem

+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua.

+

Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+ +

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+ +

Foo

+
+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+ +

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. + + Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
+ + diff --git a/test/test-pages/replace-brs/expected-metadata.json b/test/test-pages/replace-brs/expected-metadata.json new file mode 100644 index 0000000..1da4929 --- /dev/null +++ b/test/test-pages/replace-brs/expected-metadata.json @@ -0,0 +1,5 @@ +{ + "title": "Replace brs test", + "byline": null, + "excerpt": "Lorem ipsum" +} diff --git a/test/test-pages/replace-brs/expected.html b/test/test-pages/replace-brs/expected.html new file mode 100644 index 0000000..d3fd73b --- /dev/null +++ b/test/test-pages/replace-brs/expected.html @@ -0,0 +1,20 @@ +
+
+

Lorem ipsum

+

dolor sit

+

amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut + labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation + ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure + dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat + nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in + culpa qui officia deserunt mollit anim id est laborum.

+
+
+

Tempor

+

incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat + non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
\ No newline at end of file diff --git a/test/test-pages/replace-brs/source.html b/test/test-pages/replace-brs/source.html new file mode 100644 index 0000000..cabff66 --- /dev/null +++ b/test/test-pages/replace-brs/source.html @@ -0,0 +1,28 @@ + + + + + Replace brs test + + +
+

Lorem

+
+ Lorem ipsum
dolor sit


amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+

Foo

+
+ Tempor

incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+
+ + diff --git a/test/test-pages/replace-font-tags/expected-metadata.json b/test/test-pages/replace-font-tags/expected-metadata.json new file mode 100644 index 0000000..501704f --- /dev/null +++ b/test/test-pages/replace-font-tags/expected-metadata.json @@ -0,0 +1,5 @@ +{ + "title": "Replace font tags test", + "byline": null, + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum." +} diff --git a/test/test-pages/replace-font-tags/expected.html b/test/test-pages/replace-font-tags/expected.html new file mode 100644 index 0000000..a27d741 --- /dev/null +++ b/test/test-pages/replace-font-tags/expected.html @@ -0,0 +1,17 @@ +
+
+

Lorem ipsum dolor sit amet, consectetur + adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore + magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco + laboris nisi ut aliquip ex ea commodo consequat. Duis aute + irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat + nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in + culpa qui officia deserunt mollit anim id est laborum.

+

Tempor incididunt ut labore et dolore magna + aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris + nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit + in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in + culpa qui officia deserunt mollit anim id est laborum.

+
+
\ No newline at end of file diff --git a/test/test-pages/replace-font-tags/source.html b/test/test-pages/replace-font-tags/source.html new file mode 100644 index 0000000..6658079 --- /dev/null +++ b/test/test-pages/replace-font-tags/source.html @@ -0,0 +1,28 @@ + + + + + Replace font tags test + + +
+

Lorem

+
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+

Foo

+
+ Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+
+ + diff --git a/test/test-pages/style-tags-removal/expected-metadata.json b/test/test-pages/style-tags-removal/expected-metadata.json new file mode 100644 index 0000000..35b0908 --- /dev/null +++ b/test/test-pages/style-tags-removal/expected-metadata.json @@ -0,0 +1,5 @@ +{ + "title": "Style tags removal", + "byline": null, + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum." +} diff --git a/test/test-pages/style-tags-removal/expected.html b/test/test-pages/style-tags-removal/expected.html new file mode 100644 index 0000000..1c2a88e --- /dev/null +++ b/test/test-pages/style-tags-removal/expected.html @@ -0,0 +1,15 @@ +
+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat + non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat + non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
\ No newline at end of file diff --git a/test/test-pages/style-tags-removal/source.html b/test/test-pages/style-tags-removal/source.html new file mode 100644 index 0000000..8a26266 --- /dev/null +++ b/test/test-pages/style-tags-removal/source.html @@ -0,0 +1,42 @@ + + + + + Style tags removal + + + +
+

Lorem

+ +
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+ +

Foo

+
+ Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+
+ + + From eee224560b582f1476119de25fb9e62b1efe6043 Mon Sep 17 00:00:00 2001 From: Nicolas Perriault Date: Sat, 21 Mar 2015 18:02:42 +0100 Subject: [PATCH 2/2] Addressed review comments from @Gijsk. --- Readability.js | 10 ++++-- .../normalize-spaces/expected-metadata.json | 5 +++ .../test-pages/normalize-spaces/expected.html | 16 +++++++++ test/test-pages/normalize-spaces/source.html | 35 +++++++++++++++++++ .../test-pages/style-tags-removal/source.html | 2 +- 5 files changed, 64 insertions(+), 4 deletions(-) create mode 100644 test/test-pages/normalize-spaces/expected-metadata.json create mode 100644 test/test-pages/normalize-spaces/expected.html create mode 100644 test/test-pages/normalize-spaces/source.html diff --git a/Readability.js b/Readability.js index 4c8789e..6414216 100644 --- a/Readability.js +++ b/Readability.js @@ -915,7 +915,7 @@ Readability.prototype = { _hasChildBlockElement: function (element) { return this._someNode(element.childNodes, function(node) { return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 || - this._hasChildBlockElement(node) + this._hasChildBlockElement(node); }); }, @@ -928,9 +928,10 @@ Readability.prototype = { * @return string **/ _getInnerText: function(e, normalizeSpaces) { + normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; var textContent = e.textContent.trim(); - if (!Boolean(normalizeSpaces)) { + if (normalizeSpaces) { return textContent.replace(this.REGEXPS.normalize, " "); } else { return textContent; @@ -989,6 +990,9 @@ Readability.prototype = { **/ _getLinkDensity: function(element) { var textLength = this._getInnerText(element).length; + if (textLength === 0) + return; + var linkLength = 0; // XXX implement _reduceNodeList? @@ -996,7 +1000,7 @@ Readability.prototype = { linkLength += this._getInnerText(linkNode).length; }); - return textLength !== 0 ? linkLength / textLength : 0; + return linkLength / textLength; }, /** diff --git a/test/test-pages/normalize-spaces/expected-metadata.json b/test/test-pages/normalize-spaces/expected-metadata.json new file mode 100644 index 0000000..7300185 --- /dev/null +++ b/test/test-pages/normalize-spaces/expected-metadata.json @@ -0,0 +1,5 @@ +{ + "title": "Normalize space test", + "byline": null, + "excerpt": "Lorem\n ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n\ttab here\n incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum." +} diff --git a/test/test-pages/normalize-spaces/expected.html b/test/test-pages/normalize-spaces/expected.html new file mode 100644 index 0000000..55e5350 --- /dev/null +++ b/test/test-pages/normalize-spaces/expected.html @@ -0,0 +1,16 @@ +
+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tab here incididunt ut labore et dolore magna aliqua. Ut enim ad minim + veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea + commodo consequat. Duis aute irure dolor in reprehenderit in voluptate + velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat + cupidatat non proident, sunt in culpa qui officia deserunt mollit anim + id est laborum.

+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat + non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
\ No newline at end of file diff --git a/test/test-pages/normalize-spaces/source.html b/test/test-pages/normalize-spaces/source.html new file mode 100644 index 0000000..f19992b --- /dev/null +++ b/test/test-pages/normalize-spaces/source.html @@ -0,0 +1,35 @@ + + + + + Normalize space test + + +
+

Lorem

+
+ Lorem + ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tab here + incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+

Foo

+
+ Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation + + + + + ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+
+ + diff --git a/test/test-pages/style-tags-removal/source.html b/test/test-pages/style-tags-removal/source.html index 8a26266..4c6426d 100644 --- a/test/test-pages/style-tags-removal/source.html +++ b/test/test-pages/style-tags-removal/source.html @@ -35,7 +35,7 @@