diff --git a/Readability.js b/Readability.js index 4e78402..733a76a 100644 --- a/Readability.js +++ b/Readability.js @@ -31,7 +31,6 @@ function Readability(uri, doc, options) { this._uri = uri; this._doc = doc; - this._biggestFrame = false; this._articleTitle = null; this._articleByline = null; this._articleDir = null; @@ -40,7 +39,6 @@ function Readability(uri, doc, options) { this._debug = !!options.debug; this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; - this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES; this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD; // Start with all flags set @@ -48,17 +46,6 @@ function Readability(uri, doc, options) { this.FLAG_WEIGHT_CLASSES | this.FLAG_CLEAN_CONDITIONALLY; - // The list of pages we've parsed in this call of readability, - // for autopaging. As a key store for easier searching. - this._parsedPages = {}; - - // A list of the ETag headers of pages we've parsed, in case they happen to match, - // we'll know it's a duplicate. - this._pageETags = {}; - - // Make an AJAX request for each page and append it to the document. - this._curPageNum = 1; - var logEl; // Control whether log messages are sent to the console @@ -104,10 +91,6 @@ Readability.prototype = { // tight the competition is among candidates. DEFAULT_N_TOP_CANDIDATES: 5, - // The maximum number of pages to loop through before we call - // it quits and just show a link. - DEFAULT_MAX_PAGES: 5, - // Element tags to score by default. DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","), @@ -1037,24 +1020,22 @@ Readability.prototype = { if (this._debug) this.log("Article content post-prep: " + articleContent.innerHTML); - if (this._curPageNum === 1) { - if (neededToCreateTopCandidate) { - // We already created a fake div thing, and there wouldn't have been any siblings left - // for the previous loop, so there's no point trying to create a new div, and then - // move all the children over. Just assign IDs and class names here. No need to append - // because that already happened anyway. - topCandidate.id = "readability-page-1"; - topCandidate.className = "page"; - } else { - var div = doc.createElement("DIV"); - div.id = "readability-page-1"; - div.className = "page"; - var children = articleContent.childNodes; - while (children.length) { - div.appendChild(children[0]); - } - articleContent.appendChild(div); + if (neededToCreateTopCandidate) { + // We already created a fake div thing, and there wouldn't have been any siblings left + // for the previous loop, so there's no point trying to create a new div, and then + // move all the children over. Just assign IDs and class names here. No need to append + // because that already happened anyway. + topCandidate.id = "readability-page-1"; + topCandidate.className = "page"; + } else { + var div = doc.createElement("DIV"); + div.id = "readability-page-1"; + div.className = "page"; + var children = articleContent.childNodes; + while (children.length) { + div.appendChild(children[0]); } + articleContent.appendChild(div); } if (this._debug) @@ -1315,363 +1296,6 @@ Readability.prototype = { return linkLength / textLength; }, - /** - * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. - * - * @author Dan Lacy - * @return string the base url - **/ - _findBaseUrl: function() { - var uri = this._uri; - var noUrlParams = uri.path.split("?")[0]; - var urlSlashes = noUrlParams.split("/").reverse(); - var cleanedSegments = []; - var possibleType = ""; - - for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) { - var segment = urlSlashes[i]; - - // Split off and save anything that looks like a file type. - if (segment.indexOf(".") !== -1) { - possibleType = segment.split(".")[1]; - - // If the type isn't alpha-only, it's probably not actually a file extension. - if (!possibleType.match(/[^a-zA-Z]/)) - segment = segment.split(".")[0]; - } - - // If our first or second segment has anything looking like a page number, remove it. - if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) - segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, ""); - - var del = false; - - // If this is purely a number, and it's the first or second segment, - // it's probably a page number. Remove it. - if (i < 2 && segment.match(/^\d{1,2}$/)) - del = true; - - // If this is the first segment and it's just "index", remove it. - if (i === 0 && segment.toLowerCase() === "index") - del = true; - - // If our first or second segment is smaller than 3 characters, - // and the first segment was purely alphas, remove it. - if (i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) - del = true; - - // If it's not marked for deletion, push it to cleanedSegments. - if (!del) - cleanedSegments.push(segment); - } - - // This is our final, cleaned, base article URL. - return uri.scheme + "://" + uri.host + cleanedSegments.reverse().join("/"); - }, - - /** - * Look for any paging links that may occur within the document. - * - * @param body - * @return object (array) - **/ - _findNextPageLink: function(elem) { - var uri = this._uri; - var possiblePages = {}; - var allLinks = elem.getElementsByTagName('a'); - var articleBaseUrl = this._findBaseUrl(); - - // Loop through all links, looking for hints that they may be next-page links. - // Things like having "page" in their textContent, className or id, or being a child - // of a node with a page-y className or id. - // - // Also possible: levenshtein distance? longest common subsequence? - // - // After we do that, assign each page a score, and - for (var i = 0, il = allLinks.length; i < il; i += 1) { - var link = allLinks[i]; - var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ''); - - // If we've already seen this page, ignore it. - if (linkHref === "" || - linkHref === articleBaseUrl || - linkHref === uri.spec || - linkHref in this._parsedPages) { - continue; - } - - // If it's on a different domain, skip it. - if (uri.host !== linkHref.split(/\/+/g)[1]) - continue; - - var linkText = this._getInnerText(link); - - // If the linkText looks like it's not the next page, skip it. - if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25) - continue; - - // If the leftovers of the URL after removing the base URL don't contain - // any digits, it's certainly not a next page link. - var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); - if (!linkHrefLeftover.match(/\d/)) - continue; - - if (!(linkHref in possiblePages)) { - possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; - } else { - possiblePages[linkHref].linkText += ' | ' + linkText; - } - - var linkObj = possiblePages[linkHref]; - - // If the articleBaseUrl isn't part of this URL, penalize this link. It could - // still be the link, but the odds are lower. - // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html - if (linkHref.indexOf(articleBaseUrl) !== 0) - linkObj.score -= 25; - - var linkData = linkText + ' ' + link.className + ' ' + link.id; - if (linkData.match(this.REGEXPS.nextLink)) - linkObj.score += 50; - - if (linkData.match(/pag(e|ing|inat)/i)) - linkObj.score += 25; - - if (linkData.match(/(first|last)/i)) { - // -65 is enough to negate any bonuses gotten from a > or ยป in the text, - // If we already matched on "next", last is probably fine. - // If we didn't, then it's bad. Penalize. - if (!linkObj.linkText.match(this.REGEXPS.nextLink)) - linkObj.score -= 65; - } - - if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous)) - linkObj.score -= 50; - - if (linkData.match(this.REGEXPS.prevLink)) - linkObj.score -= 200; - - // If a parentNode contains page or paging or paginat - var parentNode = link.parentNode; - var positiveNodeMatch = false; - var negativeNodeMatch = false; - - while (parentNode) { - var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; - - if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { - positiveNodeMatch = true; - linkObj.score += 25; - } - - if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) { - // If this is just something like "footer", give it a negative. - // If it's something like "body-and-footer", leave it be. - if (!parentNodeClassAndId.match(this.REGEXPS.positive)) { - linkObj.score -= 25; - negativeNodeMatch = true; - } - } - - parentNode = parentNode.parentNode; - } - - // If the URL looks like it has paging in it, add to the score. - // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 - if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) - linkObj.score += 25; - - // If the URL contains negative values, give a slight decrease. - if (linkHref.match(this.REGEXPS.extraneous)) - linkObj.score -= 15; - - /** - * Minor punishment to anything that doesn't match our current URL. - * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. - * Dan, can you show me a counterexample where this is necessary? - * if (linkHref.indexOf(window.location.href) !== 0) { - * linkObj.score -= 1; - * } - **/ - - // If the link text can be parsed as a number, give it a minor bonus, with a slight - // bias towards lower numbered pages. This is so that pages that might not have 'next' - // in their text can still get scored, and sorted properly by score. - var linkTextAsNumber = parseInt(linkText, 10); - if (linkTextAsNumber) { - // Punish 1 since we're either already there, or it's probably - // before what we want anyways. - if (linkTextAsNumber === 1) { - linkObj.score -= 10; - } else { - linkObj.score += Math.max(0, 10 - linkTextAsNumber); - } - } - } - - // Loop thrugh all of our possible pages from above and find our top - // candidate for the next page URL. Require at least a score of 50, which - // is a relatively high confidence that this page is the next link. - var topPage = null; - for (var page in possiblePages) { - if (possiblePages.hasOwnProperty(page)) { - if (possiblePages[page].score >= 50 && - (!topPage || topPage.score < possiblePages[page].score)) - topPage = possiblePages[page]; - } - } - - var nextHref = null; - if (topPage) { - nextHref = topPage.href.replace(/\/$/, ''); - - this.log('NEXT PAGE IS ' + nextHref); - this._parsedPages[nextHref] = true; - } - return nextHref; - }, - - _successfulRequest: function(request) { - return (request.status >= 200 && request.status < 300) || - request.status === 304 || - (request.status === 0 && request.responseText); - }, - - _ajax: function(url, options) { - var request = new XMLHttpRequest(); - - function respondToReadyState(readyState) { - if (request.readyState === 4) { - if (this._successfulRequest(request)) { - if (options.success) - options.success(request); - } else if (options.error) { - options.error(request); - } - } - } - - if (typeof options === 'undefined') - options = {}; - - request.onreadystatechange = respondToReadyState; - - request.open('get', url, true); - request.setRequestHeader('Accept', 'text/html'); - - try { - request.send(options.postBody); - } catch (e) { - if (options.error) - options.error(); - } - - return request; - }, - - _appendNextPage: function(nextPageLink) { - var doc = this._doc; - this._curPageNum += 1; - - var articlePage = doc.createElement("DIV"); - articlePage.id = 'readability-page-' + this._curPageNum; - articlePage.className = 'page'; - articlePage.innerHTML = '

§

'; - - doc.getElementById("readability-content").appendChild(articlePage); - - if (this._curPageNum > this._maxPages) { - var nextPageMarkup = "
View Next Page
"; - articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup; - return; - } - - // Now that we've built the article page DOM element, get the page content - // asynchronously and load the cleaned content into the div we created for it. - (function(pageUrl, thisPage) { - this._ajax(pageUrl, { - success: function(r) { - - // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. - var eTag = r.getResponseHeader('ETag'); - if (eTag) { - if (eTag in this._pageETags) { - this.log("Exact duplicate page found via ETag. Aborting."); - articlePage.style.display = 'none'; - return; - } - this._pageETags[eTag] = 1; - } - - // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. - var page = doc.createElement("DIV"); - - // Do some preprocessing to our HTML to make it ready for appending. - // - Remove any script tags. Swap and reswap newlines with a unicode - // character because multiline regex doesn't work in javascript. - // - Turn any noscript tags into divs so that we can parse them. This - // allows us to find any next page links hidden via javascript. - // - Turn all double br's into p's - was handled by prepDocument in the original view. - // Maybe in the future abstract out prepDocument to work for both the original document - // and AJAX-added pages. - var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/.*?<\/script>/gi, ''); - responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/.*?<\/script>/gi, ''); - responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div'); - responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>'); - - page.innerHTML = responseHtml; - this._replaceBrs(page); - - // Reset all flags for the next page, as they will search through it and - // disable as necessary at the end of grabArticle. - this._flags = 0x1 | 0x2 | 0x4; - - var secondNextPageLink = this._findNextPageLink(page); - - // NOTE: if we end up supporting _appendNextPage(), we'll need to - // change this call to be async - var content = this._grabArticle(page); - - if (!content) { - this.log("No content found in page to append. Aborting."); - return; - } - - // Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. - // Compare it against all of the the previous document's we've gotten. If the previous - // document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. - var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; - if (firstP && firstP.innerHTML.length > 100) { - for (var i = 1; i <= this._curPageNum; i += 1) { - var rPage = doc.getElementById('readability-page-' + i); - if (rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { - this.log('Duplicate of page ' + i + ' - skipping.'); - articlePage.style.display = 'none'; - this._parsedPages[pageUrl] = true; - return; - } - } - } - - this._removeScripts(content); - - thisPage.innerHTML = thisPage.innerHTML + content.innerHTML; - - // After the page has rendered, post process the content. This delay is necessary because, - // in webkit at least, offsetWidth is not set in time to determine image width. We have to - // wait a little bit for reflow to finish before we can fix floating images. - setTimeout((function() { - this._postProcessContent(thisPage); - }).bind(this), 500); - - - if (secondNextPageLink) - this._appendNextPage(secondNextPageLink); - } - }); - }).bind(this)(nextPageLink, articlePage); - }, - /** * Get an elements class/id weight. Uses regular expressions to tell if this * element looks good or bad. @@ -1954,10 +1578,6 @@ Readability.prototype = { return (this._flags & flag) > 0; }, - _addFlag: function(flag) { - this._flags = this._flags | flag; - }, - _removeFlag: function(flag) { this._flags = this._flags & ~flag; }, @@ -2048,16 +1668,6 @@ Readability.prototype = { // Remove script tags from the document. this._removeScripts(this._doc); - // FIXME: Disabled multi-page article support for now as it - // needs more work on infrastructure. - - // Make sure this document is added to the list of parsed pages first, - // so we don't double up on the first page. - // this._parsedPages[uri.spec.replace(/\/$/, '')] = true; - - // Pull out any possible next page link first. - // var nextPageLink = this._findNextPageLink(doc.body); - this._prepDocument(); var metadata = this._getArticleMetadata(); @@ -2071,14 +1681,6 @@ Readability.prototype = { this._postProcessContent(articleContent); - // if (nextPageLink) { - // // Append any additional pages after a small timeout so that people - // // can start reading without having to wait for this to finish processing. - // setTimeout((function() { - // this._appendNextPage(nextPageLink); - // }).bind(this), 500); - // } - // If we haven't found an excerpt in the article's metadata, use the article's // first paragraph as the excerpt. This is used for displaying a preview of // the article's content. diff --git a/test/test-readability.js b/test/test-readability.js index fa5aa95..3414467 100644 --- a/test/test-readability.js +++ b/test/test-readability.js @@ -198,11 +198,6 @@ describe("Readability API", function() { expect(new Readability({}, {}, {nbTopCandidates: 42})._nbTopCandidates).eql(42); }); - it("should accept a maxPages option", function() { - expect(new Readability({}, {})._maxPages).eql(5); - expect(new Readability({}, {}, {maxPages: 42})._maxPages).eql(42); - }); - it("should accept a maxElemsToParse option", function() { expect(new Readability({}, {})._maxElemsToParse).eql(0); expect(new Readability({}, {}, {maxElemsToParse: 42})._maxElemsToParse).eql(42);