Make isProbablyReaderable include <pre>, and deal with long <br>-separated paragraphs and/or shorter-than-5-paragraph text and such.

This commit is contained in:
Gijs Kruitbosch 2015-04-25 00:19:34 +01:00
parent d9a475e8d4
commit 5f184053cd
7 changed files with 2423 additions and 18 deletions

View File

@ -182,6 +182,15 @@ Readability.prototype = {
return Array.prototype.concat.apply([], nodeLists);
},
_getAllNodesWithTag: function(node, tagNames) {
if (node.querySelectorAll) {
return node.querySelectorAll(tagNames.join(','));
}
return [].concat.apply([], tagNames.map(function(tag) {
return node.getElementsByTagName(tag);
}));
},
/**
* Converts each <a> and <img> uri in the given element to an absolute URI.
*
@ -1678,32 +1687,42 @@ Readability.prototype = {
*
* @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
*/
isProbablyReaderable: function() {
var nodes = this._doc.getElementsByTagName("p");
if (nodes.length < 5) {
return false;
}
isProbablyReaderable: function(helperIsVisible) {
var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
var possibleParagraphs = 0;
for (var i = 0; i < nodes.length; i++) {
var node = nodes[i];
// FIXME we should have a fallback for helperIsVisible, but this is
// problematic because of jsdom's elem.style handling - see
// https://github.com/mozilla/readability/pull/186 for context.
var score = 0;
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
// this callback:
return this._someNode(nodes, function(node) {
if (helperIsVisible && !helperIsVisible(node))
return false;
var matchString = node.className + " " + node.id;
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
!this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
continue;
return false;
}
if (node.textContent.trim().length < 100) {
continue;
if (node.matches && node.matches("li p")) {
return false;
}
possibleParagraphs++;
if (possibleParagraphs >= 5) {
var textContentLength = node.textContent.trim().length;
if (textContentLength < 140) {
return false;
}
score += Math.sqrt(textContentLength - 140);
if (score > 20) {
return true;
}
}
return false;
return false;
});
},
/**

View File

@ -0,0 +1,5 @@
{
"title": "draft-dejong-remotestorage-04 - remoteStorage",
"byline": "AUTHORING",
"readerable": true
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -2,5 +2,5 @@
"title": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy\n eirmod tempor invidunt",
"byline": "Henri Sivonen",
"excerpt": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy\n eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam\n voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit\n amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam\n nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed\n diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet.",
"readerable": false
"readerable": true
}

View File

@ -3,5 +3,5 @@
"byline": null,
"dir": "ltr",
"excerpt": "Get to know the features that make it the most complete browser for building the Web.",
"readerable": true
"readerable": false
}

View File

@ -2,5 +2,5 @@
"title": "",
"byline": null,
"excerpt": "Regarding item# 11111, under sufficiently extreme conditions, quarks may\n become deconfined and exist as free particles. In the course of asymptotic\n freedom, the strong interaction becomes weaker at higher temperatures.\n Eventually, color confinement would be lost and an extremely hot plasma\n of freely moving quarks and gluons would be formed. This theoretical phase\n of matter is called quark-gluon plasma.[81] The exact conditions needed\n to give rise to this state are unknown and have been the subject of a great\n deal of speculation and experimentation.",
"readerable": false
"readerable": true
}