Make isProbablyReaderable include <pre>, and deal with long <br>-separated paragraphs and/or shorter-than-5-paragraph text and such.
This commit is contained in:
parent
d9a475e8d4
commit
5f184053cd
@ -182,6 +182,15 @@ Readability.prototype = {
|
||||
return Array.prototype.concat.apply([], nodeLists);
|
||||
},
|
||||
|
||||
_getAllNodesWithTag: function(node, tagNames) {
|
||||
if (node.querySelectorAll) {
|
||||
return node.querySelectorAll(tagNames.join(','));
|
||||
}
|
||||
return [].concat.apply([], tagNames.map(function(tag) {
|
||||
return node.getElementsByTagName(tag);
|
||||
}));
|
||||
},
|
||||
|
||||
/**
|
||||
* Converts each <a> and <img> uri in the given element to an absolute URI.
|
||||
*
|
||||
@ -1678,32 +1687,42 @@ Readability.prototype = {
|
||||
*
|
||||
* @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
|
||||
*/
|
||||
isProbablyReaderable: function() {
|
||||
var nodes = this._doc.getElementsByTagName("p");
|
||||
if (nodes.length < 5) {
|
||||
return false;
|
||||
}
|
||||
isProbablyReaderable: function(helperIsVisible) {
|
||||
var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
|
||||
|
||||
var possibleParagraphs = 0;
|
||||
for (var i = 0; i < nodes.length; i++) {
|
||||
var node = nodes[i];
|
||||
// FIXME we should have a fallback for helperIsVisible, but this is
|
||||
// problematic because of jsdom's elem.style handling - see
|
||||
// https://github.com/mozilla/readability/pull/186 for context.
|
||||
|
||||
var score = 0;
|
||||
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
|
||||
// this callback:
|
||||
return this._someNode(nodes, function(node) {
|
||||
if (helperIsVisible && !helperIsVisible(node))
|
||||
return false;
|
||||
var matchString = node.className + " " + node.id;
|
||||
|
||||
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
|
||||
!this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
|
||||
continue;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (node.textContent.trim().length < 100) {
|
||||
continue;
|
||||
if (node.matches && node.matches("li p")) {
|
||||
return false;
|
||||
}
|
||||
|
||||
possibleParagraphs++;
|
||||
if (possibleParagraphs >= 5) {
|
||||
var textContentLength = node.textContent.trim().length;
|
||||
if (textContentLength < 140) {
|
||||
return false;
|
||||
}
|
||||
|
||||
score += Math.sqrt(textContentLength - 140);
|
||||
|
||||
if (score > 20) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
return false;
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
|
5
test/test-pages/ietf-1/expected-metadata.json
Normal file
5
test/test-pages/ietf-1/expected-metadata.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"title": "draft-dejong-remotestorage-04 - remoteStorage",
|
||||
"byline": "AUTHORING",
|
||||
"readerable": true
|
||||
}
|
1112
test/test-pages/ietf-1/expected.html
Normal file
1112
test/test-pages/ietf-1/expected.html
Normal file
File diff suppressed because it is too large
Load Diff
1269
test/test-pages/ietf-1/source.html
Normal file
1269
test/test-pages/ietf-1/source.html
Normal file
File diff suppressed because it is too large
Load Diff
@ -2,5 +2,5 @@
|
||||
"title": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy\n eirmod tempor invidunt",
|
||||
"byline": "Henri Sivonen",
|
||||
"excerpt": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy\n eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam\n voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit\n amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam\n nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed\n diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet.",
|
||||
"readerable": false
|
||||
"readerable": true
|
||||
}
|
||||
|
@ -3,5 +3,5 @@
|
||||
"byline": null,
|
||||
"dir": "ltr",
|
||||
"excerpt": "Get to know the features that make it the most complete browser for building the Web.",
|
||||
"readerable": true
|
||||
"readerable": false
|
||||
}
|
||||
|
@ -2,5 +2,5 @@
|
||||
"title": "",
|
||||
"byline": null,
|
||||
"excerpt": "Regarding item# 11111, under sufficiently extreme conditions, quarks may\n become deconfined and exist as free particles. In the course of asymptotic\n freedom, the strong interaction becomes weaker at higher temperatures.\n Eventually, color confinement would be lost and an extremely hot plasma\n of freely moving quarks and gluons would be formed. This theoretical phase\n of matter is called quark-gluon plasma.[81] The exact conditions needed\n to give rise to this state are unknown and have been the subject of a great\n deal of speculation and experimentation.",
|
||||
"readerable": false
|
||||
"readerable": true
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user