|
|
|
@ -35,11 +35,12 @@ function Readability(uri, doc, options) {
|
|
|
|
|
this._articleByline = null;
|
|
|
|
|
this._articleDir = null;
|
|
|
|
|
|
|
|
|
|
// Configureable options
|
|
|
|
|
// Configurable options
|
|
|
|
|
this._debug = !!options.debug;
|
|
|
|
|
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
|
|
|
|
|
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
|
|
|
|
|
this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
|
|
|
|
|
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
|
|
|
|
|
|
|
|
|
|
// Start with all flags set
|
|
|
|
|
this._flags = this.FLAG_STRIP_UNLIKELYS |
|
|
|
|
@ -123,6 +124,10 @@ Readability.prototype = {
|
|
|
|
|
|
|
|
|
|
DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
|
|
|
|
|
|
|
|
|
|
// These are the IDs and classes that readability sets itself.
|
|
|
|
|
IDS_TO_PRESERVE: [ "readability-content", "readability-page-1" ],
|
|
|
|
|
CLASSES_TO_PRESERVE: [ "readability-styled", "page" ],
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Run any post-process modifications to article content as necessary.
|
|
|
|
|
*
|
|
|
|
@ -132,6 +137,9 @@ Readability.prototype = {
|
|
|
|
|
_postProcessContent: function(articleContent) {
|
|
|
|
|
// Readability cannot open relative uris so we convert them to absolute uris.
|
|
|
|
|
this._fixRelativeUris(articleContent);
|
|
|
|
|
|
|
|
|
|
// Remove IDs and classes.
|
|
|
|
|
this._cleanIDsAndClasses(articleContent);
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
@ -225,6 +233,38 @@ Readability.prototype = {
|
|
|
|
|
}));
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Removes the id="" and class="" attribute from every element in the given
|
|
|
|
|
* subtree, except those that match IDS_TO_PRESERVE, CLASSES_TO_PRESERVE and
|
|
|
|
|
* the classesToPreserve array from the options object.
|
|
|
|
|
*
|
|
|
|
|
* @param Element
|
|
|
|
|
* @return void
|
|
|
|
|
*/
|
|
|
|
|
_cleanIDsAndClasses: function(node) {
|
|
|
|
|
if (this.IDS_TO_PRESERVE.indexOf(node.id) == -1) {
|
|
|
|
|
node.removeAttribute("id");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var classesToPreserve = this._classesToPreserve;
|
|
|
|
|
var className = node.className
|
|
|
|
|
.split(/\s+/)
|
|
|
|
|
.filter(function(cls) {
|
|
|
|
|
return classesToPreserve.indexOf(cls) != -1;
|
|
|
|
|
})
|
|
|
|
|
.join(" ");
|
|
|
|
|
|
|
|
|
|
if (className) {
|
|
|
|
|
node.className = className;
|
|
|
|
|
} else {
|
|
|
|
|
node.removeAttribute("class");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (node = node.firstElementChild; node; node = node.nextElementSibling) {
|
|
|
|
|
this._cleanIDsAndClasses(node);
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Converts each <a> and <img> uri in the given element to an absolute URI,
|
|
|
|
|
* ignoring #ref URIs.
|
|
|
|
|