Allow the word threshold parameter to be configurable

7 years ago · b3fde168cb
parent b7c32feb25
commit b3fde168cb
1 changed files with 5 additions and 1 deletions
--- a/Readability.js
+++ b/Readability.js
@ -41,6 +41,7 @@ function Readability(uri, doc, options) {
  this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
  this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
  this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
+  this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;

  // Start with all flags set
  this._flags = this.FLAG_STRIP_UNLIKELYS |
@ -110,6 +111,9 @@ Readability.prototype = {
  // Element tags to score by default.
  DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),

+  // The default number of words an article must have in order to return a result
+  DEFAULT_WORD_THRESHOLD: 500,
+
  // All of the regular expressions in use within readability.
  // Defined up here so we don't instantiate them repeatedly in loops.
  REGEXPS: {
@ -1061,7 +1065,7 @@ Readability.prototype = {
      // grabArticle with different flags set. This gives us a higher likelihood of
      // finding the content, and the sieve approach gives us a higher likelihood of
      // finding the -right- content.
-      if (this._getInnerText(articleContent, true).length < 500) {
+      if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
        page.innerHTML = pageCacheHtml;

        if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {