Quick port of constants file

8 years ago · c2a8edee97
commit c2a8edee97
1 changed files with 941 additions and 0 deletions
--- a/extractor/generic/constants.js
+++ b/extractor/generic/constants.js
@ -0,0 +1,941 @@
+// TODO: It would be great if we could merge the meta and selector lists into
+// a list of objects, because we could then rank them better. For example,
+// .hentry .entry-title is far better suited than <meta title>.
+
+// An ordered list of meta tag names that denote likely article titles. All
+// attributes should be lowercase for faster case-insensitive matching. From
+// most distinct to least distinct.
+export const STRONG_TITLE_META_TAGS = [
+    'tweetmeme-title',
+    'dc.title',
+    'rbtitle',
+    'headline',
+    'title',
+]
+
+// og:title is weak because it typically contains context that we don't like,
+// for example the source site's name. Gotta get that brand into facebook!
+export const WEAK_TITLE_META_TAGS = [
+    'og:title',
+]
+
+// An ordered list of XPath Selectors to find likely article titles. From
+// most explicit to least explicit.
+//
+// Note - this does not use classes like CSS. This checks to see if the string
+// exists in the className, which is not as accurate as .className (which 
+// splits on spaces/endlines), but for our purposes it's close enough. The
+// speed tradeoff is worth the accuracy hit.
+export const STRONG_TITLE_SELECTORS = [
+    {
+        //selector: XPath('//*[contains(@class, "hentry")]//*[contains(@class, "entry-title")]'),
+        must_exist: {
+            classes: ['hentry', 'entry-title'],
+        }
+    }, 
+    {
+        //selector: XPath('//*[id="articleHeader"]//h1'),
+        must_exist: {
+            'ids': ['articleHeader']
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "articleHeader")]/h1'),
+        must_exist: {
+            classes: ['articleHeader'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "article")]/h1'),
+        must_exist: {
+            classes: ['article'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "instapaper_title")]'),
+        must_exist: {
+            classes: ['instapaper_title'],
+        }
+    },
+    {
+        //selector: XPath('//*[@id="meebo-title"]'),
+        must_exist: {
+            'ids': ['meebo-title']
+        }
+    },
+]
+
+export const WEAK_TITLE_SELECTORS = [
+    {
+        //selector: XPath('//article//h1'),
+        must_exist: {
+            classes: [],
+            'ids': []
+        }
+    },
+    {
+        //selector: XPath('//*[id="entry-title"]'),
+        must_exist: {
+            'ids': ['entry-title']
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "entry-title")]'),
+        must_exist: {
+            classes: ['entry-title'],
+        }
+    },
+    {
+        //selector: XPath('//*[id="entryTitle" or id="entrytitle"]'),
+        must_exist: {
+            'ids': ['entryTitle'],
+        }
+    },
+    {
+        //selector: XPath('//*[id="entrytitle"]'),
+        must_exist: {
+            'ids': ['entrytitle'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "entryTitle")]'),
+        must_exist: {
+            classes: ['entryTitle'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "entrytitle")]'),
+        must_exist: {
+            classes: ['entrytitle'],
+        }
+    },
+    {
+        //selector: XPath('//*[id="articleTitle"]'),
+        must_exist: {
+            'ids': ['articleTitle'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "articleTitle")]'),
+        must_exist: {
+            classes: ['articleTitle'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "post")]//*[contains(@class, "post-title")]'),
+        must_exist: {
+            classes: ['post', 'post-title'],
+        }
+    },
+    {
+        //selector: XPath('//h1[contains(@class, "title")]'),
+    },
+    {
+        //selector: XPath('//*[contains(@class, "article")]//h2'),
+        must_exist: {
+            classes: ['article'],
+            'ids': []
+        }
+    },
+    {
+        //selector: XPath('//h1'),
+    },
+    {
+        //selector: XPath('//html//head//title'),
+    },
+    {
+        //selector: XPath('//title'),
+    },
+]
+
+
+// A regular expression that will match separating characters on a title, that
+// usually denote breadcrumbs or something similar.
+export const TITLE_SPLITTERS_RE = new RegExp('(: | - | \| )')
+
+
+// An ordered list of meta tag names that denote likely article authors. All
+// attributes should be lowercase for faster case-insensitive matching. From
+// most distinct to least distinct.
+//
+// Note: "author" is too often the -developer- of the page, so it is not
+// added here.
+export const AUTHOR_META_TAGS = [
+    'byl',
+    'clmst',
+    'dc.author',
+    'dcsext.author',
+    'dc.creator',
+    'rbauthors',
+    'authors',
+]
+
+export const AUTHOR_MAX_LENGTH = 300
+
+// An ordered list of XPath Selectors to find likely article authors. From
+// most explicit to least explicit.
+//
+// Note - this does not use classes like CSS. This checks to see if the string
+// exists in the className, which is not as accurate as .className (which 
+// splits on spaces/endlines), but for our purposes it's close enough. The
+// speed tradeoff is worth the accuracy hit.
+export const AUTHOR_SELECTORS = [
+    {
+        //selector: XPath('//*[contains(@class, "entry")]//*[contains(@class, "entry-author")]'),
+        must_exist: {
+            classes: ['entry', 'entry-author'],
+        }
+    }, 
+    {
+        //selector: XPath('//*[contains(@class, "author") and contains(@class, "vcard")]//*[contains(@class, "fn")]'),
+        must_exist: {
+            classes: ['author', 'vcard', 'fn'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "author")]//*[contains(@class, "vcard")]//*[contains(@class, "fn")]'),
+        must_exist: {
+            classes: ['author', 'vcard', 'fn'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "byline") and contains(@class, "vcard")]//*[contains(@class, "fn")]'),
+        must_exist: {
+            classes: ['byline', 'vcard', 'fn'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "byline")]//*[contains(@class, "vcard")]//*[contains(@class, "fn")]'),
+        must_exist: {
+            classes: ['byline', 'vcard', 'fn'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "byline")]//*[contains(@class, "by")]//*[contains(@class, "author")]'),
+        must_exist: {
+            classes: ['byline', 'by', 'author'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "byline")]//*[contains(@class, "by")]'),
+        must_exist: {
+            classes: ['byline', 'by'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "byline")]//*[contains(@class, "author")]'),
+        must_exist: {
+            classes: ['byline', 'author'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "post-author") and contains(@class, "vcard")]'),
+        must_exist: {
+            classes: ['post-author', 'vcard'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "post-author")]//*[contains(@class, "vcard")]'),
+        must_exist: {
+            classes: ['post-author', 'vcard'],
+        }
+    },
+    {
+        //selector: XPath('//a[contains(@rel, "author")]'),
+    },
+    {
+        //selector: XPath('//*[@id="by_author"]'),
+        must_exist: {
+            'ids': ['by_author']
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "by_author")]'),
+        must_exist: {
+            classes: ['by_author'],
+        }
+    },
+    {
+        //selector: XPath('//*[@id="entryAuthor"]'),
+        must_exist: {
+            'ids': ['entryAuthor']
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "entryAuthor")]'),
+        must_exist: {
+            classes: ['entryAuthor'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "byline")]//a[contains(@href, "author")]'),
+        must_exist: {
+            classes: ['byline'],
+        }
+    },
+    {
+        //selector: XPath('//*[@id="author"]//*[contains(@class, "authorname")]'),
+        must_exist: {
+            classes: ['authorname'],
+            'ids': ['author']
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "author")]//*[contains(@class, "authorname")]'),
+        must_exist: {
+            classes: ['author', 'authorName'],
+        }
+    },
+    {
+        //selector: XPath('//*[@id="author"]'),
+        must_exist: {
+            'ids': ['author']
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "author")]'),
+        must_exist: {
+            classes: ['author'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "articleauthor")]'),
+        must_exist: {
+            classes: ['articleauthor'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "ArticleAuthor")]'),
+        must_exist: {
+            classes: ['ArticleAuthor'],
+        }
+    },
+]
+
+// An ordered list of XPath Selectors to find likely article authors, with
+// regular expression namespaces enabled. Its own list for performance
+// considerations.
+export const AUTHOR_SELECTORS_RE = [
+    '//*[@id="byline"][re:test(., "^\s*By", "i")]',
+    '//*[contains(@class, "byline")][re:test(., "^\s*By", "i")]',
+]
+
+// An ordered list of meta tag names that denote likely date published dates.
+// All attributes should be lowercase for faster case-insensitive matching.
+// From most distinct to least distinct.
+export const DATE_PUBLISHED_META_TAGS = [
+    'article:published_time',
+    'displaydate',
+    'dc.date',
+    'dc.date.issued',
+    'rbpubdate',
+    'publish_date',
+    'pub_date',
+    'pagedate',
+    'pubdate',
+    'revision_date',
+    'doc_date',
+    'date_created',
+    'content_create_date',
+    'lastmodified',
+    'created',
+    'date'
+]
+
+// An ordered list of XPath Selectors to find likely date published dates. From
+// most explicit to least explicit.
+//
+// Note - this does not use classes like CSS. This checks to see if the string
+// exists in the className, which is not as accurate as .className (which 
+// splits on spaces/endlines), but for our purposes it's close enough. The
+// speed tradeoff is worth the accuracy hit.
+export const DATE_PUBLISHED_SELECTORS = [
+    {
+        //selector: XPath('//*[contains(@class, "hentry")]//*[contains(@class, "dtstamp.published")]'),
+        must_exist: {
+            classes: ['hentry', 'dtstamp.published'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "hentry")]//*[contains(@class, "published")]'),
+        must_exist: {
+            classes: ['hentry', 'published'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "hentry")]//*[contains(@class, "dtstamp.updated")]'),
+        must_exist: {
+            classes: ['hentry', 'dtstamp.updated'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "hentry")]//*[contains(@class, "updated")]'),
+        must_exist: {
+            classes: ['hentry', 'updated'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "single")]//*[contains(@class, "published")]'),
+        must_exist: {
+            classes: ['single', 'published'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "meta")]//*[contains(@class, "published")]'),
+        must_exist: {
+            classes: [],
+            'ids': []
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "meta")]//*[contains(@class, "postDate")]'),
+        must_exist: {
+            classes: ['meta', 'posDate'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "entry-date")]'),
+        must_exist: {
+            classes: ['entry-date'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "byline")]//*[contains(@class, "date")]'),
+        must_exist: {
+            classes: ['byline', 'date'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "postmetadata")]//*[contains(@class, "date")]'),
+        must_exist: {
+            classes: ['postmetadata', 'date'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "article_datetime")]'),
+        must_exist: {
+            classes: ['article_datetime'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "date-header")]'),
+        must_exist: {
+            classes: ['date-header'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "story-date")]'),
+        must_exist: {
+            classes: ['story-date'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "dateStamp")]'),
+        must_exist: {
+            classes: ['dateStamp'],
+        }
+    },
+    {
+        //selector: XPath('//*[@id="story"]//*[contains(@class, "datetime")]'),
+        must_exist: {
+            classes: ['datetime'],
+            'ids': ['story']
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "dateline")]'),
+        must_exist: {
+            classes: ['dateline'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "pubdate")]'),
+        must_exist: {
+            classes: ['pubdate'],
+        }
+    },
+]
+
+
+// An ordered list of compiled regular expressions to find likely date
+// published dates from the URL. These should always have the first
+// reference be a date string that is parseable by dateutil.parser.parse
+const _abbrev_months_str = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'
+export const DATE_PUBLISHED_URL_RES = [
+    new RegExp('/(20\d{2}/\d{2}/\d{2})/', 'i'),                    // /2012/01/27/ but not /2012/01/293
+    new RegExp('[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)', 'i'),     // 20120127 or 20120127T but not 2012012733 or 8201201733
+    new RegExp('(20\d{2}-[01]\d-[0-3]\d)', 'i'),                   // 2012-01-27
+    new RegExp('/(20\d{2}/%s/[0-3]\d)/' % _abbrev_months_str, 'i') // /2012/jan/27/
+
+]
+
+
+// An ordered list of meta tag names that denote likely article deks. All
+// attributes should be lowercase for faster case-insensitive matching. From
+// most distinct to least distinct.
+//
+// NOTE: There are currently no meta tags that seem to provide the right
+// content consistenty enough. Two options were:
+//  - og:description
+//  - dc.description
+// However, these tags often have SEO-specific junk in them that's not
+// header-worthy like a dek is. Excerpt material at best.
+export const DEK_META_TAGS = [
+]
+
+// An ordered list of XPath Selectors to find likely article deks. From
+// most explicit to least explicit.
+//
+// Note - this does not use classes like CSS. This checks to see if the string
+// exists in the className, which is not as accurate as .className (which 
+// splits on spaces/endlines), but for our purposes it's close enough. The
+// speed tradeoff is worth the accuracy hit.
+//
+// Should be more restrictive than not, as a failed dek can be pretty
+// detrimental to the aesthetics of an article.
+export const DEK_SELECTORS = [
+    {
+        //selector: XPath('//*[contains(@class, "entry-summary")]'),
+        must_exist: {
+            classes: ['entry-summary'],
+        }
+    }, // hentry microformat
+]
+
+
+
+// An ordered list of meta tag names that denote likely article leading images.
+// All attributes should be lowercase for faster case-insensitive matching.
+// From most distinct to least distinct.
+export const LEAD_IMAGE_URL_META_TAGS = [
+    //'og:image',
+    'image_src',
+]
+
+// An ordered list of XPath Selectors to find likely article deks. From
+// most explicit to least explicit.
+//
+// Should be more restrictive than not, as a failed dek can be pretty
+// detrimental to the aesthetics of an article.
+export const LEAD_IMAGE_URL_SELECTORS = [
+    {
+        //selector: '//link[@rel="image_src"]',
+    }, // hentry microformat
+]
+
+
+//// CONTENT FETCHING CONSTANTS ////
+
+// A list of strings that can be considered unlikely candidates when
+// extracting content from a resource. These strings are joined together
+// and then tested for existence using re:test, so may contain simple,
+// non-pipe style regular expression queries if necessary.
+export const UNLIKELY_CANDIDATES_BLACKLIST = [
+    'ad-break',
+    'adbox',
+    'advert',
+    'addthis',
+    'agegate',
+    'aux',
+    'blogger-labels',
+    'combx',
+    'comment',
+    'conversation',
+    'disqus',
+    'entry-unrelated',
+    'extra',
+    'foot',
+    'form',
+    'header',
+    'hidden',
+    'loader',
+    'login',                     // Note: This can hit 'blogindex'.
+    'menu',
+    'meta',
+    'nav',
+    'pager',
+    'pagination',
+    'predicta',                  // readwriteweb inline ad box
+    'presence_control_external', // lifehacker.com container full of false positives
+    'popup',
+    'printfriendly',
+    'remove',
+    'remark',
+    'rss',
+    'shoutbox',
+    'sidebar',
+    'sociable',
+    'sponsor',
+    'tools'
+]
+
+// A list of strings that can be considered LIKELY candidates when
+// extracting content from a resource. Essentially, the inverse of the
+// blacklist above - if something matches both blacklist and whitelist,
+// it is kept. This is useful, for example, if something has a className
+// of "rss-content entry-content". It matched 'rss', so it would normally
+// be removed, however, it's also the entry content, so it should be left
+// alone.
+//
+// These strings are joined together and then tested for existence using
+// re:test, so may contain simple, non-pipe style regular expression queries
+// if necessary.
+export const UNLIKELY_CANDIDATES_WHITELIST = [
+    'and',
+    'article',
+    'body',
+    'blogindex',
+    'column',
+    'content',
+    'entry-content-asset',
+    'format', // misuse of form
+    'hfeed',
+    'hentry',
+    'hatom',
+    'main',
+    'page',
+    'posts',
+    'shadow'
+]
+
+// A list of tags which, if found inside, should cause a <div /> to NOT
+// be turned into a paragraph tag. Shallow div tags without these elements
+// should be turned into <p /> tags.
+export const DIV_TO_P_BLOCK_TAGS = [
+    'a',
+    'blockquote',
+    'dl',
+    'div',
+    'img',
+    'p',
+    'pre',
+    'table',
+]
+
+// A list of tags that should be ignored when trying to find the top candidate
+// for a document.
+export const NON_TOP_CANDIDATE_TAGS = [
+    'br',
+    'b',
+    'i',
+    'label',
+    'hr',
+    'area',
+    'base',
+    'basefont',
+    'input',
+    'img',
+    'link',
+    'meta',
+]
+
+
+// A list of selectors that specify, very clearly, either hNews or other
+// very content-specific style content, like Blogger templates.
+// More examples here: http://microformats.org/wiki/blog-post-formats
+export const HNEWS_CONTENT_SELECTORS = [
+    {
+        //selector: XPath('//*[contains(@class, "hentry")]//*[contains(@class, "entry-content")]'),
+        must_exist: {
+            classes: ['hentry', 'entry-content'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "entry")]//*[contains(@class, "entry-content")]'),
+        must_exist: {
+            classes: ['entry', 'entry-content'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "entry")]//*[contains(@class, "entry_content")]'),
+        must_exist: {
+            classes: ['entry', 'entry_content'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "post")]//*[contains(@class, "post-body")]'),
+        must_exist: {
+            classes: ['post', 'post-body'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "post")]//*[contains(@class, "post_body")]'),
+        must_exist: {
+            classes: ['post', 'post_body'],
+        }
+    },
+    {
+        //selector: XPath('//*[contains(@class, "post")]//*[contains(@class, "postbody")]'),
+        must_exist: {
+            classes: ['post', 'postbody'],
+        }
+    },
+]
+
+export const PHOTO_HINTS = [
+    'figure',
+    'photo',
+    'image',
+    'caption'
+]
+export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i')
+
+export const POSITIVE_LEAD_IMAGE_URL_HINTS = [
+    'upload',
+    'wp-content',
+    'large',
+    'photo',
+    'wp-image',
+]
+export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
+
+export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
+    'spacer',
+    'sprite',
+    'blank',
+    'throbber',
+    'gradient',
+    'tile',
+    'bg',
+    'background',
+    'icon',
+    'social',
+    'header',
+    'hdr',
+    'advert',
+    'spinner',
+    'loader',
+    'loading',
+    'default',
+    'rating',
+    'share',
+    'facebook',
+    'twitter',
+    'theme',
+    'promo',
+    'ads',
+    'wp-includes',
+]
+export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
+
+// A list of strings that denote a positive scoring for this content as being
+// an article container. Checked against className and id.
+//
+// TODO: Perhaps have these scale based on their odds of being quality?
+export const POSITIVE_SCORE_HINTS = [
+    'article', 
+    'articlecontent',
+    'instapaper_body',
+    'blog',
+    'body',
+    'content',
+    'entry-content-asset',
+    'entry',
+    'hentry',
+    'main',
+    'Normal',
+    'page',
+    'pagination',
+    'permalink',
+    'post',
+    'story',
+    'text',
+    '[-_]copy', //usatoday
+    '\Bcopy'
+]
+
+// The above list, joined into a matching regular expression
+export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i')
+
+
+// A list of strings that denote a negative scoring for this content as being
+// an article container. Checked against className and id.
+//
+// TODO: Perhaps have these scale based on their odds of being quality?
+export const NEGATIVE_SCORE_HINTS = [
+    'adbox',
+    'advert',
+    'author',
+    'bio',
+    'bookmark',
+    'bottom',
+    'byline',
+    'clear',
+    'com-',
+    'combx',
+    'comment',
+    'comment\B',
+    'contact',
+    'copy',
+    'credit',
+    'crumb',
+    'date',
+    'deck',
+    'excerpt',
+    'featured', //tnr.com has a featured_content which throws us off
+    'foot',
+    'footer',
+    'footnote',
+    'graf',
+    'head',
+    'info',
+    'infotext', //newscientist.com copyright
+    'instapaper_ignore',
+    'jump',
+    'linebreak',
+    'link',
+    'masthead',
+    'media',
+    'meta',
+    'modal',
+    'outbrain', //slate.com junk
+    'promo',
+    'pr_', // autoblog - press release
+    'related',
+    'respond',
+    'roundcontent', //lifehacker restricted content warning
+    'scroll',
+    'secondary',
+    'share',
+    'shopping',
+    'shoutbox',
+    'side',
+    'sidebar',
+    'sponsor',
+    'stamp',
+    'sub',
+    'summary',
+    'tags',
+    'tools',
+    'widget'
+]
+// The above list, joined into a matching regular expression
+export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i')
+
+// A list of tags to strip from the output if we encounter them.
+export const STRIP_OUTPUT_TAGS = [
+    'title',
+    'script',
+    'noscript',
+    'link',
+    'style',
+    'hr',
+]
+
+// XPath to try to determine if a page is wordpress. Not always successful.
+export const IS_WP_XPATH = "//meta[@name='generator'][starts-with(@value,'WordPress')]"
+
+// Match a digit. Pretty clear.
+export const DIGIT_RE = new RegExp('[0-9]')
+
+// A list of words that, if found in link text or URLs, likely mean that
+// this link is not a next page link.
+export const EXTRANEOUS_LINK_HINTS = [
+    'print',
+    'archive',
+    'comment',
+    'discuss',
+    'e-mail',
+    'email',
+    'share',
+    'reply',
+    'all',
+    'login',
+    'sign',
+    'single',
+    'adx',
+    'entry-unrelated'
+]
+export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i')
+
+// An expression that looks to try to find the page digit within a URL, if
+// it exists.
+// Matches:
+//  page=1
+//  pg=1
+//  p=1
+//  paging=12
+//  pag=7
+//  pagination/1
+//  paging/88
+//  pa/83
+//  p/11
+//
+// Does not match:
+//  pg=102
+//  page:2
+export const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)?(=|\/)(?P<pagenum>[0-9]{1,2})))', 'i')
+
+// Match any phrase that looks like it could be page, or paging, or pagination
+export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i')
+
+// Match any link text/classname/id that looks like it could mean the next
+// page. Things like: next, continue, >, >>, » but not >|, »| as those can
+// mean last page.
+export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i')
+
+// Match any link text/classname/id that looks like it is an end link: things
+// like "first", "last", "end", etc.
+export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i')
+
+// Match any link text/classname/id that looks like it means the previous
+// page.
+export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i')
+
+// Match 2 or more consecutive <br> tags
+export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i')
+
+// Match 1 BR tag.
+export const BR_TAG_RE = new RegExp('<br[^>]*>', 'i')
+
+// A list of all of the block level tags known in HTML5 and below. Taken from
+// http://bit.ly/qneNIT
+export const BLOCK_LEVEL_TAGS = [
+    'article',
+    'aside',
+    'blockquote',
+    'body',
+    'br',
+    'button',
+    'canvas',
+    'caption',
+    'col',
+    'colgroup',
+    'dd',
+    'div',
+    'dl',
+    'dt',
+    'embed',
+    'fieldset',
+    'figcaption',
+    'figure',
+    'footer',
+    'form',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'header',
+    'hgroup',
+    'hr',
+    'li',
+    'map',
+    'object',
+    'ol',
+    'output',
+    'p',
+    'pre',
+    'progress',
+    'section',
+    'table',
+    'tbody',
+    'textarea',
+    'tfoot',
+    'th',
+    'thead',
+    'tr',
+    'ul',
+    'video',
+]