// // CONTENT FETCHING CONSTANTS //// // A list of strings that can be considered unlikely candidates when // extracting content from a resource. These strings are joined together // and then tested for existence using re:test, so may contain simple, // non-pipe style regular expression queries if necessary. export const UNLIKELY_CANDIDATES_BLACKLIST = [ 'ad-break', 'adbox', 'advert', 'addthis', 'agegate', 'aux', 'blogger-labels', 'combx', 'comment', 'conversation', 'disqus', 'entry-unrelated', 'extra', 'foot', 'form', 'header', 'hidden', 'loader', 'login', // Note: This can hit 'blogindex'. 'menu', 'meta', 'nav', 'pager', 'pagination', 'predicta', // readwriteweb inline ad box 'presence_control_external', // lifehacker.com container full of false positives 'popup', 'printfriendly', 'related', 'remove', 'remark', 'rss', 'share', 'shoutbox', 'sidebar', 'sociable', 'sponsor', 'tools', ]; // A list of strings that can be considered LIKELY candidates when // extracting content from a resource. Essentially, the inverse of the // blacklist above - if something matches both blacklist and whitelist, // it is kept. This is useful, for example, if something has a className // of "rss-content entry-content". It matched 'rss', so it would normally // be removed, however, it's also the entry content, so it should be left // alone. // // These strings are joined together and then tested for existence using // re:test, so may contain simple, non-pipe style regular expression queries // if necessary. export const UNLIKELY_CANDIDATES_WHITELIST = [ 'and', 'article', 'body', 'blogindex', 'column', 'content', 'entry-content-asset', 'format', // misuse of form 'hfeed', 'hentry', 'hatom', 'main', 'page', 'posts', 'shadow', ]; // A list of tags which, if found inside, should cause a
to NOT // be turned into a paragraph tag. Shallow div tags without these elements // should be turned into tags. export const DIV_TO_P_BLOCK_TAGS = [ 'a', 'blockquote', 'dl', 'div', 'img', 'p', 'pre', 'table', ].join(','); // A list of tags that should be ignored when trying to find the top candidate // for a document. export const NON_TOP_CANDIDATE_TAGS = [ 'br', 'b', 'i', 'label', 'hr', 'area', 'base', 'basefont', 'input', 'img', 'link', 'meta', ]; export const NON_TOP_CANDIDATE_TAGS_RE = new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i'); // A list of selectors that specify, very clearly, either hNews or other // very content-specific style content, like Blogger templates. // More examples here: http://microformats.org/wiki/blog-post-formats export const HNEWS_CONTENT_SELECTORS = [ ['.hentry', '.entry-content'], ['entry', '.entry-content'], ['.entry', '.entry_content'], ['.post', '.postbody'], ['.post', '.post_body'], ['.post', '.post-body'], ]; export const PHOTO_HINTS = [ 'figure', 'photo', 'image', 'caption', ]; export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i'); // A list of strings that denote a positive scoring for this content as being // an article container. Checked against className and id. // // TODO: Perhaps have these scale based on their odds of being quality? export const POSITIVE_SCORE_HINTS = [ 'article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday '\Bcopy', ]; // The above list, joined into a matching regular expression export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i'); // Readability publisher-specific guidelines export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i'); // A list of strings that denote a negative scoring for this content as being // an article container. Checked against className and id. // // TODO: Perhaps have these scale based on their odds of being quality? export const NEGATIVE_SCORE_HINTS = [ 'adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off 'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright 'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk 'promo', 'pr_', // autoblog - press release 'related', 'respond', 'roundcontent', // lifehacker restricted content warning 'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget', ]; // The above list, joined into a matching regular expression export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i'); // Match a digit. Pretty clear. export const DIGIT_RE = new RegExp('[0-9]'); // Match 2 or more consecutive