{"version":3,"file":null,"sources":["mercury.js","../scripts/templates/insert-values.js","../scripts/templates/index.js","../scripts/templates/custom-extractor.js","../scripts/templates/custom-extractor-test.js","../scripts/generate-custom-parser.js"],"sourcesContent":["'use strict';\n\nfunction _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'default' in ex) ? ex['default'] : ex; }\n\nvar URL = _interopDefault(require('url'));\nvar babelPolyfill = require('babel-polyfill');\nvar cheerio = _interopDefault(require('cheerio'));\nvar request = _interopDefault(require('request'));\nvar stringDirection = _interopDefault(require('string-direction'));\nvar validUrl = _interopDefault(require('valid-url'));\nvar moment = _interopDefault(require('moment'));\nvar wuzzy = _interopDefault(require('wuzzy'));\nvar difflib = _interopDefault(require('difflib'));\nvar ellipsize = _interopDefault(require('ellipsize'));\n\nvar _marked = [range].map(regeneratorRuntime.mark);\n\nfunction range() {\n var start = arguments.length <= 0 || arguments[0] === undefined ? 1 : arguments[0];\n var end = arguments.length <= 1 || arguments[1] === undefined ? 1 : arguments[1];\n return regeneratorRuntime.wrap(function range$(_context) {\n while (1) {\n switch (_context.prev = _context.next) {\n case 0:\n if (!(start <= end)) {\n _context.next = 5;\n break;\n }\n\n _context.next = 3;\n return start += 1;\n\n case 3:\n _context.next = 0;\n break;\n\n case 5:\n case \"end\":\n return _context.stop();\n }\n }\n }, _marked[0], this);\n}\n\n// extremely simple url validation as a first step\nfunction validateUrl(_ref) {\n var hostname = _ref.hostname;\n\n // If this isn't a valid url, return an error message\n return !!hostname;\n}\n\nvar Errors = {\n badUrl: {\n error: true,\n messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.'\n }\n};\n\nvar REQUEST_HEADERS = {\n 'User-Agent': 'Readability - http://readability.com/about/'\n};\n\n// The number of milliseconds to attempt to fetch a resource before timing out.\nvar FETCH_TIMEOUT = 10000;\n\n// Content types that we do not extract content from\nvar BAD_CONTENT_TYPES = ['audio/mpeg', 'image/gif', 'image/jpeg', 'image/jpg'];\n\nvar BAD_CONTENT_TYPES_RE = new RegExp('^(' + BAD_CONTENT_TYPES.join('|') + ')$', 'i');\n\n// Use this setting as the maximum size an article can be\n// for us to attempt parsing. Defaults to 5 MB.\nvar MAX_CONTENT_LENGTH = 5242880;\n\nvar _typeof = typeof Symbol === \"function\" && typeof Symbol.iterator === \"symbol\" ? function (obj) {\n return typeof obj;\n} : function (obj) {\n return obj && typeof Symbol === \"function\" && obj.constructor === Symbol ? \"symbol\" : typeof obj;\n};\n\nvar asyncToGenerator = function (fn) {\n return function () {\n var gen = fn.apply(this, arguments);\n return new Promise(function (resolve, reject) {\n function step(key, arg) {\n try {\n var info = gen[key](arg);\n var value = info.value;\n } catch (error) {\n reject(error);\n return;\n }\n\n if (info.done) {\n resolve(value);\n } else {\n return Promise.resolve(value).then(function (value) {\n return step(\"next\", value);\n }, function (err) {\n return step(\"throw\", err);\n });\n }\n }\n\n return step(\"next\");\n });\n };\n};\n\nvar defineProperty = function (obj, key, value) {\n if (key in obj) {\n Object.defineProperty(obj, key, {\n value: value,\n enumerable: true,\n configurable: true,\n writable: true\n });\n } else {\n obj[key] = value;\n }\n\n return obj;\n};\n\nvar _extends = Object.assign || function (target) {\n for (var i = 1; i < arguments.length; i++) {\n var source = arguments[i];\n\n for (var key in source) {\n if (Object.prototype.hasOwnProperty.call(source, key)) {\n target[key] = source[key];\n }\n }\n }\n\n return target;\n};\n\nvar slicedToArray = function () {\n function sliceIterator(arr, i) {\n var _arr = [];\n var _n = true;\n var _d = false;\n var _e = undefined;\n\n try {\n for (var _i = arr[Symbol.iterator](), _s; !(_n = (_s = _i.next()).done); _n = true) {\n _arr.push(_s.value);\n\n if (i && _arr.length === i) break;\n }\n } catch (err) {\n _d = true;\n _e = err;\n } finally {\n try {\n if (!_n && _i[\"return\"]) _i[\"return\"]();\n } finally {\n if (_d) throw _e;\n }\n }\n\n return _arr;\n }\n\n return function (arr, i) {\n if (Array.isArray(arr)) {\n return arr;\n } else if (Symbol.iterator in Object(arr)) {\n return sliceIterator(arr, i);\n } else {\n throw new TypeError(\"Invalid attempt to destructure non-iterable instance\");\n }\n };\n}();\n\nfunction get(options) {\n return new Promise(function (resolve, reject) {\n request(options, function (err, response, body) {\n if (err) {\n reject(err);\n } else {\n resolve({ body: body, response: response });\n }\n });\n });\n}\n\n// Evaluate a response to ensure it's something we should be keeping.\n// This does not validate in the sense of a response being 200 level or\n// not. Validation here means that we haven't found reason to bail from\n// further processing of this url.\n\nfunction validateResponse(response) {\n var parseNon2xx = arguments.length <= 1 || arguments[1] === undefined ? false : arguments[1];\n\n // Check if we got a valid status code\n if (response.statusMessage !== 'OK') {\n if (!response.statusCode) {\n throw new Error('Unable to fetch content. Original exception was ' + response.error);\n } else if (!parseNon2xx) {\n throw new Error('Resource returned a response status code of ' + response.statusCode + ' and resource was instructed to reject non-2xx level status codes.');\n }\n }\n\n var _response$headers = response.headers;\n var contentType = _response$headers['content-type'];\n var contentLength = _response$headers['content-length'];\n\n // Check that the content is not in BAD_CONTENT_TYPES\n\n if (BAD_CONTENT_TYPES_RE.test(contentType)) {\n throw new Error('Content-type for this resource was ' + contentType + ' and is not allowed.');\n }\n\n // Check that the content length is below maximum\n if (contentLength > MAX_CONTENT_LENGTH) {\n throw new Error('Content for this resource was too large. Maximum content length is ' + MAX_CONTENT_LENGTH + '.');\n }\n\n return true;\n}\n\n// Set our response attribute to the result of fetching our URL.\n// TODO: This should gracefully handle timeouts and raise the\n// proper exceptions on the many failure cases of HTTP.\n// TODO: Ensure we are not fetching something enormous. Always return\n// unicode content for HTML, with charset conversion.\n\nvar fetchResource = (function () {\n var _ref2 = asyncToGenerator(regeneratorRuntime.mark(function _callee(url, parsedUrl) {\n var options, _ref3, response, body;\n\n return regeneratorRuntime.wrap(function _callee$(_context) {\n while (1) {\n switch (_context.prev = _context.next) {\n case 0:\n parsedUrl = parsedUrl || URL.parse(encodeURI(url));\n\n options = {\n url: parsedUrl,\n headers: _extends({}, REQUEST_HEADERS),\n timeout: FETCH_TIMEOUT,\n // Don't set encoding; fixes issues\n // w/gzipped responses\n encoding: null,\n // Accept cookies\n jar: true,\n // Accept and decode gzip\n gzip: true,\n // Follow any redirect\n followAllRedirects: true\n };\n _context.next = 4;\n return get(options);\n\n case 4:\n _ref3 = _context.sent;\n response = _ref3.response;\n body = _ref3.body;\n _context.prev = 7;\n\n validateResponse(response);\n return _context.abrupt('return', { body: body, response: response });\n\n case 12:\n _context.prev = 12;\n _context.t0 = _context['catch'](7);\n return _context.abrupt('return', Errors.badUrl);\n\n case 15:\n case 'end':\n return _context.stop();\n }\n }\n }, _callee, this, [[7, 12]]);\n }));\n\n function fetchResource(_x2, _x3) {\n return _ref2.apply(this, arguments);\n }\n\n return fetchResource;\n})();\n\nfunction convertMetaProp($, from, to) {\n $('meta[' + from + ']').each(function (_, node) {\n var $node = $(node);\n\n var value = $node.attr(from);\n $node.attr(to, value);\n $node.removeAttr(from);\n });\n\n return $;\n}\n\n// For ease of use in extracting from meta tags,\n// replace the \"content\" attribute on meta tags with the\n// \"value\" attribute.\n//\n// In addition, normalize 'property' attributes to 'name' for ease of\n// querying later. See, e.g., og or twitter meta tags.\n\nfunction normalizeMetaTags($) {\n $ = convertMetaProp($, 'content', 'value');\n $ = convertMetaProp($, 'property', 'name');\n return $;\n}\n\nvar IS_LINK = new RegExp('https?://', 'i');\nvar IS_IMAGE = new RegExp('.(png|gif|jpe?g)', 'i');\n\nvar TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');\n\n// Convert all instances of images with potentially\n// lazy loaded images into normal images.\n// Many sites will have img tags with no source, or an image tag with a src\n// attribute that a is a placeholer. We need to be able to properly fill in\n// the src attribute so the images are no longer lazy loaded.\nfunction convertLazyLoadedImages($) {\n $('img').each(function (_, img) {\n Reflect.ownKeys(img.attribs).forEach(function (attr) {\n var value = img.attribs[attr];\n\n if (attr !== 'src' && IS_LINK.test(value) && IS_IMAGE.test(value)) {\n $(img).attr('src', value);\n }\n });\n });\n\n return $;\n}\n\nfunction isComment(index, node) {\n return node.type === 'comment';\n}\n\nfunction cleanComments($) {\n $.root().find('*').contents().filter(isComment).remove();\n\n return $;\n}\n\nfunction clean($) {\n $(TAGS_TO_REMOVE).remove();\n\n $ = cleanComments($);\n return $;\n}\n\nvar Resource = {\n\n // Create a Resource.\n //\n // :param url: The URL for the document we should retrieve.\n // :param response: If set, use as the response rather than\n // attempting to fetch it ourselves. Expects a\n // string.\n create: function create(url, preparedResponse, parsedUrl) {\n var _this = this;\n\n return asyncToGenerator(regeneratorRuntime.mark(function _callee() {\n var result, validResponse;\n return regeneratorRuntime.wrap(function _callee$(_context) {\n while (1) {\n switch (_context.prev = _context.next) {\n case 0:\n result = void 0;\n\n if (!preparedResponse) {\n _context.next = 6;\n break;\n }\n\n validResponse = {\n statusMessage: 'OK',\n statusCode: 200,\n headers: {\n 'content-type': 'text/html',\n 'content-length': 500\n }\n };\n\n\n result = { body: preparedResponse, response: validResponse };\n _context.next = 9;\n break;\n\n case 6:\n _context.next = 8;\n return fetchResource(url, parsedUrl);\n\n case 8:\n result = _context.sent;\n\n case 9:\n if (!result.error) {\n _context.next = 11;\n break;\n }\n\n return _context.abrupt('return', result);\n\n case 11:\n return _context.abrupt('return', _this.generateDoc(result));\n\n case 12:\n case 'end':\n return _context.stop();\n }\n }\n }, _callee, _this);\n }))();\n },\n generateDoc: function generateDoc(_ref) {\n var content = _ref.body;\n var response = _ref.response;\n var contentType = response.headers['content-type'];\n\n // TODO: Implement is_text function from\n // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57\n\n if (!contentType.includes('html') && !contentType.includes('text')) {\n throw new Error('Content does not appear to be text.');\n }\n\n var $ = cheerio.load(content, { normalizeWhitespace: true });\n\n if ($.root().children().length === 0) {\n throw new Error('No children, likely a bad parse.');\n }\n\n $ = normalizeMetaTags($);\n $ = convertLazyLoadedImages($);\n $ = clean($);\n\n return $;\n }\n};\n\nvar NYMagExtractor = {\n domain: 'nymag.com',\n content: {\n // Order by most likely. Extractor will stop on first occurence\n selectors: ['div.article-content', 'section.body', 'article.article'],\n\n // Selectors to remove from the extracted content\n clean: ['.ad', '.single-related-story'],\n\n // Object of tranformations to make on matched elements\n // Each key is the selector, each value is the tag to\n // transform to.\n // If a function is given, it should return a string\n // to convert to or nothing (in which case it will not perform\n // the transformation.\n transforms: {\n // Convert h1s to h2s\n h1: 'h2',\n\n // Convert lazy-loaded noscript images to figures\n noscript: function noscript($node) {\n var $children = $node.children();\n if ($children.length === 1 && $children.get(0).tagName === 'img') {\n return 'figure';\n }\n\n return null;\n }\n }\n },\n\n title: {\n selectors: ['h1.lede-feature-title', 'h1.headline-primary', 'h1']\n },\n\n author: {\n selectors: ['.by-authors', '.lede-feature-author']\n },\n\n dek: {\n selectors: ['.lede-feature-teaser']\n },\n\n date_published: {\n selectors: ['time.article-timestamp[datetime]', 'time.article-timestamp']\n }\n};\n\nvar BloggerExtractor = {\n domain: 'blogspot.com',\n content: {\n // Blogger is insane and does not load its content\n // initially in the page, but it's all there\n // in noscript\n selectors: ['.post-content noscript'],\n\n // Selectors to remove from the extracted content\n clean: [],\n\n // Convert the noscript tag to a div\n transforms: {\n noscript: 'div'\n }\n },\n\n author: {\n selectors: ['.post-author-name']\n },\n\n title: {\n selectors: ['h2.title']\n },\n\n date_published: {\n selectors: ['span.publishdate']\n }\n};\n\nvar WikipediaExtractor = {\n domain: 'wikipedia.org',\n content: {\n selectors: ['#mw-content-text'],\n\n defaultCleaner: false,\n\n // transform top infobox to an image with caption\n transforms: {\n '.infobox img': function infoboxImg($node) {\n var $parent = $node.parents('.infobox');\n // Only prepend the first image in .infobox\n if ($parent.children('img').length === 0) {\n $parent.prepend($node);\n }\n },\n '.infobox caption': 'figcaption',\n '.infobox': 'figure'\n },\n\n // Selectors to remove from the extracted content\n clean: ['.mw-editsection', 'figure tr, figure td, figure tbody', '#toc', '.navbox']\n\n },\n\n author: 'Wikipedia Contributors',\n\n title: {\n selectors: ['h2.title']\n },\n\n date_published: {\n selectors: ['#footer-info-lastmod']\n }\n\n};\n\nvar TwitterExtractor = {\n domain: 'twitter.com',\n\n content: {\n transforms: {\n // We're transforming essentially the whole page here.\n // Twitter doesn't have nice selectors, so our initial\n // selector grabs the whole page, then we're re-writing\n // it to fit our needs before we clean it up.\n '.permalink[role=main]': function permalinkRoleMain($node, $) {\n var tweets = $node.find('.tweet');\n var $tweetContainer = $('
');\n $tweetContainer.append(tweets);\n $node.replaceWith($tweetContainer);\n },\n\n // Twitter wraps @ with s, which\n // renders as a strikethrough\n s: 'span'\n },\n\n selectors: ['.permalink[role=main]'],\n\n defaultCleaner: false,\n\n clean: ['.stream-item-footer', 'button', '.tweet-details-fixer']\n },\n\n author: {\n selectors: ['.tweet.permalink-tweet .username']\n },\n\n date_published: {\n selectors: ['.permalink-tweet ._timestamp[data-time-ms]']\n }\n\n};\n\nvar NYTimesExtractor = {\n title: {\n selectors: ['.g-headline', 'h1.headline']\n },\n\n author: {\n selectors: ['.g-byline', '.byline']\n },\n\n content: {\n selectors: ['div.g-blocks', 'article#story'],\n\n defaultCleaner: false,\n\n transforms: {\n 'img.g-lazy': function imgGLazy($node) {\n var src = $node.attr('src');\n // const widths = $node.attr('data-widths')\n // .slice(1)\n // .slice(0, -1)\n // .split(',');\n // if (widths.length) {\n // width = widths.slice(-1);\n // } else {\n // width = '900';\n // }\n var width = 640;\n\n src = src.replace('{{size}}', width);\n $node.attr('src', src);\n }\n },\n\n clean: ['.ad', 'header#story-header', '.story-body-1 .lede.video', '.visually-hidden', '#newsletter-promo', '.promo', '.comments-button', '.hidden']\n },\n\n date_published: null,\n\n lead_image_url: null,\n\n dek: null,\n\n next_page_url: null,\n\n excerpt: null\n};\n\nvar Extractors = {\n 'nymag.com': NYMagExtractor,\n 'blogspot.com': BloggerExtractor,\n 'wikipedia.org': WikipediaExtractor,\n 'twitter.com': TwitterExtractor,\n 'www.nytimes.com': NYTimesExtractor\n};\n\n// Spacer images to be removed\nvar SPACER_RE = new RegExp('trans|transparent|spacer|blank', 'i');\n\n// A list of tags to strip from the output if we encounter them.\nvar STRIP_OUTPUT_TAGS = ['title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object'];\n\n// cleanAttributes\nvar REMOVE_ATTRS = ['style', 'align'];\nvar REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(function (selector) {\n return '[' + selector + ']';\n});\nvar REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',');\nvar WHITELIST_ATTRS = ['src', 'href', 'class', 'id', 'score'];\nvar WHITELIST_ATTRS_RE = new RegExp('^(' + WHITELIST_ATTRS.join('|') + ')$', 'i');\n\n// removeEmpty\nvar REMOVE_EMPTY_TAGS = ['p'];\nvar REMOVE_EMPTY_SELECTORS = REMOVE_EMPTY_TAGS.map(function (tag) {\n return tag + ':empty';\n}).join(',');\n\n// cleanTags\nvar CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(',');\n\n// cleanHeaders\nvar HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6'];\nvar HEADER_TAG_LIST = HEADER_TAGS.join(',');\n\n// // CONTENT FETCHING CONSTANTS ////\n\n// A list of strings that can be considered unlikely candidates when\n// extracting content from a resource. These strings are joined together\n// and then tested for existence using re:test, so may contain simple,\n// non-pipe style regular expression queries if necessary.\nvar UNLIKELY_CANDIDATES_BLACKLIST = ['ad-break', 'adbox', 'advert', 'addthis', 'agegate', 'aux', 'blogger-labels', 'combx', 'comment', 'conversation', 'disqus', 'entry-unrelated', 'extra', 'foot',\n// 'form', // This is too generic, has too many false positives\n'header', 'hidden', 'loader', 'login', // Note: This can hit 'blogindex'.\n'menu', 'meta', 'nav', 'outbrain', 'pager', 'pagination', 'predicta', // readwriteweb inline ad box\n'presence_control_external', // lifehacker.com container full of false positives\n'popup', 'printfriendly', 'related', 'remove', 'remark', 'rss', 'share', 'shoutbox', 'sidebar', 'sociable', 'sponsor', 'taboola', 'tools'];\n\n// A list of strings that can be considered LIKELY candidates when\n// extracting content from a resource. Essentially, the inverse of the\n// blacklist above - if something matches both blacklist and whitelist,\n// it is kept. This is useful, for example, if something has a className\n// of \"rss-content entry-content\". It matched 'rss', so it would normally\n// be removed, however, it's also the entry content, so it should be left\n// alone.\n//\n// These strings are joined together and then tested for existence using\n// re:test, so may contain simple, non-pipe style regular expression queries\n// if necessary.\nvar UNLIKELY_CANDIDATES_WHITELIST = ['and', 'article', 'body', 'blogindex', 'column', 'content', 'entry-content-asset', 'format', // misuse of form\n'hfeed', 'hentry', 'hatom', 'main', 'page', 'posts', 'shadow'];\n\n// A list of tags which, if found inside, should cause a to NOT\n// be turned into a paragraph tag. Shallow div tags without these elements\n// should be turned into tags.\nvar DIV_TO_P_BLOCK_TAGS = ['a', 'blockquote', 'dl', 'div', 'img', 'p', 'pre', 'table'].join(',');\n\n// A list of tags that should be ignored when trying to find the top candidate\n// for a document.\nvar NON_TOP_CANDIDATE_TAGS = ['br', 'b', 'i', 'label', 'hr', 'area', 'base', 'basefont', 'input', 'img', 'link', 'meta'];\n\nvar NON_TOP_CANDIDATE_TAGS_RE = new RegExp('^(' + NON_TOP_CANDIDATE_TAGS.join('|') + ')$', 'i');\n\nvar PHOTO_HINTS = ['figure', 'photo', 'image', 'caption'];\nvar PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i');\n\n// A list of strings that denote a positive scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nvar POSITIVE_SCORE_HINTS = ['article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday\n'\\\\Bcopy'];\n\n// The above list, joined into a matching regular expression\nvar POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i');\n\n// A list of strings that denote a negative scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nvar NEGATIVE_SCORE_HINTS = ['adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\\\\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off\n'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright\n'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk\n'promo', 'pr_', // autoblog - press release\n'related', 'respond', 'roundcontent', // lifehacker restricted content warning\n'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget'];\n// The above list, joined into a matching regular expression\nvar NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i');\n\n// XPath to try to determine if a page is wordpress. Not always successful.\nvar IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]';\n\n// A list of words that, if found in link text or URLs, likely mean that\n// this link is not a next page link.\nvar EXTRANEOUS_LINK_HINTS = ['print', 'archive', 'comment', 'discuss', 'e-mail', 'email', 'share', 'reply', 'all', 'login', 'sign', 'single', 'adx', 'entry-unrelated'];\nvar EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i');\n\n// Match any phrase that looks like it could be page, or paging, or pagination\nvar PAGE_RE = new RegExp('pag(e|ing|inat)', 'i');\n\n// A list of all of the block level tags known in HTML5 and below. Taken from\n// http://bit.ly/qneNIT\nvar BLOCK_LEVEL_TAGS = ['article', 'aside', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'col', 'colgroup', 'dd', 'div', 'dl', 'dt', 'embed', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'map', 'object', 'ol', 'output', 'p', 'pre', 'progress', 'section', 'table', 'tbody', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'ul', 'video'];\nvar BLOCK_LEVEL_TAGS_RE = new RegExp('^(' + BLOCK_LEVEL_TAGS.join('|') + ')$', 'i');\n\n// The removal is implemented as a blacklist and whitelist, this test finds\n// blacklisted elements that aren't whitelisted. We do this all in one\n// expression-both because it's only one pass, and because this skips the\n// serialization for whitelisted nodes.\nvar candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');\nvar CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i');\n\nvar candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|');\nvar CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i');\n\nfunction stripUnlikelyCandidates($) {\n // Loop through the provided document and remove any non-link nodes\n // that are unlikely candidates for article content.\n //\n // Links are ignored because there are very often links to content\n // that are identified as non-body-content, but may be inside\n // article-like content.\n //\n // :param $: a cheerio object to strip nodes from\n // :return $: the cleaned cheerio object\n $('*').not('a').each(function (index, node) {\n var $node = $(node);\n var classes = $node.attr('class');\n var id = $node.attr('id');\n if (!id && !classes) return;\n\n var classAndId = (classes || '') + ' ' + (id || '');\n if (CANDIDATES_WHITELIST.test(classAndId)) {\n return;\n } else if (CANDIDATES_BLACKLIST.test(classAndId)) {\n $node.remove();\n }\n });\n\n return $;\n}\n\n// ## NOTES:\n// Another good candidate for refactoring/optimizing.\n// Very imperative code, I don't love it. - AP\n\n\n// Given cheerio object, convert consecutive