'use strict'; function _interopDefault(ex) { return ex && typeof ex === 'object' && 'default' in ex ? ex['default'] : ex; } var _regeneratorRuntime = _interopDefault( require('@babel/runtime-corejs2/regenerator') ); var _objectSpread = _interopDefault( require('@babel/runtime-corejs2/helpers/objectSpread') ); var _asyncToGenerator = _interopDefault( require('@babel/runtime-corejs2/helpers/asyncToGenerator') ); var URL = _interopDefault(require('url')); var cheerio = _interopDefault(require('cheerio')); var iconv = _interopDefault(require('iconv-lite')); var _parseInt = _interopDefault( require('@babel/runtime-corejs2/core-js/parse-int') ); var _slicedToArray = _interopDefault( require('@babel/runtime-corejs2/helpers/slicedToArray') ); var _Promise = _interopDefault( require('@babel/runtime-corejs2/core-js/promise') ); var request = _interopDefault(require('request')); var _Reflect$ownKeys = _interopDefault( require('@babel/runtime-corejs2/core-js/reflect/own-keys') ); var _toConsumableArray = _interopDefault( require('@babel/runtime-corejs2/helpers/toConsumableArray') ); var _defineProperty = _interopDefault( require('@babel/runtime-corejs2/helpers/defineProperty') ); var _parseFloat = _interopDefault( require('@babel/runtime-corejs2/core-js/parse-float') ); var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof')); var _getIterator = _interopDefault( require('@babel/runtime-corejs2/core-js/get-iterator') ); var _Object$keys = _interopDefault( require('@babel/runtime-corejs2/core-js/object/keys') ); var stringDirection = _interopDefault(require('string-direction')); var validUrl = _interopDefault(require('valid-url')); var moment = _interopDefault(require('moment-timezone')); var parseFormat = _interopDefault(require('moment-parseformat')); var wuzzy = _interopDefault(require('wuzzy')); var difflib = _interopDefault(require('difflib')); var _Array$from = _interopDefault( require('@babel/runtime-corejs2/core-js/array/from') ); var ellipsize = _interopDefault(require('ellipsize')); var _Array$isArray = _interopDefault( require('@babel/runtime-corejs2/core-js/array/is-array') ); var NORMALIZE_RE = /\s{2,}/g; function normalizeSpaces(text) { return text.replace(NORMALIZE_RE, ' ').trim(); } // Given a node type to search for, and a list of regular expressions, // look to see if this extraction can be found in the URL. Expects // that each expression in r_list will return group(1) as the proper // string to be cleaned. // Only used for date_published currently. function extractFromUrl(url, regexList) { var matchRe = regexList.find(function(re) { return re.test(url); }); if (matchRe) { return matchRe.exec(url)[1]; } return null; } // An expression that looks to try to find the page digit within a URL, if // it exists. // Matches: // page=1 // pg=1 // p=1 // paging=12 // pag=7 // pagination/1 // paging/88 // pa/83 // p/11 // // Does not match: // pg=102 // page:2 var PAGE_IN_HREF_RE = new RegExp( '(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|/)([0-9]{1,3})', 'i' ); var HAS_ALPHA_RE = /[a-z]/i; var IS_ALPHA_RE = /^[a-z]+$/i; var IS_DIGIT_RE = /^[0-9]+$/i; var ENCODING_RE = /charset=([\w-]+)\b/; var DEFAULT_ENCODING = 'utf-8'; function pageNumFromUrl(url) { var matches = url.match(PAGE_IN_HREF_RE); if (!matches) return null; var pageNum = _parseInt(matches[6], 10); // Return pageNum < 100, otherwise // return null return pageNum < 100 ? pageNum : null; } function removeAnchor(url) { return url.split('#')[0].replace(/\/$/, ''); } function isGoodSegment(segment, index, firstSegmentHasLetters) { var goodSegment = true; // If this is purely a number, and it's the first or second // url_segment, it's probably a page number. Remove it. if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) { goodSegment = true; } // If this is the first url_segment and it's just "index", // remove it if (index === 0 && segment.toLowerCase() === 'index') { goodSegment = false; } // If our first or second url_segment is smaller than 3 characters, // and the first url_segment had no alphas, remove it. if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) { goodSegment = false; } return goodSegment; } // Take a URL, and return the article base of said URL. That is, no // pagination data exists in it. Useful for comparing to other links // that might have pagination data within them. function articleBaseUrl(url, parsed) { var parsedUrl = parsed || URL.parse(url); var protocol = parsedUrl.protocol, host = parsedUrl.host, path = parsedUrl.path; var firstSegmentHasLetters = false; var cleanedSegments = path .split('/') .reverse() .reduce(function(acc, rawSegment, index) { var segment = rawSegment; // Split off and save anything that looks like a file type. if (segment.includes('.')) { var _segment$split = segment.split('.'), _segment$split2 = _slicedToArray(_segment$split, 2), possibleSegment = _segment$split2[0], fileExt = _segment$split2[1]; if (IS_ALPHA_RE.test(fileExt)) { segment = possibleSegment; } } // If our first or second segment has anything looking like a page // number, remove it. if (PAGE_IN_HREF_RE.test(segment) && index < 2) { segment = segment.replace(PAGE_IN_HREF_RE, ''); } // If we're on the first segment, check to see if we have any // characters in it. The first segment is actually the last bit of // the URL, and this will be helpful to determine if we're on a URL // segment that looks like "/2/" for example. if (index === 0) { firstSegmentHasLetters = HAS_ALPHA_RE.test(segment); } // If it's not marked for deletion, push it to cleaned_segments. if (isGoodSegment(segment, index, firstSegmentHasLetters)) { acc.push(segment); } return acc; }, []); return '' .concat(protocol, '//') .concat(host) .concat(cleanedSegments.reverse().join('/')); } // Given a string, return True if it appears to have an ending sentence // within it, false otherwise. var SENTENCE_END_RE = new RegExp('.( |$)'); function hasSentenceEnd(text) { return SENTENCE_END_RE.test(text); } function excerptContent(content) { var words = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 10; return content .trim() .split(/\s+/) .slice(0, words) .join(' '); } // used in our fetchResource function to // ensure correctly encoded responses function getEncoding(str) { var encoding = DEFAULT_ENCODING; if (ENCODING_RE.test(str)) { var testEncode = ENCODING_RE.exec(str)[1]; if (iconv.encodingExists(testEncode)) { encoding = testEncode; } } return encoding; } var _marked = /*#__PURE__*/ _regeneratorRuntime.mark(range); function range() { var start, end, _args = arguments; return _regeneratorRuntime.wrap( function range$(_context) { while (1) { switch ((_context.prev = _context.next)) { case 0: start = _args.length > 0 && _args[0] !== undefined ? _args[0] : 1; end = _args.length > 1 && _args[1] !== undefined ? _args[1] : 1; case 2: if (!(start <= end)) { _context.next = 7; break; } _context.next = 5; return (start += 1); case 5: _context.next = 2; break; case 7: case 'end': return _context.stop(); } } }, _marked, this ); } // extremely simple url validation as a first step function validateUrl(_ref) { var hostname = _ref.hostname; // If this isn't a valid url, return an error message return !!hostname; } var Errors = { badUrl: { error: true, messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.', }, }; var REQUEST_HEADERS = cheerio.browser ? {} : { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', }; // The number of milliseconds to attempt to fetch a resource before timing out. var FETCH_TIMEOUT = 10000; // Content types that we do not extract content from var BAD_CONTENT_TYPES = ['audio/mpeg', 'image/gif', 'image/jpeg', 'image/jpg']; var BAD_CONTENT_TYPES_RE = new RegExp( '^('.concat(BAD_CONTENT_TYPES.join('|'), ')$'), 'i' ); // Use this setting as the maximum size an article can be // for us to attempt parsing. Defaults to 5 MB. var MAX_CONTENT_LENGTH = 5242880; // Turn the global proxy on or off function get(options) { return new _Promise(function(resolve, reject) { request(options, function(err, response, body) { if (err) { reject(err); } else { resolve({ body: body, response: response, }); } }); }); } // Evaluate a response to ensure it's something we should be keeping. // This does not validate in the sense of a response being 200 level or // not. Validation here means that we haven't found reason to bail from // further processing of this url. function validateResponse(response) { var parseNon2xx = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false; // Check if we got a valid status code // This isn't great, but I'm requiring a statusMessage to be set // before short circuiting b/c nock doesn't set it in tests // statusMessage only not set in nock response, in which case // I check statusCode, which is currently only 200 for OK responses // in tests if ( (response.statusMessage && response.statusMessage !== 'OK') || response.statusCode !== 200 ) { if (!response.statusCode) { throw new Error( 'Unable to fetch content. Original exception was '.concat( response.error ) ); } else if (!parseNon2xx) { throw new Error( 'Resource returned a response status code of '.concat( response.statusCode, ' and resource was instructed to reject non-2xx level status codes.' ) ); } } var _response$headers = response.headers, contentType = _response$headers['content-type'], contentLength = _response$headers['content-length']; // Check that the content is not in BAD_CONTENT_TYPES if (BAD_CONTENT_TYPES_RE.test(contentType)) { throw new Error( 'Content-type for this resource was '.concat( contentType, ' and is not allowed.' ) ); } // Check that the content length is below maximum if (contentLength > MAX_CONTENT_LENGTH) { throw new Error( 'Content for this resource was too large. Maximum content length is '.concat( MAX_CONTENT_LENGTH, '.' ) ); } return true; } // Grabs the last two pieces of the URL and joins them back together // TODO: This should gracefully handle timeouts and raise the // proper exceptions on the many failure cases of HTTP. // TODO: Ensure we are not fetching something enormous. Always return // unicode content for HTML, with charset conversion. function fetchResource(_x, _x2) { return _fetchResource.apply(this, arguments); } function _fetchResource() { _fetchResource = _asyncToGenerator( /*#__PURE__*/ _regeneratorRuntime.mark(function _callee(url, parsedUrl) { var options, _ref2, response, body; return _regeneratorRuntime.wrap( function _callee$(_context) { while (1) { switch ((_context.prev = _context.next)) { case 0: parsedUrl = parsedUrl || URL.parse(encodeURI(url)); options = { url: parsedUrl.href, headers: _objectSpread({}, REQUEST_HEADERS), timeout: FETCH_TIMEOUT, // Accept cookies jar: true, // Set to null so the response returns as binary and body as buffer // https://github.com/request/request#requestoptions-callback encoding: null, // Accept and decode gzip gzip: true, // Follow any redirect followAllRedirects: true, }; _context.next = 4; return get(options); case 4: _ref2 = _context.sent; response = _ref2.response; body = _ref2.body; _context.prev = 7; validateResponse(response); return _context.abrupt('return', { body: body, response: response, }); case 12: _context.prev = 12; _context.t0 = _context['catch'](7); return _context.abrupt('return', Errors.badUrl); case 15: case 'end': return _context.stop(); } } }, _callee, this, [[7, 12]] ); }) ); return _fetchResource.apply(this, arguments); } function convertMetaProp($, from, to) { $('meta['.concat(from, ']')).each(function(_, node) { var $node = $(node); var value = $node.attr(from); $node.attr(to, value); $node.removeAttr(from); }); return $; } // For ease of use in extracting from meta tags, // replace the "content" attribute on meta tags with the // "value" attribute. // // In addition, normalize 'property' attributes to 'name' for ease of // querying later. See, e.g., og or twitter meta tags. function normalizeMetaTags($) { $ = convertMetaProp($, 'content', 'value'); $ = convertMetaProp($, 'property', 'name'); return $; } // Spacer images to be removed var SPACER_RE = new RegExp('transparent|spacer|blank', 'i'); // The class we will use to mark elements we want to keep // but would normally remove var KEEP_CLASS = 'mercury-parser-keep'; var KEEP_SELECTORS = [ 'iframe[src^="https://www.youtube.com"]', 'iframe[src^="https://www.youtube-nocookie.com"]', 'iframe[src^="http://www.youtube.com"]', 'iframe[src^="https://player.vimeo"]', 'iframe[src^="http://player.vimeo"]', ]; // A list of tags to strip from the output if we encounter them. var STRIP_OUTPUT_TAGS = [ 'title', 'script', 'noscript', 'link', 'style', 'hr', 'embed', 'iframe', 'object', ]; // cleanAttributes var WHITELIST_ATTRS = [ 'src', 'srcset', 'href', 'class', 'id', 'alt', 'xlink:href', 'width', 'height', ]; var WHITELIST_ATTRS_RE = new RegExp( '^('.concat(WHITELIST_ATTRS.join('|'), ')$'), 'i' ); // removeEmpty var CLEAN_CONDITIONALLY_TAGS = [ 'ul', 'ol', 'table', 'div', 'button', 'form', ].join(','); // cleanHeaders var HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6']; var HEADER_TAG_LIST = HEADER_TAGS.join(','); // // CONTENT FETCHING CONSTANTS //// // A list of strings that can be considered unlikely candidates when // extracting content from a resource. These strings are joined together // and then tested for existence using re:test, so may contain simple, // non-pipe style regular expression queries if necessary. var UNLIKELY_CANDIDATES_BLACKLIST = [ 'ad-break', 'adbox', 'advert', 'addthis', 'agegate', 'aux', 'blogger-labels', 'combx', 'comment', 'conversation', 'disqus', 'entry-unrelated', 'extra', 'foot', // 'form', // This is too generic, has too many false positives 'header', 'hidden', 'loader', 'login', // Note: This can hit 'blogindex'. 'menu', 'meta', 'nav', 'outbrain', 'pager', 'pagination', 'predicta', // readwriteweb inline ad box 'presence_control_external', // lifehacker.com container full of false positives 'popup', 'printfriendly', 'related', 'remove', 'remark', 'rss', 'share', 'shoutbox', 'sidebar', 'sociable', 'sponsor', 'taboola', 'tools', ]; // A list of strings that can be considered LIKELY candidates when // extracting content from a resource. Essentially, the inverse of the // blacklist above - if something matches both blacklist and whitelist, // it is kept. This is useful, for example, if something has a className // of "rss-content entry-content". It matched 'rss', so it would normally // be removed, however, it's also the entry content, so it should be left // alone. // // These strings are joined together and then tested for existence using // re:test, so may contain simple, non-pipe style regular expression queries // if necessary. var UNLIKELY_CANDIDATES_WHITELIST = [ 'and', 'article', 'body', 'blogindex', 'column', 'content', 'entry-content-asset', 'format', // misuse of form 'hfeed', 'hentry', 'hatom', 'main', 'page', 'posts', 'shadow', ]; // A list of tags which, if found inside, should cause a
to NOT // be turned into a paragraph tag. Shallow div tags without these elements // should be turned into

tags. var DIV_TO_P_BLOCK_TAGS = [ 'a', 'blockquote', 'dl', 'div', 'img', 'p', 'pre', 'table', ].join(','); // A list of tags that should be ignored when trying to find the top candidate // an article container. Checked against className and id. // // TODO: Perhaps have these scale based on their odds of being quality? var POSITIVE_SCORE_HINTS = [ 'article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday '\\Bcopy', ]; // The above list, joined into a matching regular expression var POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i'); // Readability publisher-specific guidelines // an article container. Checked against className and id. // // TODO: Perhaps have these scale based on their odds of being quality? var NEGATIVE_SCORE_HINTS = [ 'adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off 'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright 'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk 'promo', 'pr_', // autoblog - press release 'related', 'respond', 'roundcontent', // lifehacker restricted content warning 'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget', ]; // The above list, joined into a matching regular expression var NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i'); // XPath to try to determine if a page is wordpress. Not always successful. var IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]'; // Match a digit. Pretty clear. var PAGE_RE = new RegExp('pag(e|ing|inat)', 'i'); // Match any link text/classname/id that looks like it could mean the next // http://bit.ly/qneNIT var BLOCK_LEVEL_TAGS = [ 'article', 'aside', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'col', 'colgroup', 'dd', 'div', 'dl', 'dt', 'embed', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'map', 'object', 'ol', 'output', 'p', 'pre', 'progress', 'section', 'table', 'tbody', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'ul', 'video', ]; var BLOCK_LEVEL_TAGS_RE = new RegExp( '^('.concat(BLOCK_LEVEL_TAGS.join('|'), ')$'), 'i' ); // The removal is implemented as a blacklist and whitelist, this test finds // blacklisted elements that aren't whitelisted. We do this all in one // expression-both because it's only one pass, and because this skips the // serialization for whitelisted nodes. var candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|'); var CANDIDATES_BLACKLIST = new RegExp(candidatesBlacklist, 'i'); var candidatesWhitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|'); var CANDIDATES_WHITELIST = new RegExp(candidatesWhitelist, 'i'); function stripUnlikelyCandidates($) { // Loop through the provided document and remove any non-link nodes // that are unlikely candidates for article content. // // Links are ignored because there are very often links to content // that are identified as non-body-content, but may be inside // article-like content. // // :param $: a cheerio object to strip nodes from // :return $: the cleaned cheerio object $('*') .not('a') .each(function(index, node) { var $node = $(node); var classes = $node.attr('class'); var id = $node.attr('id'); if (!id && !classes) return; var classAndId = ''.concat(classes || '', ' ').concat(id || ''); if (CANDIDATES_WHITELIST.test(classAndId)) { return; } if (CANDIDATES_BLACKLIST.test(classAndId)) { $node.remove(); } }); return $; } // Another good candidate for refactoring/optimizing. // Very imperative code, I don't love it. - AP // Given cheerio object, convert consecutive
tags into //

tags instead. // // :param $: A cheerio object function brsToPs$$1($) { var collapsing = false; $('br').each(function(index, element) { var $element = $(element); var nextElement = $element.next().get(0); if (nextElement && nextElement.tagName.toLowerCase() === 'br') { collapsing = true; $element.remove(); } else if (collapsing) { collapsing = false; // $(element).replaceWith('

') paragraphize(element, $, true); } }); return $; } // make sure it conforms to the constraints of a P tag (I.E. does // not contain any other block tags.) // // If the node is a
, it treats the following inline siblings // as if they were its children. // // :param node: The node to paragraphize; this is a raw node // :param $: The cheerio object to handle dom manipulation // :param br: Whether or not the passed node is a br function paragraphize(node, $) { var br = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false; var $node = $(node); if (br) { var sibling = node.nextSibling; var p = $('

'); // while the next node is text or not a block level element // append it to a new p node while ( sibling && !(sibling.tagName && BLOCK_LEVEL_TAGS_RE.test(sibling.tagName)) ) { var _sibling = sibling, nextSibling = _sibling.nextSibling; $(sibling).appendTo(p); sibling = nextSibling; } $node.replaceWith(p); $node.remove(); return $; } return $; } function convertDivs($) { $('div').each(function(index, div) { var $div = $(div); var convertable = $div.children(DIV_TO_P_BLOCK_TAGS).length === 0; if (convertable) { convertNodeTo$$1($div, $, 'p'); } }); return $; } function convertSpans($) { $('span').each(function(index, span) { var $span = $(span); var convertable = $span.parents('p, div').length === 0; if (convertable) { convertNodeTo$$1($span, $, 'p'); } }); return $; } // Loop through the provided doc, and convert any p-like elements to // actual paragraph tags. // // Things fitting this criteria: // * Multiple consecutive
tags. // *
tags without block level elements inside of them // * tags who are not children of

or

tags. // // :param $: A cheerio object to search // :return cheerio object with new p elements // (By-reference mutation, though. Returned just for convenience.) function convertToParagraphs$$1($) { $ = brsToPs$$1($); $ = convertDivs($); $ = convertSpans($); return $; } function convertNodeTo$$1($node, $) { var tag = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 'p'; var node = $node.get(0); if (!node) { return $; } var attrs = getAttrs(node) || {}; // console.log(attrs) var attribString = _Reflect$ownKeys(attrs) .map(function(key) { return ''.concat(key, '=').concat(attrs[key]); }) .join(' '); var html; if ($.browser) { // In the browser, the contents of noscript tags aren't rendered, therefore // transforms on the noscript tag (commonly used for lazy-loading) don't work // as expected. This test case handles that html = node.tagName.toLowerCase() === 'noscript' ? $node.text() : $node.html(); } else { html = $node.contents(); } $node.replaceWith( '<' .concat(tag, ' ') .concat(attribString, '>') .concat(html, '') ); return $; } function cleanForHeight($img, $) { var height = _parseInt($img.attr('height'), 10); var width = _parseInt($img.attr('width'), 10) || 20; // Remove images that explicitly have very small heights or // widths, because they are most likely shims or icons, // which aren't very useful for reading. if ((height || 20) < 10 || width < 10) { $img.remove(); } else if (height) { // Don't ever specify a height on images, so that we can // scale with respect to width without screwing up the // aspect ratio. $img.removeAttr('height'); } return $; } // Cleans out images where the source string matches transparent/spacer/etc // TODO This seems very aggressive - AP function removeSpacers($img, $) { if (SPACER_RE.test($img.attr('src'))) { $img.remove(); } return $; } function cleanImages($article, $) { $article.find('img').each(function(index, img) { var $img = $(img); cleanForHeight($img, $); removeSpacers($img, $); }); return $; } function markToKeep(article, $, url) { var tags = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : []; if (tags.length === 0) { tags = KEEP_SELECTORS; } if (url) { var _URL$parse = URL.parse(url), protocol = _URL$parse.protocol, hostname = _URL$parse.hostname; tags = [].concat(_toConsumableArray(tags), [ 'iframe[src^="'.concat(protocol, '//').concat(hostname, '"]'), ]); } $(tags.join(','), article).addClass(KEEP_CLASS); return $; } function stripJunkTags(article, $) { var tags = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : []; if (tags.length === 0) { tags = STRIP_OUTPUT_TAGS; } // Remove matching elements, but ignore // any element with a class of mercury-parser-keep $(tags.join(','), article) .not('.'.concat(KEEP_CLASS)) .remove(); return $; } // by the title extractor instead. If there's less than 3 of them (<3), // strip them. Otherwise, turn 'em into H2s. function cleanHOnes$$1(article, $) { var $hOnes = $('h1', article); if ($hOnes.length < 3) { $hOnes.each(function(index, node) { return $(node).remove(); }); } else { $hOnes.each(function(index, node) { convertNodeTo$$1($(node), $, 'h2'); }); } return $; } function removeAllButWhitelist($article, $) { $article.find('*').each(function(index, node) { var attrs = getAttrs(node); setAttrs( node, _Reflect$ownKeys(attrs).reduce(function(acc, attr) { if (WHITELIST_ATTRS_RE.test(attr)) { return _objectSpread({}, acc, _defineProperty({}, attr, attrs[attr])); } return acc; }, {}) ); }); // Remove the mercury-parser-keep class from result $('.'.concat(KEEP_CLASS), $article).removeClass(KEEP_CLASS); return $article; } // function removeAttrs(article, $) { // REMOVE_ATTRS.forEach((attr) => { // $(`[${attr}]`, article).removeAttr(attr); // }); // } // Remove attributes like style or align function cleanAttributes$$1($article, $) { // Grabbing the parent because at this point // $article will be wrapped in a div which will // have a score set on it. return removeAllButWhitelist( $article.parent().length ? $article.parent() : $article, $ ); } function removeEmpty($article, $) { $article.find('p').each(function(index, p) { var $p = $(p); if ($p.find('iframe, img').length === 0 && $p.text().trim() === '') $p.remove(); }); return $; } // // CONTENT FETCHING CONSTANTS //// // for a document. var NON_TOP_CANDIDATE_TAGS$1 = [ 'br', 'b', 'i', 'label', 'hr', 'area', 'base', 'basefont', 'input', 'img', 'link', 'meta', ]; var NON_TOP_CANDIDATE_TAGS_RE$1 = new RegExp( '^('.concat(NON_TOP_CANDIDATE_TAGS$1.join('|'), ')$'), 'i' ); // A list of selectors that specify, very clearly, either hNews or other // very content-specific style content, like Blogger templates. // More examples here: http://microformats.org/wiki/blog-post-formats var HNEWS_CONTENT_SELECTORS$1 = [ ['.hentry', '.entry-content'], ['entry', '.entry-content'], ['.entry', '.entry_content'], ['.post', '.postbody'], ['.post', '.post_body'], ['.post', '.post-body'], ]; var PHOTO_HINTS$1 = ['figure', 'photo', 'image', 'caption']; var PHOTO_HINTS_RE$1 = new RegExp(PHOTO_HINTS$1.join('|'), 'i'); // A list of strings that denote a positive scoring for this content as being // an article container. Checked against className and id. // // TODO: Perhaps have these scale based on their odds of being quality? var POSITIVE_SCORE_HINTS$1 = [ 'article', 'articlecontent', 'instapaper_body', 'blog', 'body', 'content', 'entry-content-asset', 'entry', 'hentry', 'main', 'Normal', 'page', 'pagination', 'permalink', 'post', 'story', 'text', '[-_]copy', // usatoday '\\Bcopy', ]; // The above list, joined into a matching regular expression var POSITIVE_SCORE_RE$1 = new RegExp(POSITIVE_SCORE_HINTS$1.join('|'), 'i'); // Readability publisher-specific guidelines var READABILITY_ASSET$1 = new RegExp('entry-content-asset', 'i'); // A list of strings that denote a negative scoring for this content as being // an article container. Checked against className and id. // // TODO: Perhaps have these scale based on their odds of being quality? var NEGATIVE_SCORE_HINTS$1 = [ 'adbox', 'advert', 'author', 'bio', 'bookmark', 'bottom', 'byline', 'clear', 'com-', 'combx', 'comment', 'comment\\B', 'contact', 'copy', 'credit', 'crumb', 'date', 'deck', 'excerpt', 'featured', // tnr.com has a featured_content which throws us off 'foot', 'footer', 'footnote', 'graf', 'head', 'info', 'infotext', // newscientist.com copyright 'instapaper_ignore', 'jump', 'linebreak', 'link', 'masthead', 'media', 'meta', 'modal', 'outbrain', // slate.com junk 'promo', 'pr_', // autoblog - press release 'related', 'respond', 'roundcontent', // lifehacker restricted content warning 'scroll', 'secondary', 'share', 'shopping', 'shoutbox', 'side', 'sidebar', 'sponsor', 'stamp', 'sub', 'summary', 'tags', 'tools', 'widget', ]; // The above list, joined into a matching regular expression var NEGATIVE_SCORE_RE$1 = new RegExp(NEGATIVE_SCORE_HINTS$1.join('|'), 'i'); // Match a digit. Pretty clear. var PARAGRAPH_SCORE_TAGS$1 = new RegExp('^(p|li|span|pre)$', 'i'); var CHILD_CONTENT_TAGS$1 = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i'); var BAD_TAGS$1 = new RegExp('^(address|form)$', 'i'); function getWeight(node) { var classes = node.attr('class'); var id = node.attr('id'); var score = 0; if (id) { // if id exists, try to score on both positive and negative if (POSITIVE_SCORE_RE$1.test(id)) { score += 25; } if (NEGATIVE_SCORE_RE$1.test(id)) { score -= 25; } } if (classes) { if (score === 0) { // if classes exist and id did not contribute to score // try to score on both positive and negative if (POSITIVE_SCORE_RE$1.test(classes)) { score += 25; } if (NEGATIVE_SCORE_RE$1.test(classes)) { score -= 25; } } // even if score has been set by id, add score for // possible photo matches // "try to keep photos if we can" if (PHOTO_HINTS_RE$1.test(classes)) { score += 10; } // add 25 if class matches entry-content-asset, // a class apparently instructed for use in the // Readability publisher guidelines // https://www.readability.com/developers/guidelines if (READABILITY_ASSET$1.test(classes)) { score += 25; } } return score; } // returns the score of a node based on // the node's score attribute // returns null if no score set function getScore($node) { return _parseFloat($node.attr('score')) || null; } // return 1 for every comma in text function scoreCommas(text) { return (text.match(/,/g) || []).length; } var idkRe = new RegExp('^(p|pre)$', 'i'); function scoreLength(textLength) { var tagName = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'p'; var chunks = textLength / 50; if (chunks > 0) { var lengthBonus; // No idea why p or pre are being tamped down here // but just following the source for now // Not even sure why tagName is included here, // since this is only being called from the context // of scoreParagraph if (idkRe.test(tagName)) { lengthBonus = chunks - 2; } else { lengthBonus = chunks - 1.25; } return Math.min(Math.max(lengthBonus, 0), 3); } return 0; } // commas, etc. Higher is better. function scoreParagraph$$1(node) { var score = 1; var text = node.text().trim(); var textLength = text.length; // If this paragraph is less than 25 characters, don't count it. if (textLength < 25) { return 0; } // Add points for any commas within this paragraph score += scoreCommas(text); // For every 50 characters in this paragraph, add another point. Up // to 3 points. score += scoreLength(textLength); // Articles can end with short paragraphs when people are being clever // but they can also end with short paragraphs setting up lists of junk // that we strip. This negative tweaks junk setup paragraphs just below // the cutoff threshold. if (text.slice(-1) === ':') { score -= 1; } return score; } function setScore($node, $, score) { $node.attr('score', score); return $node; } function addScore$$1($node, $, amount) { try { var score = getOrInitScore$$1($node, $) + amount; setScore($node, $, score); } catch (e) { // Ignoring; error occurs in scoreNode } return $node; } function addToParent$$1(node, $, score) { var parent = node.parent(); if (parent) { addScore$$1(parent, $, score * 0.25); } return node; } // if not, initializes a score based on // the node's tag type function getOrInitScore$$1($node, $) { var weightNodes = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : true; var score = getScore($node); if (score) { return score; } score = scoreNode$$1($node); if (weightNodes) { score += getWeight($node); } addToParent$$1($node, $, score); return score; } // just scores based on tag. function scoreNode$$1($node) { var _$node$get = $node.get(0), tagName = _$node$get.tagName; // TODO: Consider ordering by most likely. // E.g., if divs are a more common tag on a page, // Could save doing that regex test on every node – AP if (PARAGRAPH_SCORE_TAGS$1.test(tagName)) { return scoreParagraph$$1($node); } if (tagName.toLowerCase() === 'div') { return 5; } if (CHILD_CONTENT_TAGS$1.test(tagName)) { return 3; } if (BAD_TAGS$1.test(tagName)) { return -3; } if (tagName.toLowerCase() === 'th') { return -5; } return 0; } function convertSpans$1($node, $) { if ($node.get(0)) { var _$node$get = $node.get(0), tagName = _$node$get.tagName; if (tagName === 'span') { // convert spans to divs convertNodeTo$$1($node, $, 'div'); } } } function addScoreTo($node, $, score) { if ($node) { convertSpans$1($node, $); addScore$$1($node, $, score); } } function scorePs($, weightNodes) { $('p, pre') .not('[score]') .each(function(index, node) { // The raw score for this paragraph, before we add any parent/child // scores. var $node = $(node); $node = setScore($node, $, getOrInitScore$$1($node, $, weightNodes)); var $parent = $node.parent(); var rawScore = scoreNode$$1($node); addScoreTo($parent, $, rawScore, weightNodes); if ($parent) { // Add half of the individual content score to the // grandparent addScoreTo($parent.parent(), $, rawScore / 2, weightNodes); } }); return $; } // score content. Parents get the full value of their children's // content score, grandparents half function scoreContent$$1($) { var weightNodes = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : true; // First, look for special hNews based selectors and give them a big // boost, if they exist HNEWS_CONTENT_SELECTORS$1.forEach(function(_ref) { var _ref2 = _slicedToArray(_ref, 2), parentSelector = _ref2[0], childSelector = _ref2[1]; $(''.concat(parentSelector, ' ').concat(childSelector)).each(function( index, node ) { addScore$$1($(node).parent(parentSelector), $, 80); }); }); // Doubling this again // Previous solution caused a bug // in which parents weren't retaining // scores. This is not ideal, and // should be fixed. scorePs($, weightNodes); scorePs($, weightNodes); return $; } // it to see if any of them are decently scored. If they are, they // may be split parts of the content (Like two divs, a preamble and // a body.) Example: // http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14 function mergeSiblings($candidate, topScore, $) { if (!$candidate.parent().length) { return $candidate; } var siblingScoreThreshold = Math.max(10, topScore * 0.25); var wrappingDiv = $('
'); $candidate .parent() .children() .each(function(index, sibling) { var $sibling = $(sibling); // Ignore tags like BR, HR, etc if (NON_TOP_CANDIDATE_TAGS_RE$1.test(sibling.tagName)) { return null; } var siblingScore = getScore($sibling); if (siblingScore) { if ($sibling.get(0) === $candidate.get(0)) { wrappingDiv.append($sibling); } else { var contentBonus = 0; var density = linkDensity($sibling); // If sibling has a very low link density, // give it a small bonus if (density < 0.05) { contentBonus += 20; } // If sibling has a high link density, // give it a penalty if (density >= 0.5) { contentBonus -= 20; } // If sibling node has the same class as // candidate, give it a bonus if ($sibling.attr('class') === $candidate.attr('class')) { contentBonus += topScore * 0.2; } var newScore = siblingScore + contentBonus; if (newScore >= siblingScoreThreshold) { return wrappingDiv.append($sibling); } if (sibling.tagName === 'p') { var siblingContent = $sibling.text(); var siblingContentLength = textLength(siblingContent); if (siblingContentLength > 80 && density < 0.25) { return wrappingDiv.append($sibling); } if ( siblingContentLength <= 80 && density === 0 && hasSentenceEnd(siblingContent) ) { return wrappingDiv.append($sibling); } } } } return null; }); if ( wrappingDiv.children().length === 1 && wrappingDiv .children() .first() .get(0) === $candidate.get(0) ) { return $candidate; } return wrappingDiv; } // candidate nodes we found and find the one with the highest score. function findTopCandidate$$1($) { var $candidate; var topScore = 0; $('[score]').each(function(index, node) { // Ignore tags like BR, HR, etc if (NON_TOP_CANDIDATE_TAGS_RE$1.test(node.tagName)) { return; } var $node = $(node); var score = getScore($node); if (score > topScore) { topScore = score; $candidate = $node; } }); // If we don't have a candidate, return the body // or whatever the first element is if (!$candidate) { return $('body') || $('*').first(); } $candidate = mergeSiblings($candidate, topScore, $); return $candidate; } // Scoring function removeUnlessContent($node, $, weight) { // Explicitly save entry-content-asset tags, which are // noted as valuable in the Publisher guidelines. For now // this works everywhere. We may want to consider making // this less of a sure-thing later. if ($node.hasClass('entry-content-asset')) { return; } var content = normalizeSpaces($node.text()); if (scoreCommas(content) < 10) { var pCount = $('p', $node).length; var inputCount = $('input', $node).length; // Looks like a form, too many inputs. if (inputCount > pCount / 3) { $node.remove(); return; } var contentLength = content.length; var imgCount = $('img', $node).length; // Content is too short, and there are no images, so // this is probably junk content. if (contentLength < 25 && imgCount === 0) { $node.remove(); return; } var density = linkDensity($node); // Too high of link density, is probably a menu or // something similar. // console.log(weight, density, contentLength) if (weight < 25 && density > 0.2 && contentLength > 75) { $node.remove(); return; } // Too high of a link density, despite the score being // high. if (weight >= 25 && density > 0.5) { // Don't remove the node if it's a list and the // previous sibling starts with a colon though. That // means it's probably content. var tagName = $node.get(0).tagName.toLowerCase(); var nodeIsList = tagName === 'ol' || tagName === 'ul'; if (nodeIsList) { var previousNode = $node.prev(); if ( previousNode && normalizeSpaces(previousNode.text()).slice(-1) === ':' ) { return; } } $node.remove(); return; } var scriptCount = $('script', $node).length; // Too many script tags, not enough content. if (scriptCount > 0 && contentLength < 150) { $node.remove(); } } } // Given an article, clean it of some superfluous content specified by // tags. Things like forms, ads, etc. // // Tags is an array of tag name's to search through. (like div, form, // etc) // // Return this same doc. function cleanTags$$1($article, $) { $(CLEAN_CONDITIONALLY_TAGS, $article).each(function(index, node) { var $node = $(node); // If marked to keep, skip it if ( $node.hasClass(KEEP_CLASS) || $node.find('.'.concat(KEEP_CLASS)).length > 0 ) return; var weight = getScore($node); if (!weight) { weight = getOrInitScore$$1($node, $); setScore($node, $, weight); } // drop node if its weight is < 0 if (weight < 0) { $node.remove(); } else { // deteremine if node seems like content removeUnlessContent($node, $, weight); } }); return $; } function cleanHeaders($article, $) { var title = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : ''; $(HEADER_TAG_LIST, $article).each(function(index, header) { var $header = $(header); // Remove any headers that appear before all other p tags in the // document. This probably means that it was part of the title, a // subtitle or something else extraneous like a datestamp or byline, // all of which should be handled by other metadata handling. if ($($header, $article).prevAll('p').length === 0) { return $header.remove(); } // Remove any headers that match the title exactly. if (normalizeSpaces($(header).text()) === title) { return $header.remove(); } // If this header has a negative weight, it's probably junk. // Get rid of it. if (getWeight($(header)) < 0) { return $header.remove(); } return $header; }); return $; } // html to avoid later complications with multiple body tags. function rewriteTopLevel$$1(article, $) { // I'm not using context here because // it's problematic when converting the // top-level/root node - AP $ = convertNodeTo$$1($('html'), $, 'div'); $ = convertNodeTo$$1($('body'), $, 'div'); return $; } function absolutize($, rootUrl, attr, $content) { $('['.concat(attr, ']'), $content).each(function(_, node) { var attrs = getAttrs(node); var url = attrs[attr]; if (url) { var absoluteUrl = URL.resolve(rootUrl, url); setAttr(node, attr, absoluteUrl); } }); } function makeLinksAbsolute$$1($content, $, url) { ['href', 'src'].forEach(function(attr) { return absolutize($, url, attr, $content); }); return $content; } function textLength(text) { return text.trim().replace(/\s+/g, ' ').length; } // Determines what percentage of the text // in a node is link text // Takes a node, returns a float function linkDensity($node) { var totalTextLength = textLength($node.text()); var linkText = $node.find('a').text(); var linkLength = textLength(linkText); if (totalTextLength > 0) { return linkLength / totalTextLength; } if (totalTextLength === 0 && linkLength > 0) { return 1; } return 0; } // search for, find a meta tag associated. function extractFromMeta$$1($, metaNames, cachedNames) { var cleanTags = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true; var foundNames = metaNames.filter(function(name) { return cachedNames.indexOf(name) !== -1; }); // eslint-disable-next-line no-restricted-syntax var _iteratorNormalCompletion = true; var _didIteratorError = false; var _iteratorError = undefined; try { var _loop = function _loop() { var name = _step.value; var type = 'name'; var value = 'value'; var nodes = $('meta['.concat(type, '="').concat(name, '"]')); // Get the unique value of every matching node, in case there // are two meta tags with the same name and value. // Remove empty values. var values = nodes .map(function(index, node) { return $(node).attr(value); }) .toArray() .filter(function(text) { return text !== ''; }); // If we have more than one value for the same name, we have a // conflict and can't trust any of them. Skip this name. If we have // zero, that means our meta tags had no values. Skip this name // also. if (values.length === 1) { var metaValue; // Meta values that contain HTML should be stripped, as they // weren't subject to cleaning previously. if (cleanTags) { metaValue = stripTags(values[0], $); } else { var _values = _slicedToArray(values, 1); metaValue = _values[0]; } return { v: metaValue, }; } }; for ( var _iterator = _getIterator(foundNames), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true ) { var _ret = _loop(); if (_typeof(_ret) === 'object') return _ret.v; } // If nothing is found, return null } catch (err) { _didIteratorError = true; _iteratorError = err; } finally { try { if (!_iteratorNormalCompletion && _iterator.return != null) { _iterator.return(); } } finally { if (_didIteratorError) { throw _iteratorError; } } } return null; } function isGoodNode($node, maxChildren) { // If it has a number of children, it's more likely a container // element. Skip it. if ($node.children().length > maxChildren) { return false; } // If it looks to be within a comment, skip it. if (withinComment$$1($node)) { return false; } return true; } // Given a a list of selectors find content that may // be extractable from the document. This is for flat // meta-information, like author, title, date published, etc. function extractFromSelectors$$1($, selectors) { var maxChildren = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 1; var textOnly = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true; // eslint-disable-next-line no-restricted-syntax var _iteratorNormalCompletion = true; var _didIteratorError = false; var _iteratorError = undefined; try { for ( var _iterator = _getIterator(selectors), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true ) { var selector = _step.value; var nodes = $(selector); // If we didn't get exactly one of this selector, this may be // a list of articles or comments. Skip it. if (nodes.length === 1) { var $node = $(nodes[0]); if (isGoodNode($node, maxChildren)) { var content = void 0; if (textOnly) { content = $node.text(); } else { content = $node.html(); } if (content) { return content; } } } } } catch (err) { _didIteratorError = true; _iteratorError = err; } finally { try { if (!_iteratorNormalCompletion && _iterator.return != null) { _iterator.return(); } } finally { if (_didIteratorError) { throw _iteratorError; } } } return null; } // strips all tags from a string of text function stripTags(text, $) { // Wrapping text in html element prevents errors when text // has no html var cleanText = $(''.concat(text, '')).text(); return cleanText === '' ? text : cleanText; } function withinComment$$1($node) { var parents = $node.parents().toArray(); var commentParent = parents.find(function(parent) { var attrs = getAttrs(parent); var nodeClass = attrs.class, id = attrs.id; var classAndId = ''.concat(nodeClass, ' ').concat(id); return classAndId.includes('comment'); }); return commentParent !== undefined; } // Given a node, determine if it's article-like enough to return // param: node (a cheerio node) // return: boolean function nodeIsSufficient($node) { return $node.text().trim().length >= 100; } function isWordpress($) { return $(IS_WP_SELECTOR).length > 0; } function getAttrs(node) { var attribs = node.attribs, attributes = node.attributes; if (!attribs && attributes) { var attrs = _Reflect$ownKeys(attributes).reduce(function(acc, index) { var attr = attributes[index]; if (!attr.name || !attr.value) return acc; acc[attr.name] = attr.value; return acc; }, {}); return attrs; } return attribs; } function setAttr(node, attr, val) { if (node.attribs) { node.attribs[attr] = val; } else if (node.attributes) { node.setAttribute(attr, val); } return node; } function setAttrs(node, attrs) { if (node.attribs) { node.attribs = attrs; } else if (node.attributes) { while (node.attributes.length > 0) { node.removeAttribute(node.attributes[0].name); } _Reflect$ownKeys(attrs).forEach(function(key) { node.setAttribute(key, attrs[key]); }); } return node; } // DOM manipulation var IS_LINK = new RegExp('https?://', 'i'); var IS_IMAGE = new RegExp('.(png|gif|jpe?g)', 'i'); var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(','); // lazy loaded images into normal images. // Many sites will have img tags with no source, or an image tag with a src // attribute that a is a placeholer. We need to be able to properly fill in // the src attribute so the images are no longer lazy loaded. function convertLazyLoadedImages($) { $('img').each(function(_, img) { var attrs = getAttrs(img); _Reflect$ownKeys(attrs).forEach(function(attr) { var value = attrs[attr]; if (attr !== 'src' && IS_LINK.test(value) && IS_IMAGE.test(value)) { $(img).attr('src', value); } }); }); return $; } function isComment(index, node) { return node.type === 'comment'; } function cleanComments($) { $.root() .find('*') .contents() .filter(isComment) .remove(); return $; } function clean($) { $(TAGS_TO_REMOVE).remove(); $ = cleanComments($); return $; } var Resource = { // Create a Resource. // // :param url: The URL for the document we should retrieve. // :param response: If set, use as the response rather than // attempting to fetch it ourselves. Expects a // string. create: function create(url, preparedResponse, parsedUrl) { var _this = this; return _asyncToGenerator( /*#__PURE__*/ _regeneratorRuntime.mark(function _callee() { var result, validResponse; return _regeneratorRuntime.wrap( function _callee$(_context) { while (1) { switch ((_context.prev = _context.next)) { case 0: if (!preparedResponse) { _context.next = 5; break; } validResponse = { statusMessage: 'OK', statusCode: 200, headers: { 'content-type': 'text/html', 'content-length': 500, }, }; result = { body: preparedResponse, response: validResponse, }; _context.next = 8; break; case 5: _context.next = 7; return fetchResource(url, parsedUrl); case 7: result = _context.sent; case 8: if (!result.error) { _context.next = 11; break; } result.failed = true; return _context.abrupt('return', result); case 11: return _context.abrupt('return', _this.generateDoc(result)); case 12: case 'end': return _context.stop(); } } }, _callee, this ); }) )(); }, generateDoc: function generateDoc(_ref) { var content = _ref.body, response = _ref.response; var contentType = response.headers['content-type']; // TODO: Implement is_text function from // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57 if (!contentType.includes('html') && !contentType.includes('text')) { throw new Error('Content does not appear to be text.'); } var $ = this.encodeDoc({ content: content, contentType: contentType, }); if ($.root().children().length === 0) { throw new Error('No children, likely a bad parse.'); } $ = normalizeMetaTags($); $ = convertLazyLoadedImages($); $ = clean($); return $; }, encodeDoc: function encodeDoc(_ref2) { var content = _ref2.content, contentType = _ref2.contentType; var encoding = getEncoding(contentType); var decodedContent = iconv.decode(content, encoding); var $ = cheerio.load(decodedContent); // after first cheerio.load, check to see if encoding matches var metaContentType = $('meta[http-equiv=content-type]').attr('content'); var properEncoding = getEncoding(metaContentType); // if encodings in the header/body dont match, use the one in the body if (properEncoding !== encoding) { decodedContent = iconv.decode(content, properEncoding); $ = cheerio.load(decodedContent); } return $; }, }; var merge = function merge(extractor, domains) { return domains.reduce(function(acc, domain) { acc[domain] = extractor; return acc; }, {}); }; function mergeSupportedDomains(extractor) { return extractor.supportedDomains ? merge( extractor, [extractor.domain].concat( _toConsumableArray(extractor.supportedDomains) ) ) : merge(extractor, [extractor.domain]); } var BloggerExtractor = { domain: 'blogspot.com', content: { // Blogger is insane and does not load its content // initially in the page, but it's all there // in noscript selectors: ['.post-content noscript'], // Selectors to remove from the extracted content clean: [], // Convert the noscript tag to a div transforms: { noscript: 'div', }, }, author: { selectors: ['.post-author-name'], }, title: { selectors: ['.post h2.title'], }, date_published: { selectors: ['span.publishdate'], }, }; var NYMagExtractor = { domain: 'nymag.com', content: { // Order by most likely. Extractor will stop on first occurrence selectors: ['div.article-content', 'section.body', 'article.article'], // Selectors to remove from the extracted content clean: ['.ad', '.single-related-story'], // Object of tranformations to make on matched elements // Each key is the selector, each value is the tag to // transform to. // If a function is given, it should return a string // to convert to or nothing (in which case it will not perform // the transformation. transforms: { // Convert h1s to h2s h1: 'h2', // Convert lazy-loaded noscript images to figures noscript: function noscript($node, $) { var $children = $.browser ? $($node.text()) : $node.children(); if ( $children.length === 1 && $children.get(0) !== undefined && $children.get(0).tagName.toLowerCase() === 'img' ) { return 'figure'; } return null; }, }, }, title: { selectors: ['h1.lede-feature-title', 'h1.headline-primary', 'h1'], }, author: { selectors: ['.by-authors', '.lede-feature-author'], }, dek: { selectors: ['.lede-feature-teaser'], }, date_published: { selectors: [ ['time.article-timestamp[datetime]', 'datetime'], 'time.article-timestamp', ], }, }; var WikipediaExtractor = { domain: 'wikipedia.org', content: { selectors: ['#mw-content-text'], defaultCleaner: false, // transform top infobox to an image with caption transforms: { '.infobox img': function infoboxImg($node) { var $parent = $node.parents('.infobox'); // Only prepend the first image in .infobox if ($parent.children('img').length === 0) { $parent.prepend($node); } }, '.infobox caption': 'figcaption', '.infobox': 'figure', }, // Selectors to remove from the extracted content clean: [ '.mw-editsection', 'figure tr, figure td, figure tbody', '#toc', '.navbox', ], }, author: 'Wikipedia Contributors', title: { selectors: ['h2.title'], }, date_published: { selectors: ['#footer-info-lastmod'], }, }; var TwitterExtractor = { domain: 'twitter.com', content: { transforms: { // We're transforming essentially the whole page here. // Twitter doesn't have nice selectors, so our initial // selector grabs the whole page, then we're re-writing // it to fit our needs before we clean it up. '.permalink[role=main]': function permalinkRoleMain($node, $) { var tweets = $node.find('.tweet'); var $tweetContainer = $('
'); $tweetContainer.append(tweets); $node.replaceWith($tweetContainer); }, // Twitter wraps @ with s, which // renders as a strikethrough s: 'span', }, selectors: ['.permalink[role=main]'], defaultCleaner: false, clean: ['.stream-item-footer', 'button', '.tweet-details-fixer'], }, author: { selectors: ['.tweet.permalink-tweet .username'], }, date_published: { selectors: [['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms']], }, }; var NYTimesExtractor = { domain: 'www.nytimes.com', title: { selectors: ['h1.g-headline', 'h1[itemprop="headline"]', 'h1.headline'], }, author: { selectors: [['meta[name="author"]', 'value'], '.g-byline', '.byline'], }, content: { selectors: ['div.g-blocks', 'article#story'], transforms: { 'img.g-lazy': function imgGLazy($node) { var src = $node.attr('src'); // const widths = $node.attr('data-widths') // .slice(1) // .slice(0, -1) // .split(','); // if (widths.length) { // width = widths.slice(-1); // } else { // width = '900'; // } var width = 640; src = src.replace('{{size}}', width); $node.attr('src', src); }, }, clean: [ '.ad', 'header#story-header', '.story-body-1 .lede.video', '.visually-hidden', '#newsletter-promo', '.promo', '.comments-button', '.hidden', '.comments', '.supplemental', '.nocontent', '.story-footer-links', ], }, date_published: { selectors: [['meta[name="article:published"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, dek: null, next_page_url: null, excerpt: null, }; // Rename CustomExtractor // to fit your publication var TheAtlanticExtractor = { domain: 'www.theatlantic.com', title: { selectors: ['h1.hed'], }, author: { selectors: ['article#article .article-cover-extra .metadata .byline a'], }, content: { selectors: [ ['.article-cover figure.lead-img', '.article-body'], '.article-body', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: [], // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.partner-box', '.callout'], }, date_published: { selectors: [['time[itemProp="datePublished"]', 'datetime']], }, lead_image_url: null, next_page_url: null, excerpt: null, }; // Rename CustomExtractor // to fit your publication // (e.g., NYTimesExtractor) var NewYorkerExtractor = { domain: 'www.newyorker.com', title: { selectors: ['h1.title'], }, author: { selectors: ['.contributors'], }, content: { selectors: ['div#articleBody', 'div.articleBody'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: [], // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, date_published: { selectors: [ ['meta[name="article:published_time"]', 'value'], ['time[itemProp="datePublished"]', 'content'], ], timezone: 'America/New_York', }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, dek: { selectors: ['.dek', 'h2.dek'], }, next_page_url: null, excerpt: null, }; // Rename CustomExtractor // to fit your publication // (e.g., NYTimesExtractor) var WiredExtractor = { domain: 'www.wired.com', title: { selectors: ['h1.post-title'], }, author: { selectors: ['a[rel="author"]'], }, content: { selectors: ['article.content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: [], // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.visually-hidden', 'figcaption img.photo'], }, date_published: { selectors: [['meta[itemprop="datePublished"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, dek: { selectors: [], }, next_page_url: null, excerpt: null, }; // Rename CustomExtractor // to fit your publication // (e.g., NYTimesExtractor) var MSNExtractor = { domain: 'www.msn.com', title: { selectors: ['h1'], }, author: { selectors: ['span.authorname-txt'], }, content: { selectors: ['div.richtext'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: [], // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['span.caption'], }, date_published: { selectors: ['span.time'], }, lead_image_url: { selectors: [], }, dek: { selectors: [], }, next_page_url: null, excerpt: null, }; // Rename CustomExtractor // to fit your publication // (e.g., NYTimesExtractor) var YahooExtractor = { domain: 'www.yahoo.com', title: { selectors: ['header.canvas-header'], }, author: { selectors: ['span.provider-name'], }, content: { selectors: [ // enter content selectors '.content-canvas', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: [], // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.figure-caption'], }, date_published: { selectors: [['time.date[datetime]', 'datetime']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, dek: { selectors: [ // enter dek selectors ], }, next_page_url: null, excerpt: null, }; // Rename CustomExtractor // to fit your publication // (e.g., NYTimesExtractor) var BuzzfeedExtractor = { domain: 'www.buzzfeed.com', title: { selectors: ['h1[id="post-title"]'], }, author: { selectors: ['a[data-action="user/username"]', 'byline__author'], }, content: { selectors: [ ['.longform_custom_header_media', '#buzz_sub_buzz'], '#buzz_sub_buzz', ], defaultCleaner: false, // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { h2: 'b', 'div.longform_custom_header_media': function divLongform_custom_header_media( $node ) { if ($node.has('img') && $node.has('.longform_header_image_source')) { return 'figure'; } return null; }, 'figure.longform_custom_header_media .longform_header_image_source': 'figcaption', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [ '.instapaper_ignore', '.suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline', '.share-box', '.print', ], }, date_published: { selectors: ['.buzz-datetime'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, dek: { selectors: [], }, next_page_url: null, excerpt: null, }; // Rename CustomExtractor // to fit your publication // (e.g., NYTimesExtractor) var WikiaExtractor = { domain: 'fandom.wikia.com', title: { selectors: ['h1.entry-title'], }, author: { selectors: ['.author vcard', '.fn'], }, content: { selectors: ['.grid-content', '.entry-content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: [], // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, dek: { selectors: [], }, next_page_url: null, excerpt: null, }; // Rename CustomExtractor // to fit your publication // (e.g., NYTimesExtractor) var LittleThingsExtractor = { domain: 'www.littlethings.com', title: { selectors: ['h1.post-title'], }, author: { selectors: [['meta[name="author"]', 'value']], }, content: { selectors: [ // enter content selectors '.mainContentIntro', '.content-wrapper', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: [], // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, next_page_url: null, excerpt: null, }; // Rename CustomExtractor // to fit your publication // (e.g., NYTimesExtractor) var PoliticoExtractor = { domain: 'www.politico.com', title: { selectors: [ // enter title selectors ['meta[name="og:title"]', 'value'], ], }, author: { selectors: ['.story-main-content .byline .vcard'], }, content: { selectors: [ // enter content selectors '.story-main-content', '.content-group', '.story-core', '.story-text', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: [], // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['figcaption'], }, date_published: { selectors: [['.story-main-content .timestamp time[datetime]', 'datetime']], }, lead_image_url: { selectors: [ // enter lead_image_url selectors ['meta[name="og:image"]', 'value'], ], }, dek: { selectors: [], }, next_page_url: null, excerpt: null, }; var DeadspinExtractor = { domain: 'deadspin.com', supportedDomains: [ 'jezebel.com', 'lifehacker.com', 'kotaku.com', 'gizmodo.com', 'jalopnik.com', 'kinja.com', ], title: { selectors: ['h1.headline'], }, author: { selectors: ['.author'], }, content: { selectors: ['.post-content', '.entry-content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { 'iframe.lazyload[data-recommend-id^="youtube://"]': function iframeLazyloadDataRecommendIdYoutube( $node ) { var youtubeId = $node.attr('id').split('youtube-')[1]; $node.attr('src', 'https://www.youtube.com/embed/'.concat(youtubeId)); }, }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.magnifier', '.lightbox'], }, date_published: { selectors: [['time.updated[datetime]', 'datetime']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, dek: { selectors: [ // enter selectors ], }, next_page_url: { selectors: [ // enter selectors ], }, excerpt: { selectors: [ // enter selectors ], }, }; // Rename CustomExtractor // to fit your publication // (e.g., NYTimesExtractor) var BroadwayWorldExtractor = { domain: 'www.broadwayworld.com', title: { selectors: ['h1.article-title'], }, author: { selectors: ['span[itemprop=author]'], }, content: { selectors: ['div[itemprop=articlebody]'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, date_published: { selectors: [['meta[itemprop=datePublished]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, dek: { selectors: [], }, next_page_url: { selectors: [ // enter selectors ], }, excerpt: { selectors: [ // enter selectors ], }, }; // Rename CustomExtractor // to fit your publication // (e.g., NYTimesExtractor) var ApartmentTherapyExtractor = { domain: 'www.apartmenttherapy.com', title: { selectors: ['h1.headline'], }, author: { selectors: ['.PostByline__name'], }, content: { selectors: ['div.post__content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { 'div[data-render-react-id="images/LazyPicture"]': function divDataRenderReactIdImagesLazyPicture( $node, $ ) { var data = JSON.parse($node.attr('data-props')); var src = data.sources[0].src; var $img = $('').attr('src', src); $node.replaceWith($img); }, }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, date_published: { selectors: [['.PostByline__timestamp[datetime]', 'datetime']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, dek: { selectors: [], }, next_page_url: { selectors: [ // enter selectors ], }, excerpt: { selectors: [ // enter selectors ], }, }; var MediumExtractor = { domain: 'medium.com', supportedDomains: ['trackchanges.postlight.com'], title: { selectors: ['h1'], }, author: { selectors: [['meta[name="author"]', 'value']], }, content: { selectors: [ ['.section-content'], '.section-content', 'article > div > section', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { // Re-write lazy-loaded youtube videos iframe: function iframe($node) { var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//; var thumb = decodeURIComponent($node.attr('data-thumbnail')); if (ytRe.test(thumb)) { var _thumb$match = thumb.match(ytRe), _thumb$match2 = _slicedToArray(_thumb$match, 2), _ = _thumb$match2[0], youtubeId = _thumb$match2[1]; // eslint-disable-line $node.attr('src', 'https://www.youtube.com/embed/'.concat(youtubeId)); var $parent = $node.parents('figure'); var $caption = $parent.find('figcaption'); $parent.empty().append([$node, $caption]); } }, // rewrite figures to pull out image and caption, remove rest figure: function figure($node) { // ignore if figure has an iframe if ($node.find('iframe').length > 0) return; var $img = $node.find('img').slice(-1)[0]; var $caption = $node.find('figcaption'); $node.empty().append([$img, $caption]); }, }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, date_published: { selectors: [['time[datetime]', 'datetime']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, dek: { selectors: [ // enter selectors ], }, next_page_url: { selectors: [ // enter selectors ], }, excerpt: { selectors: [ // enter selectors ], }, }; var WwwTmzComExtractor = { domain: 'www.tmz.com', title: { selectors: ['.post-title-breadcrumb', 'h1', '.headline'], }, author: 'TMZ STAFF', date_published: { selectors: ['.article-posted-date'], timezone: 'America/Los_Angeles', }, dek: { selectors: [ // enter selectors ], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.article-content', '.all-post-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.lightbox-link'], }, }; var WwwWashingtonpostComExtractor = { domain: 'www.washingtonpost.com', title: { selectors: ['h1', '#topper-headline-wrapper'], }, author: { selectors: ['.pb-byline'], }, date_published: { selectors: [['.pb-timestamp[itemprop="datePublished"]', 'content']], }, dek: { selectors: [], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.article-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { 'div.inline-content': function divInlineContent($node) { if ($node.has('img,iframe,video').length > 0) { return 'figure'; } $node.remove(); return null; }, '.pb-caption': 'figcaption', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.interstitial-link', '.newsletter-inline-unit'], }, }; var WwwHuffingtonpostComExtractor = { domain: 'www.huffingtonpost.com', title: { selectors: ['h1.headline__title'], }, author: { selectors: ['span.author-card__details__name'], }, date_published: { selectors: [ ['meta[name="article:modified_time"]', 'value'], ['meta[name="article:published_time"]', 'value'], ], }, dek: { selectors: ['h2.headline__subtitle'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['div.entry__body'], defaultCleaner: false, // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { // 'div.top-media': ($node) => { // const $figure = $node.children('figure'); // $node.replaceWith($figure); // }, }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [ '.pull-quote', '.tag-cloud', '.embed-asset', '.below-entry', '.entry-corrections', '#suggested-story', ], }, }; var NewrepublicComExtractor = { domain: 'newrepublic.com', title: { selectors: ['h1.article-headline', '.minutes-primary h1.minute-title'], }, author: { selectors: ['div.author-list', '.minutes-primary h3.minute-byline'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], timezone: 'America/New_York', }, dek: { selectors: ['h2.article-subhead'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [ ['.article-cover', 'div.content-body'], ['.minute-image', '.minutes-primary div.content-body'], ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['aside'], }, }; var MoneyCnnComExtractor = { domain: 'money.cnn.com', title: { selectors: ['.article-title'], }, author: { selectors: ['.byline a'], }, date_published: { selectors: [['meta[name="date"]', 'value']], timezone: 'GMT', }, dek: { selectors: ['#storytext h2'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['#storytext'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.inStoryHeading'], }, }; var WwwThevergeComExtractor = { domain: 'www.theverge.com', supportedDomains: ['www.polygon.com'], title: { selectors: ['h1'], }, author: { selectors: [['meta[name="author"]', 'value']], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, dek: { selectors: ['h2.p-dek'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [ // feature template multi-match ['.c-entry-hero .e-image', '.c-entry-intro', '.c-entry-content'], // regular post multi-match ['.e-image--hero', '.c-entry-content'], // feature template fallback '.l-wrapper .l-feature', // regular post fallback 'div.c-entry-content', ], // Transform lazy-loaded images transforms: { noscript: function noscript($node) { var $children = $node.children(); if ($children.length === 1 && $children.get(0).tagName === 'img') { return 'span'; } return null; }, }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.aside', 'img.c-dynamic-image'], }, }; var WwwCnnComExtractor = { domain: 'www.cnn.com', title: { selectors: ['h1.pg-headline', 'h1'], }, author: { selectors: ['.metadata__byline__author'], }, date_published: { selectors: [['meta[name="pubdate"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [ // a more specific selector to grab the lead image and the body ['.media__video--thumbnail', '.zn-body-text'], // a fallback for the above '.zn-body-text', 'div[itemprop="articleBody"]', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { '.zn-body__paragraph, .el__leafmedia--sourced-paragraph': function znBody__paragraphEl__leafmediaSourcedParagraph( $node ) { var $text = $node.html(); if ($text) { return 'p'; } return null; }, // this transform cleans the short, all-link sections linking // to related content but not marked as such in any way. '.zn-body__paragraph': function znBody__paragraph($node) { if ($node.has('a')) { if ( $node.text().trim() === $node .find('a') .text() .trim() ) { $node.remove(); } } }, '.media__video--thumbnail': 'figure', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwAolComExtractor = { domain: 'www.aol.com', title: { selectors: ['h1.p-article__title'], }, author: { selectors: [['meta[name="author"]', 'value']], }, date_published: { selectors: ['.p-article__byline__date'], timezone: 'America/New_York', }, dek: { selectors: [ // enter selectors ], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.article-content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwYoutubeComExtractor = { domain: 'www.youtube.com', title: { selectors: ['.watch-title', 'h1.watch-title-container'], }, author: { selectors: ['.yt-user-info'], }, date_published: { selectors: [['meta[itemProp="datePublished"]', 'value']], timezone: 'GMT', }, dek: { selectors: [ // enter selectors ], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { defaultCleaner: false, selectors: [['#player-api', '#eow-description']], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { '#player-api': function playerApi($node, $) { var videoId = $('meta[itemProp="videoId"]').attr('value'); $node.html( '\n ' ) ); }, }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwTheguardianComExtractor = { domain: 'www.theguardian.com', title: { selectors: ['.content__headline'], }, author: { selectors: ['p.byline'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, dek: { selectors: ['.content__standfirst'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.content__article-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.hide-on-mobile', '.inline-icon'], }, }; var WwwSbnationComExtractor = { domain: 'www.sbnation.com', title: { selectors: ['h1.c-page-title'], }, author: { selectors: [['meta[name="author"]', 'value']], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, dek: { selectors: ['h2.c-entry-summary.p-dek'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['div.c-entry-content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwBloombergComExtractor = { domain: 'www.bloomberg.com', title: { selectors: [ // normal articles '.lede-headline', // /graphics/ template 'h1.article-title', // /news/ template 'h1.lede-text-only__hed', ], }, author: { selectors: [ ['meta[name="parsely-author"]', 'value'], '.byline-details__link', // /graphics/ template '.bydek', // /news/ template '.author', ], }, date_published: { selectors: [ ['time.published-at', 'datetime'], ['time[datetime]', 'datetime'], ['meta[name="date"]', 'value'], ['meta[name="parsely-pub-date"]', 'value'], ], }, dek: { selectors: [], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [ '.article-body__content', // /graphics/ template ['section.copy-block'], // /news/ template '.body-copy', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.inline-newsletter', '.page-ad'], }, }; var WwwBustleComExtractor = { domain: 'www.bustle.com', title: { selectors: ['h1.post-page__title'], }, author: { selectors: ['div.content-meta__author'], }, date_published: { selectors: [['time.content-meta__published-date[datetime]', 'datetime']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.post-page__body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwNprOrgExtractor = { domain: 'www.npr.org', title: { selectors: ['h1', '.storytitle'], }, author: { selectors: ['p.byline__name.byline__name--block'], }, date_published: { selectors: [ ['.dateblock time[datetime]', 'datetime'], ['meta[name="date"]', 'value'], ], }, lead_image_url: { selectors: [ ['meta[name="og:image"]', 'value'], ['meta[name="twitter:image:src"]', 'value'], ], }, content: { selectors: ['.storytext'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { '.bucketwrap.image': 'figure', '.bucketwrap.image .credit-caption': 'figcaption', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['div.enlarge_measure'], }, }; var WwwRecodeNetExtractor = { domain: 'www.recode.net', title: { selectors: ['h1.c-page-title'], }, author: { selectors: [['meta[name="author"]', 'value']], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, dek: { selectors: ['h2.c-entry-summary.p-dek'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [ ['figure.e-image--hero', '.c-entry-content'], '.c-entry-content', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var QzComExtractor = { domain: 'qz.com', title: { selectors: ['header.item-header.content-width-responsive'], }, author: { selectors: [['meta[name="author"]', 'value']], }, date_published: { selectors: ['.timestamp'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [['figure.featured-image', '.item-body'], '.item-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.article-aside', '.progressive-image-thumbnail'], }, }; var WwwDmagazineComExtractor = { domain: 'www.dmagazine.com', title: { selectors: ['h1.story__title'], }, author: { selectors: ['.story__info .story__info__item:first-child'], }, date_published: { selectors: [ // enter selectors '.story__info', ], timezone: 'America/Chicago', }, dek: { selectors: ['.story__subhead'], }, lead_image_url: { selectors: [['article figure a:first-child', 'href']], }, content: { selectors: ['.story__content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwReutersComExtractor = { domain: 'www.reuters.com', title: { selectors: ['h1.article-headline'], }, author: { selectors: ['.author'], }, date_published: { selectors: [['meta[name="og:article:published_time"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['#article-text'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { '.article-subtitle': 'h4', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['#article-byline .author'], }, }; var MashableComExtractor = { domain: 'mashable.com', title: { selectors: ['h1.title'], }, author: { selectors: ['span.author_name a'], }, date_published: { selectors: [['meta[name="og:article:published_time"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['section.article-content.blueprint'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { '.image-credit': 'figcaption', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwChicagotribuneComExtractor = { domain: 'www.chicagotribune.com', title: { selectors: ['h1.trb_ar_hl_t'], }, author: { selectors: ['span.trb_ar_by_nm_au'], }, date_published: { selectors: [['meta[itemprop="datePublished"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['div.trb_ar_page'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwVoxComExtractor = { domain: 'www.vox.com', title: { selectors: ['h1.c-page-title'], }, author: { selectors: [['meta[name="author"]', 'value']], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, dek: { selectors: ['.p-dek'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [ ['figure.e-image--hero', '.c-entry-content'], '.c-entry-content', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { 'figure .e-image__image noscript': function figureEImage__imageNoscript( $node ) { var imgHtml = $node.html(); $node .parents('.e-image__image') .find('.c-dynamic-image') .replaceWith(imgHtml); }, 'figure .e-image__meta': 'figcaption', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var NewsNationalgeographicComExtractor = { domain: 'news.nationalgeographic.com', title: { selectors: ['h1', 'h1.main-title'], }, author: { selectors: ['.byline-component__contributors b span'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], format: 'ddd MMM DD HH:mm:ss zz YYYY', timezone: 'EST', }, dek: { selectors: ['.article__deck'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [['.parsys.content', '.__image-lead__'], '.content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { '.parsys.content': function parsysContent($node, $) { var $imgSrc = $node .find('.image.parbase.section') .find('.picturefill') .first() .data('platform-src'); if ($imgSrc) { $node.prepend( $('')) ); } }, }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.pull-quote.pull-quote--large'], }, }; var WwwNationalgeographicComExtractor = { domain: 'www.nationalgeographic.com', title: { selectors: ['h1', 'h1.main-title'], }, author: { selectors: ['.byline-component__contributors b span'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, dek: { selectors: ['.article__deck'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [['.parsys.content', '.__image-lead__'], '.content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { '.parsys.content': function parsysContent($node, $) { var $imageParent = $node.children().first(); if ($imageParent.hasClass('imageGroup')) { var $dataAttrContainer = $imageParent .find('.media--medium__container') .children() .first(); var imgPath1 = $dataAttrContainer.data('platform-image1-path'); var imgPath2 = $dataAttrContainer.data('platform-image2-path'); if (imgPath2 && imgPath1) { $node.prepend( $( '
\n \n \n
') ) ); } } else { var $imgSrc = $node .find('.image.parbase.section') .find('.picturefill') .first() .data('platform-src'); if ($imgSrc) { $node.prepend( $('')) ); } } }, }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.pull-quote.pull-quote--small'], }, }; var WwwLatimesComExtractor = { domain: 'www.latimes.com', title: { selectors: ['.trb_ar_hl'], }, author: { selectors: [['meta[name="author"]', 'value']], }, date_published: { selectors: [['meta[itemprop="datePublished"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.trb_ar_main'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { '.trb_ar_la': function trb_ar_la($node) { var $figure = $node.find('figure'); $node.replaceWith($figure); }, }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.trb_ar_by', '.trb_ar_cr'], }, }; var PagesixComExtractor = { domain: 'pagesix.com', supportedDomains: ['nypost.com'], title: { selectors: ['h1 a'], }, author: { selectors: ['.byline'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, dek: { selectors: [['meta[name="description"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [ ['#featured-image-wrapper', '.entry-content'], '.entry-content', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { '#featured-image-wrapper': 'figure', '.wp-caption-text': 'figcaption', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.modal-trigger'], }, }; var ThefederalistpapersOrgExtractor = { domain: 'thefederalistpapers.org', title: { selectors: ['h1.entry-title'], }, author: { selectors: ['main span.entry-author-name'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.entry-content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [['p[style]']], }, }; var WwwCbssportsComExtractor = { domain: 'www.cbssports.com', title: { selectors: ['.article-headline'], }, author: { selectors: ['.author-name'], }, date_published: { selectors: [['.date-original-reading-time time', 'datetime']], timezone: 'UTC', }, dek: { selectors: ['.article-subline'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.article'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwMsnbcComExtractor = { domain: 'www.msnbc.com', title: { selectors: ['h1', 'h1.is-title-pane'], }, author: { selectors: ['.author'], }, date_published: { selectors: [['meta[name="DC.date.issued"]', 'value']], }, dek: { selectors: [['meta[name="description"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.pane-node-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { '.pane-node-body': function paneNodeBody($node, $) { var _WwwMsnbcComExtractor = _slicedToArray( WwwMsnbcComExtractor.lead_image_url.selectors[0], 2 ), selector = _WwwMsnbcComExtractor[0], attr = _WwwMsnbcComExtractor[1]; var src = $(selector).attr(attr); if (src) { $node.prepend('')); } }, }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwThepoliticalinsiderComExtractor = { domain: 'www.thepoliticalinsider.com', title: { selectors: [['meta[name="sailthru.title"]', 'value']], }, author: { selectors: [['meta[name="sailthru.author"]', 'value']], }, date_published: { selectors: [['meta[name="sailthru.date"]', 'value']], timezone: 'America/New_York', }, dek: { selectors: [ // enter selectors ], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['div#article-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwMentalflossComExtractor = { domain: 'www.mentalfloss.com', title: { selectors: ['h1.title', '.title-group', '.inner'], }, author: { selectors: ['.field-name-field-enhanced-authors'], }, date_published: { selectors: ['.date-display-single'], timezone: 'America/New_York', }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['div.field.field-name-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var AbcnewsGoComExtractor = { domain: 'abcnews.go.com', title: { selectors: ['.article-header h1'], }, author: { selectors: ['.authors'], clean: ['.author-overlay', '.by-text'], }, date_published: { selectors: ['.timestamp'], timezone: 'America/New_York', }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.article-copy'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwNydailynewsComExtractor = { domain: 'www.nydailynews.com', title: { selectors: ['h1#ra-headline'], }, author: { selectors: [['meta[name="parsely-author"]', 'value']], }, date_published: { selectors: [['meta[name="sailthru.date"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['article#ra-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['dl#ra-tags', '.ra-related', 'a.ra-editor', 'dl#ra-share-bottom'], }, }; var WwwCnbcComExtractor = { domain: 'www.cnbc.com', title: { selectors: ['h1.title'], }, author: { selectors: [['meta[name="author"]', 'value']], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['div#article_body.content', 'div.story'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwPopsugarComExtractor = { domain: 'www.popsugar.com', title: { selectors: ['h2.post-title', 'title-text'], }, author: { selectors: [['meta[name="article:author"]', 'value']], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['#content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.share-copy-title', '.post-tags', '.reactions'], }, }; var ObserverComExtractor = { domain: 'observer.com', title: { selectors: ['h1.entry-title'], }, author: { selectors: ['.author', '.vcard'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, dek: { selectors: ['h2.dek'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['div.entry-content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var PeopleComExtractor = { domain: 'people.com', title: { selectors: [['meta[name="og:title"]', 'value']], }, author: { selectors: ['a.author.url.fn'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['div.article-body__inner'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwUsmagazineComExtractor = { domain: 'www.usmagazine.com', title: { selectors: ['header h1'], }, author: { selectors: ['a.article-byline.tracked-offpage'], }, date_published: { timezone: 'America/New_York', selectors: ['time.article-published-date'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['div.article-body-inner'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.module-related'], }, }; var WwwRollingstoneComExtractor = { domain: 'www.rollingstone.com', title: { selectors: ['h1.content-title'], }, author: { selectors: ['a.content-author.tracked-offpage'], }, date_published: { selectors: ['time.content-published-date'], timezone: 'America/New_York', }, dek: { selectors: ['.content-description'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [['.lead-container', '.article-content'], '.article-content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.module-related'], }, }; var twofortysevensportsComExtractor = { domain: '247sports.com', title: { selectors: ['title', 'article header h1'], }, author: { selectors: ['.author'], }, date_published: { selectors: [['time[data-published]', 'data-published']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['section.body.article'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var UproxxComExtractor = { domain: 'uproxx.com', title: { selectors: ['div.post-top h1'], }, author: { selectors: ['.post-top .authorname'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.post-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { 'div.image': 'figure', 'div.image .wp-media-credit': 'figcaption', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwEonlineComExtractor = { domain: 'www.eonline.com', title: { selectors: ['h1.article__title'], }, author: { selectors: ['.entry-meta__author a'], }, date_published: { selectors: [['meta[itemprop="datePublished"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [ ['.post-content section, .post-content div.post-content__image'], ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { 'div.post-content__image': 'figure', 'div.post-content__image .image__credits': 'figcaption', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwMiamiheraldComExtractor = { domain: 'www.miamiherald.com', title: { selectors: ['h1.title'], }, date_published: { selectors: ['p.published-date'], timezone: 'America/New_York', }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['div.dateline-storybody'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwRefinery29ComExtractor = { domain: 'www.refinery29.com', title: { selectors: ['h1.title'], }, author: { selectors: ['.contributor'], }, date_published: { selectors: [['meta[name="sailthru.date"]', 'value']], timezone: 'America/New_York', }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [ ['.full-width-opener', '.article-content'], '.article-content', '.body', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { 'div.loading noscript': function divLoadingNoscript($node) { var imgHtml = $node.html(); $node.parents('.loading').replaceWith(imgHtml); }, '.section-image': 'figure', '.section-image .content-caption': 'figcaption', '.section-text': 'p', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.story-share'], }, }; var WwwMacrumorsComExtractor = { domain: 'www.macrumors.com', title: { selectors: ['h1', 'h1.title'], }, author: { selectors: ['.author-url'], }, date_published: { selectors: ['.article .byline'], // Wednesday January 18, 2017 11:44 am PST format: 'dddd MMMM D, YYYY h:mm A zz', timezone: 'America/Los_Angeles', }, dek: { selectors: [['meta[name="description"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.article'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwAndroidcentralComExtractor = { domain: 'www.androidcentral.com', title: { selectors: ['h1', 'h1.main-title'], }, author: { selectors: ['.meta-by'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, dek: { selectors: [['meta[name="og:description"]', 'value']], }, lead_image_url: { selectors: [['.image-large', 'src']], }, content: { selectors: ['.article-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.intro', 'blockquote'], }, }; var WwwSiComExtractor = { domain: 'www.si.com', title: { selectors: ['h1', 'h1.headline'], }, author: { selectors: [['meta[name="author"]', 'value']], }, date_published: { selectors: ['.timestamp'], timezone: 'America/New_York', }, dek: { selectors: ['.quick-hit ul'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [['p', '.marquee_large_2x', '.component.image']], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { noscript: function noscript($node) { var $children = $node.children(); if ($children.length === 1 && $children.get(0).tagName === 'img') { return 'figure'; } return null; }, }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [ ['.inline-thumb', '.primary-message', '.description', '.instructions'], ], }, }; var WwwRawstoryComExtractor = { domain: 'www.rawstory.com', title: { selectors: ['.blog-title'], }, author: { selectors: ['.blog-author a:first-of-type'], }, date_published: { selectors: ['.blog-author a:last-of-type'], timezone: 'EST', }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.blog-content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwCnetComExtractor = { domain: 'www.cnet.com', title: { selectors: [['meta[name="og:title"]', 'value']], }, author: { selectors: ['a.author'], }, date_published: { selectors: ['time'], timezone: 'America/Los_Angeles', }, dek: { selectors: ['.article-dek'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [ ['img.__image-lead__', '.article-main-body'], '.article-main-body', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { 'figure.image': function figureImage($node) { var $img = $node.find('img'); $img.attr('width', '100%'); $img.attr('height', '100%'); $img.addClass('__image-lead__'); $node.remove('.imgContainer').prepend($img); }, }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwCinemablendComExtractor = { domain: 'www.cinemablend.com', title: { selectors: ['.story_title'], }, author: { selectors: ['.author'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], timezone: 'EST', }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['div#wrap_left_content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwTodayComExtractor = { domain: 'www.today.com', title: { selectors: ['h1.entry-headline'], }, author: { selectors: [['meta[name="author"]', 'value']], }, date_published: { selectors: [['meta[name="DC.date.issued"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.entry-container'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.label-comment'], }, }; var WwwHowtogeekComExtractor = { domain: 'www.howtogeek.com', title: { selectors: ['title'], }, author: { selectors: ['#authorinfobox a'], }, date_published: { selectors: ['#authorinfobox + div li'], timezone: 'GMT', }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.thecontent'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwAlComExtractor = { domain: 'www.al.com', title: { selectors: [['meta[name="title"]', 'value']], }, author: { selectors: [['meta[name="article_author"]', 'value']], }, date_published: { selectors: [['meta[name="article_date_original"]', 'value']], timezone: 'EST', }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.entry-content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwThepennyhoarderComExtractor = { domain: 'www.thepennyhoarder.com', title: { selectors: [['meta[name="dcterms.title"]', 'value']], }, author: { selectors: [['link[rel="author"]', 'title']], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [['.post-img', '.post-text'], '.post-text'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwWesternjournalismComExtractor = { domain: 'www.westernjournalism.com', title: { selectors: ['title', 'h1.entry-title'], }, author: { selectors: [['meta[name="author"]', 'value']], }, date_published: { selectors: [['meta[name="DC.date.issued"]', 'value']], }, dek: { selectors: ['.subtitle'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['div.article-sharing.top + div'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.ad-notice-small'], }, }; var FusionNetExtractor = { domain: 'fusion.net', title: { selectors: ['.post-title', '.single-title', '.headline'], }, author: { selectors: ['.show-for-medium .byline'], }, date_published: { selectors: [['time.local-time', 'datetime']], }, dek: { selectors: [ // enter selectors ], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [ ['.post-featured-media', '.article-content'], '.article-content', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { '.fusion-youtube-oembed': 'figure', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwAmericanowComExtractor = { domain: 'www.americanow.com', title: { selectors: ['.title', ['meta[name="title"]', 'value']], }, author: { selectors: ['.byline'], }, date_published: { selectors: [['meta[name="publish_date"]', 'value']], }, dek: { selectors: [ // enter selectors ], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [['.article-content', '.image', '.body'], '.body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.article-video-wrapper', '.show-for-small-only'], }, }; var ScienceflyComExtractor = { domain: 'sciencefly.com', title: { selectors: ['.entry-title', '.cb-entry-title', '.cb-single-title'], }, author: { selectors: ['div.cb-author', 'div.cb-author-title'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, dek: { selectors: [ // enter selectors ], }, lead_image_url: { selectors: [['div.theiaPostSlider_slides img', 'src']], }, content: { selectors: ['div.theiaPostSlider_slides'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var HellogigglesComExtractor = { domain: 'hellogiggles.com', title: { selectors: ['.title'], }, author: { selectors: ['.author-link'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.entry-content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var ThoughtcatalogComExtractor = { domain: 'thoughtcatalog.com', title: { selectors: ['h1.title', ['meta[name="og:title"]', 'value']], }, author: { selectors: [ 'div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name', 'h1.writer-name', ], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.entry.post'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.tc_mark'], }, }; var WwwNjComExtractor = { domain: 'www.nj.com', title: { selectors: [['meta[name="title"]', 'value']], }, author: { selectors: [['meta[name="article_author"]', 'value']], }, date_published: { selectors: [['meta[name="article_date_original"]', 'value']], timezone: 'America/New_York', }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.entry-content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwInquisitrComExtractor = { domain: 'www.inquisitr.com', title: { selectors: ['h1.entry-title.story--header--title'], }, author: { selectors: ['div.story--header--author'], }, date_published: { selectors: [['meta[name="datePublished"]', 'value']], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['article.story', '.entry-content.'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [ '.post-category', '.story--header--socials', '.story--header--content', ], }, }; var WwwNbcnewsComExtractor = { domain: 'www.nbcnews.com', title: { selectors: ['div.article-hed h1'], }, author: { selectors: ['span.byline_author'], }, date_published: { selectors: [ ['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'], '.flag_article-wrapper time', ], timezone: 'America/New_York', }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['div.article-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var FortuneComExtractor = { domain: 'fortune.com', title: { selectors: ['h1'], }, author: { selectors: [['meta[name="author"]', 'value']], }, date_published: { selectors: ['.MblGHNMJ'], timezone: 'UTC', }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [['picture', 'article.row'], 'article.row'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var WwwLinkedinComExtractor = { domain: 'www.linkedin.com', title: { selectors: ['.article-title', 'h1'], }, author: { selectors: [ ['meta[name="article:author"]', 'value'], '.entity-name a[rel=author]', ], }, date_published: { selectors: [['time[itemprop="datePublished"]', 'datetime']], timezone: 'America/Los_Angeles', }, dek: { selectors: [ // enter selectors ], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [['header figure', '.prose'], '.prose'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.entity-image'], }, }; var ObamawhitehouseArchivesGovExtractor = { domain: 'obamawhitehouse.archives.gov', supportedDomains: ['whitehouse.gov'], title: { selectors: ['h1', '.pane-node-title'], }, author: { selectors: ['.blog-author-link', '.node-person-name-link'], }, date_published: { selectors: [['meta[name="article:published_time"]', 'value']], }, dek: { selectors: ['.field-name-field-forall-summary'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { defaultCleaner: false, selectors: ['div#content-start', '.pane-node-field-forall-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.pane-node-title', '.pane-custom.pane-1'], }, }; var WwwOpposingviewsComExtractor = { domain: 'www.opposingviews.com', title: { selectors: ['h1.title'], }, author: { selectors: ['div.date span span a'], }, date_published: { selectors: [['meta[name="publish_date"]', 'value']], }, dek: { selectors: [ // enter selectors ], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.article-content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.show-for-small-only'], }, }; var WwwProspectmagazineCoUkExtractor = { domain: 'www.prospectmagazine.co.uk', title: { selectors: ['.page-title'], }, author: { selectors: ['.aside_author .title'], }, date_published: { selectors: ['.post-info'], timezone: 'Europe/London', }, dek: { selectors: ['.page-subtitle'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [ // ['article.type-post div.post_content p'], 'article .post_content', ], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var ForwardComExtractor = { domain: 'forward.com', title: { selectors: [['meta[name="og:title"]', 'value']], }, author: { selectors: ['.author-name', ['meta[name="sailthru.author"]', 'value']], }, date_published: { selectors: [['meta[name="date"]', 'value']], }, dek: { selectors: [ // enter selectors ], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [['.post-item-media-wrap', '.post-item p']], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.donate-box', '.message', '.subtitle'], }, }; var WwwQdailyComExtractor = { domain: 'www.qdaily.com', title: { selectors: ['h2', 'h2.title'], }, author: { selectors: ['.name'], }, date_published: { selectors: [['.date.smart-date', 'data-origindate']], }, dek: { selectors: ['.excerpt'], }, lead_image_url: { selectors: [['.article-detail-hd img', 'src']], }, content: { selectors: ['.detail'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['.lazyload', '.lazylad', '.lazylood'], }, }; var GothamistComExtractor = { domain: 'gothamist.com', supportedDomains: [ 'chicagoist.com', 'laist.com', 'sfist.com', 'shanghaiist.com', 'dcist.com', ], title: { selectors: ['h1', '.entry-header h1'], }, author: { selectors: ['.author'], }, date_published: { selectors: ['abbr', 'abbr.published'], timezone: 'America/New_York', }, dek: { selectors: [null], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.entry-body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { 'div.image-none': 'figure', '.image-none i': 'figcaption', 'div.image-left': 'figure', '.image-left i': 'figcaption', 'div.image-right': 'figure', '.image-right i': 'figcaption', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [ '.image-none br', '.image-left br', '.image-right br', '.galleryEase', ], }, }; var WwwFoolComExtractor = { domain: 'www.fool.com', title: { selectors: ['h1'], }, author: { selectors: ['.author-inline .author-name'], }, date_published: { selectors: [['meta[name="date"]', 'value']], }, dek: { selectors: ['header h2'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.article-content'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { '.caption img': function captionImg($node) { var src = $node.attr('src'); $node .parent() .replaceWith('
')); }, '.caption': 'figcaption', }, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: ['#pitch'], }, }; var WwwSlateComExtractor = { domain: 'www.slate.com', title: { selectors: ['.hed', 'h1'], }, author: { selectors: ['a[rel=author]'], }, date_published: { selectors: ['.pub-date'], timezone: 'America/New_York', }, dek: { selectors: ['.dek'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: ['.body'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [ '.about-the-author', '.pullquote', '.newsletter-signup-component', '.top-comment', ], }, }; var IciRadioCanadaCaExtractor = { domain: 'ici.radio-canada.ca', title: { selectors: ['h1'], }, author: { selectors: [['meta[name="dc.creator"]', 'value']], }, date_published: { selectors: [['meta[name="dc.date.created"]', 'value']], timezone: 'America/New_York', }, dek: { selectors: ['.bunker-component.lead'], }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']], }, content: { selectors: [['.main-multimedia-item', '.news-story-content']], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: {}, // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result clean: [], }, }; var CustomExtractors = /*#__PURE__*/ Object.freeze({ BloggerExtractor: BloggerExtractor, NYMagExtractor: NYMagExtractor, WikipediaExtractor: WikipediaExtractor, TwitterExtractor: TwitterExtractor, NYTimesExtractor: NYTimesExtractor, TheAtlanticExtractor: TheAtlanticExtractor, NewYorkerExtractor: NewYorkerExtractor, WiredExtractor: WiredExtractor, MSNExtractor: MSNExtractor, YahooExtractor: YahooExtractor, BuzzfeedExtractor: BuzzfeedExtractor, WikiaExtractor: WikiaExtractor, LittleThingsExtractor: LittleThingsExtractor, PoliticoExtractor: PoliticoExtractor, DeadspinExtractor: DeadspinExtractor, BroadwayWorldExtractor: BroadwayWorldExtractor, ApartmentTherapyExtractor: ApartmentTherapyExtractor, MediumExtractor: MediumExtractor, WwwTmzComExtractor: WwwTmzComExtractor, WwwWashingtonpostComExtractor: WwwWashingtonpostComExtractor, WwwHuffingtonpostComExtractor: WwwHuffingtonpostComExtractor, NewrepublicComExtractor: NewrepublicComExtractor, MoneyCnnComExtractor: MoneyCnnComExtractor, WwwThevergeComExtractor: WwwThevergeComExtractor, WwwCnnComExtractor: WwwCnnComExtractor, WwwAolComExtractor: WwwAolComExtractor, WwwYoutubeComExtractor: WwwYoutubeComExtractor, WwwTheguardianComExtractor: WwwTheguardianComExtractor, WwwSbnationComExtractor: WwwSbnationComExtractor, WwwBloombergComExtractor: WwwBloombergComExtractor, WwwBustleComExtractor: WwwBustleComExtractor, WwwNprOrgExtractor: WwwNprOrgExtractor, WwwRecodeNetExtractor: WwwRecodeNetExtractor, QzComExtractor: QzComExtractor, WwwDmagazineComExtractor: WwwDmagazineComExtractor, WwwReutersComExtractor: WwwReutersComExtractor, MashableComExtractor: MashableComExtractor, WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor, WwwVoxComExtractor: WwwVoxComExtractor, NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor, WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor, WwwLatimesComExtractor: WwwLatimesComExtractor, PagesixComExtractor: PagesixComExtractor, ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor, WwwCbssportsComExtractor: WwwCbssportsComExtractor, WwwMsnbcComExtractor: WwwMsnbcComExtractor, WwwThepoliticalinsiderComExtractor: WwwThepoliticalinsiderComExtractor, WwwMentalflossComExtractor: WwwMentalflossComExtractor, AbcnewsGoComExtractor: AbcnewsGoComExtractor, WwwNydailynewsComExtractor: WwwNydailynewsComExtractor, WwwCnbcComExtractor: WwwCnbcComExtractor, WwwPopsugarComExtractor: WwwPopsugarComExtractor, ObserverComExtractor: ObserverComExtractor, PeopleComExtractor: PeopleComExtractor, WwwUsmagazineComExtractor: WwwUsmagazineComExtractor, WwwRollingstoneComExtractor: WwwRollingstoneComExtractor, twofortysevensportsComExtractor: twofortysevensportsComExtractor, UproxxComExtractor: UproxxComExtractor, WwwEonlineComExtractor: WwwEonlineComExtractor, WwwMiamiheraldComExtractor: WwwMiamiheraldComExtractor, WwwRefinery29ComExtractor: WwwRefinery29ComExtractor, WwwMacrumorsComExtractor: WwwMacrumorsComExtractor, WwwAndroidcentralComExtractor: WwwAndroidcentralComExtractor, WwwSiComExtractor: WwwSiComExtractor, WwwRawstoryComExtractor: WwwRawstoryComExtractor, WwwCnetComExtractor: WwwCnetComExtractor, WwwCinemablendComExtractor: WwwCinemablendComExtractor, WwwTodayComExtractor: WwwTodayComExtractor, WwwHowtogeekComExtractor: WwwHowtogeekComExtractor, WwwAlComExtractor: WwwAlComExtractor, WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor, WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor, FusionNetExtractor: FusionNetExtractor, WwwAmericanowComExtractor: WwwAmericanowComExtractor, ScienceflyComExtractor: ScienceflyComExtractor, HellogigglesComExtractor: HellogigglesComExtractor, ThoughtcatalogComExtractor: ThoughtcatalogComExtractor, WwwNjComExtractor: WwwNjComExtractor, WwwInquisitrComExtractor: WwwInquisitrComExtractor, WwwNbcnewsComExtractor: WwwNbcnewsComExtractor, FortuneComExtractor: FortuneComExtractor, WwwLinkedinComExtractor: WwwLinkedinComExtractor, ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor, WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor, WwwProspectmagazineCoUkExtractor: WwwProspectmagazineCoUkExtractor, ForwardComExtractor: ForwardComExtractor, WwwQdailyComExtractor: WwwQdailyComExtractor, GothamistComExtractor: GothamistComExtractor, WwwFoolComExtractor: WwwFoolComExtractor, WwwSlateComExtractor: WwwSlateComExtractor, IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor, }); var Extractors = _Object$keys(CustomExtractors).reduce(function(acc, key) { var extractor = CustomExtractors[key]; return _objectSpread({}, acc, mergeSupportedDomains(extractor)); }, {}); // CLEAN AUTHOR CONSTANTS var CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i; // author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)', // CLEAN DEK CONSTANTS var TEXT_LINK_RE = new RegExp('http(s)?://', 'i'); // An ordered list of meta tag names that denote likely article deks. var MS_DATE_STRING = /^\d{13}$/i; var SEC_DATE_STRING = /^\d{10}$/i; var CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i; var TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i; var TIME_MERIDIAN_DOTS_RE = /\.m\./i; var months = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', ]; var allMonths = months.join('|'); var timestamp1 = '[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?'; var timestamp2 = '[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}'; var timestamp3 = '-[0-9]{3,4}$'; var SPLIT_DATE_STRING = new RegExp( '(' .concat(timestamp1, ')|(') .concat(timestamp2, ')|(') .concat(timestamp3, ')|([0-9]{1,4})|(') .concat(allMonths, ')'), 'ig' ); // 2016-11-22T08:57-500 // Check if datetime string has an offset at the end var TIME_WITH_OFFSET_RE = /-\d{3,4}$/; // CLEAN TITLE CONSTANTS // A regular expression that will match separating characters on a // title, that usually denote breadcrumbs or something similar. var TITLE_SPLITTERS_RE = /(: | - | \| )/g; var DOMAIN_ENDINGS_RE = new RegExp('.com$|.net$|.org$|.co.uk$', 'g'); // just the name(s): 'David Smith'. function cleanAuthor(author) { return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim()); } function clean$1(leadImageUrl) { leadImageUrl = leadImageUrl.trim(); if (validUrl.isWebUri(leadImageUrl)) { return leadImageUrl; } return null; } // Return None if the dek wasn't good enough. function cleanDek(dek, _ref) { var $ = _ref.$, excerpt = _ref.excerpt; // Sanity check that we didn't get too short or long of a dek. if (dek.length > 1000 || dek.length < 5) return null; // Check that dek isn't the same as excerpt if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10)) return null; var dekText = stripTags(dek, $); // Plain text links shouldn't exist in the dek. If we have some, it's // not a good dek - bail. if (TEXT_LINK_RE.test(dekText)) return null; return normalizeSpaces(dekText.trim()); } function cleanDateString(dateString) { return (dateString.match(SPLIT_DATE_STRING) || []) .join(' ') .replace(TIME_MERIDIAN_DOTS_RE, 'm') .replace(TIME_MERIDIAN_SPACE_RE, '$1 $2 $3') .replace(CLEAN_DATE_STRING_RE, '$1') .trim(); } function createDate(dateString, timezone, format) { if (TIME_WITH_OFFSET_RE.test(dateString)) { return moment(new Date(dateString)); } return timezone ? moment.tz(dateString, format || parseFormat(dateString), timezone) : moment(dateString, format || parseFormat(dateString)); } // Take a date published string, and hopefully return a date out of // it. Return none if we fail. function cleanDatePublished(dateString) { var _ref = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {}, timezone = _ref.timezone, format = _ref.format; // If string is in milliseconds or seconds, convert to int and return if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) { return new Date(_parseInt(dateString, 10)).toISOString(); } var date = createDate(dateString, timezone, format); if (!date.isValid()) { dateString = cleanDateString(dateString); date = createDate(dateString, timezone, format); } return date.isValid() ? date.toISOString() : null; } function extractCleanNode(article, _ref) { var $ = _ref.$, _ref$cleanConditional = _ref.cleanConditionally, cleanConditionally = _ref$cleanConditional === void 0 ? true : _ref$cleanConditional, _ref$title = _ref.title, title = _ref$title === void 0 ? '' : _ref$title, _ref$url = _ref.url, url = _ref$url === void 0 ? '' : _ref$url, _ref$defaultCleaner = _ref.defaultCleaner, defaultCleaner = _ref$defaultCleaner === void 0 ? true : _ref$defaultCleaner; // Rewrite the tag name to div if it's a top level node like body or // html to avoid later complications with multiple body tags. rewriteTopLevel$$1(article, $); // Drop small images and spacer images // Only do this is defaultCleaner is set to true; // this can sometimes be too aggressive. if (defaultCleaner) cleanImages(article, $); // Make links absolute makeLinksAbsolute$$1(article, $, url); // Mark elements to keep that would normally be removed. // E.g., stripJunkTags will remove iframes, so we're going to mark // YouTube/Vimeo videos as elements we want to keep. markToKeep(article, $, url); // Drop certain tags like , etc // This is -mostly- for cleanliness, not security. stripJunkTags(article, $); // H1 tags are typically the article title, which should be extracted // by the title extractor instead. If there's less than 3 of them (<3), // strip them. Otherwise, turn 'em into H2s. cleanHOnes$$1(article, $); // Clean headers cleanHeaders(article, $, title); // We used to clean UL's and OL's here, but it was leading to // too many in-article lists being removed. Consider a better // way to detect menus particularly and remove them. // Also optionally running, since it can be overly aggressive. if (defaultCleaner) cleanTags$$1(article, $, cleanConditionally); // Remove empty paragraph nodes removeEmpty(article, $); // Remove unnecessary attributes cleanAttributes$$1(article, $); return article; } function cleanTitle$$1(title, _ref) { var url = _ref.url, $ = _ref.$; // If title has |, :, or - in it, see if // we can clean it up. if (TITLE_SPLITTERS_RE.test(title)) { title = resolveSplitTitle(title, url); } // Final sanity check that we didn't get a crazy title. // if (title.length > 150 || title.length < 15) { if (title.length > 150) { // If we did, return h1 from the document if it exists var h1 = $('h1'); if (h1.length === 1) { title = h1.text(); } } // strip any html tags in the title text return normalizeSpaces(stripTags(title, $).trim()); } function extractBreadcrumbTitle(splitTitle, text) { // This must be a very breadcrumbed title, like: // The Best Gadgets on Earth : Bits : Blogs : NYTimes.com // NYTimes - Blogs - Bits - The Best Gadgets on Earth if (splitTitle.length >= 6) { // Look to see if we can find a breadcrumb splitter that happens // more than once. If we can, we'll be able to better pull out // the title. var termCounts = splitTitle.reduce(function(acc, titleText) { acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1; return acc; }, {}); var _Reflect$ownKeys$redu = _Reflect$ownKeys(termCounts).reduce( function(acc, key) { if (acc[1] < termCounts[key]) { return [key, termCounts[key]]; } return acc; }, [0, 0] ), _Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2), maxTerm = _Reflect$ownKeys$redu2[0], termCount = _Reflect$ownKeys$redu2[1]; // We found a splitter that was used more than once, so it // is probably the breadcrumber. Split our title on that instead. // Note: max_term should be <= 4 characters, so that " >> " // will match, but nothing longer than that. if (termCount >= 2 && maxTerm.length <= 4) { splitTitle = text.split(maxTerm); } var splitEnds = [splitTitle[0], splitTitle.slice(-1)]; var longestEnd = splitEnds.reduce(function(acc, end) { return acc.length > end.length ? acc : end; }, ''); if (longestEnd.length > 10) { return longestEnd; } return text; } return null; } function cleanDomainFromTitle(splitTitle, url) { // Search the ends of the title, looking for bits that fuzzy match // the URL too closely. If one is found, discard it and return the // rest. // // Strip out the big TLDs - it just makes the matching a bit more // accurate. Not the end of the world if it doesn't strip right. var _URL$parse = URL.parse(url), host = _URL$parse.host; var nakedDomain = host.replace(DOMAIN_ENDINGS_RE, ''); var startSlug = splitTitle[0].toLowerCase().replace(' ', ''); var startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain); if (startSlugRatio > 0.4 && startSlug.length > 5) { return splitTitle.slice(2).join(''); } var endSlug = splitTitle .slice(-1)[0] .toLowerCase() .replace(' ', ''); var endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain); if (endSlugRatio > 0.4 && endSlug.length >= 5) { return splitTitle.slice(0, -2).join(''); } return null; } // Given a title with separators in it (colons, dashes, etc), // resolve whether any of the segments should be removed. function resolveSplitTitle(title) { var url = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : ''; // Splits while preserving splitters, like: // ['The New New York', ' - ', 'The Washington Post'] var splitTitle = title.split(TITLE_SPLITTERS_RE); if (splitTitle.length === 1) { return title; } var newTitle = extractBreadcrumbTitle(splitTitle, title); if (newTitle) return newTitle; newTitle = cleanDomainFromTitle(splitTitle, url); if (newTitle) return newTitle; // Fuzzy ratio didn't find anything, so this title is probably legit. // Just return it all. return title; } var Cleaners = { author: cleanAuthor, lead_image_url: clean$1, dek: cleanDek, date_published: cleanDatePublished, content: extractCleanNode, title: cleanTitle$$1, }; // likely to be article text. // // If strip_unlikely_candidates is True, remove any elements that // match certain criteria first. (Like, does this element have a // classname of "comment") // // If weight_nodes is True, use classNames and IDs to determine the // worthiness of nodes. // // Returns a cheerio object $ function extractBestNode($, opts) { // clone the node so we can get back to our // initial parsed state if needed // TODO Do I need this? – AP // let $root = $.root().clone() if (opts.stripUnlikelyCandidates) { $ = stripUnlikelyCandidates($); } $ = convertToParagraphs$$1($); $ = scoreContent$$1($, opts.weightNodes); var $topCandidate = findTopCandidate$$1($); return $topCandidate; } var GenericContentExtractor = { defaultOpts: { stripUnlikelyCandidates: true, weightNodes: true, cleanConditionally: true, }, // Extract the content for this resource - initially, pass in our // most restrictive opts which will return the highest quality // content. On each failure, retry with slightly more lax opts. // // :param return_type: string. If "node", should return the content // as a cheerio node rather than as an HTML string. // // Opts: // stripUnlikelyCandidates: Remove any elements that match // non-article-like criteria first.(Like, does this element // have a classname of "comment") // // weightNodes: Modify an elements score based on whether it has // certain classNames or IDs. Examples: Subtract if a node has // a className of 'comment', Add if a node has an ID of // 'entry-content'. // // cleanConditionally: Clean the node to return of some // superfluous content. Things like forms, ads, etc. extract: function extract(_ref, opts) { var $ = _ref.$, html = _ref.html, title = _ref.title, url = _ref.url; opts = _objectSpread({}, this.defaultOpts, opts); $ = $ || cheerio.load(html); // Cascade through our extraction-specific opts in an ordered fashion, // turning them off as we try to extract content. var node = this.getContentNode($, title, url, opts); if (nodeIsSufficient(node)) { return this.cleanAndReturnNode(node, $); } // We didn't succeed on first pass, one by one disable our // extraction opts and try again. // eslint-disable-next-line no-restricted-syntax var _iteratorNormalCompletion = true; var _didIteratorError = false; var _iteratorError = undefined; try { for ( var _iterator = _getIterator( _Reflect$ownKeys(opts).filter(function(k) { return opts[k] === true; }) ), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true ) { var key = _step.value; opts[key] = false; $ = cheerio.load(html); node = this.getContentNode($, title, url, opts); if (nodeIsSufficient(node)) { break; } } } catch (err) { _didIteratorError = true; _iteratorError = err; } finally { try { if (!_iteratorNormalCompletion && _iterator.return != null) { _iterator.return(); } } finally { if (_didIteratorError) { throw _iteratorError; } } } return this.cleanAndReturnNode(node, $); }, // Get node given current options getContentNode: function getContentNode($, title, url, opts) { return extractCleanNode(extractBestNode($, opts), { $: $, cleanConditionally: opts.cleanConditionally, title: title, url: url, }); }, // Once we got here, either we're at our last-resort node, or // we broke early. Make sure we at least have -something- before we // move forward. cleanAndReturnNode: function cleanAndReturnNode(node, $) { if (!node) { return null; } return normalizeSpaces($.html(node)); // if return_type == "html": // return normalize_spaces(node_to_html(node)) // else: // return node }, }; // TODO: It would be great if we could merge the meta and selector lists into // a list of objects, because we could then rank them better. For example, // .hentry .entry-title is far better suited than <meta title>. // An ordered list of meta tag names that denote likely article titles. All // attributes should be lowercase for faster case-insensitive matching. From // most distinct to least distinct. var STRONG_TITLE_META_TAGS = [ 'tweetmeme-title', 'dc.title', 'rbtitle', 'headline', 'title', ]; // og:title is weak because it typically contains context that we don't like, // for example the source site's name. Gotta get that brand into facebook! var WEAK_TITLE_META_TAGS = ['og:title']; // An ordered list of XPath Selectors to find likely article titles. From // most explicit to least explicit. // // Note - this does not use classes like CSS. This checks to see if the string // exists in the className, which is not as accurate as .className (which // splits on spaces/endlines), but for our purposes it's close enough. The // speed tradeoff is worth the accuracy hit. var STRONG_TITLE_SELECTORS = [ '.hentry .entry-title', 'h1#articleHeader', 'h1.articleHeader', 'h1.article', '.instapaper_title', '#meebo-title', ]; var WEAK_TITLE_SELECTORS = [ 'article h1', '#entry-title', '.entry-title', '#entryTitle', '#entrytitle', '.entryTitle', '.entrytitle', '#articleTitle', '.articleTitle', 'post post-title', 'h1.title', 'h2.article', 'h1', 'html head title', 'title', ]; var GenericTitleExtractor = { extract: function extract(_ref) { var $ = _ref.$, url = _ref.url, metaCache = _ref.metaCache; // First, check to see if we have a matching meta tag that we can make // use of that is strongly associated with the headline. var title; title = extractFromMeta$$1($, STRONG_TITLE_META_TAGS, metaCache); if (title) return cleanTitle$$1(title, { url: url, $: $, }); // Second, look through our content selectors for the most likely // article title that is strongly associated with the headline. title = extractFromSelectors$$1($, STRONG_TITLE_SELECTORS); if (title) return cleanTitle$$1(title, { url: url, $: $, }); // Third, check for weaker meta tags that may match. title = extractFromMeta$$1($, WEAK_TITLE_META_TAGS, metaCache); if (title) return cleanTitle$$1(title, { url: url, $: $, }); // Last, look for weaker selector tags that may match. title = extractFromSelectors$$1($, WEAK_TITLE_SELECTORS); if (title) return cleanTitle$$1(title, { url: url, $: $, }); // If no matches, return an empty string return ''; }, }; // An ordered list of meta tag names that denote likely article authors. All // attributes should be lowercase for faster case-insensitive matching. From // most distinct to least distinct. // // Note: "author" is too often the -developer- of the page, so it is not // added here. var AUTHOR_META_TAGS = [ 'byl', 'clmst', 'dc.author', 'dcsext.author', 'dc.creator', 'rbauthors', 'authors', ]; var AUTHOR_MAX_LENGTH = 300; // An ordered list of XPath Selectors to find likely article authors. From // most explicit to least explicit. // // Note - this does not use classes like CSS. This checks to see if the string // exists in the className, which is not as accurate as .className (which // splits on spaces/endlines), but for our purposes it's close enough. The // speed tradeoff is worth the accuracy hit. var AUTHOR_SELECTORS = [ '.entry .entry-author', '.author.vcard .fn', '.author .vcard .fn', '.byline.vcard .fn', '.byline .vcard .fn', '.byline .by .author', '.byline .by', '.byline .author', '.post-author.vcard', '.post-author .vcard', 'a[rel=author]', '#by_author', '.by_author', '#entryAuthor', '.entryAuthor', '.byline a[href*=author]', '#author .authorname', '.author .authorname', '#author', '.author', '.articleauthor', '.ArticleAuthor', '.byline', ]; // An ordered list of Selectors to find likely article authors, with // regular expression for content. var bylineRe = /^[\n\s]*By/i; var BYLINE_SELECTORS_RE = [['#byline', bylineRe], ['.byline', bylineRe]]; var GenericAuthorExtractor = { extract: function extract(_ref) { var $ = _ref.$, metaCache = _ref.metaCache; var author; // First, check to see if we have a matching // meta tag that we can make use of. author = extractFromMeta$$1($, AUTHOR_META_TAGS, metaCache); if (author && author.length < AUTHOR_MAX_LENGTH) { return cleanAuthor(author); } // Second, look through our selectors looking for potential authors. author = extractFromSelectors$$1($, AUTHOR_SELECTORS, 2); if (author && author.length < AUTHOR_MAX_LENGTH) { return cleanAuthor(author); } // Last, use our looser regular-expression based selectors for // potential authors. // eslint-disable-next-line no-restricted-syntax var _iteratorNormalCompletion = true; var _didIteratorError = false; var _iteratorError = undefined; try { for ( var _iterator = _getIterator(BYLINE_SELECTORS_RE), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true ) { var _ref4 = _step.value; var _ref3 = _slicedToArray(_ref4, 2); var selector = _ref3[0]; var regex = _ref3[1]; var node = $(selector); if (node.length === 1) { var text = node.text(); if (regex.test(text)) { return cleanAuthor(text); } } } } catch (err) { _didIteratorError = true; _iteratorError = err; } finally { try { if (!_iteratorNormalCompletion && _iterator.return != null) { _iterator.return(); } } finally { if (_didIteratorError) { throw _iteratorError; } } } return null; }, }; // An ordered list of meta tag names that denote // likely date published dates. All attributes // should be lowercase for faster case-insensitive matching. // From most distinct to least distinct. var DATE_PUBLISHED_META_TAGS = [ 'article:published_time', 'displaydate', 'dc.date', 'dc.date.issued', 'rbpubdate', 'publish_date', 'pub_date', 'pagedate', 'pubdate', 'revision_date', 'doc_date', 'date_created', 'content_create_date', 'lastmodified', 'created', 'date', ]; // An ordered list of XPath Selectors to find // likely date published dates. From most explicit // to least explicit. var DATE_PUBLISHED_SELECTORS = [ '.hentry .dtstamp.published', '.hentry .published', '.hentry .dtstamp.updated', '.hentry .updated', '.single .published', '.meta .published', '.meta .postDate', '.entry-date', '.byline .date', '.postmetadata .date', '.article_datetime', '.date-header', '.story-date', '.dateStamp', '#story .datetime', '.dateline', '.pubdate', ]; // An ordered list of compiled regular expressions to find likely date // published dates from the URL. These should always have the first // reference be a date string that is parseable by dateutil.parser.parse var abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'; var DATE_PUBLISHED_URL_RES = [ // /2012/01/27/ but not /2012/01/293 new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), // 20120127 or 20120127T but not 2012012733 or 8201201733 // /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i, // 2012-01-27 new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), // /2012/jan/27/ new RegExp('/(20\\d{2}/'.concat(abbrevMonthsStr, '/[0-3]\\d)/'), 'i'), ]; var GenericDatePublishedExtractor = { extract: function extract(_ref) { var $ = _ref.$, url = _ref.url, metaCache = _ref.metaCache; var datePublished; // First, check to see if we have a matching meta tag // that we can make use of. // Don't try cleaning tags from this string datePublished = extractFromMeta$$1( $, DATE_PUBLISHED_META_TAGS, metaCache, false ); if (datePublished) return cleanDatePublished(datePublished); // Second, look through our selectors looking for potential // date_published's. datePublished = extractFromSelectors$$1($, DATE_PUBLISHED_SELECTORS); if (datePublished) return cleanDatePublished(datePublished); // Lastly, look to see if a dately string exists in the URL datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES); if (datePublished) return cleanDatePublished(datePublished); return null; }, }; // import { // DEK_META_TAGS, // DEK_SELECTORS, // DEK_URL_RES, // } from './constants'; // import { cleanDek } from 'cleaners'; // import { // extractFromMeta, // extractFromSelectors, // } from 'utils/dom'; // Currently there is only one selector for // deks. We should simply return null here // until we have a more robust generic option. // Below is the original source for this, for reference. var GenericDekExtractor = { // extract({ $, content, metaCache }) { extract: function extract() { return null; }, }; // # First, check to see if we have a matching meta tag that we can make // # use of. // dek = self.extract_from_meta('dek', constants.DEK_META_TAGS) // if not dek: // # Second, look through our CSS/XPath selectors. This may return // # an HTML fragment. // dek = self.extract_from_selectors('dek', // constants.DEK_SELECTORS, // text_only=False) // // if dek: // # Make sure our dek isn't in the first few thousand characters // # of the content, otherwise it's just the start of the article // # and not a true dek. // content = self.extract_content() // content_chunk = normalize_spaces(strip_tags(content[:2000])) // dek_chunk = normalize_spaces(dek[:100]) # Already has no tags. // // # 80% or greater similarity means the dek was very similar to some // # of the starting content, so we skip it. // if fuzz.partial_ratio(content_chunk, dek_chunk) < 80: // return dek // // return None // An ordered list of meta tag names that denote likely article leading images. // All attributes should be lowercase for faster case-insensitive matching. // From most distinct to least distinct. var LEAD_IMAGE_URL_META_TAGS = ['og:image', 'twitter:image', 'image_src']; var LEAD_IMAGE_URL_SELECTORS = ['link[rel=image_src]']; var POSITIVE_LEAD_IMAGE_URL_HINTS = [ 'upload', 'wp-content', 'large', 'photo', 'wp-image', ]; var POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp( POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i' ); var NEGATIVE_LEAD_IMAGE_URL_HINTS = [ 'spacer', 'sprite', 'blank', 'throbber', 'gradient', 'tile', 'bg', 'background', 'icon', 'social', 'header', 'hdr', 'advert', 'spinner', 'loader', 'loading', 'default', 'rating', 'share', 'facebook', 'twitter', 'theme', 'promo', 'ads', 'wp-includes', ]; var NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp( NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i' ); var GIF_RE = /\.gif(\?.*)?$/i; var JPG_RE = /\.jpe?g(\?.*)?$/i; function getSig($node) { return '' .concat($node.attr('class') || '', ' ') .concat($node.attr('id') || ''); } // Scores image urls based on a variety of heuristics. function scoreImageUrl(url) { url = url.trim(); var score = 0; if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) { score += 20; } if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) { score -= 20; } // TODO: We might want to consider removing this as // gifs are much more common/popular than they once were if (GIF_RE.test(url)) { score -= 10; } if (JPG_RE.test(url)) { score += 10; } // PNGs are neutral. return score; } // Alt attribute usually means non-presentational image. function scoreAttr($img) { if ($img.attr('alt')) { return 5; } return 0; } // Look through our parent and grandparent for figure-like // container elements, give a bonus if we find them function scoreByParents($img) { var score = 0; var $figParent = $img.parents('figure').first(); if ($figParent.length === 1) { score += 25; } var $parent = $img.parent(); var $gParent; if ($parent.length === 1) { $gParent = $parent.parent(); } [$parent, $gParent].forEach(function($node) { if (PHOTO_HINTS_RE$1.test(getSig($node))) { score += 15; } }); return score; } // Look at our immediate sibling and see if it looks like it's a // caption. Bonus if so. function scoreBySibling($img) { var score = 0; var $sibling = $img.next(); var sibling = $sibling.get(0); if (sibling && sibling.tagName.toLowerCase() === 'figcaption') { score += 25; } if (PHOTO_HINTS_RE$1.test(getSig($sibling))) { score += 15; } return score; } function scoreByDimensions($img) { var score = 0; var width = _parseFloat($img.attr('width')); var height = _parseFloat($img.attr('height')); var src = $img.attr('src'); // Penalty for skinny images if (width && width <= 50) { score -= 50; } // Penalty for short images if (height && height <= 50) { score -= 50; } if (width && height && !src.includes('sprite')) { var area = width * height; if (area < 5000) { // Smaller than 50 x 100 score -= 100; } else { score += Math.round(area / 1000); } } return score; } function scoreByPosition($imgs, index) { return $imgs.length / 2 - index; } // it. Like content and next page extraction, uses a scoring system // to determine what the most likely image may be. Short circuits // on really probable things like og:image meta tags. // // Potential signals to still take advantage of: // * domain // * weird aspect ratio var GenericLeadImageUrlExtractor = { extract: function extract(_ref) { var $ = _ref.$, content = _ref.content, metaCache = _ref.metaCache, html = _ref.html; var cleanUrl; if (!$.browser && $('head').length === 0) { $('*') .first() .prepend(html); } // Check to see if we have a matching meta tag that we can make use of. // Moving this higher because common practice is now to use large // images on things like Open Graph or Twitter cards. // images usually have for things like Open Graph. var imageUrl = extractFromMeta$$1( $, LEAD_IMAGE_URL_META_TAGS, metaCache, false ); if (imageUrl) { cleanUrl = clean$1(imageUrl); if (cleanUrl) return cleanUrl; } // Next, try to find the "best" image via the content. // We'd rather not have to fetch each image and check dimensions, // so try to do some analysis and determine them instead. var $content = $(content); var imgs = $('img', $content).toArray(); var imgScores = {}; imgs.forEach(function(img, index) { var $img = $(img); var src = $img.attr('src'); if (!src) return; var score = scoreImageUrl(src); score += scoreAttr($img); score += scoreByParents($img); score += scoreBySibling($img); score += scoreByDimensions($img); score += scoreByPosition(imgs, index); imgScores[src] = score; }); var _Reflect$ownKeys$redu = _Reflect$ownKeys(imgScores).reduce( function(acc, key) { return imgScores[key] > acc[1] ? [key, imgScores[key]] : acc; }, [null, 0] ), _Reflect$ownKeys$redu2 = _slicedToArray(_Reflect$ownKeys$redu, 2), topUrl = _Reflect$ownKeys$redu2[0], topScore = _Reflect$ownKeys$redu2[1]; if (topScore > 0) { cleanUrl = clean$1(topUrl); if (cleanUrl) return cleanUrl; } // If nothing else worked, check to see if there are any really // probable nodes in the doc, like <link rel="image_src" />. // eslint-disable-next-line no-restricted-syntax var _iteratorNormalCompletion = true; var _didIteratorError = false; var _iteratorError = undefined; try { for ( var _iterator = _getIterator(LEAD_IMAGE_URL_SELECTORS), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true ) { var selector = _step.value; var $node = $(selector).first(); var src = $node.attr('src'); if (src) { cleanUrl = clean$1(src); if (cleanUrl) return cleanUrl; } var href = $node.attr('href'); if (href) { cleanUrl = clean$1(href); if (cleanUrl) return cleanUrl; } var value = $node.attr('value'); if (value) { cleanUrl = clean$1(value); if (cleanUrl) return cleanUrl; } } } catch (err) { _didIteratorError = true; _iteratorError = err; } finally { try { if (!_iteratorNormalCompletion && _iterator.return != null) { _iterator.return(); } } finally { if (_didIteratorError) { throw _iteratorError; } } } return null; }, }; // """ // # First, try to find the "best" image via the content. // # We'd rather not have to fetch each image and check dimensions, // # so try to do some analysis and determine them instead. // content = self.extractor.extract_content(return_type="node") // imgs = content.xpath('.//img') // img_scores = defaultdict(int) // logger.debug('Scoring %d images from content', len(imgs)) // for (i, img) in enumerate(imgs): // img_score = 0 // // if not 'src' in img.attrib: // logger.debug('No src attribute found') // continue // // try: // parsed_img = urlparse(img.attrib['src']) // img_path = parsed_img.path.lower() // except ValueError: // logger.debug('ValueError getting img path.') // continue // logger.debug('Image path is %s', img_path) // // if constants.POSITIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path): // logger.debug('Positive URL hints match. Adding 20.') // img_score += 20 // // if constants.NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path): // logger.debug('Negative URL hints match. Subtracting 20.') // img_score -= 20 // // # Gifs are more often structure than photos // if img_path.endswith('gif'): // logger.debug('gif found. Subtracting 10.') // img_score -= 10 // // # JPGs are more often photographs // if img_path.endswith('jpg'): // logger.debug('jpg found. Adding 10.') // img_score += 10 // // # PNGs are neutral. // // # Alt attribute usually means non-presentational image. // if 'alt' in img.attrib and len(img.attrib['alt']) > 5: // logger.debug('alt attribute found. Adding 5.') // img_score += 5 // // # Look through our parent and grandparent for figure-like // # container elements, give a bonus if we find them // parents = [img.getparent()] // if parents[0] is not None and parents[0].getparent() is not None: // parents.append(parents[0].getparent()) // for p in parents: // if p.tag == 'figure': // logger.debug('Parent with <figure> tag found. Adding 25.') // img_score += 25 // // p_sig = ' '.join([p.get('id', ''), p.get('class', '')]) // if constants.PHOTO_HINTS_RE.search(p_sig): // logger.debug('Photo hints regex match. Adding 15.') // img_score += 15 // // # Look at our immediate sibling and see if it looks like it's a // # caption. Bonus if so. // sibling = img.getnext() // if sibling is not None: // if sibling.tag == 'figcaption': // img_score += 25 // // sib_sig = ' '.join([sibling.get('id', ''), // sibling.get('class', '')]).lower() // if 'caption' in sib_sig: // img_score += 15 // // # Pull out width/height if they were set. // img_width = None // img_height = None // if 'width' in img.attrib: // try: // img_width = float(img.get('width')) // except ValueError: // pass // if 'height' in img.attrib: // try: // img_height = float(img.get('height')) // except ValueError: // pass // // # Penalty for skinny images // if img_width and img_width <= 50: // logger.debug('Skinny image found. Subtracting 50.') // img_score -= 50 // // # Penalty for short images // if img_height and img_height <= 50: // # Wide, short images are more common than narrow, tall ones // logger.debug('Short image found. Subtracting 25.') // img_score -= 25 // // if img_width and img_height and not 'sprite' in img_path: // area = img_width * img_height // // if area < 5000: # Smaller than 50x100 // logger.debug('Image with small area found. Subtracting 100.') // img_score -= 100 // else: // img_score += round(area/1000.0) // // # If the image is higher on the page than other images, // # it gets a bonus. Penalty if lower. // logger.debug('Adding page placement bonus of %d.', len(imgs)/2 - i) // img_score += len(imgs)/2 - i // // # Use the raw src here because we munged img_path for case // # insensitivity // logger.debug('Final score is %d.', img_score) // img_scores[img.attrib['src']] += img_score // // top_score = 0 // top_url = None // for (url, score) in img_scores.items(): // if score > top_score: // top_url = url // top_score = score // // if top_score > 0: // logger.debug('Using top score image from content. Score was %d', top_score) // return top_url // // // # If nothing else worked, check to see if there are any really // # probable nodes in the doc, like <link rel="image_src" />. // logger.debug('Trying to find lead image in probable nodes') // for selector in constants.LEAD_IMAGE_URL_SELECTORS: // nodes = self.resource.extract_by_selector(selector) // for node in nodes: // clean_value = None // if node.attrib.get('src'): // clean_value = self.clean(node.attrib['src']) // // if not clean_value and node.attrib.get('href'): // clean_value = self.clean(node.attrib['href']) // // if not clean_value and node.attrib.get('value'): // clean_value = self.clean(node.attrib['value']) // // if clean_value: // logger.debug('Found lead image in probable nodes.') // logger.debug('Node was: %s', node) // return clean_value // // return None function scoreSimilarity(score, articleUrl, href) { // Do this last and only if we have a real candidate, because it's // potentially expensive computationally. Compare the link to this // URL using difflib to get the % similarity of these URLs. On a // sliding scale, subtract points from this link based on // similarity. if (score > 0) { var similarity = new difflib.SequenceMatcher( null, articleUrl, href ).ratio(); // Subtract .1 from diff_percent when calculating modifier, // which means that if it's less than 10% different, we give a // bonus instead. Ex: // 3% different = +17.5 points // 10% different = 0 points // 20% different = -25 points var diffPercent = 1.0 - similarity; var diffModifier = -(250 * (diffPercent - 0.2)); return score + diffModifier; } return 0; } function scoreLinkText(linkText, pageNum) { // If the link text can be parsed as a number, give it a minor // bonus, with a slight bias towards lower numbered pages. This is // so that pages that might not have 'next' in their text can still // get scored, and sorted properly by score. var score = 0; if (IS_DIGIT_RE.test(linkText.trim())) { var linkTextAsNum = _parseInt(linkText, 10); // If it's the first page, we already got it on the first call. // Give it a negative score. Otherwise, up to page 10, give a // small bonus. if (linkTextAsNum < 2) { score = -30; } else { score = Math.max(0, 10 - linkTextAsNum); } // If it appears that the current page number is greater than // this links page number, it's a very bad sign. Give it a big // penalty. if (pageNum && pageNum >= linkTextAsNum) { score -= 50; } } return score; } function scorePageInLink(pageNum, isWp) { // page in the link = bonus. Intentionally ignore wordpress because // their ?p=123 link style gets caught by this even though it means // separate documents entirely. if (pageNum && !isWp) { return 50; } return 0; } var DIGIT_RE$2 = /\d/; // A list of words that, if found in link text or URLs, likely mean that // this link is not a next page link. var EXTRANEOUS_LINK_HINTS$1 = [ 'print', 'archive', 'comment', 'discuss', 'e-mail', 'email', 'share', 'reply', 'all', 'login', 'sign', 'single', 'adx', 'entry-unrelated', ]; var EXTRANEOUS_LINK_HINTS_RE$1 = new RegExp( EXTRANEOUS_LINK_HINTS$1.join('|'), 'i' ); // Match any link text/classname/id that looks like it could mean the next // page. Things like: next, continue, >, >>, » but not >|, »| as those can // mean last page. var NEXT_LINK_TEXT_RE$1 = new RegExp( '(next|weiter|continue|>([^|]|$)|»([^|]|$))', 'i' ); // Match any link text/classname/id that looks like it is an end link: things // like "first", "last", "end", etc. var CAP_LINK_TEXT_RE$1 = new RegExp('(first|last|end)', 'i'); // Match any link text/classname/id that looks like it means the previous // page. var PREV_LINK_TEXT_RE$1 = new RegExp('(prev|earl|old|new|<|«)', 'i'); // Match any phrase that looks like it could be page, or paging, or pagination function scoreExtraneousLinks(href) { // If the URL itself contains extraneous values, give a penalty. if (EXTRANEOUS_LINK_HINTS_RE$1.test(href)) { return -25; } return 0; } function makeSig($link) { return '' .concat($link.attr('class') || '', ' ') .concat($link.attr('id') || ''); } function scoreByParents$1($link) { // If a parent node contains paging-like classname or id, give a // bonus. Additionally, if a parent_node contains bad content // (like 'sponsor'), give a penalty. var $parent = $link.parent(); var positiveMatch = false; var negativeMatch = false; var score = 0; _Array$from(range(0, 4)).forEach(function() { if ($parent.length === 0) { return; } var parentData = makeSig($parent, ' '); // If we have 'page' or 'paging' in our data, that's a good // sign. Add a bonus. if (!positiveMatch && PAGE_RE.test(parentData)) { positiveMatch = true; score += 25; } // If we have 'comment' or something in our data, and // we don't have something like 'content' as well, that's // a bad sign. Give a penalty. if ( !negativeMatch && NEGATIVE_SCORE_RE.test(parentData) && EXTRANEOUS_LINK_HINTS_RE$1.test(parentData) ) { if (!POSITIVE_SCORE_RE.test(parentData)) { negativeMatch = true; score -= 25; } } $parent = $parent.parent(); }); return score; } function scorePrevLink(linkData) { // If the link has something like "previous", its definitely // an old link, skip it. if (PREV_LINK_TEXT_RE$1.test(linkData)) { return -200; } return 0; } function shouldScore( href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls ) { // skip if we've already fetched this url if ( previousUrls.find(function(url) { return href === url; }) !== undefined ) { return false; } // If we've already parsed this URL, or the URL matches the base // URL, or is empty, skip it. if (!href || href === articleUrl || href === baseUrl) { return false; } var hostname = parsedUrl.hostname; var _URL$parse = URL.parse(href), linkHost = _URL$parse.hostname; // Domain mismatch. if (linkHost !== hostname) { return false; } // If href doesn't contain a digit after removing the base URL, // it's certainly not the next page. var fragment = href.replace(baseUrl, ''); if (!DIGIT_RE$2.test(fragment)) { return false; } // This link has extraneous content (like "comment") in its link // text, so we skip it. if (EXTRANEOUS_LINK_HINTS_RE$1.test(linkText)) { return false; } // Next page link text is never long, skip if it is too long. if (linkText.length > 25) { return false; } return true; } function scoreBaseUrl(href, baseRegex) { // If the baseUrl isn't part of this URL, penalize this // link. It could still be the link, but the odds are lower. // Example: // http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html if (!baseRegex.test(href)) { return -25; } return 0; } function scoreNextLinkText(linkData) { // Things like "next", ">>", etc. if (NEXT_LINK_TEXT_RE$1.test(linkData)) { return 50; } return 0; } function scoreCapLinks(linkData) { // Cap links are links like "last", etc. if (CAP_LINK_TEXT_RE$1.test(linkData)) { // If we found a link like "last", but we've already seen that // this link is also "next", it's fine. If it's not been // previously marked as "next", then it's probably bad. // Penalize. if (NEXT_LINK_TEXT_RE$1.test(linkData)) { return -65; } } return 0; } function makeBaseRegex(baseUrl) { return new RegExp('^'.concat(baseUrl), 'i'); } function makeSig$1($link, linkText) { return '' .concat(linkText || $link.text(), ' ') .concat($link.attr('class') || '', ' ') .concat($link.attr('id') || ''); } function scoreLinks(_ref) { var links = _ref.links, articleUrl = _ref.articleUrl, baseUrl = _ref.baseUrl, parsedUrl = _ref.parsedUrl, $ = _ref.$, _ref$previousUrls = _ref.previousUrls, previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls; parsedUrl = parsedUrl || URL.parse(articleUrl); var baseRegex = makeBaseRegex(baseUrl); var isWp = isWordpress($); // Loop through all links, looking for hints that they may be next-page // links. Things like having "page" in their textContent, className or // id, or being a child of a node with a page-y className or id. // // After we do that, assign each page a score, and pick the one that // looks most like the next page link, as long as its score is strong // enough to have decent confidence. var scoredPages = links.reduce(function(possiblePages, link) { // Remove any anchor data since we don't do a good job // standardizing URLs (it's hard), we're going to do // some checking with and without a trailing slash var attrs = getAttrs(link); // if href is undefined, return if (!attrs.href) return possiblePages; var href = removeAnchor(attrs.href); var $link = $(link); var linkText = $link.text(); if ( !shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls) ) { return possiblePages; } // ## PASSED THE FIRST-PASS TESTS. Start scoring. ## if (!possiblePages[href]) { possiblePages[href] = { score: 0, linkText: linkText, href: href, }; } else { possiblePages[href].linkText = '' .concat(possiblePages[href].linkText, '|') .concat(linkText); } var possiblePage = possiblePages[href]; var linkData = makeSig$1($link, linkText); var pageNum = pageNumFromUrl(href); var score = scoreBaseUrl(href, baseRegex); score += scoreNextLinkText(linkData); score += scoreCapLinks(linkData); score += scorePrevLink(linkData); score += scoreByParents$1($link); score += scoreExtraneousLinks(href); score += scorePageInLink(pageNum, isWp); score += scoreLinkText(linkText, pageNum); score += scoreSimilarity(score, articleUrl, href); possiblePage.score = score; return possiblePages; }, {}); return _Reflect$ownKeys(scoredPages).length === 0 ? null : scoredPages; } // for multi-page articles var GenericNextPageUrlExtractor = { extract: function extract(_ref) { var $ = _ref.$, url = _ref.url, parsedUrl = _ref.parsedUrl, _ref$previousUrls = _ref.previousUrls, previousUrls = _ref$previousUrls === void 0 ? [] : _ref$previousUrls; parsedUrl = parsedUrl || URL.parse(url); var articleUrl = removeAnchor(url); var baseUrl = articleBaseUrl(url, parsedUrl); var links = $('a[href]').toArray(); var scoredLinks = scoreLinks({ links: links, articleUrl: articleUrl, baseUrl: baseUrl, parsedUrl: parsedUrl, $: $, previousUrls: previousUrls, }); // If no links were scored, return null if (!scoredLinks) return null; // now that we've scored all possible pages, // find the biggest one. var topPage = _Reflect$ownKeys(scoredLinks).reduce( function(acc, link) { var scoredLink = scoredLinks[link]; return scoredLink.score > acc.score ? scoredLink : acc; }, { score: -100, } ); // If the score is less than 50, we're not confident enough to use it, // so we fail. if (topPage.score >= 50) { return topPage.href; } return null; }, }; var CANONICAL_META_SELECTORS = ['og:url']; function parseDomain(url) { var parsedUrl = URL.parse(url); var hostname = parsedUrl.hostname; return hostname; } function result(url) { return { url: url, domain: parseDomain(url), }; } var GenericUrlExtractor = { extract: function extract(_ref) { var $ = _ref.$, url = _ref.url, metaCache = _ref.metaCache; var $canonical = $('link[rel=canonical]'); if ($canonical.length !== 0) { var href = $canonical.attr('href'); if (href) { return result(href); } } var metaUrl = extractFromMeta$$1($, CANONICAL_META_SELECTORS, metaCache); if (metaUrl) { return result(metaUrl); } return result(url); }, }; var EXCERPT_META_SELECTORS = ['og:description', 'twitter:description']; function clean$2(content, $) { var maxLength = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 200; content = content.replace(/[\s\n]+/g, ' ').trim(); return ellipsize(content, maxLength, { ellipse: '…', }); } var GenericExcerptExtractor = { extract: function extract(_ref) { var $ = _ref.$, content = _ref.content, metaCache = _ref.metaCache; var excerpt = extractFromMeta$$1($, EXCERPT_META_SELECTORS, metaCache); if (excerpt) { return clean$2(stripTags(excerpt, $)); } // Fall back to excerpting from the extracted content var maxLength = 200; var shortContent = content.slice(0, maxLength * 5); return clean$2($(shortContent).text(), $, maxLength); }, }; var GenericWordCountExtractor = { extract: function extract(_ref) { var content = _ref.content; var $ = cheerio.load(content); var $content = $('div').first(); var text = normalizeSpaces($content.text()); return text.split(/\s/).length; }, }; var GenericExtractor = { // This extractor is the default for all domains domain: '*', title: GenericTitleExtractor.extract, date_published: GenericDatePublishedExtractor.extract, author: GenericAuthorExtractor.extract, content: GenericContentExtractor.extract.bind(GenericContentExtractor), lead_image_url: GenericLeadImageUrlExtractor.extract, dek: GenericDekExtractor.extract, next_page_url: GenericNextPageUrlExtractor.extract, url_and_domain: GenericUrlExtractor.extract, excerpt: GenericExcerptExtractor.extract, word_count: GenericWordCountExtractor.extract, direction: function direction(_ref) { var title = _ref.title; return stringDirection.getDirection(title); }, extract: function extract(options) { var html = options.html, $ = options.$; if (html && !$) { var loaded = cheerio.load(html); options.$ = loaded; } var title = this.title(options); var date_published = this.date_published(options); var author = this.author(options); var content = this.content( _objectSpread({}, options, { title: title, }) ); var lead_image_url = this.lead_image_url( _objectSpread({}, options, { content: content, }) ); var dek = this.dek( _objectSpread({}, options, { content: content, }) ); var next_page_url = this.next_page_url(options); var excerpt = this.excerpt( _objectSpread({}, options, { content: content, }) ); var word_count = this.word_count( _objectSpread({}, options, { content: content, }) ); var direction = this.direction({ title: title, }); var _this$url_and_domain = this.url_and_domain(options), url = _this$url_and_domain.url, domain = _this$url_and_domain.domain; return { title: title, author: author, date_published: date_published || null, dek: dek, lead_image_url: lead_image_url, content: content, next_page_url: next_page_url, url: url, domain: domain, excerpt: excerpt, word_count: word_count, direction: direction, }; }, }; var Detectors = { 'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor, 'meta[name="generator"][value="blogger"]': BloggerExtractor, }; function detectByHtml($) { var selector = _Reflect$ownKeys(Detectors).find(function(s) { return $(s).length > 0; }); return Detectors[selector]; } function getExtractor(url, parsedUrl, $) { parsedUrl = parsedUrl || URL.parse(url); var _parsedUrl = parsedUrl, hostname = _parsedUrl.hostname; var baseDomain = hostname .split('.') .slice(-2) .join('.'); return ( Extractors[hostname] || Extractors[baseDomain] || detectByHtml($) || GenericExtractor ); } function cleanBySelectors($content, $, _ref) { var clean = _ref.clean; if (!clean) return $content; $(clean.join(','), $content).remove(); return $content; } // Transform matching elements function transformElements($content, $, _ref2) { var transforms = _ref2.transforms; if (!transforms) return $content; _Reflect$ownKeys(transforms).forEach(function(key) { var $matches = $(key, $content); var value = transforms[key]; // If value is a string, convert directly if (typeof value === 'string') { $matches.each(function(index, node) { convertNodeTo$$1($(node), $, transforms[key]); }); } else if (typeof value === 'function') { // If value is function, apply function to node $matches.each(function(index, node) { var result = value($(node), $); // If function returns a string, convert node to that value if (typeof result === 'string') { convertNodeTo$$1($(node), $, result); } }); } }); return $content; } function findMatchingSelector($, selectors, extractHtml) { return selectors.find(function(selector) { if (_Array$isArray(selector)) { if (extractHtml) { return selector.reduce(function(acc, s) { return acc && $(s).length > 0; }, true); } var _selector = _slicedToArray(selector, 2), s = _selector[0], attr = _selector[1]; return ( $(s).length === 1 && $(s).attr(attr) && $(s) .attr(attr) .trim() !== '' ); } return ( $(selector).length === 1 && $(selector) .text() .trim() !== '' ); }); } function select(opts) { var $ = opts.$, type = opts.type, extractionOpts = opts.extractionOpts, _opts$extractHtml = opts.extractHtml, extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml; // Skip if there's not extraction for this type if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia // contributors), return the string if (typeof extractionOpts === 'string') return extractionOpts; var selectors = extractionOpts.selectors, _extractionOpts$defau = extractionOpts.defaultCleaner, defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau; var matchingSelector = findMatchingSelector($, selectors, extractHtml); if (!matchingSelector) return null; // Declaring result; will contain either // text or html, which will be cleaned // by the appropriate cleaner type // If the selector type requests html as its return type // transform and clean the element with provided selectors var $content; if (extractHtml) { // If matching selector is an array, we're considering this a // multi-match selection, which allows the parser to choose several // selectors to include in the result. Note that all selectors in the // array must match in order for this selector to trigger if (_Array$isArray(matchingSelector)) { $content = $(matchingSelector.join(',')); var $wrapper = $('<div></div>'); $content.each(function(index, element) { $wrapper.append(element); }); $content = $wrapper; } else { $content = $(matchingSelector); } // Wrap in div so transformation can take place on root element $content.wrap($('<div></div>')); $content = $content.parent(); $content = transformElements($content, $, extractionOpts); $content = cleanBySelectors($content, $, extractionOpts); $content = Cleaners[type]( $content, _objectSpread({}, opts, { defaultCleaner: defaultCleaner, }) ); return $.html($content); } var result; // if selector is an array (e.g., ['img', 'src']), // extract the attr if (_Array$isArray(matchingSelector)) { var _matchingSelector = _slicedToArray(matchingSelector, 2), selector = _matchingSelector[0], attr = _matchingSelector[1]; result = $(selector) .attr(attr) .trim(); } else { var $node = $(matchingSelector); $node = cleanBySelectors($node, $, extractionOpts); $node = transformElements($node, $, extractionOpts); result = $node.text().trim(); } // Allow custom extractor to skip default cleaner // for this type; defaults to true if (defaultCleaner) { return Cleaners[type](result, _objectSpread({}, opts, extractionOpts)); } return result; } function extractResult(opts) { var type = opts.type, extractor = opts.extractor, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback; var result = select( _objectSpread({}, opts, { extractionOpts: extractor[type], }) ); // If custom parser succeeds, return the result if (result) { return result; } // If nothing matches the selector, and fallback is enabled, // run the Generic extraction if (fallback) return GenericExtractor[type](opts); return null; } var RootExtractor = { extract: function extract() { var extractor = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : GenericExtractor; var opts = arguments.length > 1 ? arguments[1] : undefined; var _opts = opts, contentOnly = _opts.contentOnly, extractedTitle = _opts.extractedTitle; // This is the generic extractor. Run its extract method if (extractor.domain === '*') return extractor.extract(opts); opts = _objectSpread({}, opts, { extractor: extractor, }); if (contentOnly) { var _content = extractResult( _objectSpread({}, opts, { type: 'content', extractHtml: true, title: extractedTitle, }) ); return { content: _content, }; } var title = extractResult( _objectSpread({}, opts, { type: 'title', }) ); var date_published = extractResult( _objectSpread({}, opts, { type: 'date_published', }) ); var author = extractResult( _objectSpread({}, opts, { type: 'author', }) ); var next_page_url = extractResult( _objectSpread({}, opts, { type: 'next_page_url', }) ); var content = extractResult( _objectSpread({}, opts, { type: 'content', extractHtml: true, title: title, }) ); var lead_image_url = extractResult( _objectSpread({}, opts, { type: 'lead_image_url', content: content, }) ); var excerpt = extractResult( _objectSpread({}, opts, { type: 'excerpt', content: content, }) ); var dek = extractResult( _objectSpread({}, opts, { type: 'dek', content: content, excerpt: excerpt, }) ); var word_count = extractResult( _objectSpread({}, opts, { type: 'word_count', content: content, }) ); var direction = extractResult( _objectSpread({}, opts, { type: 'direction', title: title, }) ); var _ref3 = extractResult( _objectSpread({}, opts, { type: 'url_and_domain', }) ) || { url: null, domain: null, }, url = _ref3.url, domain = _ref3.domain; return { title: title, content: content, author: author, date_published: date_published, lead_image_url: lead_image_url, dek: dek, next_page_url: next_page_url, url: url, domain: domain, excerpt: excerpt, word_count: word_count, direction: direction, }; }, }; function collectAllPages(_x) { return _collectAllPages.apply(this, arguments); } function _collectAllPages() { _collectAllPages = _asyncToGenerator( /*#__PURE__*/ _regeneratorRuntime.mark(function _callee(_ref) { var next_page_url, html, $, metaCache, result, Extractor, title, url, pages, previousUrls, extractorOpts, nextPageResult, word_count; return _regeneratorRuntime.wrap( function _callee$(_context) { while (1) { switch ((_context.prev = _context.next)) { case 0: (next_page_url = _ref.next_page_url), (html = _ref.html), ($ = _ref.$), (metaCache = _ref.metaCache), (result = _ref.result), (Extractor = _ref.Extractor), (title = _ref.title), (url = _ref.url); // At this point, we've fetched just the first page pages = 1; previousUrls = [removeAnchor(url)]; // If we've gone over 26 pages, something has // likely gone wrong. case 3: if (!(next_page_url && pages < 26)) { _context.next = 16; break; } pages += 1; // eslint-disable-next-line no-await-in-loop _context.next = 7; return Resource.create(next_page_url); case 7: $ = _context.sent; html = $.html(); extractorOpts = { url: next_page_url, html: html, $: $, metaCache: metaCache, contentOnly: true, extractedTitle: title, previousUrls: previousUrls, }; nextPageResult = RootExtractor.extract( Extractor, extractorOpts ); previousUrls.push(next_page_url); result = _objectSpread({}, result, { content: '' .concat(result.content, '<hr><h4>Page ') .concat(pages, '</h4>') .concat(nextPageResult.content), }); // eslint-disable-next-line prefer-destructuring next_page_url = nextPageResult.next_page_url; _context.next = 3; break; case 16: word_count = GenericExtractor.word_count({ content: '<div>'.concat(result.content, '</div>'), }); return _context.abrupt( 'return', _objectSpread({}, result, { total_pages: pages, pages_rendered: pages, word_count: word_count, }) ); case 18: case 'end': return _context.stop(); } } }, _callee, this ); }) ); return _collectAllPages.apply(this, arguments); } var Mercury = { parse: function parse(url, html) { var opts = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : {}; return _asyncToGenerator( /*#__PURE__*/ _regeneratorRuntime.mark(function _callee() { var _opts$fetchAllPages, fetchAllPages, _opts$fallback, fallback, parsedUrl, $, Extractor, metaCache, result, _result, title, next_page_url; return _regeneratorRuntime.wrap( function _callee$(_context) { while (1) { switch ((_context.prev = _context.next)) { case 0: (_opts$fetchAllPages = opts.fetchAllPages), (fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages), (_opts$fallback = opts.fallback), (fallback = _opts$fallback === void 0 ? true : _opts$fallback); // if no url was passed and this is the browser version, // set url to window.location.href and load the html // from the current page if (!url && cheerio.browser) { url = window.location.href; // eslint-disable-line no-undef html = html || cheerio.html(); } parsedUrl = URL.parse(url); if (validateUrl(parsedUrl)) { _context.next = 5; break; } return _context.abrupt('return', Errors.badUrl); case 5: _context.next = 7; return Resource.create(url, html, parsedUrl); case 7: $ = _context.sent; Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`); // If we found an error creating the resource, return that error if (!$.failed) { _context.next = 11; break; } return _context.abrupt('return', $); case 11: // if html still has not been set (i.e., url passed to Mercury.parse), // set html from the response of Resource.create if (!html) { html = $.html(); } // Cached value of every meta name in our document. // Used when extracting title/author/date_published/dek metaCache = $('meta') .map(function(_, node) { return $(node).attr('name'); }) .toArray(); result = RootExtractor.extract(Extractor, { url: url, html: html, $: $, metaCache: metaCache, parsedUrl: parsedUrl, fallback: fallback, }); (_result = result), (title = _result.title), (next_page_url = _result.next_page_url); // Fetch more pages if next_page_url found if (!(fetchAllPages && next_page_url)) { _context.next = 21; break; } _context.next = 18; return collectAllPages({ Extractor: Extractor, next_page_url: next_page_url, html: html, $: $, metaCache: metaCache, result: result, title: title, url: url, }); case 18: result = _context.sent; _context.next = 22; break; case 21: result = _objectSpread({}, result, { total_pages: 1, rendered_pages: 1, }); case 22: return _context.abrupt('return', result); case 23: case 'end': return _context.stop(); } } }, _callee, this ); }) )(); }, browser: !!cheerio.browser, // A convenience method for getting a resource // to work with, e.g., for custom extractor generator fetchResource: function fetchResource(url) { return Resource.create(url); }, }; module.exports = Mercury;