varmercury=createCommonjsModule(function(module){'use strict';function_interopDefault(ex){returnex&&(typeofex==='undefined'?'undefined':_typeof(ex))==='object'&&'default'inex?ex['default']:ex;}var_regeneratorRuntime=_interopDefault(regenerator);var_extends$$=_interopDefault(_extends);var_asyncToGenerator=_interopDefault(asyncToGenerator);varURL$$=_interopDefault(URL);varbabelPolyfill$$=babelPolyfill;varcheerio$$=_interopDefault(cheerio);var_Promise=_interopDefault(promise);varrequest$$=_interopDefault(request);var_Reflect$ownKeys$$=_interopDefault(_Reflect$ownKeys);varstringDirection$$=_interopDefault(stringDirection);var_getIterator$$=_interopDefault(_getIterator);var_defineProperty=_interopDefault(defineProperty);var_slicedToArray$$=_interopDefault(_slicedToArray);var_typeof$$=_interopDefault(_typeof);varvalidUrl$$=_interopDefault(validUrl);varmoment$$=_interopDefault(moment);varwuzzy$$=_interopDefault(wuzzy);vardifflib$$=_interopDefault(difflib);var_Array$from=_interopDefault(from);varellipsize$$=_interopDefault(ellipsize);var_marked=[range].map(_regeneratorRuntime.mark);functionrange(){varstart=arguments.length<=0||arguments[0]===undefined?1:arguments[0];varend=arguments.length<=1||arguments[1]===undefined?1:arguments[1];return_regeneratorRuntime.wrap(functionrange$(_context){while(1){switch(_context.prev=_context.next){case0:if(!(start<=end)){_context.next=5;break;}_context.next=3;returnstart+=1;case3:_context.next=0;break;case5:case"end":return_context.stop();}}},_marked[0],this);}// extremely simple url validation as a first step
varmercury=createCommonjsModule(function(module){'use strict';function_interopDefault(ex){returnex&&(typeofex==='undefined'?'undefined':_typeof(ex))==='object'&&'default'inex?ex['default']:ex;}var_regeneratorRuntime=_interopDefault(regenerator);var_extends$$=_interopDefault(_extends);var_asyncToGenerator=_interopDefault(asyncToGenerator);varURL$$=_interopDefault(URL);var cheerio$$=_interopDefault(cheerio);var_Promise=_interopDefault(promise);varrequest$$=_interopDefault(request);var_Reflect$ownKeys$$=_interopDefault(_Reflect$ownKeys);varstringDirection$$=_interopDefault(stringDirection);var_getIterator$$=_interopDefault(_getIterator);var_defineProperty=_interopDefault(defineProperty);var_slicedToArray$$=_interopDefault(_slicedToArray);var_typeof$$=_interopDefault(_typeof);varvalidUrl$$=_interopDefault(validUrl);varmoment$$=_interopDefault(moment);varwuzzy$$=_interopDefault(wuzzy);vardifflib$$=_interopDefault(difflib);var_Array$from=_interopDefault(from);varellipsize$$=_interopDefault(ellipsize);var_marked=[range].map(_regeneratorRuntime.mark);functionrange(){varstart=arguments.length>0&&arguments[0]!==undefined?arguments[0]:1;varend=arguments.length>1&&arguments[1]!==undefined?arguments[1]:1;return_regeneratorRuntime.wrap(functionrange$(_context){while(1){switch(_context.prev=_context.next){case0:if(!(start<=end)){_context.next=5;break;}_context.next=3;returnstart+=1;case3:_context.next=0;break;case5:case"end":return_context.stop();}}},_marked[0],this);}// extremely simple url validation as a first step
functionvalidateUrl(_ref){varhostname=_ref.hostname;// If this isn't a valid url, return an error message
return!!hostname;}varErrors={badUrl:{error:true,messages:'The url parameter passed does not look like a valid URL. Please check your data and try again.'}};varREQUEST_HEADERS={'User-Agent':'Readability - http://readability.com/about/'};// The number of milliseconds to attempt to fetch a resource before timing out.
varFETCH_TIMEOUT=10000;// Content types that we do not extract content from
@ -41,7 +40,7 @@ var MAX_CONTENT_LENGTH=5242880;function get(options){return new _Promise(functio
// This does not validate in the sense of a response being 200 level or
// not. Validation here means that we haven't found reason to bail from
// further processing of this url.
functionvalidateResponse(response){varparseNon2xx=arguments.length<=1||arguments[1]===undefined?false:arguments[1];// Check if we got a valid status code
functionvalidateResponse(response){varparseNon2xx=arguments.length>1&&arguments[1]!==undefined?arguments[1]:false;// Check if we got a valid status code
if(response.statusMessage!=='OK'){if(!response.statusCode){thrownewError('Unable to fetch content. Original exception was '+response.error);}elseif(!parseNon2xx){thrownewError('Resource returned a response status code of '+response.statusCode+' and resource was instructed to reject non-2xx level status codes.');}}var_response$headers=response.headers;varcontentType=_response$headers['content-type'];varcontentLength=_response$headers['content-length'];// Check that the content is not in BAD_CONTENT_TYPES
if(BAD_CONTENT_TYPES_RE.test(contentType)){thrownewError('Content-type for this resource was '+contentType+' and is not allowed.');}// Check that the content length is below maximum
if(contentLength>MAX_CONTENT_LENGTH){thrownewError('Content for this resource was too large. Maximum content length is '+MAX_CONTENT_LENGTH+'.');}returntrue;}// Set our response attribute to the result of fetching our URL.
@ -267,7 +266,7 @@ paragraphize(element,$,true);}});return $;}// Given a node, turn it into a P if
// :param node: The node to paragraphize; this is a raw node
// :param $: The cheerio object to handle dom manipulation
// :param br: Whether or not the passed node is a br
functionparagraphize(node,$){varbr=arguments.length<=2||arguments[2]===undefined?false:arguments[2];var$node=$(node);if(br){varsibling=node.nextSibling;varp=$('<p></p>');// while the next node is text or not a block level element
functionparagraphize(node,$){varbr=arguments.length>2&&arguments[2]!==undefined?arguments[2]:false;var$node=$(node);if(br){varsibling=node.nextSibling;varp=$('<p></p>');// while the next node is text or not a block level element
// append it to a new p node
while(sibling&&!(sibling.tagName&&BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))){varnextSibling=sibling.nextSibling;$(sibling).appendTo(p);sibling=nextSibling;}$node.replaceWith(p);$node.remove();return$;}return$;}functionconvertDivs($){$('div').each(function(index,div){var$div=$(div);varconvertable=$div.children(DIV_TO_P_BLOCK_TAGS).length===0;if(convertable){convertNodeTo($div,$,'p');}});return$;}functionconvertSpans($){$('span').each(function(index,span){var$span=$(span);varconvertable=$span.parents('p, div').length===0;if(convertable){convertNodeTo($span,$,'p');}});return$;}// Loop through the provided doc, and convert any p-like elements to
// (By-reference mutation, though. Returned just for convenience.)
functionconvertToParagraphs($){$=brsToPs($);$=convertDivs($);$=convertSpans($);return$;}functionconvertNodeTo($node,$){vartag=arguments.length<=2||arguments[2]===undefined?'p':arguments[2];varnode=$node.get(0);if(!node){return$;}var_$node$get=$node.get(0);varattribs=_$node$get.attribs;varattribString=_Reflect$ownKeys$$(attribs).map(function(key){returnkey+'='+attribs[key];}).join(' ');$node.replaceWith('<'+tag+' '+attribString+'>'+$node.contents()+'</'+tag+'>');return$;}functioncleanForHeight($img,$){varheight=parseInt($img.attr('height'),10);varwidth=parseInt($img.attr('width'),10)||20;// Remove images that explicitly have very small heights or
functionconvertToParagraphs($){$=brsToPs($);$=convertDivs($);$=convertSpans($);return$;}functionconvertNodeTo($node,$){vartag=arguments.length>2&&arguments[2]!==undefined?arguments[2]:'p';varnode=$node.get(0);if(!node){return$;}var_$node$get=$node.get(0);varattribs=_$node$get.attribs;varattribString=_Reflect$ownKeys$$(attribs).map(function(key){returnkey+'='+attribs[key];}).join(' ');$node.replaceWith('<'+tag+' '+attribString+'>'+$node.contents()+'</'+tag+'>');return$;}functioncleanForHeight($img,$){varheight=parseInt($img.attr('height'),10);varwidth=parseInt($img.attr('width'),10)||20;// Remove images that explicitly have very small heights or
// widths, because they are most likely shims or icons,
// which aren't very useful for reading.
if((height||20)<10||width<10){$img.remove();}elseif(height){// Don't ever specify a height on images, so that we can
@ -288,7 +287,7 @@ if((height||20)<10||width<10){$img.remove();}else if(height){// Don't ever speci
// aspect ratio.
$img.removeAttr('height');}return$;}// Cleans out images where the source string matches transparent/spacer/etc
// TODO This seems very aggressive - AP
functionremoveSpacers($img,$){if(SPACER_RE.test($img.attr('src'))){$img.remove();}return$;}functioncleanImages($article,$){$article.find('img').each(function(index,img){var$img=$(img);cleanForHeight($img,$);removeSpacers($img,$);});return$;}functionstripJunkTags(article,$){vartags=arguments.length<=2||arguments[2]===undefined?[]:arguments[2];if(tags.length===0){tags=STRIP_OUTPUT_TAGS;}$(tags.join(','),article).remove();return$;}// H1 tags are typically the article title, which should be extracted
functionremoveSpacers($img,$){if(SPACER_RE.test($img.attr('src'))){$img.remove();}return$;}functioncleanImages($article,$){$article.find('img').each(function(index,img){var$img=$(img);cleanForHeight($img,$);removeSpacers($img,$);});return$;}functionstripJunkTags(article,$){vartags=arguments.length>2&&arguments[2]!==undefined?arguments[2]:[];if(tags.length===0){tags=STRIP_OUTPUT_TAGS;}$(tags.join(','),article).remove();return$;}// H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),
@ -364,7 +363,7 @@ if(READABILITY_ASSET$1.test(classes)){score+=25;}}return score;}// returns the s
// the node's score attribute
// returns null if no score set
functiongetScore($node){returnparseFloat($node.attr('score'))||null;}// return 1 for every comma in text
functionscoreCommas(text){return(text.match(/,/g)||[]).length;}varidkRe=newRegExp('^(p|pre)$','i');functionscoreLength(textLength){vartagName=arguments.length<=1||arguments[1]===undefined?'p':arguments[1];varchunks=textLength/50;if(chunks>0){varlengthBonus=void0;// No idea why p or pre are being tamped down here
functionscoreCommas(text){return(text.match(/,/g)||[]).length;}varidkRe=newRegExp('^(p|pre)$','i');functionscoreLength(textLength){vartagName=arguments.length>1&&arguments[1]!==undefined?arguments[1]:'p';varchunks=textLength/50;if(chunks>0){varlengthBonus=void0;// No idea why p or pre are being tamped down here
// but just following the source for now
// Not even sure why tagName is included here,
// since this is only being called from the context
functionaddToParent(node,$,score){varparent=node.parent();if(parent){addScore(parent,$,score*0.25);}returnnode;}// gets and returns the score if it exists
// if not, initializes a score based on
// the node's tag type
functiongetOrInitScore($node,$){varweightNodes=arguments.length<=2||arguments[2]===undefined?true:arguments[2];varscore=getScore($node);if(score){returnscore;}score=scoreNode($node);if(weightNodes){score+=getWeight($node);}addToParent($node,$,score);returnscore;}// Score an individual node. Has some smarts for paragraphs, otherwise
functiongetOrInitScore($node,$){varweightNodes=arguments.length>2&&arguments[2]!==undefined?arguments[2]:true;varscore=getScore($node);if(score){returnscore;}score=scoreNode($node);if(weightNodes){score+=getWeight($node);}addToParent($node,$,score);returnscore;}// Score an individual node. Has some smarts for paragraphs, otherwise
// just scores based on tag.
functionscoreNode($node){var_$node$get=$node.get(0);vartagName=_$node$get.tagName;// TODO: Consider ordering by most likely.
// E.g., if divs are a more common tag on a page,
@ -396,7 +395,7 @@ var $node=$(node);$node=setScore($node,$,getOrInitScore($node,$,weightNodes));va
// grandparent
addScoreTo($parent.parent(),$,rawScore/2,weightNodes);}});return$;}// score content. Parents get the full value of their children's
// content score, grandparents half
functionscoreContent($){varweightNodes=arguments.length<=1||arguments[1]===undefined?true:arguments[1];// First, look for special hNews based selectors and give them a big
functionscoreContent($){varweightNodes=arguments.length>1&&arguments[1]!==undefined?arguments[1]:true;// First, look for special hNews based selectors and give them a big
// boost, if they exist
HNEWS_CONTENT_SELECTORS$1.forEach(function(_ref){var_ref2=_slicedToArray$$(_ref,2);varparentSelector=_ref2[0];varchildSelector=_ref2[1];$(parentSelector+' '+childSelector).each(function(index,node){addScore($(node).parent(parentSelector),$,80);});});// Doubling this again
// Previous solution caused a bug
@ -487,7 +486,7 @@ if(scriptCount>0&&contentLength<150){$node.remove();return;}}}// Given an articl
// Return this same doc.
functioncleanTags($article,$){$(CLEAN_CONDITIONALLY_TAGS,$article).each(function(index,node){var$node=$(node);varweight=getScore($node);if(!weight){weight=getOrInitScore($node,$);setScore($node,$,weight);}// drop node if its weight is < 0
if(weight<0){$node.remove();}else{// deteremine if node seems like content
removeUnlessContent($node,$,weight);}});return$;}functioncleanHeaders($article,$){vartitle=arguments.length<=2||arguments[2]===undefined?'':arguments[2];$(HEADER_TAG_LIST,$article).each(function(index,header){var$header=$(header);// Remove any headers that appear before all other p tags in the
removeUnlessContent($node,$,weight);}});return$;}functioncleanHeaders($article,$){vartitle=arguments.length>2&&arguments[2]!==undefined?arguments[2]:'';$(HEADER_TAG_LIST,$article).each(function(index,header){var$header=$(header);// Remove any headers that appear before all other p tags in the
// document. This probably means that it was part of the title, a
// subtitle or something else extraneous like a datestamp or byline,
// all of which should be handled by other metadata handling.
functionlinkDensity($node){vartotalTextLength=textLength($node.text());varlinkText=$node.find('a').text();varlinkLength=textLength(linkText);if(totalTextLength>0){returnlinkLength/totalTextLength;}elseif(totalTextLength===0&&linkLength>0){return1;}return0;}// Given a node type to search for, and a list of meta tag names to
// search for, find a meta tag associated.
functionextractFromMeta($,metaNames,cachedNames){varcleanTags=arguments.length<=3||arguments[3]===undefined?true:arguments[3];varfoundNames=metaNames.filter(function(name){returncachedNames.indexOf(name)!==-1;});var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{var_loop=function_loop(){varname=_step.value;vartype='name';varvalue='value';varnodes=$('meta['+type+'="'+name+'"]');// Get the unique value of every matching node, in case there
functionextractFromMeta($,metaNames,cachedNames){varcleanTags=arguments.length>3&&arguments[3]!==undefined?arguments[3]:true;varfoundNames=metaNames.filter(function(name){returncachedNames.indexOf(name)!==-1;});var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{var_loop=function_loop(){varname=_step.value;vartype='name';varvalue='value';varnodes=$('meta['+type+'="'+name+'"]');// Get the unique value of every matching node, in case there
// are two meta tags with the same name and value.
// Remove empty values.
varvalues=nodes.map(function(index,node){return$(node).attr(value);}).toArray().filter(function(text){returntext!=='';});// If we have more than one value for the same name, we have a
@ -520,7 +519,7 @@ if($node.children().length>maxChildren){return false;}// If it looks to be withi
if(withinComment($node)){returnfalse;}returntrue;}// Given a a list of selectors find content that may
// be extractable from the document. This is for flat
// meta-information, like author, title, date published, etc.
functionextractFromSelectors($,selectors){varmaxChildren=arguments.length<=2||arguments[2]===undefined?1:arguments[2];vartextOnly=arguments.length<=3||arguments[3]===undefined?true:arguments[3];var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator$$(selectors),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){varselector=_step.value;varnodes=$(selector);// If we didn't get exactly one of this selector, this may be
functionextractFromSelectors($,selectors){varmaxChildren=arguments.length>2&&arguments[2]!==undefined?arguments[2]:1;vartextOnly=arguments.length>3&&arguments[3]!==undefined?arguments[3]:true;var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator$$(selectors),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){varselector=_step.value;varnodes=$(selector);// If we didn't get exactly one of this selector, this may be
// a list of articles or comments. Skip it.
if(nodes.length===1){var$node=$(nodes[0]);if(isGoodNode($node,maxChildren)){varcontent=void0;if(textOnly){content=$node.text();}else{content=$node.html();}if(content){returncontent;}}}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnnull;}// strips all tags from a string of text
functionstripTags(text,$){// Wrapping text in html element prevents errors when text
// accurate. Not the end of the world if it doesn't strip right.
var_URL$parse=URL$$.parse(url);varhost=_URL$parse.host;varnakedDomain=host.replace(DOMAIN_ENDINGS_RE,'');varstartSlug=splitTitle[0].toLowerCase().replace(' ','');varstartSlugRatio=wuzzy$$.levenshtein(startSlug,nakedDomain);if(startSlugRatio>0.4&&startSlug.length>5){returnsplitTitle.slice(2).join('');}varendSlug=splitTitle.slice(-1)[0].toLowerCase().replace(' ','');varendSlugRatio=wuzzy$$.levenshtein(endSlug,nakedDomain);if(endSlugRatio>0.4&&endSlug.length>=5){returnsplitTitle.slice(0,-2).join('');}returnnull;}// Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.
functionresolveSplitTitle(title){varurl=arguments.length<=1||arguments[1]===undefined?'':arguments[1];// Splits while preserving splitters, like:
functionresolveSplitTitle(title){varurl=arguments.length>1&&arguments[1]!==undefined?arguments[1]:'';// Splits while preserving splitters, like:
// ['The New New York', ' - ', 'The Washington Post']
varsplitTitle=title.split(TITLE_SPLITTERS_RE);if(splitTitle.length===1){returntitle;}varnewTitle=extractBreadcrumbTitle(splitTitle,title);if(newTitle)returnnewTitle;newTitle=cleanDomainFromTitle(splitTitle,url);if(newTitle)returnnewTitle;// Fuzzy ratio didn't find anything, so this title is probably legit.
// Just return it all.
@ -832,7 +831,7 @@ if(!scoredLinks)return null;// now that we've scored all possible pages,
// find the biggest one.
vartopPage=_Reflect$ownKeys$$(scoredLinks).reduce(function(acc,link){varscoredLink=scoredLinks[link];returnscoredLink.score>acc.score?scoredLink:acc;},{score:-100});// If the score is less than 50, we're not confident enough to use it,
// so we fail.
if(topPage.score>=50){returntopPage.href;}returnnull;}};varCANONICAL_META_SELECTORS=['og:url'];functionparseDomain(url){varparsedUrl=URL$$.parse(url);varhostname=parsedUrl.hostname;returnhostname;}functionresult(url){return{url:url,domain:parseDomain(url)};}varGenericUrlExtractor={extract:functionextract(_ref){var$=_ref.$;varurl=_ref.url;varmetaCache=_ref.metaCache;var$canonical=$('link[rel=canonical]');if($canonical.length!==0){varhref=$canonical.attr('href');if(href){returnresult(href);}}varmetaUrl=extractFromMeta($,CANONICAL_META_SELECTORS,metaCache);if(metaUrl){returnresult(metaUrl);}returnresult(url);}};varEXCERPT_META_SELECTORS=['og:description','twitter:description'];functionclean$2(content,$){varmaxLength=arguments.length<=2||arguments[2]===undefined?200:arguments[2];content=content.replace(/[\s\n]+/g,' ').trim();returnellipsize$$(content,maxLength,{ellipse:'…'});}varGenericExcerptExtractor={extract:functionextract(_ref){var$=_ref.$;varcontent=_ref.content;varmetaCache=_ref.metaCache;varexcerpt=extractFromMeta($,EXCERPT_META_SELECTORS,metaCache);if(excerpt){returnclean$2(stripTags(excerpt,$));}// Fall back to excerpting from the extracted content
if(topPage.score>=50){returntopPage.href;}returnnull;}};varCANONICAL_META_SELECTORS=['og:url'];functionparseDomain(url){varparsedUrl=URL$$.parse(url);varhostname=parsedUrl.hostname;returnhostname;}functionresult(url){return{url:url,domain:parseDomain(url)};}varGenericUrlExtractor={extract:functionextract(_ref){var$=_ref.$;varurl=_ref.url;varmetaCache=_ref.metaCache;var$canonical=$('link[rel=canonical]');if($canonical.length!==0){varhref=$canonical.attr('href');if(href){returnresult(href);}}varmetaUrl=extractFromMeta($,CANONICAL_META_SELECTORS,metaCache);if(metaUrl){returnresult(metaUrl);}returnresult(url);}};varEXCERPT_META_SELECTORS=['og:description','twitter:description'];functionclean$2(content,$){varmaxLength=arguments.length>2&&arguments[2]!==undefined?arguments[2]:200;content=content.replace(/[\s\n]+/g,' ').trim();returnellipsize$$(content,maxLength,{ellipse:'…'});}varGenericExcerptExtractor={extract:functionextract(_ref){var$=_ref.$;varcontent=_ref.content;varmetaCache=_ref.metaCache;varexcerpt=extractFromMeta($,EXCERPT_META_SELECTORS,metaCache);if(excerpt){returnclean$2(stripTags(excerpt,$));}// Fall back to excerpting from the extracted content
varmaxLength=200;varshortContent=content.slice(0,maxLength*5);returnclean$2($(shortContent).text(),$,maxLength);}};varGenericWordCountExtractor={extract:functionextract(_ref){varcontent=_ref.content;var$=cheerio$$.load(content);vartext=normalizeSpaces($('div').first().text());returntext.split(/\s/).length;}};varGenericExtractor={// This extractor is the default for all domains
domain:'*',title:GenericTitleExtractor.extract,date_published:GenericDatePublishedExtractor.extract,author:GenericAuthorExtractor.extract,content:GenericContentExtractor.extract.bind(GenericContentExtractor),lead_image_url:GenericLeadImageUrlExtractor.extract,dek:GenericDekExtractor.extract,next_page_url:GenericNextPageUrlExtractor.extract,url_and_domain:GenericUrlExtractor.extract,excerpt:GenericExcerptExtractor.extract,word_count:GenericWordCountExtractor.extract,direction:functiondirection(_ref){vartitle=_ref.title;returnstringDirection$$.getDirection(title);},extract:functionextract(options){varhtml=options.html;if(html){var$=cheerio$$.load(html);options.$=$;}vartitle=this.title(options);vardate_published=this.date_published(options);varauthor=this.author(options);varcontent=this.content(_extends$$({},options,{title:title}));varlead_image_url=this.lead_image_url(_extends$$({},options,{content:content}));vardek=this.dek(_extends$$({},options,{content:content}));varnext_page_url=this.next_page_url(options);varexcerpt=this.excerpt(_extends$$({},options,{content:content}));varword_count=this.word_count(_extends$$({},options,{content:content}));vardirection=this.direction({title:title});var_url_and_domain=this.url_and_domain(options);varurl=_url_and_domain.url;vardomain=_url_and_domain.domain;return{title:title,author:author,date_published:date_published||null,dek:dek,lead_image_url:lead_image_url,content:content,next_page_url:next_page_url,url:url,domain:domain,excerpt:excerpt,word_count:word_count,direction:direction};}};functiongetExtractor(url,parsedUrl){parsedUrl=parsedUrl||URL$$.parse(url);var_parsedUrl=parsedUrl;varhostname=_parsedUrl.hostname;varbaseDomain=hostname.split('.').slice(-2).join('.');returnExtractors[hostname]||Extractors[baseDomain]||GenericExtractor;}// Remove elements by an array of selectors
functioncleanBySelectors($content,$,_ref){varclean=_ref.clean;if(!clean)return$content;$(clean.join(','),$content).remove();return$content;}// Transform matching elements
if(defaultCleaner){returnCleaners[type](result,opts);}returnresult;}functionextractResult(opts){vartype=opts.type;varextractor=opts.extractor;var_opts$fallback=opts.fallback;varfallback=_opts$fallback===undefined?true:_opts$fallback;varresult=select(_extends$$({},opts,{extractionOpts:extractor[type]}));// If custom parser succeeds, return the result
if(result){returnresult;}// If nothing matches the selector, and fallback is enabled,
// run the Generic extraction
if(fallback)returnGenericExtractor[type](opts);returnnull;}varRootExtractor={extract:functionextract(){varextractor=arguments.length<=0||arguments[0]===undefined?GenericExtractor:arguments[0];varopts=arguments[1];var_opts=opts;varcontentOnly=_opts.contentOnly;varextractedTitle=_opts.extractedTitle;// This is the generic extractor. Run its extract method
if(fallback)returnGenericExtractor[type](opts);returnnull;}varRootExtractor={extract:functionextract(){varextractor=arguments.length>0&&arguments[0]!==undefined?arguments[0]:GenericExtractor;varopts=arguments[1];var_opts=opts;varcontentOnly=_opts.contentOnly;varextractedTitle=_opts.extractedTitle;// This is the generic extractor. Run its extract method
if(extractor.domain==='*')returnextractor.extract(opts);opts=_extends$$({},opts,{extractor:extractor});if(contentOnly){var_content=extractResult(_extends$$({},opts,{type:'content',extractHtml:true,title:extractedTitle}));return{content:_content};}vartitle=extractResult(_extends$$({},opts,{type:'title'}));vardate_published=extractResult(_extends$$({},opts,{type:'date_published'}));varauthor=extractResult(_extends$$({},opts,{type:'author'}));varnext_page_url=extractResult(_extends$$({},opts,{type:'next_page_url'}));varcontent=extractResult(_extends$$({},opts,{type:'content',extractHtml:true,title:title}));varlead_image_url=extractResult(_extends$$({},opts,{type:'lead_image_url',content:content}));vardek=extractResult(_extends$$({},opts,{type:'dek',content:content}));varexcerpt=extractResult(_extends$$({},opts,{type:'excerpt',content:content}));varword_count=extractResult(_extends$$({},opts,{type:'word_count',content:content}));vardirection=extractResult(_extends$$({},opts,{type:'direction',title:title}));var_ref3=extractResult(_extends$$({},opts,{type:'url_and_domain'}))||{url:null,domain:null};varurl=_ref3.url;vardomain=_ref3.domain;return{title:title,content:content,author:author,date_published:date_published,lead_image_url:lead_image_url,dek:dek,next_page_url:next_page_url,url:url,domain:domain,excerpt:excerpt,word_count:word_count,direction:direction};}};varcollectAllPages=function(){var_ref=_asyncToGenerator(_regeneratorRuntime.mark(function_callee(_ref2){varnext_page_url=_ref2.next_page_url;varhtml=_ref2.html;var$=_ref2.$;varmetaCache=_ref2.metaCache;varresult=_ref2.result;varExtractor=_ref2.Extractor;vartitle=_ref2.title;varurl=_ref2.url;varpages,previousUrls,extractorOpts,nextPageResult,word_count;return_regeneratorRuntime.wrap(function_callee$(_context){while(1){switch(_context.prev=_context.next){case0:// At this point, we've fetched just the first page
pages=1;previousUrls=[removeAnchor(url)];// If we've gone over 26 pages, something has
_context.next=10;returnResource.create(url,html,parsedUrl);case10:$=_context.sent;if(!$.error){_context.next=13;break;}return_context.abrupt('return',$);case13:html=$.html();// Cached value of every meta name in our document.
// Used when extracting title/author/date_published/dek
metaCache=$('meta').map(function(_,node){return$(node).attr('name');}).toArray();result=RootExtractor.extract(Extractor,{url:url,html:html,$:$,metaCache:metaCache,parsedUrl:parsedUrl,fallback:fallback});_result=result;title=_result.title;next_page_url=_result.next_page_url;// Fetch more pages if next_page_url found
@ -974,7 +973,7 @@ var candidatesBlacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|');