varmercury=createCommonjsModule(function(module){'use strict';function_interopDefault(ex){returnex&&(typeofex==='undefined'?'undefined':_typeof(ex))==='object'&&'default'inex?ex['default']:ex;}var_regeneratorRuntime=_interopDefault(regenerator);var_extends$$=_interopDefault(_extends);var_asyncToGenerator=_interopDefault(asyncToGenerator);varURL$$=_interopDefault(URL);varcheerio$$=_interopDefault(cheerio);var_Promise=_interopDefault(promise);varrequest$$=_interopDefault(request);var_Reflect$ownKeys$$=_interopDefault(_Reflect$ownKeys);varstringDirection$$=_interopDefault(stringDirection);var_getIterator$$=_interopDefault(_getIterator);var_defineProperty=_interopDefault(defineProperty);var_slicedToArray$$=_interopDefault(_slicedToArray);var_typeof$$=_interopDefault(_typeof);varvalidUrl$$=_interopDefault(validUrl);varmoment$$=_interopDefault(moment);varwuzzy$$=_interopDefault(wuzzy);vardifflib$$=_interopDefault(difflib);var_Array$from=_interopDefault(from);varellipsize$$=_interopDefault(ellipsize);var_marked=[range].map(_regeneratorRuntime.mark);functionrange(){varstart=arguments.length>0&&arguments[0]!==undefined?arguments[0]:1;varend=arguments.length>1&&arguments[1]!==undefined?arguments[1]:1;return_regeneratorRuntime.wrap(functionrange$(_context){while(1){switch(_context.prev=_context.next){case0:if(!(start<=end)){_context.next=5;break;}_context.next=3;returnstart+=1;case3:_context.next=0;break;case5:case"end":return_context.stop();}}},_marked[0],this);}// extremely simple url validation as a first step
functionvalidateUrl(_ref){varhostname=_ref.hostname;// If this isn't a valid url, return an error message
return!!hostname;}varErrors={badUrl:{error:true,messages:'The url parameter passed does not look like a valid URL. Please check your data and try again.'}};varREQUEST_HEADERS={'User-Agent':'Readability - http://readability.com/about/'};// The number of milliseconds to attempt to fetch a resource before timing out.
varFETCH_TIMEOUT=10000;// Content types that we do not extract content from
varBAD_CONTENT_TYPES=['audio/mpeg','image/gif','image/jpeg','image/jpg'];varBAD_CONTENT_TYPES_RE=newRegExp('^('+BAD_CONTENT_TYPES.join('|')+')$','i');// Use this setting as the maximum size an article can be
varMAX_CONTENT_LENGTH=5242880;functionget(options){returnnew_Promise(function(resolve,reject){request$$(options,function(err,response,body){if(err){reject(err);}else{resolve({body:body,response:response});}});});}// Evaluate a response to ensure it's something we should be keeping.
functionvalidateResponse(response){varparseNon2xx=arguments.length>1&&arguments[1]!==undefined?arguments[1]:false;// Check if we got a valid status code
if(response.statusMessage!=='OK'){if(!response.statusCode){thrownewError('Unable to fetch content. Original exception was '+response.error);}elseif(!parseNon2xx){thrownewError('Resource returned a response status code of '+response.statusCode+' and resource was instructed to reject non-2xx level status codes.');}}var_response$headers=response.headers;varcontentType=_response$headers['content-type'];varcontentLength=_response$headers['content-length'];// Check that the content is not in BAD_CONTENT_TYPES
if(BAD_CONTENT_TYPES_RE.test(contentType)){thrownewError('Content-type for this resource was '+contentType+' and is not allowed.');}// Check that the content length is below maximum
if(contentLength>MAX_CONTENT_LENGTH){thrownewError('Content for this resource was too large. Maximum content length is '+MAX_CONTENT_LENGTH+'.');}returntrue;}// Set our response attribute to the result of fetching our URL.
// TODO: This should gracefully handle timeouts and raise the
// proper exceptions on the many failure cases of HTTP.
// TODO: Ensure we are not fetching something enormous. Always return
// unicode content for HTML, with charset conversion.
followAllRedirects:true};_context.next=4;returnget(options);case4:_ref3=_context.sent;response=_ref3.response;body=_ref3.body;_context.prev=7;validateResponse(response);return_context.abrupt('return',{body:body,response:response});case12:_context.prev=12;_context.t0=_context['catch'](7);return_context.abrupt('return',Errors.badUrl);case15:case'end':return_context.stop();}}},_callee,this,[[7,12]]);}));functionfetchResource(_x2,_x3){return_ref2.apply(this,arguments);}returnfetchResource;}();functionconvertMetaProp($,from,to){$('meta['+from+']').each(function(_,node){var$node=$(node);varvalue=$node.attr(from);$node.attr(to,value);$node.removeAttr(from);});return$;}// For ease of use in extracting from meta tags,
// replace the "content" attribute on meta tags with the
// "value" attribute.
//
// In addition, normalize 'property' attributes to 'name' for ease of
// querying later. See, e.g., og or twitter meta tags.
functionnormalizeMetaTags($){$=convertMetaProp($,'content','value');$=convertMetaProp($,'property','name');return$;}varIS_LINK=newRegExp('https?://','i');varIS_IMAGE=newRegExp('.(png|gif|jpe?g)','i');varTAGS_TO_REMOVE=['script','style','form'].join(',');// Convert all instances of images with potentially
// lazy loaded images into normal images.
// Many sites will have img tags with no source, or an image tag with a src
// attribute that a is a placeholer. We need to be able to properly fill in
// the src attribute so the images are no longer lazy loaded.
functionconvertLazyLoadedImages($){$('img').each(function(_,img){_Reflect$ownKeys$$(img.attribs).forEach(function(attr){varvalue=img.attribs[attr];if(attr!=='src'&&IS_LINK.test(value)&&IS_IMAGE.test(value)){$(img).attr('src',value);}});});return$;}functionisComment(index,node){returnnode.type==='comment';}functioncleanComments($){$.root().find('*').contents().filter(isComment).remove();return$;}functionclean($){$(TAGS_TO_REMOVE).remove();$=cleanComments($);return$;}varResource={// Create a Resource.
create:functioncreate(url,preparedResponse,parsedUrl){var_this=this;return_asyncToGenerator(_regeneratorRuntime.mark(function_callee(){varresult,validResponse;return_regeneratorRuntime.wrap(function_callee$(_context){while(1){switch(_context.prev=_context.next){case0:result=void0;if(!preparedResponse){_context.next=6;break;}validResponse={statusMessage:'OK',statusCode:200,headers:{'content-type':'text/html','content-length':500}};result={body:preparedResponse,response:validResponse};_context.next=9;break;case6:_context.next=8;returnfetchResource(url,parsedUrl);case8:result=_context.sent;case9:if(!result.error){_context.next=11;break;}return_context.abrupt('return',result);case11:return_context.abrupt('return',_this.generateDoc(result));case12:case'end':return_context.stop();}}},_callee,_this);}))();},generateDoc:functiongenerateDoc(_ref){varcontent=_ref.body;varresponse=_ref.response;varcontentType=response.headers['content-type'];// TODO: Implement is_text function from
if(!contentType.includes('html')&&!contentType.includes('text')){thrownewError('Content does not appear to be text.');}var$=cheerio$$.load(content,{normalizeWhitespace:true});if($.root().children().length===0){thrownewError('No children, likely a bad parse.');}$=normalizeMetaTags($);$=convertLazyLoadedImages($);$=clean($);return$;}};varNYMagExtractor={domain:'nymag.com',content:{// Order by most likely. Extractor will stop on first occurrence
noscript:functionnoscript($node){var$children=$node.children();if($children.length===1&&$children.get(0).tagName==='img'){return'figure';}returnnull;}}},title:{selectors:['h1.lede-feature-title','h1.headline-primary','h1']},author:{selectors:['.by-authors','.lede-feature-author']},dek:{selectors:['.lede-feature-teaser']},date_published:{selectors:[['time.article-timestamp[datetime]','datetime'],'time.article-timestamp']}};varBloggerExtractor={domain:'blogspot.com',content:{// Blogger is insane and does not load its content
selectors:['.post-content noscript'],// Selectors to remove from the extracted content
clean:[],// Convert the noscript tag to a div
transforms:{noscript:'div'}},author:{selectors:['.post-author-name']},title:{selectors:['h2.title']},date_published:{selectors:['span.publishdate']}};varWikipediaExtractor={domain:'wikipedia.org',content:{selectors:['#mw-content-text'],defaultCleaner:false,// transform top infobox to an image with caption
transforms:{'.infobox img':functioninfoboxImg($node){var$parent=$node.parents('.infobox');// Only prepend the first image in .infobox
if($parent.children('img').length===0){$parent.prepend($node);}},'.infobox caption':'figcaption','.infobox':'figure'},// Selectors to remove from the extracted content
// Twitter doesn't have nice selectors, so our initial
// selector grabs the whole page, then we're re-writing
// it to fit our needs before we clean it up.
'.permalink[role=main]':functionpermalinkRoleMain($node,$){vartweets=$node.find('.tweet');var$tweetContainer=$('<div id="TWEETS_GO_HERE"></div>');$tweetContainer.append(tweets);$node.replaceWith($tweetContainer);},// Twitter wraps @ with s, which
varTheAtlanticExtractor={domain:'www.theatlantic.com',title:{selectors:['h1.hed']},author:{selectors:['article#article .article-cover-extra .metadata .byline a']},content:{selectors:['.article-body'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varNewYorkerExtractor={domain:'www.newyorker.com',title:{selectors:['h1.title']},author:{selectors:['.contributors']},content:{selectors:['div#articleBody','div.articleBody'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varWiredExtractor={domain:'www.wired.com',title:{selectors:['h1.post-title']},author:{selectors:['a[rel="author"]']},content:{selectors:['article.content'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varMSNExtractor={domain:'www.msn.com',title:{selectors:['h1']},author:{selectors:['span.authorname-txt']},content:{selectors:['div.richtext'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varYahooExtractor={domain:'www.yahoo.com',title:{selectors:['header.canvas-header']},author:{selectors:['span.provider-name']},content:{selectors:[// enter content selectors
'.content-canvas'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varBuzzfeedExtractor={domain:'www.buzzfeed.com',title:{selectors:['h1[id="post-title"]']},author:{selectors:['a[data-action="user/username"]','byline__author']},content:{selectors:['#buzz_sub_buzz'],defaultCleaner:false,// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:{h2:'b'},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varWikiaExtractor={domain:'fandom.wikia.com',title:{selectors:['h1.entry-title']},author:{selectors:['.author vcard','.fn']},content:{selectors:['.grid-content','.entry-content'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varLittleThingsExtractor={domain:'www.littlethings.com',title:{selectors:['h1.post-title']},author:{selectors:[['meta[name="author"]','value']]},content:{selectors:[// enter content selectors
'.mainContentIntro','.content-wrapper'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
varPoliticoExtractor={domain:'www.politico.com',title:{selectors:[// enter title selectors
['meta[name="og:title"]','value']]},author:{selectors:['.story-main-content .byline .vcard']},content:{selectors:[// enter content selectors
'.story-main-content','.content-group','.story-core','.story-text'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:['figcaption']},date_published:{selectors:[['.story-main-content .timestamp time[datetime]','datetime']]},lead_image_url:{selectors:[// enter lead_image_url selectors
['meta[name="og:image"]','value']]},dek:{selectors:[['meta[name="description"]','value']]},next_page_url:null,excerpt:null};varExtractors={'nymag.com':NYMagExtractor,'blogspot.com':BloggerExtractor,'wikipedia.org':WikipediaExtractor,'twitter.com':TwitterExtractor,'www.nytimes.com':NYTimesExtractor,'www.theatlantic.com':TheAtlanticExtractor,'www.newyorker.com':NewYorkerExtractor,'www.wired.com':WiredExtractor,'www.msn.com':MSNExtractor,'www.yahoo.com':YahooExtractor,'www.buzzfeed.com':BuzzfeedExtractor,'fandom.wikia.com':WikiaExtractor,'www.littlethings.com':LittleThingsExtractor,'www.politico.com':PoliticoExtractor};// Spacer images to be removed
// A list of strings that can be considered unlikely candidates when
// extracting content from a resource. These strings are joined together
// and then tested for existence using re:test, so may contain simple,
// non-pipe style regular expression queries if necessary.
varUNLIKELY_CANDIDATES_BLACKLIST=['ad-break','adbox','advert','addthis','agegate','aux','blogger-labels','combx','comment','conversation','disqus','entry-unrelated','extra','foot',// 'form', // This is too generic, has too many false positives
'header','hidden','loader','login',// Note: This can hit 'blogindex'.
'menu','meta','nav','outbrain','pager','pagination','predicta',// readwriteweb inline ad box
'presence_control_external',// lifehacker.com container full of false positives
'popup','printfriendly','related','remove','remark','rss','share','shoutbox','sidebar','sociable','sponsor','taboola','tools'];// A list of strings that can be considered LIKELY candidates when
// extracting content from a resource. Essentially, the inverse of the
// blacklist above - if something matches both blacklist and whitelist,
// it is kept. This is useful, for example, if something has a className
// of "rss-content entry-content". It matched 'rss', so it would normally
// be removed, however, it's also the entry content, so it should be left
// alone.
//
// These strings are joined together and then tested for existence using
// re:test, so may contain simple, non-pipe style regular expression queries
// if necessary.
varUNLIKELY_CANDIDATES_WHITELIST=['and','article','body','blogindex','column','content','entry-content-asset','format',// misuse of form
'hfeed','hentry','hatom','main','page','posts','shadow'];// A list of tags which, if found inside, should cause a <div /> to NOT
// be turned into a paragraph tag. Shallow div tags without these elements
// should be turned into <p /> tags.
varDIV_TO_P_BLOCK_TAGS=['a','blockquote','dl','div','img','p','pre','table'].join(',');// A list of tags that should be ignored when trying to find the top candidate
// for a document.
varNON_TOP_CANDIDATE_TAGS=['br','b','i','label','hr','area','base','basefont','input','img','link','meta'];varNON_TOP_CANDIDATE_TAGS_RE=newRegExp('^('+NON_TOP_CANDIDATE_TAGS.join('|')+')$','i');varPHOTO_HINTS=['figure','photo','image','caption'];varPHOTO_HINTS_RE=newRegExp(PHOTO_HINTS.join('|'),'i');// A list of strings that denote a positive scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
'\\Bcopy'];// The above list, joined into a matching regular expression
varPOSITIVE_SCORE_RE=newRegExp(POSITIVE_SCORE_HINTS.join('|'),'i');// A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
varNEGATIVE_SCORE_HINTS=['adbox','advert','author','bio','bookmark','bottom','byline','clear','com-','combx','comment','comment\\B','contact','copy','credit','crumb','date','deck','excerpt','featured',// tnr.com has a featured_content which throws us off
'scroll','secondary','share','shopping','shoutbox','side','sidebar','sponsor','stamp','sub','summary','tags','tools','widget'];// The above list, joined into a matching regular expression
varNEGATIVE_SCORE_RE=newRegExp(NEGATIVE_SCORE_HINTS.join('|'),'i');// XPath to try to determine if a page is wordpress. Not always successful.
varIS_WP_SELECTOR='meta[name=generator][value^=WordPress]';// A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link.
varEXTRANEOUS_LINK_HINTS=['print','archive','comment','discuss','e-mail','email','share','reply','all','login','sign','single','adx','entry-unrelated'];varEXTRANEOUS_LINK_HINTS_RE=newRegExp(EXTRANEOUS_LINK_HINTS.join('|'),'i');// Match any phrase that looks like it could be page, or paging, or pagination
varPAGE_RE=newRegExp('pag(e|ing|inat)','i');// A list of all of the block level tags known in HTML5 and below. Taken from
// http://bit.ly/qneNIT
varBLOCK_LEVEL_TAGS=['article','aside','blockquote','body','br','button','canvas','caption','col','colgroup','dd','div','dl','dt','embed','fieldset','figcaption','figure','footer','form','h1','h2','h3','h4','h5','h6','header','hgroup','hr','li','map','object','ol','output','p','pre','progress','section','table','tbody','textarea','tfoot','th','thead','tr','ul','video'];varBLOCK_LEVEL_TAGS_RE=newRegExp('^('+BLOCK_LEVEL_TAGS.join('|')+')$','i');// The removal is implemented as a blacklist and whitelist, this test finds
// blacklisted elements that aren't whitelisted. We do this all in one
// expression-both because it's only one pass, and because this skips the
// serialization for whitelisted nodes.
varcandidatesBlacklist=UNLIKELY_CANDIDATES_BLACKLIST.join('|');varCANDIDATES_BLACKLIST=newRegExp(candidatesBlacklist,'i');varcandidatesWhitelist=UNLIKELY_CANDIDATES_WHITELIST.join('|');varCANDIDATES_WHITELIST=newRegExp(candidatesWhitelist,'i');functionstripUnlikelyCandidates($){// Loop through the provided document and remove any non-link nodes
// that are unlikely candidates for article content.
//
// Links are ignored because there are very often links to content
// that are identified as non-body-content, but may be inside
functionparagraphize(node,$){varbr=arguments.length>2&&arguments[2]!==undefined?arguments[2]:false;var$node=$(node);if(br){varsibling=node.nextSibling;varp=$('<p></p>');// while the next node is text or not a block level element
while(sibling&&!(sibling.tagName&&BLOCK_LEVEL_TAGS_RE.test(sibling.tagName))){varnextSibling=sibling.nextSibling;$(sibling).appendTo(p);sibling=nextSibling;}$node.replaceWith(p);$node.remove();return$;}return$;}functionconvertDivs($){$('div').each(function(index,div){var$div=$(div);varconvertable=$div.children(DIV_TO_P_BLOCK_TAGS).length===0;if(convertable){convertNodeTo($div,$,'p');}});return$;}functionconvertSpans($){$('span').each(function(index,span){var$span=$(span);varconvertable=$span.parents('p, div').length===0;if(convertable){convertNodeTo($span,$,'p');}});return$;}// Loop through the provided doc, and convert any p-like elements to
// actual paragraph tags.
//
// Things fitting this criteria:
// * Multiple consecutive <br /> tags.
// * <div /> tags without block level elements inside of them
// * <span /> tags who are not children of <p /> or <div /> tags.
//
// :param $: A cheerio object to search
// :return cheerio object with new p elements
// (By-reference mutation, though. Returned just for convenience.)
functionconvertToParagraphs($){$=brsToPs($);$=convertDivs($);$=convertSpans($);return$;}functionconvertNodeTo($node,$){vartag=arguments.length>2&&arguments[2]!==undefined?arguments[2]:'p';varnode=$node.get(0);if(!node){return$;}var_$node$get=$node.get(0);varattribs=_$node$get.attribs;varattribString=_Reflect$ownKeys$$(attribs).map(function(key){returnkey+'='+attribs[key];}).join(' ');$node.replaceWith('<'+tag+' '+attribString+'>'+$node.contents()+'</'+tag+'>');return$;}functioncleanForHeight($img,$){varheight=parseInt($img.attr('height'),10);varwidth=parseInt($img.attr('width'),10)||20;// Remove images that explicitly have very small heights or
functionremoveSpacers($img,$){if(SPACER_RE.test($img.attr('src'))){$img.remove();}return$;}functioncleanImages($article,$){$article.find('img').each(function(index,img){var$img=$(img);cleanForHeight($img,$);removeSpacers($img,$);});return$;}functionstripJunkTags(article,$){vartags=arguments.length>2&&arguments[2]!==undefined?arguments[2]:[];if(tags.length===0){tags=STRIP_OUTPUT_TAGS;}$(tags.join(','),article).remove();return$;}// H1 tags are typically the article title, which should be extracted
$article.find('*').each(function(index,node){node.attribs=_Reflect$ownKeys$$(node.attribs).reduce(function(acc,attr){if(WHITELIST_ATTRS_RE.test(attr)){return_extends$$({},acc,_defineProperty({},attr,node.attribs[attr]));}returnacc;},{});});}// function removeAttrs(article, $) {
// A list of strings that can be considered unlikely candidates when
// extracting content from a resource. These strings are joined together
// and then tested for existence using re:test, so may contain simple,
// non-pipe style regular expression queries if necessary.
varUNLIKELY_CANDIDATES_BLACKLIST$1=['ad-break','adbox','advert','addthis','agegate','aux','blogger-labels','combx','comment','conversation','disqus','entry-unrelated','extra','foot','form','header','hidden','loader','login',// Note: This can hit 'blogindex'.
'menu','meta','nav','pager','pagination','predicta',// readwriteweb inline ad box
'presence_control_external',// lifehacker.com container full of false positives
'popup','printfriendly','related','remove','remark','rss','share','shoutbox','sidebar','sociable','sponsor','tools'];// A list of strings that can be considered LIKELY candidates when
// extracting content from a resource. Essentially, the inverse of the
// blacklist above - if something matches both blacklist and whitelist,
// it is kept. This is useful, for example, if something has a className
// of "rss-content entry-content". It matched 'rss', so it would normally
// be removed, however, it's also the entry content, so it should be left
// alone.
//
// These strings are joined together and then tested for existence using
// re:test, so may contain simple, non-pipe style regular expression queries
// if necessary.
varUNLIKELY_CANDIDATES_WHITELIST$1=['and','article','body','blogindex','column','content','entry-content-asset','format',// misuse of form
'hfeed','hentry','hatom','main','page','posts','shadow'];// A list of tags which, if found inside, should cause a <div /> to NOT
// be turned into a paragraph tag. Shallow div tags without these elements
// should be turned into <p /> tags.
varDIV_TO_P_BLOCK_TAGS$1=['a','blockquote','dl','div','img','p','pre','table'].join(',');// A list of tags that should be ignored when trying to find the top candidate
// for a document.
varNON_TOP_CANDIDATE_TAGS$1=['br','b','i','label','hr','area','base','basefont','input','img','link','meta'];varNON_TOP_CANDIDATE_TAGS_RE$1=newRegExp('^('+NON_TOP_CANDIDATE_TAGS$1.join('|')+')$','i');// A list of selectors that specify, very clearly, either hNews or other
// very content-specific style content, like Blogger templates.
// More examples here: http://microformats.org/wiki/blog-post-formats
varHNEWS_CONTENT_SELECTORS$1=[['.hentry','.entry-content'],['entry','.entry-content'],['.entry','.entry_content'],['.post','.postbody'],['.post','.post_body'],['.post','.post-body']];varPHOTO_HINTS$1=['figure','photo','image','caption'];varPHOTO_HINTS_RE$1=newRegExp(PHOTO_HINTS$1.join('|'),'i');// A list of strings that denote a positive scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
varREADABILITY_ASSET$1=newRegExp('entry-content-asset','i');// A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
varNEGATIVE_SCORE_HINTS$1=['adbox','advert','author','bio','bookmark','bottom','byline','clear','com-','combx','comment','comment\\B','contact','copy','credit','crumb','date','deck','excerpt','featured',// tnr.com has a featured_content which throws us off
'scroll','secondary','share','shopping','shoutbox','side','sidebar','sponsor','stamp','sub','summary','tags','tools','widget'];// The above list, joined into a matching regular expression
varNEGATIVE_SCORE_RE$1=newRegExp(NEGATIVE_SCORE_HINTS$1.join('|'),'i');// A list of all of the block level tags known in HTML5 and below. Taken from
// http://bit.ly/qneNIT
varBLOCK_LEVEL_TAGS$1=['article','aside','blockquote','body','br','button','canvas','caption','col','colgroup','dd','div','dl','dt','embed','fieldset','figcaption','figure','footer','form','h1','h2','h3','h4','h5','h6','header','hgroup','hr','li','map','object','ol','output','p','pre','progress','section','table','tbody','textarea','tfoot','th','thead','tr','ul','video'];varBLOCK_LEVEL_TAGS_RE$1=newRegExp('^('+BLOCK_LEVEL_TAGS$1.join('|')+')$','i');// The removal is implemented as a blacklist and whitelist, this test finds
// blacklisted elements that aren't whitelisted. We do this all in one
// expression-both because it's only one pass, and because this skips the
// serialization for whitelisted nodes.
varcandidatesBlacklist$1=UNLIKELY_CANDIDATES_BLACKLIST$1.join('|');varcandidatesWhitelist$1=UNLIKELY_CANDIDATES_WHITELIST$1.join('|');varPARAGRAPH_SCORE_TAGS$1=newRegExp('^(p|li|span|pre)$','i');varCHILD_CONTENT_TAGS$1=newRegExp('^(td|blockquote|ol|ul|dl)$','i');varBAD_TAGS$1=newRegExp('^(address|form)$','i');// Get the score of a node based on its className and id.
functiongetWeight(node){varclasses=node.attr('class');varid=node.attr('id');varscore=0;if(id){// if id exists, try to score on both positive and negative
if(POSITIVE_SCORE_RE$1.test(id)){score+=25;}if(NEGATIVE_SCORE_RE$1.test(id)){score-=25;}}if(classes){if(score===0){// if classes exist and id did not contribute to score
// try to score on both positive and negative
if(POSITIVE_SCORE_RE$1.test(classes)){score+=25;}if(NEGATIVE_SCORE_RE$1.test(classes)){score-=25;}}// even if score has been set by id, add score for
// possible photo matches
// "try to keep photos if we can"
if(PHOTO_HINTS_RE$1.test(classes)){score+=10;}// add 25 if class matches entry-content-asset,
functionscoreCommas(text){return(text.match(/,/g)||[]).length;}varidkRe=newRegExp('^(p|pre)$','i');functionscoreLength(textLength){vartagName=arguments.length>1&&arguments[1]!==undefined?arguments[1]:'p';varchunks=textLength/50;if(chunks>0){varlengthBonus=void0;// No idea why p or pre are being tamped down here
// since this is only being called from the context
// of scoreParagraph
if(idkRe.test(tagName)){lengthBonus=chunks-2;}else{lengthBonus=chunks-1.25;}returnMath.min(Math.max(lengthBonus,0),3);}return0;}// Score a paragraph using various methods. Things like number of
// commas, etc. Higher is better.
functionscoreParagraph(node){varscore=1;vartext=node.text().trim();vartextLength=text.length;// If this paragraph is less than 25 characters, don't count it.
if(textLength<25){return0;}// Add points for any commas within this paragraph
score+=scoreCommas(text);// For every 50 characters in this paragraph, add another point. Up
// to 3 points.
score+=scoreLength(textLength);// Articles can end with short paragraphs when people are being clever
// but they can also end with short paragraphs setting up lists of junk
// that we strip. This negative tweaks junk setup paragraphs just below
// the cutoff threshold.
if(text.slice(-1)===':'){score-=1;}returnscore;}functionsetScore($node,$,score){$node.attr('score',score);return$node;}functionaddScore($node,$,amount){try{varscore=getOrInitScore($node,$)+amount;setScore($node,$,score);}catch(e){// Ignoring; error occurs in scoreNode
}return$node;}// Adds 1/4 of a child's score to its parent
functionaddToParent(node,$,score){varparent=node.parent();if(parent){addScore(parent,$,score*0.25);}returnnode;}// gets and returns the score if it exists
functiongetOrInitScore($node,$){varweightNodes=arguments.length>2&&arguments[2]!==undefined?arguments[2]:true;varscore=getScore($node);if(score){returnscore;}score=scoreNode($node);if(weightNodes){score+=getWeight($node);}addToParent($node,$,score);returnscore;}// Score an individual node. Has some smarts for paragraphs, otherwise
functionscoreNode($node){var_$node$get=$node.get(0);vartagName=_$node$get.tagName;// TODO: Consider ordering by most likely.
// E.g., if divs are a more common tag on a page,
// Could save doing that regex test on every node – AP
if(PARAGRAPH_SCORE_TAGS$1.test(tagName)){returnscoreParagraph($node);}elseif(tagName==='div'){return5;}elseif(CHILD_CONTENT_TAGS$1.test(tagName)){return3;}elseif(BAD_TAGS$1.test(tagName)){return-3;}elseif(tagName==='th'){return-5;}return0;}functionconvertSpans$1($node,$){if($node.get(0)){var_$node$get=$node.get(0);vartagName=_$node$get.tagName;if(tagName==='span'){// convert spans to divs
convertNodeTo($node,$,'div');}}}functionaddScoreTo($node,$,score){if($node){convertSpans$1($node,$);addScore($node,$,score);}}functionscorePs($,weightNodes){$('p, pre').not('[score]').each(function(index,node){// The raw score for this paragraph, before we add any parent/child
// scores.
var$node=$(node);$node=setScore($node,$,getOrInitScore($node,$,weightNodes));var$parent=$node.parent();varrawScore=scoreNode($node);addScoreTo($parent,$,rawScore,weightNodes);if($parent){// Add half of the individual content score to the
// grandparent
addScoreTo($parent.parent(),$,rawScore/2,weightNodes);}});return$;}// score content. Parents get the full value of their children's
functionscoreContent($){varweightNodes=arguments.length>1&&arguments[1]!==undefined?arguments[1]:true;// First, look for special hNews based selectors and give them a big
HNEWS_CONTENT_SELECTORS$1.forEach(function(_ref){var_ref2=_slicedToArray$$(_ref,2);varparentSelector=_ref2[0];varchildSelector=_ref2[1];$(parentSelector+' '+childSelector).each(function(index,node){addScore($(node).parent(parentSelector),$,80);});});// Doubling this again
scorePs($,weightNodes);scorePs($,weightNodes);return$;}varNORMALIZE_RE=/\s{2,}/g;functionnormalizeSpaces(text){returntext.replace(NORMALIZE_RE,' ').trim();}// Given a node type to search for, and a list of regular expressions,
// look to see if this extraction can be found in the URL. Expects
// that each expression in r_list will return group(1) as the proper
// string to be cleaned.
// Only used for date_published currently.
functionextractFromUrl(url,regexList){varmatchRe=regexList.find(function(re){returnre.test(url);});if(matchRe){returnmatchRe.exec(url)[1];}returnnull;}// An expression that looks to try to find the page digit within a URL, if
returnpageNum<100?pageNum:null;}functionremoveAnchor(url){returnurl.split('#')[0].replace(/\/$/,'');}functionisGoodSegment(segment,index,firstSegmentHasLetters){vargoodSegment=true;// If this is purely a number, and it's the first or second
// url_segment, it's probably a page number. Remove it.
if(index<2&&IS_DIGIT_RE.test(segment)&&segment.length<3){goodSegment=true;}// If this is the first url_segment and it's just "index",
// remove it
if(index===0&&segment.toLowerCase()==='index'){goodSegment=false;}// If our first or second url_segment is smaller than 3 characters,
// and the first url_segment had no alphas, remove it.
if(index<2&&segment.length<3&&!firstSegmentHasLetters){goodSegment=false;}returngoodSegment;}// Take a URL, and return the article base of said URL. That is, no
// pagination data exists in it. Useful for comparing to other links
// that might have pagination data within them.
functionarticleBaseUrl(url,parsed){varparsedUrl=parsed||URL$$.parse(url);varprotocol=parsedUrl.protocol;varhost=parsedUrl.host;varpath=parsedUrl.path;varfirstSegmentHasLetters=false;varcleanedSegments=path.split('/').reverse().reduce(function(acc,rawSegment,index){varsegment=rawSegment;// Split off and save anything that looks like a file type.
if(segment.includes('.')){var_segment$split=segment.split('.');var_segment$split2=_slicedToArray$$(_segment$split,2);varpossibleSegment=_segment$split2[0];varfileExt=_segment$split2[1];if(IS_ALPHA_RE.test(fileExt)){segment=possibleSegment;}}// If our first or second segment has anything looking like a page
if(PAGE_IN_HREF_RE.test(segment)&&index<2){segment=segment.replace(PAGE_IN_HREF_RE,'');}// If we're on the first segment, check to see if we have any
// characters in it. The first segment is actually the last bit of
// the URL, and this will be helpful to determine if we're on a URL
// segment that looks like "/2/" for example.
if(index===0){firstSegmentHasLetters=HAS_ALPHA_RE.test(segment);}// If it's not marked for deletion, push it to cleaned_segments.
if(isGoodSegment(segment,index,firstSegmentHasLetters)){acc.push(segment);}returnacc;},[]);returnprotocol+'//'+host+cleanedSegments.reverse().join('/');}// Given a string, return True if it appears to have an ending sentence
varSENTENCE_END_RE=newRegExp('.( |$)');functionhasSentenceEnd(text){returnSENTENCE_END_RE.test(text);}functionexcerptContent(content){varwords=arguments.length>1&&arguments[1]!==undefined?arguments[1]:10;returncontent.trim().split(/\s+/).slice(0,words).join(' ');}// Now that we have a top_candidate, look through the siblings of
functionmergeSiblings($candidate,topScore,$){if(!$candidate.parent().length){return$candidate;}varsiblingScoreThreshold=Math.max(10,topScore*0.25);varwrappingDiv=$('<div></div>');$candidate.parent().children().each(function(index,sibling){var$sibling=$(sibling);// Ignore tags like BR, HR, etc
if(NON_TOP_CANDIDATE_TAGS_RE$1.test(sibling.tagName)){returnnull;}varsiblingScore=getScore($sibling);if(siblingScore){if($sibling===$candidate){wrappingDiv.append($sibling);}else{varcontentBonus=0;vardensity=linkDensity($sibling);// If sibling has a very low link density,
// give it a small bonus
if(density<0.05){contentBonus+=20;}// If sibling has a high link density,
// give it a penalty
if(density>=0.5){contentBonus-=20;}// If sibling node has the same class as
// candidate, give it a bonus
if($sibling.attr('class')===$candidate.attr('class')){contentBonus+=topScore*0.2;}varnewScore=siblingScore+contentBonus;if(newScore>=siblingScoreThreshold){returnwrappingDiv.append($sibling);}elseif(sibling.tagName==='p'){varsiblingContent=$sibling.text();varsiblingContentLength=textLength(siblingContent);if(siblingContentLength>80&&density<0.25){returnwrappingDiv.append($sibling);}elseif(siblingContentLength<=80&&density===0&&hasSentenceEnd(siblingContent)){returnwrappingDiv.append($sibling);}}}}returnnull;});returnwrappingDiv;}// After we've calculated scores, loop through all of the possible
// candidate nodes we found and find the one with the highest score.
functionfindTopCandidate($){var$candidate=void0;vartopScore=0;$('[score]').each(function(index,node){// Ignore tags like BR, HR, etc
if(NON_TOP_CANDIDATE_TAGS_RE$1.test(node.tagName)){return;}var$node=$(node);varscore=getScore($node);if(score>topScore){topScore=score;$candidate=$node;}});// If we don't have a candidate, return the body
// or whatever the first element is
if(!$candidate){return$('body')||$('*').first();}$candidate=mergeSiblings($candidate,topScore,$);return$candidate;}functionremoveUnlessContent($node,$,weight){// Explicitly save entry-content-asset tags, which are
// noted as valuable in the Publisher guidelines. For now
// this works everywhere. We may want to consider making
// this less of a sure-thing later.
if($node.hasClass('entry-content-asset')){return;}varcontent=normalizeSpaces($node.text());if(scoreCommas(content)<10){varpCount=$('p',$node).length;varinputCount=$('input',$node).length;// Looks like a form, too many inputs.
if(inputCount>pCount/3){$node.remove();return;}varcontentLength=content.length;varimgCount=$('img',$node).length;// Content is too short, and there are no images, so
// this is probably junk content.
if(contentLength<25&&imgCount===0){$node.remove();return;}vardensity=linkDensity($node);// Too high of link density, is probably a menu or
// something similar.
// console.log(weight, density, contentLength)
if(weight<25&&density>0.2&&contentLength>75){$node.remove();return;}// Too high of a link density, despite the score being
// high.
if(weight>=25&&density>0.5){// Don't remove the node if it's a list and the
// previous sibling starts with a colon though. That
// means it's probably content.
vartagName=$node.get(0).tagName;varnodeIsList=tagName==='ol'||tagName==='ul';if(nodeIsList){varpreviousNode=$node.prev();if(previousNode&&normalizeSpaces(previousNode.text()).slice(-1)===':'){return;}}$node.remove();return;}varscriptCount=$('script',$node).length;// Too many script tags, not enough content.
if(scriptCount>0&&contentLength<150){$node.remove();return;}}}// Given an article, clean it of some superfluous content specified by
// tags. Things like forms, ads, etc.
//
// Tags is an array of tag name's to search through. (like div, form,
// etc)
//
// Return this same doc.
functioncleanTags($article,$){$(CLEAN_CONDITIONALLY_TAGS,$article).each(function(index,node){var$node=$(node);varweight=getScore($node);if(!weight){weight=getOrInitScore($node,$);setScore($node,$,weight);}// drop node if its weight is < 0
if(weight<0){$node.remove();}else{// deteremine if node seems like content
removeUnlessContent($node,$,weight);}});return$;}functioncleanHeaders($article,$){vartitle=arguments.length>2&&arguments[2]!==undefined?arguments[2]:'';$(HEADER_TAG_LIST,$article).each(function(index,header){var$header=$(header);// Remove any headers that appear before all other p tags in the
// document. This probably means that it was part of the title, a
// subtitle or something else extraneous like a datestamp or byline,
// all of which should be handled by other metadata handling.
if($($header,$article).prevAll('p').length===0){return$header.remove();}// Remove any headers that match the title exactly.
if(normalizeSpaces($(header).text())===title){return$header.remove();}// If this header has a negative weight, it's probably junk.
// Get rid of it.
if(getWeight($(header))<0){return$header.remove();}return$header;});return$;}// Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags.
functionrewriteTopLevel(article,$){// I'm not using context here because
// it's problematic when converting the
// top-level/root node - AP
$=convertNodeTo($('html'),$,'div');$=convertNodeTo($('body'),$,'div');return$;}functionabsolutize($,rootUrl,attr,$content){$('['+attr+']',$content).each(function(_,node){varurl=node.attribs[attr];varabsoluteUrl=URL$$.resolve(rootUrl,url);node.attribs[attr]=absoluteUrl;});}functionmakeLinksAbsolute($content,$,url){['href','src'].forEach(function(attr){returnabsolutize($,url,attr,$content);});return$content;}functiontextLength(text){returntext.trim().replace(/\s+/g,' ').length;}// Determines what percentage of the text
// in a node is link text
// Takes a node, returns a float
functionlinkDensity($node){vartotalTextLength=textLength($node.text());varlinkText=$node.find('a').text();varlinkLength=textLength(linkText);if(totalTextLength>0){returnlinkLength/totalTextLength;}elseif(totalTextLength===0&&linkLength>0){return1;}return0;}// Given a node type to search for, and a list of meta tag names to
functionextractFromMeta($,metaNames,cachedNames){varcleanTags=arguments.length>3&&arguments[3]!==undefined?arguments[3]:true;varfoundNames=metaNames.filter(function(name){returncachedNames.indexOf(name)!==-1;});var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{var_loop=function_loop(){varname=_step.value;vartype='name';varvalue='value';varnodes=$('meta['+type+'="'+name+'"]');// Get the unique value of every matching node, in case there
// are two meta tags with the same name and value.
// Remove empty values.
varvalues=nodes.map(function(index,node){return$(node).attr(value);}).toArray().filter(function(text){returntext!=='';});// If we have more than one value for the same name, we have a
// conflict and can't trust any of them. Skip this name. If we have
// zero, that means our meta tags had no values. Skip this name
// also.
if(values.length===1){varmetaValue=void0;// Meta values that contain HTML should be stripped, as they
if(cleanTags){metaValue=stripTags(values[0],$);}else{metaValue=values[0];}return{v:metaValue};}};for(var_iterator=_getIterator$$(foundNames),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){var_ret=_loop();if((typeof_ret==='undefined'?'undefined':_typeof$$(_ret))==="object")return_ret.v;}// If nothing is found, return null
}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnnull;}functionisGoodNode($node,maxChildren){// If it has a number of children, it's more likely a container
// element. Skip it.
if($node.children().length>maxChildren){returnfalse;}// If it looks to be within a comment, skip it.
if(withinComment($node)){returnfalse;}returntrue;}// Given a a list of selectors find content that may
// be extractable from the document. This is for flat
// meta-information, like author, title, date published, etc.
functionextractFromSelectors($,selectors){varmaxChildren=arguments.length>2&&arguments[2]!==undefined?arguments[2]:1;vartextOnly=arguments.length>3&&arguments[3]!==undefined?arguments[3]:true;var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator$$(selectors),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){varselector=_step.value;varnodes=$(selector);// If we didn't get exactly one of this selector, this may be
if(nodes.length===1){var$node=$(nodes[0]);if(isGoodNode($node,maxChildren)){varcontent=void0;if(textOnly){content=$node.text();}else{content=$node.html();}if(content){returncontent;}}}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnnull;}// strips all tags from a string of text
functionstripTags(text,$){// Wrapping text in html element prevents errors when text
// has no html
varcleanText=$('<span>'+text+'</span>').text();returncleanText===''?text:cleanText;}functionwithinComment($node){varparents=$node.parents().toArray();varcommentParent=parents.find(function(parent){varclassAndId=parent.attribs.class+' '+parent.attribs.id;returnclassAndId.includes('comment');});returncommentParent!==undefined;}// Given a node, determine if it's article-like enough to return
varTEXT_LINK_RE=newRegExp('http(s)?://','i');// CLEAN DATE PUBLISHED CONSTANTS
varMS_DATE_STRING=/^\d{13}$/i;varSEC_DATE_STRING=/^\d{10}$/i;varCLEAN_DATE_STRING_RE=/^\s*published\s*:?\s*(.*)/i;varTIME_MERIDIAN_SPACE_RE=/(.*\d)(am|pm)(.*)/i;varTIME_MERIDIAN_DOTS_RE=/\.m\./i;varmonths=['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'];varallMonths=months.join('|');vartimestamp1='[0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?';vartimestamp2='[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4}';varSPLIT_DATE_STRING=newRegExp('('+timestamp1+')|('+timestamp2+')|([0-9]{1,4})|('+allMonths+')','ig');// CLEAN TITLE CONSTANTS
// A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar.
varTITLE_SPLITTERS_RE=/(: | - | \| )/g;varDOMAIN_ENDINGS_RE=newRegExp('.com$|.net$|.org$|.co.uk$','g');// Take an author string (like 'By David Smith ') and clean it to
// just the name(s): 'David Smith'.
functioncleanAuthor(author){returnauthor.replace(CLEAN_AUTHOR_RE,'$2').trim();}functionclean$1(leadImageUrl){leadImageUrl=leadImageUrl.trim();if(validUrl$$.isWebUri(leadImageUrl)){returnleadImageUrl;}returnnull;}// Take a dek HTML fragment, and return the cleaned version of it.
functioncleanDek(dek,_ref){var$=_ref.$;varexcerpt=_ref.excerpt;// Sanity check that we didn't get too short or long of a dek.
if(dek.length>1000||dek.length<5)returnnull;// Check that dek isn't the same as excerpt
if(excerpt&&excerptContent(excerpt,10)===excerptContent(dek,10))returnnull;vardekText=stripTags(dek,$);// Plain text links shouldn't exist in the dek. If we have some, it's
if(TEXT_LINK_RE.test(dekText))returnnull;returndekText.trim();}// Is there a compelling reason to use moment here?
// Mostly only being used for the isValid() method,
// but could just check for 'Invalid Date' string.
functioncleanDateString(dateString){return(dateString.match(SPLIT_DATE_STRING)||[]).join(' ').replace(TIME_MERIDIAN_DOTS_RE,'m').replace(TIME_MERIDIAN_SPACE_RE,'$1 $2 $3').replace(CLEAN_DATE_STRING_RE,'$1').trim();}// Take a date published string, and hopefully return a date out of
// it. Return none if we fail.
functioncleanDatePublished(dateString){// If string is in milliseconds or seconds, convert to int
if(MS_DATE_STRING.test(dateString)||SEC_DATE_STRING.test(dateString)){dateString=parseInt(dateString,10);}vardate=moment$$(newDate(dateString));if(!date.isValid()){dateString=cleanDateString(dateString);date=moment$$(newDate(dateString));}returndate.isValid()?date.toISOString():null;}// Clean our article content, returning a new, cleaned node.
functionextractCleanNode(article,_ref){var$=_ref.$;var_ref$cleanConditional=_ref.cleanConditionally;varcleanConditionally=_ref$cleanConditional===undefined?true:_ref$cleanConditional;var_ref$title=_ref.title;vartitle=_ref$title===undefined?'':_ref$title;var_ref$url=_ref.url;varurl=_ref$url===undefined?'':_ref$url;var_ref$defaultCleaner=_ref.defaultCleaner;vardefaultCleaner=_ref$defaultCleaner===undefined?true:_ref$defaultCleaner;// Rewrite the tag name to div if it's a top level node like body or
vartermCounts=splitTitle.reduce(function(acc,titleText){acc[titleText]=acc[titleText]?acc[titleText]+1:1;returnacc;},{});var_Reflect$ownKeys$redu=_Reflect$ownKeys$$(termCounts).reduce(function(acc,key){if(acc[1]<termCounts[key]){return[key,termCounts[key]];}returnacc;},[0,0]);var_Reflect$ownKeys$redu2=_slicedToArray$$(_Reflect$ownKeys$redu,2);varmaxTerm=_Reflect$ownKeys$redu2[0];vartermCount=_Reflect$ownKeys$redu2[1];// We found a splitter that was used more than once, so it
if(termCount>=2&&maxTerm.length<=4){splitTitle=text.split(maxTerm);}varsplitEnds=[splitTitle[0],splitTitle.slice(-1)];varlongestEnd=splitEnds.reduce(function(acc,end){returnacc.length>end.length?acc:end;},'');if(longestEnd.length>10){return{v:longestEnd};}return{v:text};}();if((typeof_ret==='undefined'?'undefined':_typeof$$(_ret))==="object")return_ret.v;}returnnull;}functioncleanDomainFromTitle(splitTitle,url){// Search the ends of the title, looking for bits that fuzzy match
// the URL too closely. If one is found, discard it and return the
// rest.
//
// Strip out the big TLDs - it just makes the matching a bit more
// accurate. Not the end of the world if it doesn't strip right.
var_URL$parse=URL$$.parse(url);varhost=_URL$parse.host;varnakedDomain=host.replace(DOMAIN_ENDINGS_RE,'');varstartSlug=splitTitle[0].toLowerCase().replace(' ','');varstartSlugRatio=wuzzy$$.levenshtein(startSlug,nakedDomain);if(startSlugRatio>0.4&&startSlug.length>5){returnsplitTitle.slice(2).join('');}varendSlug=splitTitle.slice(-1)[0].toLowerCase().replace(' ','');varendSlugRatio=wuzzy$$.levenshtein(endSlug,nakedDomain);if(endSlugRatio>0.4&&endSlug.length>=5){returnsplitTitle.slice(0,-2).join('');}returnnull;}// Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.
// ['The New New York', ' - ', 'The Washington Post']
varsplitTitle=title.split(TITLE_SPLITTERS_RE);if(splitTitle.length===1){returntitle;}varnewTitle=extractBreadcrumbTitle(splitTitle,title);if(newTitle)returnnewTitle;newTitle=cleanDomainFromTitle(splitTitle,url);if(newTitle)returnnewTitle;// Fuzzy ratio didn't find anything, so this title is probably legit.
// Just return it all.
returntitle;}varCleaners={author:cleanAuthor,lead_image_url:clean$1,dek:cleanDek,date_published:cleanDatePublished,content:extractCleanNode,title:cleanTitle};// Using a variety of scoring techniques, extract the content most
// likely to be article text.
//
// If strip_unlikely_candidates is True, remove any elements that
// match certain criteria first. (Like, does this element have a
// classname of "comment")
//
// If weight_nodes is True, use classNames and IDs to determine the
// worthiness of nodes.
//
// Returns a cheerio object $
functionextractBestNode($,opts){// clone the node so we can get back to our
// initial parsed state if needed
// TODO Do I need this? – AP
// let $root = $.root().clone()
if(opts.stripUnlikelyCandidates){$=stripUnlikelyCandidates($);}$=convertToParagraphs($);$=scoreContent($,opts.weightNodes);var$topCandidate=findTopCandidate($);return$topCandidate;}varGenericContentExtractor={defaultOpts:{stripUnlikelyCandidates:true,weightNodes:true,cleanConditionally:true},// Extract the content for this resource - initially, pass in our
// most restrictive opts which will return the highest quality
// content. On each failure, retry with slightly more lax opts.
//
// :param return_type: string. If "node", should return the content
// as a cheerio node rather than as an HTML string.
//
// Opts:
// stripUnlikelyCandidates: Remove any elements that match
// non-article-like criteria first.(Like, does this element
// have a classname of "comment")
//
// weightNodes: Modify an elements score based on whether it has
// certain classNames or IDs. Examples: Subtract if a node has
// a className of 'comment', Add if a node has an ID of
// 'entry-content'.
//
// cleanConditionally: Clean the node to return of some
// superfluous content. Things like forms, ads, etc.
extract:functionextract(_ref,opts){var$=_ref.$;varhtml=_ref.html;vartitle=_ref.title;varurl=_ref.url;opts=_extends$$({},this.defaultOpts,opts);$=$||cheerio$$.load(html);// Cascade through our extraction-specific opts in an ordered fashion,
varnode=this.getContentNode($,title,url,opts);if(nodeIsSufficient(node)){returnthis.cleanAndReturnNode(node,$);}// We didn't succeed on first pass, one by one disable our
var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator$$(_Reflect$ownKeys$$(opts).filter(function(k){returnopts[k]===true;})),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){varkey=_step.value;opts[key]=false;$=cheerio$$.load(html);node=this.getContentNode($,title,url,opts);if(nodeIsSufficient(node)){break;}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnthis.cleanAndReturnNode(node,$);},// Get node given current options
getContentNode:functiongetContentNode($,title,url,opts){returnextractCleanNode(extractBestNode($,opts),{$:$,cleanConditionally:opts.cleanConditionally,title:title,url:url});},// Once we got here, either we're at our last-resort node, or
// we broke early. Make sure we at least have -something- before we
// move forward.
cleanAndReturnNode:functioncleanAndReturnNode(node,$){if(!node){returnnull;}returnnormalizeSpaces($.html(node));// if return_type == "html":
// return normalize_spaces(node_to_html(node))
// else:
// return node
}};// TODO: It would be great if we could merge the meta and selector lists into
// a list of objects, because we could then rank them better. For example,
// .hentry .entry-title is far better suited than <meta title>.
// An ordered list of meta tag names that denote likely article titles. All
// attributes should be lowercase for faster case-insensitive matching. From
// most distinct to least distinct.
varSTRONG_TITLE_META_TAGS=['tweetmeme-title','dc.title','rbtitle','headline','title'];// og:title is weak because it typically contains context that we don't like,
// for example the source site's name. Gotta get that brand into facebook!
varWEAK_TITLE_META_TAGS=['og:title'];// An ordered list of XPath Selectors to find likely article titles. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.
varSTRONG_TITLE_SELECTORS=['.hentry .entry-title','h1#articleHeader','h1.articleHeader','h1.article','.instapaper_title','#meebo-title'];varWEAK_TITLE_SELECTORS=['article h1','#entry-title','.entry-title','#entryTitle','#entrytitle','.entryTitle','.entrytitle','#articleTitle','.articleTitle','post post-title','h1.title','h2.article','h1','html head title','title'];varGenericTitleExtractor={extract:functionextract(_ref){var$=_ref.$;varurl=_ref.url;varmetaCache=_ref.metaCache;// First, check to see if we have a matching meta tag that we can make
// use of that is strongly associated with the headline.
vartitle=void0;title=extractFromMeta($,STRONG_TITLE_META_TAGS,metaCache);if(title)returncleanTitle(title,{url:url,$:$});// Second, look through our content selectors for the most likely
// article title that is strongly associated with the headline.
title=extractFromSelectors($,STRONG_TITLE_SELECTORS);if(title)returncleanTitle(title,{url:url,$:$});// Third, check for weaker meta tags that may match.
title=extractFromMeta($,WEAK_TITLE_META_TAGS,metaCache);if(title)returncleanTitle(title,{url:url,$:$});// Last, look for weaker selector tags that may match.
title=extractFromSelectors($,WEAK_TITLE_SELECTORS);if(title)returncleanTitle(title,{url:url,$:$});// If no matches, return an empty string
return'';}};// An ordered list of meta tag names that denote likely article authors. All
// attributes should be lowercase for faster case-insensitive matching. From
// most distinct to least distinct.
//
// Note: "author" is too often the -developer- of the page, so it is not
// added here.
varAUTHOR_META_TAGS=['byl','clmst','dc.author','dcsext.author','dc.creator','rbauthors','authors'];varAUTHOR_MAX_LENGTH=300;// An ordered list of XPath Selectors to find likely article authors. From
// most explicit to least explicit.
//
// Note - this does not use classes like CSS. This checks to see if the string
// exists in the className, which is not as accurate as .className (which
// splits on spaces/endlines), but for our purposes it's close enough. The
// speed tradeoff is worth the accuracy hit.
varAUTHOR_SELECTORS=['.entry .entry-author','.author.vcard .fn','.author .vcard .fn','.byline.vcard .fn','.byline .vcard .fn','.byline .by .author','.byline .by','.byline .author','.post-author.vcard','.post-author .vcard','a[rel=author]','#by_author','.by_author','#entryAuthor','.entryAuthor','.byline a[href*=author]','#author .authorname','.author .authorname','#author','.author','.articleauthor','.ArticleAuthor','.byline'];// An ordered list of Selectors to find likely article authors, with
// regular expression for content.
varbylineRe=/^[\n\s]*By/i;varBYLINE_SELECTORS_RE=[['#byline',bylineRe],['.byline',bylineRe]];varGenericAuthorExtractor={extract:functionextract(_ref){var$=_ref.$;varmetaCache=_ref.metaCache;varauthor=void0;// First, check to see if we have a matching
// meta tag that we can make use of.
author=extractFromMeta($,AUTHOR_META_TAGS,metaCache);if(author&&author.length<AUTHOR_MAX_LENGTH){returncleanAuthor(author);}// Second, look through our selectors looking for potential authors.
author=extractFromSelectors($,AUTHOR_SELECTORS,2);if(author&&author.length<AUTHOR_MAX_LENGTH){returncleanAuthor(author);}// Last, use our looser regular-expression based selectors for
var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator$$(BYLINE_SELECTORS_RE),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){var_ref4=_step.value;var_ref3=_slicedToArray$$(_ref4,2);varselector=_ref3[0];varregex=_ref3[1];varnode=$(selector);if(node.length===1){vartext=node.text();if(regex.test(text)){returncleanAuthor(text);}}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnnull;}};// An ordered list of meta tag names that denote
// should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
varDATE_PUBLISHED_META_TAGS=['article:published_time','displaydate','dc.date','dc.date.issued','rbpubdate','publish_date','pub_date','pagedate','pubdate','revision_date','doc_date','date_created','content_create_date','lastmodified','created','date'];// An ordered list of XPath Selectors to find
// likely date published dates. From most explicit
// to least explicit.
varDATE_PUBLISHED_SELECTORS=['.hentry .dtstamp.published','.hentry .published','.hentry .dtstamp.updated','.hentry .updated','.single .published','.meta .published','.meta .postDate','.entry-date','.byline .date','.postmetadata .date','.article_datetime','.date-header','.story-date','.dateStamp','#story .datetime','.dateline','.pubdate'];// An ordered list of compiled regular expressions to find likely date
// published dates from the URL. These should always have the first
// reference be a date string that is parseable by dateutil.parser.parse
varabbrevMonthsStr='(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';varDATE_PUBLISHED_URL_RES=[// /2012/01/27/ but not /2012/01/293
newRegExp('/(20\\d{2}/\\d{2}/\\d{2})/','i'),// 20120127 or 20120127T but not 2012012733 or 8201201733
newRegExp('/(20\\d{2}/'+abbrevMonthsStr+'/[0-3]\\d)/','i')];varGenericDatePublishedExtractor={extract:functionextract(_ref){var$=_ref.$;varurl=_ref.url;varmetaCache=_ref.metaCache;vardatePublished=void0;// First, check to see if we have a matching meta tag
// that we can make use of.
// Don't try cleaning tags from this string
datePublished=extractFromMeta($,DATE_PUBLISHED_META_TAGS,metaCache,false);if(datePublished)returncleanDatePublished(datePublished);// Second, look through our selectors looking for potential
// date_published's.
datePublished=extractFromSelectors($,DATE_PUBLISHED_SELECTORS);if(datePublished)returncleanDatePublished(datePublished);// Lastly, look to see if a dately string exists in the URL
extract:functionextract(){returnnull;}};// An ordered list of meta tag names that denote likely article leading images.
// All attributes should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
varLEAD_IMAGE_URL_META_TAGS=['og:image','twitter:image','image_src'];varLEAD_IMAGE_URL_SELECTORS=['link[rel=image_src]'];varPOSITIVE_LEAD_IMAGE_URL_HINTS=['upload','wp-content','large','photo','wp-image'];varPOSITIVE_LEAD_IMAGE_URL_HINTS_RE=newRegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'),'i');varNEGATIVE_LEAD_IMAGE_URL_HINTS=['spacer','sprite','blank','throbber','gradient','tile','bg','background','icon','social','header','hdr','advert','spinner','loader','loading','default','rating','share','facebook','twitter','theme','promo','ads','wp-includes'];varNEGATIVE_LEAD_IMAGE_URL_HINTS_RE=newRegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'),'i');varGIF_RE=/\.gif(\?.*)?$/i;varJPG_RE=/\.jpe?g(\?.*)?$/i;functiongetSig($node){return($node.attr('class')||'')+' '+($node.attr('id')||'');}// Scores image urls based on a variety of heuristics.
functionscoreImageUrl(url){url=url.trim();varscore=0;if(POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)){score+=20;}if(NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)){score-=20;}// TODO: We might want to consider removing this as
// gifs are much more common/popular than they once were
if(GIF_RE.test(url)){score-=10;}if(JPG_RE.test(url)){score+=10;}// PNGs are neutral.
returnscore;}// Alt attribute usually means non-presentational image.
functionscoreAttr($img){if($img.attr('alt')){return5;}return0;}// Look through our parent and grandparent for figure-like
// container elements, give a bonus if we find them
functionscoreByParents($img){varscore=0;var$figParent=$img.parents('figure').first();if($figParent.length===1){score+=25;}var$parent=$img.parent();var$gParent=void0;if($parent.length===1){$gParent=$parent.parent();}[$parent,$gParent].forEach(function($node){if(PHOTO_HINTS_RE$1.test(getSig($node))){score+=15;}});returnscore;}// Look at our immediate sibling and see if it looks like it's a
// caption. Bonus if so.
functionscoreBySibling($img){varscore=0;var$sibling=$img.next();varsibling=$sibling.get(0);if(sibling&&sibling.tagName==='figcaption'){score+=25;}if(PHOTO_HINTS_RE$1.test(getSig($sibling))){score+=15;}returnscore;}functionscoreByDimensions($img){varscore=0;varwidth=parseFloat($img.attr('width'));varheight=parseFloat($img.attr('height'));varsrc=$img.attr('src');// Penalty for skinny images
if(width&&width<=50){score-=50;}// Penalty for short images
if(height&&height<=50){score-=50;}if(width&&height&&!src.includes('sprite')){vararea=width*height;if(area<5000){// Smaller than 50 x 100
score-=100;}else{score+=Math.round(area/1000);}}returnscore;}functionscoreByPosition($imgs,index){return$imgs.length/2-index;}// Given a resource, try to find the lead image URL from within
// it. Like content and next page extraction, uses a scoring system
// to determine what the most likely image may be. Short circuits
// on really probable things like og:image meta tags.
//
// Potential signals to still take advantage of:
// * domain
// * weird aspect ratio
varGenericLeadImageUrlExtractor={extract:functionextract(_ref){var$=_ref.$;varcontent=_ref.content;varmetaCache=_ref.metaCache;varcleanUrl=void0;// Check to see if we have a matching meta tag that we can make use of.
// Moving this higher because common practice is now to use large
// images on things like Open Graph or Twitter cards.
// images usually have for things like Open Graph.
varimageUrl=extractFromMeta($,LEAD_IMAGE_URL_META_TAGS,metaCache,false);if(imageUrl){cleanUrl=clean$1(imageUrl);if(cleanUrl)returncleanUrl;}// Next, try to find the "best" image via the content.
// We'd rather not have to fetch each image and check dimensions,
// so try to do some analysis and determine them instead.
varimgs=$('img',content).toArray();varimgScores={};imgs.forEach(function(img,index){var$img=$(img);varsrc=$img.attr('src');if(!src)return;varscore=scoreImageUrl(src);score+=scoreAttr($img);score+=scoreByParents($img);score+=scoreBySibling($img);score+=scoreByDimensions($img);score+=scoreByPosition(imgs,index);imgScores[src]=score;});var_Reflect$ownKeys$redu=_Reflect$ownKeys$$(imgScores).reduce(function(acc,key){returnimgScores[key]>acc[1]?[key,imgScores[key]]:acc;},[null,0]);var_Reflect$ownKeys$redu2=_slicedToArray$$(_Reflect$ownKeys$redu,2);vartopUrl=_Reflect$ownKeys$redu2[0];vartopScore=_Reflect$ownKeys$redu2[1];if(topScore>0){cleanUrl=clean$1(topUrl);if(cleanUrl)returncleanUrl;}// If nothing else worked, check to see if there are any really
var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator$$(LEAD_IMAGE_URL_SELECTORS),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){varselector=_step.value;var$node=$(selector).first();varsrc=$node.attr('src');if(src){cleanUrl=clean$1(src);if(cleanUrl)returncleanUrl;}varhref=$node.attr('href');if(href){cleanUrl=clean$1(href);if(cleanUrl)returncleanUrl;}varvalue=$node.attr('value');if(value){cleanUrl=clean$1(value);if(cleanUrl)returncleanUrl;}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnnull;}};functionscoreSimilarity(score,articleUrl,href){// Do this last and only if we have a real candidate, because it's
// potentially expensive computationally. Compare the link to this
// URL using difflib to get the % similarity of these URLs. On a
// sliding scale, subtract points from this link based on
// similarity.
if(score>0){varsimilarity=newdifflib$$.SequenceMatcher(null,articleUrl,href).ratio();// Subtract .1 from diff_percent when calculating modifier,
// which means that if it's less than 10% different, we give a
// bonus instead. Ex:
// 3% different = +17.5 points
// 10% different = 0 points
// 20% different = -25 points
vardiffPercent=1.0-similarity;vardiffModifier=-(250*(diffPercent-0.2));returnscore+diffModifier;}return0;}functionscoreLinkText(linkText,pageNum){// If the link text can be parsed as a number, give it a minor
// bonus, with a slight bias towards lower numbered pages. This is
// so that pages that might not have 'next' in their text can still
// get scored, and sorted properly by score.
varscore=0;if(IS_DIGIT_RE.test(linkText.trim())){varlinkTextAsNum=parseInt(linkText,10);// If it's the first page, we already got it on the first call.
// Give it a negative score. Otherwise, up to page 10, give a
// small bonus.
if(linkTextAsNum<2){score=-30;}else{score=Math.max(0,10-linkTextAsNum);}// If it appears that the current page number is greater than
// this links page number, it's a very bad sign. Give it a big
// penalty.
if(pageNum&&pageNum>=linkTextAsNum){score-=50;}}returnscore;}functionscorePageInLink(pageNum,isWp){// page in the link = bonus. Intentionally ignore wordpress because
// their ?p=123 link style gets caught by this even though it means
// separate documents entirely.
if(pageNum&&!isWp){return50;}return0;}varDIGIT_RE$2=/\d/;// A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link.
varEXTRANEOUS_LINK_HINTS$1=['print','archive','comment','discuss','e-mail','email','share','reply','all','login','sign','single','adx','entry-unrelated'];varEXTRANEOUS_LINK_HINTS_RE$1=newRegExp(EXTRANEOUS_LINK_HINTS$1.join('|'),'i');// Match any link text/classname/id that looks like it could mean the next
// page. Things like: next, continue, >, >>, » but not >|, »| as those can
// mean last page.
varNEXT_LINK_TEXT_RE$1=newRegExp('(next|weiter|continue|>([^|]|$)|»([^|]|$))','i');// Match any link text/classname/id that looks like it is an end link: things
// like "first", "last", "end", etc.
varCAP_LINK_TEXT_RE$1=newRegExp('(first|last|end)','i');// Match any link text/classname/id that looks like it means the previous
// page.
varPREV_LINK_TEXT_RE$1=newRegExp('(prev|earl|old|new|<|«)','i');functionscoreExtraneousLinks(href){// If the URL itself contains extraneous values, give a penalty.
if(EXTRANEOUS_LINK_HINTS_RE$1.test(href)){return-25;}return0;}functionmakeSig$1($link){return($link.attr('class')||'')+' '+($link.attr('id')||'');}functionscoreByParents$1($link){// If a parent node contains paging-like classname or id, give a
// bonus. Additionally, if a parent_node contains bad content
var$parent=$link.parent();varpositiveMatch=false;varnegativeMatch=false;varscore=0;_Array$from(range(0,4)).forEach(function(){if($parent.length===0){return;}varparentData=makeSig$1($parent,' ');// If we have 'page' or 'paging' in our data, that's a good
if(!positiveMatch&&PAGE_RE.test(parentData)){positiveMatch=true;score+=25;}// If we have 'comment' or something in our data, and
// we don't have something like 'content' as well, that's
// a bad sign. Give a penalty.
if(!negativeMatch&&NEGATIVE_SCORE_RE.test(parentData)&&EXTRANEOUS_LINK_HINTS_RE$1.test(parentData)){if(!POSITIVE_SCORE_RE.test(parentData)){negativeMatch=true;score-=25;}}$parent=$parent.parent();});returnscore;}functionscorePrevLink(linkData){// If the link has something like "previous", its definitely
// an old link, skip it.
if(PREV_LINK_TEXT_RE$1.test(linkData)){return-200;}return0;}functionshouldScore(href,articleUrl,baseUrl,parsedUrl,linkText,previousUrls){// skip if we've already fetched this url
if(previousUrls.find(function(url){returnhref===url;})!==undefined){returnfalse;}// If we've already parsed this URL, or the URL matches the base
if(!baseRegex.test(href)){return-25;}return0;}functionscoreNextLinkText(linkData){// Things like "next", ">>", etc.
if(NEXT_LINK_TEXT_RE$1.test(linkData)){return50;}return0;}functionscoreCapLinks(linkData){// Cap links are links like "last", etc.
if(CAP_LINK_TEXT_RE$1.test(linkData)){// If we found a link like "last", but we've already seen that
// this link is also "next", it's fine. If it's not been
// previously marked as "next", then it's probably bad.
// Penalize.
if(NEXT_LINK_TEXT_RE$1.test(linkData)){return-65;}}return0;}functionmakeBaseRegex(baseUrl){returnnewRegExp('^'+baseUrl,'i');}functionmakeSig($link,linkText){return(linkText||$link.text())+' '+($link.attr('class')||'')+' '+($link.attr('id')||'');}functionscoreLinks(_ref){varlinks=_ref.links;vararticleUrl=_ref.articleUrl;varbaseUrl=_ref.baseUrl;varparsedUrl=_ref.parsedUrl;var$=_ref.$;var_ref$previousUrls=_ref.previousUrls;varpreviousUrls=_ref$previousUrls===undefined?[]:_ref$previousUrls;parsedUrl=parsedUrl||URL$$.parse(articleUrl);varbaseRegex=makeBaseRegex(baseUrl);varisWp=isWordpress($);// Loop through all links, looking for hints that they may be next-page
// links. Things like having "page" in their textContent, className or
// id, or being a child of a node with a page-y className or id.
//
// After we do that, assign each page a score, and pick the one that
// looks most like the next page link, as long as its score is strong
// enough to have decent confidence.
varscoredPages=links.reduce(function(possiblePages,link){// Remove any anchor data since we don't do a good job
// standardizing URLs (it's hard), we're going to do
// some checking with and without a trailing slash
varhref=removeAnchor(link.attribs.href);var$link=$(link);varlinkText=$link.text();if(!shouldScore(href,articleUrl,baseUrl,parsedUrl,linkText,previousUrls)){returnpossiblePages;}// ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
if(!possiblePages[href]){possiblePages[href]={score:0,linkText:linkText,href:href};}else{possiblePages[href].linkText=possiblePages[href].linkText+'|'+linkText;}varpossiblePage=possiblePages[href];varlinkData=makeSig($link,linkText);varpageNum=pageNumFromUrl(href);varscore=scoreBaseUrl(href,baseRegex);score+=scoreNextLinkText(linkData);score+=scoreCapLinks(linkData);score+=scorePrevLink(linkData);score+=scoreByParents$1($link);score+=scoreExtraneousLinks(href);score+=scorePageInLink(pageNum,isWp);score+=scoreLinkText(linkText,pageNum);score+=scoreSimilarity(score,articleUrl,href);possiblePage.score=score;returnpossiblePages;},{});return_Reflect$ownKeys$$(scoredPages).length===0?null:scoredPages;}// Looks for and returns next page url
varGenericNextPageUrlExtractor={extract:functionextract(_ref){var$=_ref.$;varurl=_ref.url;varparsedUrl=_ref.parsedUrl;var_ref$previousUrls=_ref.previousUrls;varpreviousUrls=_ref$previousUrls===undefined?[]:_ref$previousUrls;parsedUrl=parsedUrl||URL$$.parse(url);vararticleUrl=removeAnchor(url);varbaseUrl=articleBaseUrl(url,parsedUrl);varlinks=$('a[href]').toArray();varscoredLinks=scoreLinks({links:links,articleUrl:articleUrl,baseUrl:baseUrl,parsedUrl:parsedUrl,$:$,previousUrls:previousUrls});// If no links were scored, return null
if(!scoredLinks)returnnull;// now that we've scored all possible pages,
vartopPage=_Reflect$ownKeys$$(scoredLinks).reduce(function(acc,link){varscoredLink=scoredLinks[link];returnscoredLink.score>acc.score?scoredLink:acc;},{score:-100});// If the score is less than 50, we're not confident enough to use it,
if(topPage.score>=50){returntopPage.href;}returnnull;}};varCANONICAL_META_SELECTORS=['og:url'];functionparseDomain(url){varparsedUrl=URL$$.parse(url);varhostname=parsedUrl.hostname;returnhostname;}functionresult(url){return{url:url,domain:parseDomain(url)};}varGenericUrlExtractor={extract:functionextract(_ref){var$=_ref.$;varurl=_ref.url;varmetaCache=_ref.metaCache;var$canonical=$('link[rel=canonical]');if($canonical.length!==0){varhref=$canonical.attr('href');if(href){returnresult(href);}}varmetaUrl=extractFromMeta($,CANONICAL_META_SELECTORS,metaCache);if(metaUrl){returnresult(metaUrl);}returnresult(url);}};varEXCERPT_META_SELECTORS=['og:description','twitter:description'];functionclean$2(content,$){varmaxLength=arguments.length>2&&arguments[2]!==undefined?arguments[2]:200;content=content.replace(/[\s\n]+/g,' ').trim();returnellipsize$$(content,maxLength,{ellipse:'…'});}varGenericExcerptExtractor={extract:functionextract(_ref){var$=_ref.$;varcontent=_ref.content;varmetaCache=_ref.metaCache;varexcerpt=extractFromMeta($,EXCERPT_META_SELECTORS,metaCache);if(excerpt){returnclean$2(stripTags(excerpt,$));}// Fall back to excerpting from the extracted content
varmaxLength=200;varshortContent=content.slice(0,maxLength*5);returnclean$2($(shortContent).text(),$,maxLength);}};varGenericWordCountExtractor={extract:functionextract(_ref){varcontent=_ref.content;var$=cheerio$$.load(content);vartext=normalizeSpaces($('div').first().text());returntext.split(/\s/).length;}};varGenericExtractor={// This extractor is the default for all domains
domain:'*',title:GenericTitleExtractor.extract,date_published:GenericDatePublishedExtractor.extract,author:GenericAuthorExtractor.extract,content:GenericContentExtractor.extract.bind(GenericContentExtractor),lead_image_url:GenericLeadImageUrlExtractor.extract,dek:GenericDekExtractor.extract,next_page_url:GenericNextPageUrlExtractor.extract,url_and_domain:GenericUrlExtractor.extract,excerpt:GenericExcerptExtractor.extract,word_count:GenericWordCountExtractor.extract,direction:functiondirection(_ref){vartitle=_ref.title;returnstringDirection$$.getDirection(title);},extract:functionextract(options){varhtml=options.html;if(html){var$=cheerio$$.load(html);options.$=$;}vartitle=this.title(options);vardate_published=this.date_published(options);varauthor=this.author(options);varcontent=this.content(_extends$$({},options,{title:title}));varlead_image_url=this.lead_image_url(_extends$$({},options,{content:content}));vardek=this.dek(_extends$$({},options,{content:content}));varnext_page_url=this.next_page_url(options);varexcerpt=this.excerpt(_extends$$({},options,{content:content}));varword_count=this.word_count(_extends$$({},options,{content:content}));vardirection=this.direction({title:title});var_url_and_domain=this.url_and_domain(options);varurl=_url_and_domain.url;vardomain=_url_and_domain.domain;return{title:title,author:author,date_published:date_published||null,dek:dek,lead_image_url:lead_image_url,content:content,next_page_url:next_page_url,url:url,domain:domain,excerpt:excerpt,word_count:word_count,direction:direction};}};functiongetExtractor(url,parsedUrl){parsedUrl=parsedUrl||URL$$.parse(url);var_parsedUrl=parsedUrl;varhostname=_parsedUrl.hostname;varbaseDomain=hostname.split('.').slice(-2).join('.');returnExtractors[hostname]||Extractors[baseDomain]||GenericExtractor;}// Remove elements by an array of selectors
functioncleanBySelectors($content,$,_ref){varclean=_ref.clean;if(!clean)return$content;$(clean.join(','),$content).remove();return$content;}// Transform matching elements
functiontransformElements($content,$,_ref2){vartransforms=_ref2.transforms;if(!transforms)return$content;_Reflect$ownKeys$$(transforms).forEach(function(key){var$matches=$(key,$content);varvalue=transforms[key];// If value is a string, convert directly
if(typeofvalue==='string'){$matches.each(function(index,node){convertNodeTo($(node),$,transforms[key]);});}elseif(typeofvalue==='function'){// If value is function, apply function to node
$matches.each(function(index,node){varresult=value($(node),$);// If function returns a string, convert node to that value
if(typeofresult==='string'){convertNodeTo($(node),$,result);}});}});return$content;}functionfindMatchingSelector($,selectors){returnselectors.find(function(selector){if(Array.isArray(selector)){var_selector=_slicedToArray$$(selector,2);vars=_selector[0];varattr=_selector[1];return$(s).length===1&&$(s).attr(attr)&&$(s).attr(attr).trim()!=='';}return$(selector).length===1&&$(selector).text().trim()!=='';});}functionselect(opts){var$=opts.$;vartype=opts.type;varextractionOpts=opts.extractionOpts;var_opts$extractHtml=opts.extractHtml;varextractHtml=_opts$extractHtml===undefined?false:_opts$extractHtml;// Skip if there's not extraction for this type
if(typeofextractionOpts==='string')returnextractionOpts;varselectors=extractionOpts.selectors;var_extractionOpts$defau=extractionOpts.defaultCleaner;vardefaultCleaner=_extractionOpts$defau===undefined?true:_extractionOpts$defau;varmatchingSelector=findMatchingSelector($,selectors);if(!matchingSelector)returnnull;// Declaring result; will contain either
$content.wrap($('<div></div>'));$content=$content.parent();$content=transformElements($content,$,extractionOpts);$content=cleanBySelectors($content,$,extractionOpts);$content=Cleaners[type]($content,_extends$$({},opts,{defaultCleaner:defaultCleaner}));return$.html($content);}varresult=void0;// if selector is an array (e.g., ['img', 'src']),
if(defaultCleaner){returnCleaners[type](result,opts);}returnresult;}functionextractResult(opts){vartype=opts.type;varextractor=opts.extractor;var_opts$fallback=opts.fallback;varfallback=_opts$fallback===undefined?true:_opts$fallback;varresult=select(_extends$$({},opts,{extractionOpts:extractor[type]}));// If custom parser succeeds, return the result
if(result){returnresult;}// If nothing matches the selector, and fallback is enabled,
if(fallback)returnGenericExtractor[type](opts);returnnull;}varRootExtractor={extract:functionextract(){varextractor=arguments.length>0&&arguments[0]!==undefined?arguments[0]:GenericExtractor;varopts=arguments[1];var_opts=opts;varcontentOnly=_opts.contentOnly;varextractedTitle=_opts.extractedTitle;// This is the generic extractor. Run its extract method
if(extractor.domain==='*')returnextractor.extract(opts);opts=_extends$$({},opts,{extractor:extractor});if(contentOnly){var_content=extractResult(_extends$$({},opts,{type:'content',extractHtml:true,title:extractedTitle}));return{content:_content};}vartitle=extractResult(_extends$$({},opts,{type:'title'}));vardate_published=extractResult(_extends$$({},opts,{type:'date_published'}));varauthor=extractResult(_extends$$({},opts,{type:'author'}));varnext_page_url=extractResult(_extends$$({},opts,{type:'next_page_url'}));varcontent=extractResult(_extends$$({},opts,{type:'content',extractHtml:true,title:title}));varlead_image_url=extractResult(_extends$$({},opts,{type:'lead_image_url',content:content}));varexcerpt=extractResult(_extends$$({},opts,{type:'excerpt',content:content}));vardek=extractResult(_extends$$({},opts,{type:'dek',content:content,excerpt:excerpt}));varword_count=extractResult(_extends$$({},opts,{type:'word_count',content:content}));vardirection=extractResult(_extends$$({},opts,{type:'direction',title:title}));var_ref3=extractResult(_extends$$({},opts,{type:'url_and_domain'}))||{url:null,domain:null};varurl=_ref3.url;vardomain=_ref3.domain;return{title:title,content:content,author:author,date_published:date_published,lead_image_url:lead_image_url,dek:dek,next_page_url:next_page_url,url:url,domain:domain,excerpt:excerpt,word_count:word_count,direction:direction};}};varcollectAllPages=function(){var_ref=_asyncToGenerator(_regeneratorRuntime.mark(function_callee(_ref2){varnext_page_url=_ref2.next_page_url;varhtml=_ref2.html;var$=_ref2.$;varmetaCache=_ref2.metaCache;varresult=_ref2.result;varExtractor=_ref2.Extractor;vartitle=_ref2.title;varurl=_ref2.url;varpages,previousUrls,extractorOpts,nextPageResult,word_count;return_regeneratorRuntime.wrap(function_callee$(_context){while(1){switch(_context.prev=_context.next){case0:// At this point, we've fetched just the first page
_context.next=10;returnResource.create(url,html,parsedUrl);case10:$=_context.sent;if(!$.error){_context.next=13;break;}return_context.abrupt('return',$);case13:html=$.html();// Cached value of every meta name in our document.
metaCache=$('meta').map(function(_,node){return$(node).attr('name');}).toArray();result=RootExtractor.extract(Extractor,{url:url,html:html,$:$,metaCache:metaCache,parsedUrl:parsedUrl,fallback:fallback});_result=result;title=_result.title;next_page_url=_result.next_page_url;// Fetch more pages if next_page_url found
if(!(fetchAllPages&&next_page_url)){_context.next=25;break;}_context.next=22;returncollectAllPages({Extractor:Extractor,next_page_url:next_page_url,html:html,$:$,metaCache:metaCache,result:result,title:title,url:url});case22:result=_context.sent;_context.next=26;break;case25:result=_extends$$({},result,{total_pages:1,rendered_pages:1});case26:return_context.abrupt('return',result);case27:case'end':return_context.stop();}}},_callee,_this);}))();},// A convenience method for getting a resource
// A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
varNEGATIVE_SCORE_HINTS=['adbox','advert','author','bio','bookmark','bottom','byline','clear','com-','combx','comment','comment\\B','contact','copy','credit','crumb','date','deck','excerpt','featured',// tnr.com has a featured_content which throws us off
// A list of strings that can be considered unlikely candidates when
// extracting content from a resource. These strings are joined together
// and then tested for existence using re:test, so may contain simple,
// non-pipe style regular expression queries if necessary.
varUNLIKELY_CANDIDATES_BLACKLIST$1=['ad-break','adbox','advert','addthis','agegate','aux','blogger-labels','combx','comment','conversation','disqus','entry-unrelated','extra','foot','form','header','hidden','loader','login',// Note: This can hit 'blogindex'.
'menu','meta','nav','pager','pagination','predicta',// readwriteweb inline ad box
'presence_control_external',// lifehacker.com container full of false positives
// A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
varNEGATIVE_SCORE_HINTS$1=['adbox','advert','author','bio','bookmark','bottom','byline','clear','com-','combx','comment','comment\\B','contact','copy','credit','crumb','date','deck','excerpt','featured',// tnr.com has a featured_content which throws us off
var_templateObject=_taggedTemplateLiteral(['\n // Rename CustomExtractor\n // to fit your publication\n // (e.g., NYTimesExtractor)\n export const CustomExtractor = {\n domain: \'','\',\n title: {\n selectors: [\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it\'s consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn\'t be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ]\n },\n\n date_published: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n next_page_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n excerpt: {\n selectors: [\n // enter selectors\n ],\n },\n }\n '],['\n // Rename CustomExtractor\n // to fit your publication\n // (e.g., NYTimesExtractor)\n export const CustomExtractor = {\n domain: \'','\',\n title: {\n selectors: [\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it\'s consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn\'t be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ]\n },\n\n date_published: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n next_page_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n excerpt: {\n selectors: [\n // enter selectors\n ],\n },\n }\n ']);
var_templateObject$1=_taggedTemplateLiteral(['\n it(\'returns the ','\', async () => {\n // To pass this test, fill out the ',' selector\n // in ','/index.js.\n const html =\n fs.readFileSync(\'','\');\n const articleUrl =\n \'','\';\n\n const { ',' } =\n await Mercury.parse(articleUrl, html, { fallback: false });\n\n // Update these values with the expected values from\n // the article.\n assert.equal(',', ',')\n });\n '],['\n it(\'returns the ','\', async () => {\n // To pass this test, fill out the ',' selector\n // in ','/index.js.\n const html =\n fs.readFileSync(\'','\');\n const articleUrl =\n \'','\';\n\n const { ',' } =\n await Mercury.parse(articleUrl, html, { fallback: false });\n\n // Update these values with the expected values from\n // the article.\n assert.equal(',', ',')\n });\n ']);
var_templateObject2=_taggedTemplateLiteral(['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n // Rename CustomExtractor\n describe(\'CustomExtractor\', () => {\n it(\'is selected properly\', () => {\n // To pass this test, rename your extractor in\n // ','/index.js\n // (e.g., CustomExtractor => NYTimesExtractor)\n // then add your new extractor to\n // src/extractors/all.js\n const url =\n \'','\';\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ','\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ','/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const html =\n fs.readFileSync(\'','\');\n const url =\n \'','\';\n\n const { content } =\n await Mercury.parse(url, html, { fallback: false });\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, null);\n });\n });\n '],['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n // Rename CustomExtractor\n describe(\'CustomExtractor\', () => {\n it(\'is selected properly\', () => {\n // To pass this test, rename your extractor in\n // ','/index.js\n // (e.g., CustomExtractor => NYTimesExtractor)\n // then add your new extractor to\n // src/extractors/all.js\n const url =\n \'','\';\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ','\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ','/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const html =\n fs.readFileSync(\'','\');\n const url =\n \'','\';\n\n const { content } =\n await Mercury.parse(url, html, { fallback: false });\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, null);\n });\n });\n ']);
confirm(generateScaffold,[url,file,result],'Generating parser and tests');
console.log('Your custom site extractor has been set up. To get started building it, run\n npm run watch:test -- '+hostname);
}else{
console.log('\n It looks like you already have a custom parser for this url.\n The page you linked to has been added to '+file+'. Copy and paste\n the following code to use that page in your tests:\n const html = fs.readFileSync(\''+file+'\');');