varmercury=createCommonjsModule(function(module){'use strict';function_interopDefault(ex){returnex&&(typeofex==='undefined'?'undefined':_typeof(ex))==='object'&&'default'inex?ex['default']:ex;}var_regeneratorRuntime=_interopDefault(regenerator);var_extends$$=_interopDefault(_extends);var_asyncToGenerator=_interopDefault(asyncToGenerator);varURL$$=_interopDefault(URL);varcheerio$$=_interopDefault(cheerio);var_Promise=_interopDefault(promise);varrequest$$=_interopDefault(request);var_Reflect$ownKeys$$=_interopDefault(_Reflect$ownKeys);varstringDirection$$=_interopDefault(stringDirection);var_getIterator$$=_interopDefault(_getIterator);var_defineProperty=_interopDefault(defineProperty);var_slicedToArray$$=_interopDefault(_slicedToArray);var_typeof$$=_interopDefault(_typeof);varvalidUrl$$=_interopDefault(validUrl);varmoment$$=_interopDefault(moment);varwuzzy$$=_interopDefault(wuzzy);vardifflib$$=_interopDefault(difflib);var_Array$from=_interopDefault(from);varellipsize$$=_interopDefault(ellipsize);var_marked=[range].map(_regeneratorRuntime.mark);functionrange(){varstart=arguments.length>0&&arguments[0]!==undefined?arguments[0]:1;varend=arguments.length>1&&arguments[1]!==undefined?arguments[1]:1;return_regeneratorRuntime.wrap(functionrange$(_context){while(1){switch(_context.prev=_context.next){case0:if(!(start<=end)){_context.next=5;break;}_context.next=3;returnstart+=1;case3:_context.next=0;break;case5:case"end":return_context.stop();}}},_marked[0],this);}// extremely simple url validation as a first step
varmercury=createCommonjsModule(function(module){'use strict';function_interopDefault(ex){returnex&&(typeofex==='undefined'?'undefined':_typeof$1(ex))==='object'&&'default'inex?ex['default']:ex;}var_regeneratorRuntime=_interopDefault(regenerator);var_extends$$=_interopDefault(_extends);var_asyncToGenerator=_interopDefault(asyncToGenerator);varURL$$=_interopDefault(URL);varcheerio$$=_interopDefault(cheerio);var_Promise=_interopDefault(promise);varrequest$$=_interopDefault(request);var_Reflect$ownKeys=_interopDefault(ownKeys);var_toConsumableArray=_interopDefault(toConsumableArray);var_slicedToArray=_interopDefault(slicedToArray);varstringDirection$$=_interopDefault(stringDirection);var_getIterator=_interopDefault(getIterator);var_defineProperty=_interopDefault(defineProperty);var_typeof$$=_interopDefault(_typeof);varvalidUrl$$=_interopDefault(validUrl);varmoment$$=_interopDefault(moment);varwuzzy$$=_interopDefault(wuzzy);vardifflib$$=_interopDefault(difflib);var_Array$from=_interopDefault(from);varellipsize$$=_interopDefault(ellipsize);var_marked=[range].map(_regeneratorRuntime.mark);functionrange(){varstart=arguments.length>0&&arguments[0]!==undefined?arguments[0]:1;varend=arguments.length>1&&arguments[1]!==undefined?arguments[1]:1;return_regeneratorRuntime.wrap(functionrange$(_context){while(1){switch(_context.prev=_context.next){case0:if(!(start<=end)){_context.next=5;break;}_context.next=3;returnstart+=1;case3:_context.next=0;break;case5:case"end":return_context.stop();}}},_marked[0],this);}// extremely simple url validation as a first step
functionvalidateUrl(_ref){varhostname=_ref.hostname;// If this isn't a valid url, return an error message
return!!hostname;}varErrors={badUrl:{error:true,messages:'The url parameter passed does not look like a valid URL. Please check your data and try again.'}};varREQUEST_HEADERS={'User-Agent':'Readability - http://readability.com/about/'};// The number of milliseconds to attempt to fetch a resource before timing out.
varFETCH_TIMEOUT=10000;// Content types that we do not extract content from
@ -64,7 +229,7 @@ function normalizeMetaTags($){$=convertMetaProp($,'content','value');$=convertMe
// Many sites will have img tags with no source, or an image tag with a src
// attribute that a is a placeholer. We need to be able to properly fill in
// the src attribute so the images are no longer lazy loaded.
functionconvertLazyLoadedImages($){$('img').each(function(_,img){_Reflect$ownKeys$$(img.attribs).forEach(function(attr){varvalue=img.attribs[attr];if(attr!=='src'&&IS_LINK.test(value)&&IS_IMAGE.test(value)){$(img).attr('src',value);}});});return$;}functionisComment(index,node){returnnode.type==='comment';}functioncleanComments($){$.root().find('*').contents().filter(isComment).remove();return$;}functionclean($){$(TAGS_TO_REMOVE).remove();$=cleanComments($);return$;}varResource={// Create a Resource.
functionconvertLazyLoadedImages($){$('img').each(function(_,img){_Reflect$ownKeys(img.attribs).forEach(function(attr){varvalue=img.attribs[attr];if(attr!=='src'&&IS_LINK.test(value)&&IS_IMAGE.test(value)){$(img).attr('src',value);}});});return$;}functionisComment(index,node){returnnode.type==='comment';}functioncleanComments($){$.root().find('*').contents().filter(isComment).remove();return$;}functionclean($){$(TAGS_TO_REMOVE).remove();$=cleanComments($);return$;}varResource={// Create a Resource.
//
// :param url: The URL for the document we should retrieve.
// :param response: If set, use as the response rather than
@ -72,7 +237,7 @@ function convertLazyLoadedImages($){$('img').each(function(_,img){_Reflect$ownKe
// string.
create:functioncreate(url,preparedResponse,parsedUrl){var_this=this;return_asyncToGenerator(_regeneratorRuntime.mark(function_callee(){varresult,validResponse;return_regeneratorRuntime.wrap(function_callee$(_context){while(1){switch(_context.prev=_context.next){case0:result=void0;if(!preparedResponse){_context.next=6;break;}validResponse={statusMessage:'OK',statusCode:200,headers:{'content-type':'text/html','content-length':500}};result={body:preparedResponse,response:validResponse};_context.next=9;break;case6:_context.next=8;returnfetchResource(url,parsedUrl);case8:result=_context.sent;case9:if(!result.error){_context.next=11;break;}return_context.abrupt('return',result);case11:return_context.abrupt('return',_this.generateDoc(result));case12:case'end':return_context.stop();}}},_callee,_this);}))();},generateDoc:functiongenerateDoc(_ref){varcontent=_ref.body;varresponse=_ref.response;varcontentType=response.headers['content-type'];// TODO: Implement is_text function from
if(!contentType.includes('html')&&!contentType.includes('text')){thrownewError('Content does not appear to be text.');}var$=cheerio$$.load(content,{normalizeWhitespace:true});if($.root().children().length===0){thrownewError('No children, likely a bad parse.');}$=normalizeMetaTags($);$=convertLazyLoadedImages($);$=clean($);return$;}};var NYMagExtractor={domain:'nymag.com',content:{// Order by most likely. Extractor will stop on first occurrence
if(!contentType.includes('html')&&!contentType.includes('text')){thrownewError('Content does not appear to be text.');}var$=cheerio$$.load(content,{normalizeWhitespace:true});if($.root().children().length===0){thrownewError('No children, likely a bad parse.');}$=normalizeMetaTags($);$=convertLazyLoadedImages($);$=clean($);return$;}};varmerge=functionmerge(extractor,domains){returndomains.reduce(function(acc,domain){acc[domain]=extractor;returnacc;},{});};functionmergeSupportedDomains(extractor){returnextractor.supportedDomains?merge(extractor,[extractor.domain].concat(_toConsumableArray(extractor.supportedDomains))):merge(extractor,[extractor.domain]);}varNYMagExtractor={domain:'nymag.com',content:{// Order by most likely. Extractor will stop on first occurrence
selectors:['div.article-content','section.body','article.article'],// Selectors to remove from the extracted content
clean:['.ad','.single-related-story'],// Object of tranformations to make on matched elements
// Each key is the selector, each value is the tag to
selectors:['.post-content noscript'],// Selectors to remove from the extracted content
clean:[],// Convert the noscript tag to a div
transforms:{noscript:'div'}},author:{selectors:['.post-author-name']},title:{selectors:['h2.title']},date_published:{selectors:['span.publishdate']}};varWikipediaExtractor={domain:'wikipedia.org',content:{selectors:['#mw-content-text'],defaultCleaner:false,// transform top infobox to an image with caption
transforms:{noscript:'div'}},author:{selectors:['.post-author-name']},title:{selectors:['.post h2.title']},date_published:{selectors:['span.publishdate']}};varWikipediaExtractor={domain:'wikipedia.org',content:{selectors:['#mw-content-text'],defaultCleaner:false,// transform top infobox to an image with caption
transforms:{'.infobox img':functioninfoboxImg($node){var$parent=$node.parents('.infobox');// Only prepend the first image in .infobox
if($parent.children('img').length===0){$parent.prepend($node);}},'.infobox caption':'figcaption','.infobox':'figure'},// Selectors to remove from the extracted content
'.permalink[role=main]':functionpermalinkRoleMain($node,$){vartweets=$node.find('.tweet');var$tweetContainer=$('<div id="TWEETS_GO_HERE"></div>');$tweetContainer.append(tweets);$node.replaceWith($tweetContainer);},// Twitter wraps @ with s, which
@ -181,10 +346,50 @@ transforms:[],// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:['figcaption']},date_published:{selectors:[['.story-main-content .timestamp time[datetime]','datetime']]},lead_image_url:{selectors:[// enter lead_image_url selectors
['meta[name="og:image"]','value']]},dek:{selectors:[['meta[name="description"]','value']]},next_page_url:null,excerpt:null};varExtractors={'nymag.com':NYMagExtractor,'blogspot.com':BloggerExtractor,'wikipedia.org':WikipediaExtractor,'twitter.com':TwitterExtractor,'www.nytimes.com':NYTimesExtractor,'www.theatlantic.com':TheAtlanticExtractor,'www.newyorker.com':NewYorkerExtractor,'www.wired.com':WiredExtractor,'www.msn.com':MSNExtractor,'www.yahoo.com':YahooExtractor,'www.buzzfeed.com':BuzzfeedExtractor,'fandom.wikia.com':WikiaExtractor,'www.littlethings.com':LittleThingsExtractor,'www.politico.com':PoliticoExtractor};// Spacer images to be removed
varSPACER_RE=newRegExp('trans|transparent|spacer|blank','i');// A list of tags to strip from the output if we encounter them.
['meta[name="og:image"]','value']]},dek:{selectors:[['meta[name="description"]','value']]},next_page_url:null,excerpt:null};varDeadspinExtractor={domain:'deadspin.com',supportedDomains:['jezebel.com','lifehacker.com','kotaku.com','gizmodo.com','jalopnik.com','kinja.com'],title:{selectors:['h1.headline']},author:{selectors:['.author']},content:{selectors:['.post-content','.entry-content'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:{'iframe.lazyload[data-recommend-id^="youtube://"]':functioniframeLazyloadDataRecommendIdYoutube($node){varyoutubeId=$node.attr('id').split('youtube-')[1];$node.attr('src','https://www.youtube.com/embed/'+youtubeId);}},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:[]},date_published:{selectors:[['time.updated[datetime]','datetime']]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},dek:{selectors:[// enter selectors
]},next_page_url:{selectors:[// enter selectors
]},excerpt:{selectors:[// enter selectors
]}};// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
varBroadwayWorldExtractor={domain:'www.broadwayworld.com',title:{selectors:['h1.article-title']},author:{selectors:['span[itemprop=author]']},content:{selectors:['div[itemprop=articlebody]'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:{},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:[]},date_published:{selectors:[['meta[itemprop=datePublished]','value']]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},dek:{selectors:[['meta[name="og:description"]','value']]},next_page_url:{selectors:[// enter selectors
]},excerpt:{selectors:[// enter selectors
]}};// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
varApartmentTherapyExtractor={domain:'www.apartmenttherapy.com',title:{selectors:['h1.headline']},author:{selectors:['.PostByline__name']},content:{selectors:['div.post__content'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms:{'div[data-render-react-id="images/LazyPicture"]':functiondivDataRenderReactIdImagesLazyPicture($node,$){vardata=JSON.parse($node.attr('data-props'));varsrc=data.sources[0].src;var$img=$('<img />').attr('src',src);$node.replaceWith($img);}},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:[]},date_published:{selectors:[['.PostByline__timestamp[datetime]','datetime']]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},dek:{selectors:[['meta[name=description]','value']]},next_page_url:{selectors:[// enter selectors
]},excerpt:{selectors:[// enter selectors
]}};varMediumExtractor={domain:'medium.com',supportedDomains:['trackchanges.postlight.com'],title:{selectors:['h1']},author:{selectors:[['meta[name="author"]','value']]},content:{selectors:['.section-content'],// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
$node.attr('src','https://www.youtube.com/embed/'+youtubeId);var$parent=$node.parents('figure');$parent.prepend($node.clone());$node.remove();}}},// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean:[]},date_published:{selectors:[['time[datetime]','datetime']]},lead_image_url:{selectors:[['meta[name="og:image"]','value']]},dek:{selectors:[// enter selectors
]},next_page_url:{selectors:[// enter selectors
]},excerpt:{selectors:[// enter selectors
]}};varExtractors=_extends$$({'nymag.com':NYMagExtractor,'blogspot.com':BloggerExtractor,'wikipedia.org':WikipediaExtractor,'twitter.com':TwitterExtractor,'www.nytimes.com':NYTimesExtractor,'www.theatlantic.com':TheAtlanticExtractor,'www.newyorker.com':NewYorkerExtractor,'www.wired.com':WiredExtractor,'www.msn.com':MSNExtractor,'www.yahoo.com':YahooExtractor,'www.buzzfeed.com':BuzzfeedExtractor,'fandom.wikia.com':WikiaExtractor,'www.littlethings.com':LittleThingsExtractor,'www.politico.com':PoliticoExtractor},mergeSupportedDomains(DeadspinExtractor),{'www.broadwayworld.com':BroadwayWorldExtractor,'www.apartmenttherapy.com':ApartmentTherapyExtractor},mergeSupportedDomains(MediumExtractor));// Spacer images to be removed
varSPACER_RE=newRegExp('trans|transparent|spacer|blank','i');// The class we will use to mark elements we want to keep
// but would normally remove
varKEEP_CLASS='mercury-parser-keep';varKEEP_SELECTORS=['iframe[src^="https://www.youtube.com"]','iframe[src^="http://www.youtube.com"]','iframe[src^="https://player.vimeo"]','iframe[src^="http://player.vimeo"]'];// A list of tags to strip from the output if we encounter them.
// (By-reference mutation, though. Returned just for convenience.)
functionconvertToParagraphs($){$=brsToPs($);$=convertDivs($);$=convertSpans($);return$;}functionconvertNodeTo($node,$){vartag=arguments.length>2&&arguments[2]!==undefined?arguments[2]:'p';varnode=$node.get(0);if(!node){return$;}var_$node$get=$node.get(0);varattribs=_$node$get.attribs;varattribString=_Reflect$ownKeys$$(attribs).map(function(key){returnkey+'='+attribs[key];}).join(' ');$node.replaceWith('<'+tag+' '+attribString+'>'+$node.contents()+'</'+tag+'>');return$;}functioncleanForHeight($img,$){varheight=parseInt($img.attr('height'),10);varwidth=parseInt($img.attr('width'),10)||20;// Remove images that explicitly have very small heights or
functionconvertToParagraphs($){$=brsToPs($);$=convertDivs($);$=convertSpans($);return$;}functionconvertNodeTo($node,$){vartag=arguments.length>2&&arguments[2]!==undefined?arguments[2]:'p';varnode=$node.get(0);if(!node){return$;}var_$node$get=$node.get(0);varattribs=_$node$get.attribs;varattribString=_Reflect$ownKeys(attribs).map(function(key){returnkey+'='+attribs[key];}).join(' ');$node.replaceWith('<'+tag+' '+attribString+'>'+$node.contents()+'</'+tag+'>');return$;}functioncleanForHeight($img,$){varheight=parseInt($img.attr('height'),10);varwidth=parseInt($img.attr('width'),10)||20;// Remove images that explicitly have very small heights or
// widths, because they are most likely shims or icons,
// which aren't very useful for reading.
if((height||20)<10||width<10){$img.remove();}elseif(height){// Don't ever specify a height on images, so that we can
@ -287,17 +492,22 @@ if((height||20)<10||width<10){$img.remove();}else if(height){// Don't ever speci
// aspect ratio.
$img.removeAttr('height');}return$;}// Cleans out images where the source string matches transparent/spacer/etc
// TODO This seems very aggressive - AP
functionremoveSpacers($img,$){if(SPACER_RE.test($img.attr('src'))){$img.remove();}return$;}functioncleanImages($article,$){$article.find('img').each(function(index,img){var$img=$(img);cleanForHeight($img,$);removeSpacers($img,$);});return$;}functionstripJunkTags(article,$){vartags=arguments.length>2&&arguments[2]!==undefined?arguments[2]:[];if(tags.length===0){tags=STRIP_OUTPUT_TAGS;}$(tags.join(','),article).remove();return$;}// H1 tags are typically the article title, which should be extracted
functionremoveSpacers($img,$){if(SPACER_RE.test($img.attr('src'))){$img.remove();}return$;}functioncleanImages($article,$){$article.find('img').each(function(index,img){var$img=$(img);cleanForHeight($img,$);removeSpacers($img,$);});return$;}functionmarkToKeep(article,$,url){vartags=arguments.length>3&&arguments[3]!==undefined?arguments[3]:[];if(tags.length===0){tags=KEEP_SELECTORS;}if(url){var_URL$parse=URL$$.parse(url);varprotocol=_URL$parse.protocol;varhostname=_URL$parse.hostname;tags=[].concat(_toConsumableArray(tags),['iframe[src^="'+protocol+'//'+hostname+'"]']);}$(tags.join(','),article).addClass(KEEP_CLASS);return$;}functionstripJunkTags(article,$){vartags=arguments.length>2&&arguments[2]!==undefined?arguments[2]:[];if(tags.length===0){tags=STRIP_OUTPUT_TAGS;}// Remove matching elements, but ignore
// any element with a class of mercury-parser-keep
$(tags.join(','),article).not('.'+KEEP_CLASS).remove();// Remove the mercury-parser-keep class from result
$('.'+KEEP_CLASS,article).removeClass(KEEP_CLASS);return$;}// H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),
$article.find('*').each(function(index,node){node.attribs=_Reflect$ownKeys$$(node.attribs).reduce(function(acc,attr){if(WHITELIST_ATTRS_RE.test(attr)){return_extends$$({},acc,_defineProperty({},attr,node.attribs[attr]));}returnacc;},{});});}// function removeAttrs(article, $) {
functioncleanHOnes(article,$){var$hOnes=$('h1',article);if($hOnes.length<3){$hOnes.each(function(index,node){return$(node).remove();});}else{$hOnes.each(function(index,node){convertNodeTo($(node),$,'h2');});}return$;}functionremoveAllButWhitelist($article){$article.find('*').each(function(index,node){node.attribs=_Reflect$ownKeys(node.attribs).reduce(function(acc,attr){if(WHITELIST_ATTRS_RE.test(attr)){return_extends$$({},acc,_defineProperty({},attr,node.attribs[attr]));}returnacc;},{});});return$article;}// function removeAttrs(article, $) {
// A list of strings that can be considered unlikely candidates when
// extracting content from a resource. These strings are joined together
// and then tested for existence using re:test, so may contain simple,
@ -397,7 +607,7 @@ addScoreTo($parent.parent(),$,rawScore/2,weightNodes);}});return $;}// score con
// content score, grandparents half
functionscoreContent($){varweightNodes=arguments.length>1&&arguments[1]!==undefined?arguments[1]:true;// First, look for special hNews based selectors and give them a big
// boost, if they exist
HNEWS_CONTENT_SELECTORS$1.forEach(function(_ref){var_ref2=_slicedToArray$$(_ref,2);varparentSelector=_ref2[0];varchildSelector=_ref2[1];$(parentSelector+' '+childSelector).each(function(index,node){addScore($(node).parent(parentSelector),$,80);});});// Doubling this again
HNEWS_CONTENT_SELECTORS$1.forEach(function(_ref){var_ref2=_slicedToArray(_ref,2);varparentSelector=_ref2[0];varchildSelector=_ref2[1];$(parentSelector+' '+childSelector).each(function(index,node){addScore($(node).parent(parentSelector),$,80);});});// Doubling this again
// pagination data exists in it. Useful for comparing to other links
// that might have pagination data within them.
functionarticleBaseUrl(url,parsed){varparsedUrl=parsed||URL$$.parse(url);varprotocol=parsedUrl.protocol;varhost=parsedUrl.host;varpath=parsedUrl.path;varfirstSegmentHasLetters=false;varcleanedSegments=path.split('/').reverse().reduce(function(acc,rawSegment,index){varsegment=rawSegment;// Split off and save anything that looks like a file type.
if(segment.includes('.')){var_segment$split=segment.split('.');var_segment$split2=_slicedToArray$$(_segment$split,2);varpossibleSegment=_segment$split2[0];varfileExt=_segment$split2[1];if(IS_ALPHA_RE.test(fileExt)){segment=possibleSegment;}}// If our first or second segment has anything looking like a page
if(segment.includes('.')){var_segment$split=segment.split('.');var_segment$split2=_slicedToArray(_segment$split,2);varpossibleSegment=_segment$split2[0];varfileExt=_segment$split2[1];if(IS_ALPHA_RE.test(fileExt)){segment=possibleSegment;}}// If our first or second segment has anything looking like a page
// number, remove it.
if(PAGE_IN_HREF_RE.test(segment)&&index<2){segment=segment.replace(PAGE_IN_HREF_RE,'');}// If we're on the first segment, check to see if we have any
// characters in it. The first segment is actually the last bit of
@ -512,14 +722,14 @@ var values=nodes.map(function(index,node){return $(node).attr(value);}).toArray(
// also.
if(values.length===1){varmetaValue=void0;// Meta values that contain HTML should be stripped, as they
// weren't subject to cleaning previously.
if(cleanTags){metaValue=stripTags(values[0],$);}else{metaValue=values[0];}return{v:metaValue};}};for(var_iterator=_getIterator$$(foundNames),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){var_ret=_loop();if((typeof_ret==='undefined'?'undefined':_typeof$$(_ret))==="object")return_ret.v;}// If nothing is found, return null
if(cleanTags){metaValue=stripTags(values[0],$);}else{metaValue=values[0];}return{v:metaValue};}};for(var_iterator=_getIterator(foundNames),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){var_ret=_loop();if((typeof_ret==='undefined'?'undefined':_typeof$$(_ret))==="object")return_ret.v;}// If nothing is found, return null
}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnnull;}functionisGoodNode($node,maxChildren){// If it has a number of children, it's more likely a container
// element. Skip it.
if($node.children().length>maxChildren){returnfalse;}// If it looks to be within a comment, skip it.
if(withinComment($node)){returnfalse;}returntrue;}// Given a a list of selectors find content that may
// be extractable from the document. This is for flat
// meta-information, like author, title, date published, etc.
functionextractFromSelectors($,selectors){varmaxChildren=arguments.length>2&&arguments[2]!==undefined?arguments[2]:1;vartextOnly=arguments.length>3&&arguments[3]!==undefined?arguments[3]:true;var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator$$(selectors),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){varselector=_step.value;varnodes=$(selector);// If we didn't get exactly one of this selector, this may be
functionextractFromSelectors($,selectors){varmaxChildren=arguments.length>2&&arguments[2]!==undefined?arguments[2]:1;vartextOnly=arguments.length>3&&arguments[3]!==undefined?arguments[3]:true;var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator(selectors),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){varselector=_step.value;varnodes=$(selector);// If we didn't get exactly one of this selector, this may be
// a list of articles or comments. Skip it.
if(nodes.length===1){var$node=$(nodes[0]);if(isGoodNode($node,maxChildren)){varcontent=void0;if(textOnly){content=$node.text();}else{content=$node.html();}if(content){returncontent;}}}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnnull;}// strips all tags from a string of text
functionstripTags(text,$){// Wrapping text in html element prevents errors when text
@ -554,20 +764,23 @@ function extractCleanNode(article,_ref){var $=_ref.$;var _ref$cleanConditional=_
rewriteTopLevel(article,$);// Drop small images and spacer images
// Only do this is defaultCleaner is set to true;
// this can sometimes be too aggressive.
if(defaultCleaner)cleanImages(article,$);// Drop certain tags like <title>, etc
if(defaultCleaner)cleanImages(article,$);// Mark elements to keep that would normally be removed.
// E.g., stripJunkTags will remove iframes, so we're going to mark
// YouTube/Vimeo videos as elements we want to keep.
markToKeep(article,$,url);// Drop certain tags like <title>, etc
// This is -mostly- for cleanliness, not security.
stripJunkTags(article,$);// H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.
cleanHOnes(article,$);// Clean headers
cleanHeaders(article,$,title);// Make links absolute
if(splitTitle.length>=6){var_ret=function(){// Look to see if we can find a breadcrumb splitter that happens
// more than once. If we can, we'll be able to better pull out
// the title.
vartermCounts=splitTitle.reduce(function(acc,titleText){acc[titleText]=acc[titleText]?acc[titleText]+1:1;returnacc;},{});var_Reflect$ownKeys$redu=_Reflect$ownKeys$$(termCounts).reduce(function(acc,key){if(acc[1]<termCounts[key]){return[key,termCounts[key]];}returnacc;},[0,0]);var_Reflect$ownKeys$redu2=_slicedToArray$$(_Reflect$ownKeys$redu,2);varmaxTerm=_Reflect$ownKeys$redu2[0];vartermCount=_Reflect$ownKeys$redu2[1];// We found a splitter that was used more than once, so it
vartermCounts=splitTitle.reduce(function(acc,titleText){acc[titleText]=acc[titleText]?acc[titleText]+1:1;returnacc;},{});var_Reflect$ownKeys$redu=_Reflect$ownKeys(termCounts).reduce(function(acc,key){if(acc[1]<termCounts[key]){return[key,termCounts[key]];}returnacc;},[0,0]);var_Reflect$ownKeys$redu2=_slicedToArray(_Reflect$ownKeys$redu,2);varmaxTerm=_Reflect$ownKeys$redu2[0];vartermCount=_Reflect$ownKeys$redu2[1];// We found a splitter that was used more than once, so it
// is probably the breadcrumber. Split our title on that instead.
// Note: max_term should be <= 4 characters, so that " >> "
varnode=this.getContentNode($,title,url,opts);if(nodeIsSufficient(node)){returnthis.cleanAndReturnNode(node,$);}// We didn't succeed on first pass, one by one disable our
// extraction opts and try again.
var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator$$(_Reflect$ownKeys$$(opts).filter(function(k){returnopts[k]===true;})),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){varkey=_step.value;opts[key]=false;$=cheerio$$.load(html);node=this.getContentNode($,title,url,opts);if(nodeIsSufficient(node)){break;}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnthis.cleanAndReturnNode(node,$);},// Get node given current options
var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator(_Reflect$ownKeys(opts).filter(function(k){returnopts[k]===true;})),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){varkey=_step.value;opts[key]=false;$=cheerio$$.load(html);node=this.getContentNode($,title,url,opts);if(nodeIsSufficient(node)){break;}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnthis.cleanAndReturnNode(node,$);},// Get node given current options
getContentNode:functiongetContentNode($,title,url,opts){returnextractCleanNode(extractBestNode($,opts),{$:$,cleanConditionally:opts.cleanConditionally,title:title,url:url});},// Once we got here, either we're at our last-resort node, or
// we broke early. Make sure we at least have -something- before we
// move forward.
@ -683,7 +896,7 @@ var bylineRe=/^[\n\s]*By/i;var BYLINE_SELECTORS_RE=[['#byline',bylineRe],['.byli
author=extractFromMeta($,AUTHOR_META_TAGS,metaCache);if(author&&author.length<AUTHOR_MAX_LENGTH){returncleanAuthor(author);}// Second, look through our selectors looking for potential authors.
author=extractFromSelectors($,AUTHOR_SELECTORS,2);if(author&&author.length<AUTHOR_MAX_LENGTH){returncleanAuthor(author);}// Last, use our looser regular-expression based selectors for
// potential authors.
var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator$$(BYLINE_SELECTORS_RE),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){var_ref4=_step.value;var_ref3=_slicedToArray$$(_ref4,2);varselector=_ref3[0];varregex=_ref3[1];varnode=$(selector);if(node.length===1){vartext=node.text();if(regex.test(text)){returncleanAuthor(text);}}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnnull;}};// An ordered list of meta tag names that denote
var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator(BYLINE_SELECTORS_RE),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){var_ref4=_step.value;var_ref3=_slicedToArray(_ref4,2);varselector=_ref3[0];varregex=_ref3[1];varnode=$(selector);if(node.length===1){vartext=node.text();if(regex.test(text)){returncleanAuthor(text);}}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnnull;}};// An ordered list of meta tag names that denote
// likely date published dates. All attributes
// should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
@ -749,9 +962,9 @@ var GenericLeadImageUrlExtractor={extract:function extract(_ref){var $=_ref.$;va
varimageUrl=extractFromMeta($,LEAD_IMAGE_URL_META_TAGS,metaCache,false);if(imageUrl){cleanUrl=clean$1(imageUrl);if(cleanUrl)returncleanUrl;}// Next, try to find the "best" image via the content.
// We'd rather not have to fetch each image and check dimensions,
// so try to do some analysis and determine them instead.
varimgs=$('img',content).toArray();varimgScores={};imgs.forEach(function(img,index){var$img=$(img);varsrc=$img.attr('src');if(!src)return;varscore=scoreImageUrl(src);score+=scoreAttr($img);score+=scoreByParents($img);score+=scoreBySibling($img);score+=scoreByDimensions($img);score+=scoreByPosition(imgs,index);imgScores[src]=score;});var_Reflect$ownKeys$redu=_Reflect$ownKeys$$(imgScores).reduce(function(acc,key){returnimgScores[key]>acc[1]?[key,imgScores[key]]:acc;},[null,0]);var_Reflect$ownKeys$redu2=_slicedToArray$$(_Reflect$ownKeys$redu,2);vartopUrl=_Reflect$ownKeys$redu2[0];vartopScore=_Reflect$ownKeys$redu2[1];if(topScore>0){cleanUrl=clean$1(topUrl);if(cleanUrl)returncleanUrl;}// If nothing else worked, check to see if there are any really
varimgs=$('img',content).toArray();varimgScores={};imgs.forEach(function(img,index){var$img=$(img);varsrc=$img.attr('src');if(!src)return;varscore=scoreImageUrl(src);score+=scoreAttr($img);score+=scoreByParents($img);score+=scoreBySibling($img);score+=scoreByDimensions($img);score+=scoreByPosition(imgs,index);imgScores[src]=score;});var_Reflect$ownKeys$redu=_Reflect$ownKeys(imgScores).reduce(function(acc,key){returnimgScores[key]>acc[1]?[key,imgScores[key]]:acc;},[null,0]);var_Reflect$ownKeys$redu2=_slicedToArray(_Reflect$ownKeys$redu,2);vartopUrl=_Reflect$ownKeys$redu2[0];vartopScore=_Reflect$ownKeys$redu2[1];if(topScore>0){cleanUrl=clean$1(topUrl);if(cleanUrl)returncleanUrl;}// If nothing else worked, check to see if there are any really
// probable nodes in the doc, like <link rel="image_src" />.
var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator$$(LEAD_IMAGE_URL_SELECTORS),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){varselector=_step.value;var$node=$(selector).first();varsrc=$node.attr('src');if(src){cleanUrl=clean$1(src);if(cleanUrl)returncleanUrl;}varhref=$node.attr('href');if(href){cleanUrl=clean$1(href);if(cleanUrl)returncleanUrl;}varvalue=$node.attr('value');if(value){cleanUrl=clean$1(value);if(cleanUrl)returncleanUrl;}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnnull;}};functionscoreSimilarity(score,articleUrl,href){// Do this last and only if we have a real candidate, because it's
var_iteratorNormalCompletion=true;var_didIteratorError=false;var_iteratorError=undefined;try{for(var_iterator=_getIterator(LEAD_IMAGE_URL_SELECTORS),_step;!(_iteratorNormalCompletion=(_step=_iterator.next()).done);_iteratorNormalCompletion=true){varselector=_step.value;var$node=$(selector).first();varsrc=$node.attr('src');if(src){cleanUrl=clean$1(src);if(cleanUrl)returncleanUrl;}varhref=$node.attr('href');if(href){cleanUrl=clean$1(href);if(cleanUrl)returncleanUrl;}varvalue=$node.attr('value');if(value){cleanUrl=clean$1(value);if(cleanUrl)returncleanUrl;}}}catch(err){_didIteratorError=true;_iteratorError=err;}finally{try{if(!_iteratorNormalCompletion&&_iterator.return){_iterator.return();}}finally{if(_didIteratorError){throw_iteratorError;}}}returnnull;}};functionscoreSimilarity(score,articleUrl,href){// Do this last and only if we have a real candidate, because it's
// potentially expensive computationally. Compare the link to this
// URL using difflib to get the % similarity of these URLs. On a
// sliding scale, subtract points from this link based on
@ -825,21 +1038,21 @@ var scoredPages=links.reduce(function(possiblePages,link){// Remove any anchor d
// standardizing URLs (it's hard), we're going to do
// some checking with and without a trailing slash
varhref=removeAnchor(link.attribs.href);var$link=$(link);varlinkText=$link.text();if(!shouldScore(href,articleUrl,baseUrl,parsedUrl,linkText,previousUrls)){returnpossiblePages;}// ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
if(!possiblePages[href]){possiblePages[href]={score:0,linkText:linkText,href:href};}else{possiblePages[href].linkText=possiblePages[href].linkText+'|'+linkText;}varpossiblePage=possiblePages[href];varlinkData=makeSig($link,linkText);varpageNum=pageNumFromUrl(href);varscore=scoreBaseUrl(href,baseRegex);score+=scoreNextLinkText(linkData);score+=scoreCapLinks(linkData);score+=scorePrevLink(linkData);score+=scoreByParents$1($link);score+=scoreExtraneousLinks(href);score+=scorePageInLink(pageNum,isWp);score+=scoreLinkText(linkText,pageNum);score+=scoreSimilarity(score,articleUrl,href);possiblePage.score=score;returnpossiblePages;},{});return_Reflect$ownKeys$$(scoredPages).length===0?null:scoredPages;}// Looks for and returns next page url
if(!possiblePages[href]){possiblePages[href]={score:0,linkText:linkText,href:href};}else{possiblePages[href].linkText=possiblePages[href].linkText+'|'+linkText;}varpossiblePage=possiblePages[href];varlinkData=makeSig($link,linkText);varpageNum=pageNumFromUrl(href);varscore=scoreBaseUrl(href,baseRegex);score+=scoreNextLinkText(linkData);score+=scoreCapLinks(linkData);score+=scorePrevLink(linkData);score+=scoreByParents$1($link);score+=scoreExtraneousLinks(href);score+=scorePageInLink(pageNum,isWp);score+=scoreLinkText(linkText,pageNum);score+=scoreSimilarity(score,articleUrl,href);possiblePage.score=score;returnpossiblePages;},{});return_Reflect$ownKeys(scoredPages).length===0?null:scoredPages;}// Looks for and returns next page url
// for multi-page articles
varGenericNextPageUrlExtractor={extract:functionextract(_ref){var$=_ref.$;varurl=_ref.url;varparsedUrl=_ref.parsedUrl;var_ref$previousUrls=_ref.previousUrls;varpreviousUrls=_ref$previousUrls===undefined?[]:_ref$previousUrls;parsedUrl=parsedUrl||URL$$.parse(url);vararticleUrl=removeAnchor(url);varbaseUrl=articleBaseUrl(url,parsedUrl);varlinks=$('a[href]').toArray();varscoredLinks=scoreLinks({links:links,articleUrl:articleUrl,baseUrl:baseUrl,parsedUrl:parsedUrl,$:$,previousUrls:previousUrls});// If no links were scored, return null
if(!scoredLinks)returnnull;// now that we've scored all possible pages,
// find the biggest one.
vartopPage=_Reflect$ownKeys$$(scoredLinks).reduce(function(acc,link){varscoredLink=scoredLinks[link];returnscoredLink.score>acc.score?scoredLink:acc;},{score:-100});// If the score is less than 50, we're not confident enough to use it,
vartopPage=_Reflect$ownKeys(scoredLinks).reduce(function(acc,link){varscoredLink=scoredLinks[link];returnscoredLink.score>acc.score?scoredLink:acc;},{score:-100});// If the score is less than 50, we're not confident enough to use it,
// so we fail.
if(topPage.score>=50){returntopPage.href;}returnnull;}};varCANONICAL_META_SELECTORS=['og:url'];functionparseDomain(url){varparsedUrl=URL$$.parse(url);varhostname=parsedUrl.hostname;returnhostname;}functionresult(url){return{url:url,domain:parseDomain(url)};}varGenericUrlExtractor={extract:functionextract(_ref){var$=_ref.$;varurl=_ref.url;varmetaCache=_ref.metaCache;var$canonical=$('link[rel=canonical]');if($canonical.length!==0){varhref=$canonical.attr('href');if(href){returnresult(href);}}varmetaUrl=extractFromMeta($,CANONICAL_META_SELECTORS,metaCache);if(metaUrl){returnresult(metaUrl);}returnresult(url);}};varEXCERPT_META_SELECTORS=['og:description','twitter:description'];functionclean$2(content,$){varmaxLength=arguments.length>2&&arguments[2]!==undefined?arguments[2]:200;content=content.replace(/[\s\n]+/g,' ').trim();returnellipsize$$(content,maxLength,{ellipse:'…'});}varGenericExcerptExtractor={extract:functionextract(_ref){var$=_ref.$;varcontent=_ref.content;varmetaCache=_ref.metaCache;varexcerpt=extractFromMeta($,EXCERPT_META_SELECTORS,metaCache);if(excerpt){returnclean$2(stripTags(excerpt,$));}// Fall back to excerpting from the extracted content
varmaxLength=200;varshortContent=content.slice(0,maxLength*5);returnclean$2($(shortContent).text(),$,maxLength);}};varGenericWordCountExtractor={extract:functionextract(_ref){varcontent=_ref.content;var$=cheerio$$.load(content);vartext=normalizeSpaces($('div').first().text());returntext.split(/\s/).length;}};varGenericExtractor={// This extractor is the default for all domains
domain:'*',title:GenericTitleExtractor.extract,date_published:GenericDatePublishedExtractor.extract,author:GenericAuthorExtractor.extract,content:GenericContentExtractor.extract.bind(GenericContentExtractor),lead_image_url:GenericLeadImageUrlExtractor.extract,dek:GenericDekExtractor.extract,next_page_url:GenericNextPageUrlExtractor.extract,url_and_domain:GenericUrlExtractor.extract,excerpt:GenericExcerptExtractor.extract,word_count:GenericWordCountExtractor.extract,direction:functiondirection(_ref){vartitle=_ref.title;returnstringDirection$$.getDirection(title);},extract:functionextract(options){varhtml=options.html;if(html){var$=cheerio$$.load(html);options.$=$;}vartitle=this.title(options);vardate_published=this.date_published(options);varauthor=this.author(options);varcontent=this.content(_extends$$({},options,{title:title}));varlead_image_url=this.lead_image_url(_extends$$({},options,{content:content}));vardek=this.dek(_extends$$({},options,{content:content}));varnext_page_url=this.next_page_url(options);varexcerpt=this.excerpt(_extends$$({},options,{content:content}));varword_count=this.word_count(_extends$$({},options,{content:content}));vardirection=this.direction({title:title});var_url_and_domain=this.url_and_domain(options);varurl=_url_and_domain.url;vardomain=_url_and_domain.domain;return{title:title,author:author,date_published:date_published||null,dek:dek,lead_image_url:lead_image_url,content:content,next_page_url:next_page_url,url:url,domain:domain,excerpt:excerpt,word_count:word_count,direction:direction};}};functiongetExtractor(url,parsedUrl){parsedUrl=parsedUrl||URL$$.parse(url);var_parsedUrl=parsedUrl;varhostname=_parsedUrl.hostname;varbaseDomain=hostname.split('.').slice(-2).join('.');returnExtractors[hostname]||Extractors[baseDomain]||GenericExtractor;}// Remove elements by an array of selectors
functioncleanBySelectors($content,$,_ref){varclean=_ref.clean;if(!clean)return$content;$(clean.join(','),$content).remove();return$content;}// Transform matching elements
functiontransformElements($content,$,_ref2){vartransforms=_ref2.transforms;if(!transforms)return$content;_Reflect$ownKeys$$(transforms).forEach(function(key){var$matches=$(key,$content);varvalue=transforms[key];// If value is a string, convert directly
functiontransformElements($content,$,_ref2){vartransforms=_ref2.transforms;if(!transforms)return$content;_Reflect$ownKeys(transforms).forEach(function(key){var$matches=$(key,$content);varvalue=transforms[key];// If value is a string, convert directly
if(typeofvalue==='string'){$matches.each(function(index,node){convertNodeTo($(node),$,transforms[key]);});}elseif(typeofvalue==='function'){// If value is function, apply function to node
$matches.each(function(index,node){varresult=value($(node),$);// If function returns a string, convert node to that value
if(typeofresult==='string'){convertNodeTo($(node),$,result);}});}});return$content;}functionfindMatchingSelector($,selectors){returnselectors.find(function(selector){if(Array.isArray(selector)){var_selector=_slicedToArray$$(selector,2);vars=_selector[0];varattr=_selector[1];return$(s).length===1&&$(s).attr(attr)&&$(s).attr(attr).trim()!=='';}return$(selector).length===1&&$(selector).text().trim()!=='';});}functionselect(opts){var$=opts.$;vartype=opts.type;varextractionOpts=opts.extractionOpts;var_opts$extractHtml=opts.extractHtml;varextractHtml=_opts$extractHtml===undefined?false:_opts$extractHtml;// Skip if there's not extraction for this type
if(typeofresult==='string'){convertNodeTo($(node),$,result);}});}});return$content;}functionfindMatchingSelector($,selectors){returnselectors.find(function(selector){if(Array.isArray(selector)){var_selector=_slicedToArray(selector,2);vars=_selector[0];varattr=_selector[1];return$(s).length===1&&$(s).attr(attr)&&$(s).attr(attr).trim()!=='';}return$(selector).length===1&&$(selector).text().trim()!=='';});}functionselect(opts){var$=opts.$;vartype=opts.type;varextractionOpts=opts.extractionOpts;var_opts$extractHtml=opts.extractHtml;varextractHtml=_opts$extractHtml===undefined?false:_opts$extractHtml;// Skip if there's not extraction for this type
if(!extractionOpts)returnnull;// If a string is hardcoded for a type (e.g., Wikipedia
// contributors), return the string
if(typeofextractionOpts==='string')returnextractionOpts;varselectors=extractionOpts.selectors;var_extractionOpts$defau=extractionOpts.defaultCleaner;vardefaultCleaner=_extractionOpts$defau===undefined?true:_extractionOpts$defau;varmatchingSelector=findMatchingSelector($,selectors);if(!matchingSelector)returnnull;// Declaring result; will contain either
if(extractHtml){var$content=$(matchingSelector);// Wrap in div so transformation can take place on root element
$content.wrap($('<div></div>'));$content=$content.parent();$content=transformElements($content,$,extractionOpts);$content=cleanBySelectors($content,$,extractionOpts);$content=Cleaners[type]($content,_extends$$({},opts,{defaultCleaner:defaultCleaner}));return$.html($content);}varresult=void0;// if selector is an array (e.g., ['img', 'src']),
// extract the attr
if(Array.isArray(matchingSelector)){var_matchingSelector=_slicedToArray$$(matchingSelector,2);varselector=_matchingSelector[0];varattr=_matchingSelector[1];result=$(selector).attr(attr).trim();}else{result=$(matchingSelector).text().trim();}// Allow custom extractor to skip default cleaner
if(Array.isArray(matchingSelector)){var_matchingSelector=_slicedToArray(matchingSelector,2);varselector=_matchingSelector[0];varattr=_matchingSelector[1];result=$(selector).attr(attr).trim();}else{result=$(matchingSelector).text().trim();}// Allow custom extractor to skip default cleaner
// for this type; defaults to true
if(defaultCleaner){returnCleaners[type](result,opts);}returnresult;}functionextractResult(opts){vartype=opts.type;varextractor=opts.extractor;var_opts$fallback=opts.fallback;varfallback=_opts$fallback===undefined?true:_opts$fallback;varresult=select(_extends$$({},opts,{extractionOpts:extractor[type]}));// If custom parser succeeds, return the result
if(result){returnresult;}// If nothing matches the selector, and fallback is enabled,
@ -1132,14 +1354,14 @@ function template(strings) {
}).join('\n');
}
var_templateObject=_taggedTemplateLiteral(['\n // Rename CustomExtractor\n // to fit your publication\n // (e.g., NYTimesExtractor)\n export const CustomExtractor = {\n domain: \'','\',\n title: {\n selectors: [\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it\'s consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn\'t be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ]\n },\n\n date_published: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n next_page_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n excerpt: {\n selectors: [\n // enter selectors\n ],\n },\n }\n '],['\n // Rename CustomExtractor\n // to fit your publication\n // (e.g., NYTimesExtractor)\n export const CustomExtractor = {\n domain: \'','\',\n title: {\n selectors: [\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it\'s consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn\'t be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ]\n },\n\n date_published: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n next_page_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n excerpt: {\n selectors: [\n // enter selectors\n ],\n },\n }\n ']);
var_templateObject= taggedTemplateLiteral(['\n // Rename CustomExtractor\n // to fit your publication\n // (e.g., NYTimesExtractor)\n export const CustomExtractor = {\n domain: \'','\',\n\n title: {\n selectors: [\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n // enter author selectors\n ],\n },\n\n date_published: {\n selectors: [\n // enter selectors\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it\'s consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn\'t be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ]\n },\n }\n '],['\n // Rename CustomExtractor\n // to fit your publication\n // (e.g., NYTimesExtractor)\n export const CustomExtractor = {\n domain: \'','\',\n\n title: {\n selectors: [\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n // enter author selectors\n ],\n },\n\n date_published: {\n selectors: [\n // enter selectors\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it\'s consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn\'t be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ]\n },\n }\n ']);
functionextractorTemplate(hostname){
returntemplate(_templateObject,hostname);
}
var_templateObject$1=_taggedTemplateLiteral(['\n it(\'returns the ','\', async () => {\n // To pass this test, fill out the ',' selector\n // in ','/index.js.\n const html =\n fs.readFileSync(\'','\');\n const articleUrl =\n \'','\';\n\n const { ',' } =\n await Mercury.parse(articleUrl, html, { fallback: false });\n\n // Update these values with the expected values from\n // the article.\n assert.equal(',', ',')\n });\n '],['\n it(\'returns the ','\', async () => {\n // To pass this test, fill out the ',' selector\n // in ','/index.js.\n const html =\n fs.readFileSync(\'','\');\n const articleUrl =\n \'','\';\n\n const { ',' } =\n await Mercury.parse(articleUrl, html, { fallback: false });\n\n // Update these values with the expected values from\n // the article.\n assert.equal(',', ',')\n });\n ']);
var_templateObject2=_taggedTemplateLiteral(['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n // Rename CustomExtractor\n describe(\'CustomExtractor\', () => {\n it(\'is selected properly\', () => {\n // To pass this test, rename your extractor in\n // ','/index.js\n // (e.g., CustomExtractor => NYTimesExtractor)\n // then add your new extractor to\n // src/extractors/all.js\n const url =\n \'','\';\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ','\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ','/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const html =\n fs.readFileSync(\'','\');\n const url =\n \'','\';\n\n const { content } =\n await Mercury.parse(url, html, { fallback: false });\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, null);\n });\n });\n '],['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n // Rename CustomExtractor\n describe(\'CustomExtractor\', () => {\n it(\'is selected properly\', () => {\n // To pass this test, rename your extractor in\n // ','/index.js\n // (e.g., CustomExtractor => NYTimesExtractor)\n // then add your new extractor to\n // src/extractors/all.js\n const url =\n \'','\';\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ','\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ','/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const html =\n fs.readFileSync(\'','\');\n const url =\n \'','\';\n\n const { content } =\n await Mercury.parse(url, html, { fallback: false });\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, null);\n });\n });\n ']);
var_templateObject$1= taggedTemplateLiteral(['\n it(\'returns the ','\', async () => {\n // To pass this test, fill out the ',' selector\n // in ','/index.js.\n const html =\n fs.readFileSync(\'','\');\n const articleUrl =\n \'','\';\n\n const { ',' } =\n await Mercury.parse(articleUrl, html, { fallback: false });\n\n // Update these values with the expected values from\n // the article.\n assert.equal(',', ',')\n });\n '],['\n it(\'returns the ','\', async () => {\n // To pass this test, fill out the ',' selector\n // in ','/index.js.\n const html =\n fs.readFileSync(\'','\');\n const articleUrl =\n \'','\';\n\n const { ',' } =\n await Mercury.parse(articleUrl, html, { fallback: false });\n\n // Update these values with the expected values from\n // the article.\n assert.equal(',', ',')\n });\n ']);
var_templateObject2= taggedTemplateLiteral(['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n // Rename CustomExtractor\n // to fit your publication\n // (e.g., NYTimesExtractor)\n describe(\'CustomExtractor\', () => {\n it(\'is selected properly\', () => {\n // This test should be passing by default.\n // It sanity checks that the correct parser\n // is being selected for URLs from this domain\n const url =\n \'','\';\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ','\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ','/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const html =\n fs.readFileSync(\'','\');\n const url =\n \'','\';\n\n const { content } =\n await Mercury.parse(url, html, { fallback: false });\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, \'Add the first 13 words of the article here\');\n });\n });\n '],['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n // Rename CustomExtractor\n // to fit your publication\n // (e.g., NYTimesExtractor)\n describe(\'CustomExtractor\', () => {\n it(\'is selected properly\', () => {\n // This test should be passing by default.\n // It sanity checks that the correct parser\n // is being selected for URLs from this domain\n const url =\n \'','\';\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ','\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ','/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const html =\n fs.readFileSync(\'','\');\n const url =\n \'','\';\n\n const { content } =\n await Mercury.parse(url, html, { fallback: false });\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, \'Add the first 13 words of the article here\');\n });\n });\n ']);