feat: update more custom parsers and unit tests and remove unnecessary parser

move-fixtures
Sarah Doire 2 years ago
parent b46ea57a10
commit 8e3ba45ee7

@ -4342,32 +4342,6 @@ var ThoughtcatalogComExtractor = {
clean: ['.tc_mark']
}
};
var WwwNjComExtractor = {
domain: 'www.nj.com',
title: {
selectors: [['meta[name="title"]', 'value']]
},
author: {
selectors: [['meta[name="article_author"]', 'value']]
},
date_published: {
selectors: [['meta[name="article_date_original"]', 'value']],
timezone: 'America/New_York'
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['.entry-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: []
}
};
var WwwInquisitrComExtractor = {
domain: 'www.inquisitr.com',
title: {

291
dist/mercury.js vendored

@ -1540,6 +1540,19 @@ var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');
// the src attribute so the images are no longer lazy loaded.
function convertLazyLoadedImages($) {
var extractSrcFromJSON = function extractSrcFromJSON(str) {
try {
var _JSON$parse = JSON.parse(str),
src = _JSON$parse.src;
if (typeof src === 'string') return src;
} catch (e) {
return false;
}
return false;
};
$('img').each(function (_, img) {
var attrs = getAttrs(img);
@ -1549,7 +1562,14 @@ function convertLazyLoadedImages($) {
if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) {
$(img).attr('srcset', value);
} else if (attr !== 'src' && attr !== 'srcset' && IS_LINK.test(value) && IS_IMAGE.test(value)) {
$(img).attr('src', value);
// Is the value a JSON object? If so, we should attempt to extract the image src from the data.
var existingSrc = extractSrcFromJSON(value);
if (existingSrc) {
$(img).attr('src', existingSrc);
} else {
$(img).attr('src', value);
}
}
});
});
@ -2295,7 +2315,7 @@ var DeadspinExtractor = {
var BroadwayWorldExtractor = {
domain: 'www.broadwayworld.com',
title: {
selectors: ['h1.article-title']
selectors: ['h1[itemprop=headline]', 'h1.article-title']
},
author: {
selectors: ['span[itemprop=author]']
@ -2389,6 +2409,14 @@ var MediumExtractor = {
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
// Allow drop cap character.
'section span:first-of-type': function sectionSpanFirstOfType($node) {
var $text = $node.html();
if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
$node.replaceWith($text);
}
},
// Re-write lazy-loaded youtube videos
iframe: function iframe($node) {
var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
@ -2430,7 +2458,7 @@ var MediumExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['span', 'svg']
clean: ['span a', 'svg']
},
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']]
@ -2807,7 +2835,7 @@ var WwwSbnationComExtractor = {
selectors: [['meta[name="article:published_time"]', 'value']]
},
dek: {
selectors: ['h2.c-entry-summary.p-dek']
selectors: ['p.c-entry-summary.p-dek', 'h2.c-entry-summary.p-dek']
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
@ -3005,10 +3033,10 @@ var WwwDmagazineComExtractor = {
var WwwReutersComExtractor = {
domain: 'www.reuters.com',
title: {
selectors: ['h1.article-headline']
selectors: ['h1[class*="ArticleHeader-headline-"]', 'h1.article-headline']
},
author: {
selectors: ['.author']
selectors: [['meta[name="og:article:author"]', 'value'], '.author']
},
date_published: {
selectors: [['meta[name="og:article:published_time"]', 'value']]
@ -3017,7 +3045,7 @@ var WwwReutersComExtractor = {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['#article-text'],
selectors: ['div.ArticleBodyWrapper'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
@ -3026,7 +3054,7 @@ var WwwReutersComExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['#article-byline .author']
clean: ['div[class^="ArticleBody-byline-container-"]', '#article-byline .author']
}
};
@ -3158,63 +3186,54 @@ var NewsNationalgeographicComExtractor = {
}
};
// export const WwwNationalgeographicComExtractor = {
// domain: 'www.nationalgeographic.com',
// title: {
// selectors: ['h1', 'h1.main-title'],
// },
// author: {
// selectors: ['.byline-component__contributors b span'],
// },
// date_published: {
// selectors: [['meta[name="article:published_time"]', 'value']],
// },
// dek: {
// selectors: ['.Article__Headline__Desc', '.article__deck'],
// },
// lead_image_url: {
// selectors: [['meta[name="og:image"]', 'value']],
// },
// content: {
// selectors: ['section.Article__Content', ['.parsys.content', '.__image-lead__'], '.content'],
// // Is there anything in the content you selected that needs transformed
// // before it's consumable content? E.g., unusual lazy loaded images
// transforms: {
// '.parsys.content': ($node, $) => {
// const $imageParent = $node.children().first();
// if ($imageParent.hasClass('imageGroup')) {
// const $dataAttrContainer = $imageParent
// .find('.media--medium__container')
// .children()
// .first();
// const imgPath1 = $dataAttrContainer.data('platform-image1-path');
// const imgPath2 = $dataAttrContainer.data('platform-image2-path');
// if (imgPath2 && imgPath1) {
// $node.prepend(
// $(`<div class="__image-lead__">
// <img src="${imgPath1}"/>
// <img src="${imgPath2}"/>
// </div>`)
// );
// }
// } else {
// const $imgSrc = $node
// .find('.image.parbase.section')
// .find('.picturefill')
// .first()
// .data('platform-src');
// if ($imgSrc) {
// $node.prepend($(`<img class="__image-lead__" src="${$imgSrc}"/>`));
// }
// }
// },
// },
// // Is there anything that is in the result that shouldn't be?
// // The clean selectors will remove anything that matches from
// // the result
// clean: ['.pull-quote.pull-quote--small'],
// },
// };
var WwwNationalgeographicComExtractor = {
domain: 'www.nationalgeographic.com',
title: {
selectors: ['h1', 'h1.main-title']
},
author: {
selectors: ['.byline-component__contributors b span']
},
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']]
},
dek: {
selectors: ['.Article__Headline__Desc', '.article__deck']
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['section.Article__Content', ['.parsys.content', '.__image-lead__'], '.content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'.parsys.content': function parsysContent($node, $) {
var $imageParent = $node.children().first();
if ($imageParent.hasClass('imageGroup')) {
var $dataAttrContainer = $imageParent.find('.media--medium__container').children().first();
var imgPath1 = $dataAttrContainer.data('platform-image1-path');
var imgPath2 = $dataAttrContainer.data('platform-image2-path');
if (imgPath2 && imgPath1) {
$node.prepend($("<div class=\"__image-lead__\">\n <img src=\"".concat(imgPath1, "\"/>\n <img src=\"").concat(imgPath2, "\"/>\n </div>")));
}
} else {
var $imgSrc = $node.find('.image.parbase.section').find('.picturefill').first().data('platform-src');
if ($imgSrc) {
$node.prepend($("<img class=\"__image-lead__\" src=\"".concat($imgSrc, "\"/>")));
}
}
}
},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['.pull-quote.pull-quote--small']
}
};
var WwwLatimesComExtractor = {
domain: 'www.latimes.com',
@ -3602,17 +3621,17 @@ var WwwUsmagazineComExtractor = {
selectors: ['header h1']
},
author: {
selectors: ['a.article-byline.tracked-offpage']
selectors: ['a.author', 'a.article-byline.tracked-offpage']
},
date_published: {
timezone: 'America/New_York',
selectors: ['time.article-published-date']
selectors: [['meta[name="article:published_time"]', 'value']]
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['div.article-body-inner'],
selectors: ['div.article-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
@ -3659,7 +3678,7 @@ var twofortysevensportsComExtractor = {
selectors: ['title', 'article header h1']
},
author: {
selectors: ['.author']
selectors: ['.article-cnt__author', '.author']
},
date_published: {
selectors: [['time[data-published]', 'data-published']]
@ -3668,7 +3687,7 @@ var twofortysevensportsComExtractor = {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['section.body.article'],
selectors: ['.article-body', 'section.body.article'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
@ -3802,12 +3821,10 @@ var WwwMacrumorsComExtractor = {
selectors: ['h1', 'h1.title']
},
author: {
selectors: ['.author-url']
selectors: ['article a[rel="author"]', '.author-url']
},
date_published: {
selectors: ['.article .byline'],
// Wednesday January 18, 2017 11:44 am PST
format: 'dddd MMMM D, YYYY h:mm A zz',
selectors: [['time', 'datetime']],
timezone: 'America/Los_Angeles'
},
dek: {
@ -3817,7 +3834,7 @@ var WwwMacrumorsComExtractor = {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['.article'],
selectors: ['article', '.article'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
@ -3992,19 +4009,19 @@ var WwwCinemablendComExtractor = {
var WwwTodayComExtractor = {
domain: 'www.today.com',
title: {
selectors: ['h1.entry-headline']
selectors: ['h1.article-hero-headline__htag', 'h1.entry-headline']
},
author: {
selectors: [['meta[name="author"]', 'value']]
selectors: ['span.byline-name', ['meta[name="author"]', 'value']]
},
date_published: {
selectors: [['meta[name="DC.date.issued"]', 'value']]
selectors: ['time[datetime]', ['meta[name="DC.date.issued"]', 'value']]
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['.entry-container'],
selectors: ['div.article-body__content', '.entry-container'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
@ -4236,33 +4253,6 @@ var ThoughtcatalogComExtractor = {
}
};
var WwwNjComExtractor = {
domain: 'www.nj.com',
title: {
selectors: [['meta[name="title"]', 'value']]
},
author: {
selectors: [['meta[name="article_author"]', 'value']]
},
date_published: {
selectors: [['meta[name="article_date_original"]', 'value']],
timezone: 'America/New_York'
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['.entry-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: []
}
};
var WwwInquisitrComExtractor = {
domain: 'www.inquisitr.com',
title: {
@ -4292,20 +4282,20 @@ var WwwInquisitrComExtractor = {
var WwwNbcnewsComExtractor = {
domain: 'www.nbcnews.com',
title: {
selectors: ['div.article-hed h1']
selectors: ['div.article-hero-headline h1', 'div.article-hed h1']
},
author: {
selectors: ['span.byline_author']
selectors: ['div.article-inline-byline span.byline-name', 'span.byline_author']
},
date_published: {
selectors: [['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'], '.flag_article-wrapper time'],
selectors: [['meta[name="article:published"]', 'value'], ['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'], '.flag_article-wrapper time'],
timezone: 'America/New_York'
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['div.article-body'],
selectors: ['div.article-body__content', 'div.article-body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
@ -4408,13 +4398,13 @@ var ObamawhitehouseArchivesGovExtractor = {
var WwwOpposingviewsComExtractor = {
domain: 'www.opposingviews.com',
title: {
selectors: ['h1.title']
selectors: ['h1.m-detail-header--title', 'h1.title']
},
author: {
selectors: ['div.date span span a']
selectors: [['meta[name="author"]', 'value'], 'div.date span span a']
},
date_published: {
selectors: [['meta[name="publish_date"]', 'value']]
selectors: [['meta[name="published"]', 'value'], ['meta[name="publish_date"]', 'value']]
},
dek: {
selectors: [// enter selectors
@ -4424,7 +4414,7 @@ var WwwOpposingviewsComExtractor = {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['.article-content'],
selectors: ['.m-detail--body', '.article-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
@ -4641,13 +4631,13 @@ var IciRadioCanadaCaExtractor = {
timezone: 'America/New_York'
},
dek: {
selectors: ['.bunker-component.lead']
selectors: ['div.lead-container', '.bunker-component.lead']
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: [['.main-multimedia-item', '.news-story-content']],
selectors: ['section.document-content-style', ['.main-multimedia-item', '.news-story-content']],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
@ -4694,7 +4684,7 @@ var WwwFastcompanyComExtractor = {
selectors: ['h1']
},
author: {
selectors: ['.post__by']
selectors: [['meta[name="author"]', 'value']]
},
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']]
@ -4748,7 +4738,7 @@ var NewsMynaviJpExtractor = {
selectors: [['meta[name="og:title"]', 'value']]
},
author: {
selectors: ['main div.article-author a.article-author__name']
selectors: ['a.articleHeader_name', 'main div.article-author a.article-author__name']
},
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']]
@ -4760,7 +4750,7 @@ var NewsMynaviJpExtractor = {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['main article div'],
selectors: ['div.article-body', 'main article div'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
@ -4813,10 +4803,10 @@ var GithubComExtractor = {
]
},
date_published: {
selectors: [['span[itemprop="dateModified"] relative-time', 'datetime']]
selectors: ['relative-time[datetime]', ['span[itemprop="dateModified"] relative-time', 'datetime']]
},
dek: {
selectors: ['span[itemprop="about"]']
selectors: [['meta[name="description"]', 'value'], 'span[itemprop="about"]']
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
@ -4949,7 +4939,7 @@ var BuzzapJpExtractor = {
var WwwAsahiComExtractor = {
domain: 'www.asahi.com',
title: {
selectors: ['.ArticleTitle h1']
selectors: ['main h1', '.ArticleTitle h1']
},
author: {
selectors: [['meta[name="article:author"]', 'value']]
@ -4965,10 +4955,10 @@ var WwwAsahiComExtractor = {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['#MainInner div.ArticleBody'],
selectors: ['main'],
defaultCleaner: false,
transforms: {},
clean: ['div.AdMod', 'div.LoginSelectArea']
clean: ['div.AdMod', 'div.LoginSelectArea', 'time', 'div.notPrint']
}
};
@ -5394,10 +5384,10 @@ var GetnewsJpExtractor = {
selectors: ['article h1']
},
author: {
selectors: ['span.prof']
selectors: [['meta[name="article:author"]', 'value'], 'span.prof']
},
date_published: {
selectors: [['ul.cattag-top time', 'datetime']]
selectors: [['meta[name="article:published_time"]', 'value'], ['ul.cattag-top time', 'datetime']]
},
dek: null,
lead_image_url: {
@ -5509,20 +5499,22 @@ var WwwIpaGoJpExtractor = {
var WeeklyAsciiJpExtractor = {
domain: 'weekly.ascii.jp',
title: {
selectors: ['h1[itemprop="headline"]']
selectors: ['article h1', 'h1[itemprop="headline"]']
},
author: {
selectors: ['p.author']
},
date_published: {
selectors: [['meta[name="odate"]', 'value']]
selectors: ['p.date', ['meta[name="odate"]', 'value']],
format: 'YYYY年MM月DD日 HH:mm',
timezone: 'Asia/Tokyo'
},
dek: null,
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['div.article'],
selectors: ['div#contents_detail', 'div.article'],
transforms: {},
clean: []
}
@ -5616,7 +5608,7 @@ var WwwRbbtodayComExtractor = {
selectors: [['header time', 'datetime']]
},
dek: {
selectors: ['.arti-summary']
selectors: [['meta[name="description"]', 'value'], '.arti-summary']
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
@ -5648,7 +5640,7 @@ var WwwLemondeFrExtractor = {
content: {
selectors: ['.article__content'],
transforms: {},
clean: []
clean: ['figcaption']
}
};
@ -5850,17 +5842,18 @@ var PastebinComExtractor = {
selectors: ['h1']
},
author: {
selectors: ['.paste_box_line2 .t_us + a']
selectors: ['.username', '.paste_box_line2 .t_us + a']
},
date_published: {
selectors: ['.paste_box_line2 .t_da + span'],
timezone: 'America/New_York'
selectors: ['.date', '.paste_box_line2 .t_da + span'],
timezone: 'America/New_York',
format: 'MMMM D, YYYY'
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['#selectable .text'],
selectors: ['.source', '#selectable .text'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
@ -6189,6 +6182,7 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
WwwChicagotribuneComExtractor: WwwChicagotribuneComExtractor,
WwwVoxComExtractor: WwwVoxComExtractor,
NewsNationalgeographicComExtractor: NewsNationalgeographicComExtractor,
WwwNationalgeographicComExtractor: WwwNationalgeographicComExtractor,
WwwLatimesComExtractor: WwwLatimesComExtractor,
PagesixComExtractor: PagesixComExtractor,
ThefederalistpapersOrgExtractor: ThefederalistpapersOrgExtractor,
@ -6224,7 +6218,6 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
ScienceflyComExtractor: ScienceflyComExtractor,
HellogigglesComExtractor: HellogigglesComExtractor,
ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
WwwNjComExtractor: WwwNjComExtractor,
WwwInquisitrComExtractor: WwwInquisitrComExtractor,
WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
FortuneComExtractor: FortuneComExtractor,
@ -7522,13 +7515,26 @@ var GenericExcerptExtractor = {
}
};
var getWordCount = function getWordCount(content) {
var $ = cheerio.load(content);
var $content = $('div').first();
var text = normalizeSpaces($content.text());
return text.split(/\s/).length;
};
var getWordCountAlt = function getWordCountAlt(content) {
content = content.replace(/<[^>]*>/g, ' ');
content = content.replace(/\s+/g, ' ');
content = content.trim();
return content.split(' ').length;
};
var GenericWordCountExtractor = {
extract: function extract(_ref) {
var content = _ref.content;
var $ = cheerio.load(content);
var $content = $('div').first();
var text = normalizeSpaces($content.text());
return text.split(/\s/).length;
var count = getWordCount(content);
if (count === 1) count = getWordCountAlt(content);
return count;
}
};
@ -7691,7 +7697,8 @@ function select(opts) {
_extractionOpts$defau = extractionOpts.defaultCleaner,
defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau,
allowMultiple = extractionOpts.allowMultiple;
var matchingSelector = findMatchingSelector($, selectors, extractHtml, allowMultiple);
var overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
var matchingSelector = findMatchingSelector($, selectors, extractHtml, overrideAllowMultiple);
if (!matchingSelector) return null;
function transformAndClean($node) {
@ -7973,7 +7980,7 @@ function _collectAllPages() {
});
return _context.abrupt("return", _objectSpread({}, result, {
total_pages: pages,
pages_rendered: pages,
rendered_pages: pages,
word_count: word_count
}));

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -6,7 +6,7 @@ export const twofortysevensportsComExtractor = {
},
author: {
selectors: ['.author'],
selectors: ['.article-cnt__author', '.author'],
},
date_published: {
@ -18,7 +18,7 @@ export const twofortysevensportsComExtractor = {
},
content: {
selectors: ['section.body.article'],
selectors: ['.article-body', 'section.body.article'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images

@ -69,7 +69,7 @@ describe('twofortysevensportsComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://s3media.247sports.com/Uploads/Assets/149/971/26_4971149.jpg'
'https://s3media.247sports.com/Uploads/Assets/149/971/4971149.jpg?fit=bounds&crop=1200:630,offset-y0.50&width=1200&height=630'
);
});

@ -6,11 +6,14 @@ export const GetnewsJpExtractor = {
},
author: {
selectors: ['span.prof'],
selectors: [['meta[name="article:author"]', 'value'], 'span.prof'],
},
date_published: {
selectors: [['ul.cattag-top time', 'datetime']],
selectors: [
['meta[name="article:published_time"]', 'value'],
['ul.cattag-top time', 'datetime'],
],
},
dek: null,

@ -46,7 +46,7 @@ describe('GetnewsJpExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(author, 'NeoLネオエル');
assert.equal(author, 'https://getnews.jp/author/neol');
});
it('returns the date_published', async () => {

@ -12,11 +12,17 @@ export const GithubComExtractor = {
},
date_published: {
selectors: [['span[itemprop="dateModified"] relative-time', 'datetime']],
selectors: [
'relative-time[datetime]',
['span[itemprop="dateModified"] relative-time', 'datetime'],
],
},
dek: {
selectors: ['span[itemprop="about"]'],
selectors: [
['meta[name="description"]', 'value'],
'span[itemprop="about"]',
],
},
lead_image_url: {

@ -33,7 +33,10 @@ describe('GithubComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(title, `steventroughtonsmith/marzipanify`);
assert.equal(
title,
`steventroughtonsmith/marzipanify: Convert an iOS Simulator app bundle to an iOSMac (Marzipan) one (Unsupported & undocumented, WIP)`
);
});
it('returns the author', async () => {
@ -53,7 +56,7 @@ describe('GithubComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2019-03-04T12:37:07.000Z');
assert.equal(date_published, '2020-07-07T05:00:00.000Z');
});
it('returns the dek', async () => {
@ -65,7 +68,7 @@ describe('GithubComExtractor', () => {
// the article.
assert.equal(
dek,
'Convert an iOS Simulator app bundle to an iOSMac (Marzipan) one (Unsupported & undocumented, WIP)'
'Convert an iOS Simulator app bundle to an iOSMac (Marzipan) one (Unsupported & undocumented, WIP) - steventroughtonsmith/marzipanify: Convert an iOS Simulator app bundle to an iOSMac (Marzipan) one (Unsupported & undocumented, WIP)'
);
});
@ -78,7 +81,7 @@ describe('GithubComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
`https://avatars0.githubusercontent.com/u/45212?s=400&v=4`
`https://opengraph.githubassets.com/269d23b56f85a1ea9bd3cf5b2f34ddbce8cfaff7a74c6561f59d84db67b8efc1/steventroughtonsmith/marzipanify`
);
});

@ -16,7 +16,7 @@ export const IciRadioCanadaCaExtractor = {
},
dek: {
selectors: ['.bunker-component.lead'],
selectors: ['div.lead-container', '.bunker-component.lead'],
},
lead_image_url: {
@ -24,7 +24,10 @@ export const IciRadioCanadaCaExtractor = {
},
content: {
selectors: [['.main-multimedia-item', '.news-story-content']],
selectors: [
'section.document-content-style',
['.main-multimedia-item', '.news-story-content'],
],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images

@ -34,7 +34,7 @@ describe('IciRadioCanadaCaExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(title, 'Affaire KPMG: un juge se récuse');
assert.equal(title, 'Affaire KPMG : un juge se récuse');
});
it('returns the author', async () => {
@ -69,7 +69,7 @@ describe('IciRadioCanadaCaExtractor', () => {
// the article.
assert.equal(
dek,
"Un juge de la cour de l'impôt se récuse d'un dossier mettant en cause un stratagème du cabinet comptable KPMG. Selon les émissions Enquête et the fifth estate, le juge Bocock avait participé à une soirée cocktail organisée par un cabinet d'avocats lié à l'affaire."
"Un juge de la Cour de l'impôt se récuse d'un dossier mettant en cause un stratagème du cabinet comptable KPMG. Selon les émissions Enquête et The Fifth Estate, le juge Bocock avait participé à une soirée cocktail organisée par un cabinet d'avocats lié à l'affaire."
);
});
@ -82,7 +82,7 @@ describe('IciRadioCanadaCaExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://images.radio-canada.ca/w_635,h_357/v1/ici-info/16x9/randall-bocock-juge.jpg'
'https://images.radio-canada.ca/v1/ici-info/16x9/randall-bocock-juge.jpg?im=Resize=(1250);Composite=(type=URL,url=https://images.radio-canada.ca/v1/assets/elements/16x9/outdated-content-2017.png),gravity=SouthEast,placement=Over,location=(0,0),scale=1'
);
});
@ -106,7 +106,7 @@ describe('IciRadioCanadaCaExtractor', () => {
// the article.
assert.equal(
first13,
"Le juge Randall Bocock se retire d'une cause liée à KPMG Photo :"
"Un texte de Frédéric Zalac d'Enquête Jusquà la semaine dernière, le juge Randall"
);
});
});

@ -74,7 +74,6 @@ export * from './www.americanow.com';
export * from './sciencefly.com';
export * from './hellogiggles.com';
export * from './thoughtcatalog.com';
export * from './www.nj.com';
export * from './www.inquisitr.com';
export * from './www.nbcnews.com';
export * from './fortune.com';

@ -6,7 +6,10 @@ export const NewsMynaviJpExtractor = {
},
author: {
selectors: ['main div.article-author a.article-author__name'],
selectors: [
'a.articleHeader_name',
'main div.article-author a.article-author__name',
],
},
date_published: {
@ -22,7 +25,7 @@ export const NewsMynaviJpExtractor = {
},
content: {
selectors: ['main article div'],
selectors: ['div.article-body', 'main article div'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images

@ -46,7 +46,7 @@ describe('NewsMynaviJpExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(author, '後藤大地');
assert.equal(author, '著者:後藤大地');
});
it('returns the date_published', async () => {
@ -81,7 +81,7 @@ describe('NewsMynaviJpExtractor', () => {
// the article.
assert.equal(
lead_image_url,
`https://news.mynavi.jp/article/20190222-775563/index_images/index.jpg`
`https://news.mynavi.jp/techplus/article/20190222-775563/index_images/index.jpg`
);
});

@ -6,12 +6,13 @@ export const PastebinComExtractor = {
},
author: {
selectors: ['.paste_box_line2 .t_us + a'],
selectors: ['.username', '.paste_box_line2 .t_us + a'],
},
date_published: {
selectors: ['.paste_box_line2 .t_da + span'],
selectors: ['.date', '.paste_box_line2 .t_da + span'],
timezone: 'America/New_York',
format: 'MMMM D, YYYY',
},
lead_image_url: {
@ -19,7 +20,7 @@ export const PastebinComExtractor = {
},
content: {
selectors: ['#selectable .text'],
selectors: ['.source', '#selectable .text'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images

@ -14,9 +14,7 @@ describe('PostlightComExtractor', () => {
let url;
beforeAll(() => {
url = 'https://postlight.com/insights/three-ways-to-be-the-thermostat';
const html = fs.readFileSync(
'./fixtures/postlight.com/1664999338243.html'
);
const html = fs.readFileSync('./fixtures/postlight.com.html');
result = Mercury.parse(url, { html, fallback: false });
});

@ -7,6 +7,7 @@ export const ThoughtcatalogComExtractor = {
author: {
selectors: [
'cite a',
'div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name',
'h1.writer-name',
],
@ -30,6 +31,6 @@ export const ThoughtcatalogComExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['.tc_mark'],
clean: ['.tc_mark', 'figcaption'],
},
};

@ -69,7 +69,7 @@ describe('ThoughtcatalogComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://thoughtcatalog.files.wordpress.com/2016/12/31565018766_4494e5f335_o.jpg'
'https://thoughtcatalog.com/wp-content/uploads/2016/12/31565018766_4494e5f335_o.jpg?w=1536&h=768&crop=1'
);
});
@ -93,7 +93,7 @@ describe('ThoughtcatalogComExtractor', () => {
// the article.
assert.equal(
first13,
'herzblut One day you are going to meet someone in your life, that'
'One day you are going to meet someone in your life, that is'
);
});
});

@ -2,7 +2,7 @@ export const WeeklyAsciiJpExtractor = {
domain: 'weekly.ascii.jp',
title: {
selectors: ['h1[itemprop="headline"]'],
selectors: ['article h1', 'h1[itemprop="headline"]'],
},
author: {
@ -10,7 +10,11 @@ export const WeeklyAsciiJpExtractor = {
},
date_published: {
selectors: [['meta[name="odate"]', 'value']],
selectors: ['p.date', ['meta[name="odate"]', 'value']],
format: 'YYYY年MM月DD日 HH:mm',
timezone: 'Asia/Tokyo',
},
dek: null,
@ -20,7 +24,7 @@ export const WeeklyAsciiJpExtractor = {
},
content: {
selectors: ['div.article'],
selectors: ['div#contents_detail', 'div.article'],
transforms: {},

@ -49,7 +49,7 @@ describe('WeeklyAsciiJpExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(author, `文●オカモトASCII編集部`);
assert.equal(author, `文● オカモトASCII編集部`);
});
it('returns the date_published', async () => {
@ -81,7 +81,7 @@ describe('WeeklyAsciiJpExtractor', () => {
// the article.
assert.equal(
lead_image_url,
`https://ascii.jp/elem/000/001/848/1848427/00-01_1024x1024.jpg`
`https://ascii.jp/img/2019/04/19/1643408/l/59bcecd731732273.jpg`
);
});

@ -66,7 +66,7 @@ describe('WwwAmericanowComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://images.americanow.com:8080/ovi/catalog/downloads/preview/rndr_670x377//2016/12/conway-1482422231.JPG/rndr_670x377.jpg'
'https://americanow.s3-us-west-1.amazonaws.com/an_prod/s3fs-public/admin/ANDefault.jpg'
);
});

@ -2,7 +2,7 @@ export const WwwAsahiComExtractor = {
domain: 'www.asahi.com',
title: {
selectors: ['.ArticleTitle h1'],
selectors: ['main h1', '.ArticleTitle h1'],
},
author: {
@ -24,12 +24,12 @@ export const WwwAsahiComExtractor = {
},
content: {
selectors: ['#MainInner div.ArticleBody'],
selectors: ['main'],
defaultCleaner: false,
transforms: {},
clean: ['div.AdMod', 'div.LoginSelectArea'],
clean: ['div.AdMod', 'div.LoginSelectArea', 'time', 'div.notPrint'],
},
};

@ -4,7 +4,7 @@
export const BroadwayWorldExtractor = {
domain: 'www.broadwayworld.com',
title: {
selectors: ['h1.article-title'],
selectors: ['h1[itemprop=headline]', 'h1.article-title'],
},
author: {

@ -9,7 +9,7 @@ import { excerptContent } from 'utils/text';
const fs = require('fs');
// Rename CustomExtractor
describe('CustomExtractor', () => {
describe('WwwBroadwayWorldComExtractor', () => {
describe('initial test case', () => {
let result;
let url;
@ -59,7 +59,7 @@ describe('CustomExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-10-13T19:35:00.000Z');
assert.equal(date_published, '2016-10-13T15:35:00.000Z');
});
it('returns the lead_image_url', async () => {
@ -71,7 +71,7 @@ describe('CustomExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://images.bwwstatic.com/columnpic7/7B5FD766-A644-E386-19DE07017A3AD79C.jpg'
'https://cloudimages.broadwayworld.com/columnpic7/6807B5FD766-A644-E386-19DE07017A3AD79C.jpg'
);
});

@ -6,7 +6,7 @@ export const WwwFastcompanyComExtractor = {
},
author: {
selectors: ['.post__by'],
selectors: [['meta[name="author"]', 'value']],
},
date_published: {

@ -82,7 +82,7 @@ describe('WwwFastcompanyComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://images.fastcompany.net/image/upload/w_1280,f_auto,q_auto,fl_lossy/fc/3067012-poster-p-1-the-only-five-email-folders-your-inbox-will-ever-need.jpg'
'https://images.fastcompany.net/image/upload/w_1280,f_auto,q_auto,fl_lossy/wp-cms/uploads/https://fast-company-res.cloudinary.com/image/upload/fc/3067012-poster-p-1-the-only-five-email-folders-your-inbox-will-ever-need.jpg'
);
});

@ -26,6 +26,6 @@ export const WwwLemondeFrExtractor = {
transforms: {},
clean: [],
clean: ['figcaption'],
},
};

@ -50,7 +50,7 @@ describe('WwwLemondeFrExtractor', () => {
assert.equal(
dek,
'Elle abaisse ses prévisions pour 2019, avec un PIB à 1,4 % pour lensemble de lUE, et à 1,2 % pour la zone euro.'
'Linstitution abaisse ses prévisions pour 2019, avec un PIB à 1,4 % pour lensemble de lUE, et à 1,2 % pour la zone euro.'
);
});
@ -59,7 +59,7 @@ describe('WwwLemondeFrExtractor', () => {
assert.equal(
lead_image_url,
`https://img.lemde.fr/2019/05/07/316/0/3824/1912/1440/720/60/0/d105b14_dfjDE1I-caggQrT4gvHf2nZP.jpg`
`https://img.lemde.fr/2019/05/07/354/570/3246/2164/1440/960/60/0/d105b14_dfjDE1I-caggQrT4gvHf2nZP.jpg`
);
});

@ -2,15 +2,21 @@ export const WwwLifehackerJpExtractor = {
domain: 'www.lifehacker.jp',
title: {
selectors: ['h1.lh-summary-title'],
selectors: ['h1[class^="article_pArticle_Title"]', 'h1.lh-summary-title'],
},
author: {
selectors: ['p.lh-entryDetailInner--credit'],
selectors: [
['meta[name="author"]', 'value'],
'p.lh-entryDetailInner--credit',
],
},
date_published: {
selectors: [['div.lh-entryDetail-header time', 'datetime']],
selectors: [
['meta[name="article:published_time"]', 'value'],
['div.lh-entryDetail-header time', 'datetime'],
],
},
dek: null,
@ -20,7 +26,10 @@ export const WwwLifehackerJpExtractor = {
},
content: {
selectors: ['div.lh-entryDetail-body'],
selectors: [
'div[class^="article_pArticle_Body__"]',
'div.lh-entryDetail-body',
],
transforms: {
'img.lazyload': $node => {

@ -60,7 +60,7 @@ describe('WwwLifehackerJpExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, `2019-03-08T04:00:00.000Z`);
assert.equal(date_published, `2019-03-08T13:00:00.000Z`);
});
it('returns the dek', async () => {
@ -82,7 +82,7 @@ describe('WwwLifehackerJpExtractor', () => {
// the article.
assert.equal(
lead_image_url,
`https://assets.media-platform.com/lifehacker/dist/images/2019/02/28/%E3%82%B9%E3%82%AF%E3%83%AA%E3%83%BC%E3%83%B3%E3%82%B7%E3%83%A7%E3%83%83%E3%83%882019-02-2810.48.32-w960.jpg`
`https://media.loom-app.com/mpp/lifehacker/dist/images/2019/02/28/%E3%82%B9%E3%82%AF%E3%83%AA%E3%83%BC%E3%83%B3%E3%82%B7%E3%83%A7%E3%83%83%E3%83%882019-02-2810.48.32.jpg?w=1280&h=630&f=jpg`
);
});
@ -106,7 +106,7 @@ describe('WwwLifehackerJpExtractor', () => {
// the article.
assert.equal(
first13,
'Image: Amazon.co.jpついつい溜めてしまい、気がつくとかさばって捨てるのにも苦労する新聞紙。そんな新聞紙を捨てる時には、ファインの「新聞ストッカー」が役に立ちます。新聞紙を簡単に、くるくるっとテープでまけちゃうんです。Image:'
'ついつい溜めてしまい、気がつくとかさばって捨てるのにも苦労する新聞紙。そんな新聞紙を捨てる時には、ファインの「新聞ストッカー」が役に立ちます。新聞紙を簡単に、くるくるっとテープでまけちゃうんです。Image: Amazon.co.jp使い方は簡単。テープや包帯を巻くようにぐるぐると巻きつけるだけ。普通だったら紐でまとめますが、下に通して、結んで、切って…と、結構時間がかかります。でも、このアイテムはラップフィルム。テープのように新聞に吸着してくれるので、きつく締めたり、結ぶ必要がありません。サクサクっとまとめられるので、時短になること間違いなしですね。Image:'
);
});
});

@ -6,14 +6,11 @@ export const WwwMacrumorsComExtractor = {
},
author: {
selectors: ['.author-url'],
selectors: ['article a[rel="author"]', '.author-url'],
},
date_published: {
selectors: ['.article .byline'],
// Wednesday January 18, 2017 11:44 am PST
format: 'dddd MMMM D, YYYY h:mm A zz',
selectors: [['time', 'datetime']],
timezone: 'America/Los_Angeles',
},
@ -27,7 +24,7 @@ export const WwwMacrumorsComExtractor = {
},
content: {
selectors: ['.article'],
selectors: ['article', '.article'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images

@ -56,7 +56,7 @@ describe('WwwMacrumorsComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2017-01-18T19:44:00.000Z');
assert.equal(date_published, '2017-01-18T19:44:11.000Z');
});
it('returns the dek', async () => {
@ -81,7 +81,7 @@ describe('WwwMacrumorsComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://cdn.macrumors.com/article-new/2015/11/ipad-pro-apple-pencil-screen-800x471.jpg?retina'
'https://images.macrumors.com/t/EN8cNLYLkjyiQXEOq65hBnAVS6k=/1600x/article-new/2014/11/applepencil2.jpg'
);
});

@ -2,15 +2,19 @@ export const WwwNbcnewsComExtractor = {
domain: 'www.nbcnews.com',
title: {
selectors: ['div.article-hed h1'],
selectors: ['div.article-hero-headline h1', 'div.article-hed h1'],
},
author: {
selectors: ['span.byline_author'],
selectors: [
'div.article-inline-byline span.byline-name',
'span.byline_author',
],
},
date_published: {
selectors: [
['meta[name="article:published"]', 'value'],
['.flag_article-wrapper time.timestamp_article[datetime]', 'datetime'],
'.flag_article-wrapper time',
],
@ -23,7 +27,7 @@ export const WwwNbcnewsComExtractor = {
},
content: {
selectors: ['div.article-body'],
selectors: ['div.article-body__content', 'div.article-body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images

@ -53,7 +53,7 @@ describe('WwwNbcnewsComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-13T18:06:00.000Z');
assert.equal(date_published, '2016-12-13T23:06:00.000Z');
});
it('returns the lead_image_url', async () => {
@ -65,7 +65,7 @@ describe('WwwNbcnewsComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://media1.s-nbcnews.com/j/newscms/2016_45/1792226/161110-nasa-spacex-mbe-430p_ea6b06bb8c83e70502b6de93ee91c78a.nbcnews-fp-1200-800.jpg'
'https://media3.s-nbcnews.com/j/newscms/2016_45/1792226/161110-nasa-spacex-mbe-430p_ea6b06bb8c83e70502b6de93ee91c78a.nbcnews-fp-1200-630.jpg'
);
});

@ -1,34 +0,0 @@
export const WwwNjComExtractor = {
domain: 'www.nj.com',
title: {
selectors: [['meta[name="title"]', 'value']],
},
author: {
selectors: [['meta[name="article_author"]', 'value']],
},
date_published: {
selectors: [['meta[name="article_date_original"]', 'value']],
timezone: 'America/New_York',
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']],
},
content: {
selectors: ['.entry-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [],
},
};

@ -1,100 +0,0 @@
import assert from 'assert';
import URL from 'url';
import cheerio from 'cheerio';
import Mercury from 'mercury';
import getExtractor from 'extractors/get-extractor';
import { excerptContent } from 'utils/text';
const fs = require('fs');
describe('WwwNjComExtractor', () => {
describe('initial test case', () => {
let result;
let url;
beforeAll(() => {
url =
'http://www.nj.com/essex/index.ssf/2016/12/man_sentenced_for_stealing_millions_from_nj_atms_i.html#incart_river_home';
const html = fs.readFileSync('./fixtures/www.nj.com.html');
result = Mercury.parse(url, { html, fallback: false });
});
it('is selected properly', () => {
// This test should be passing by default.
// It sanity checks that the correct parser
// is being selected for URLs from this domain
const extractor = getExtractor(url);
assert.equal(extractor.domain, URL.parse(url).hostname);
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.nj.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// the article.
assert.equal(
title,
'Man sentenced for stealing millions in elaborate N.J. ATM skimming scheme'
);
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.nj.com/index.js.
const { author } = await result;
// Update these values with the expected values from
// the article.
assert.equal(author, 'Rajeev Dhir | NJ Advance Media for NJ.com');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.nj.com/index.js.
const { date_published } = await result;
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-13T21:51:00.000Z');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.nj.com/index.js.
const { lead_image_url } = await result;
// Update these values with the expected values from
// the article.
assert.equal(
lead_image_url,
'http://image.nj.com/home/njo-media/width620/img/njcom_photos/photo/2016/12/08/21671718-large.png'
);
});
it('returns the content', async () => {
// To pass this test, fill out the content selector
// in ./src/extractors/custom/www.nj.com/index.js.
// You may also want to make use of the clean and transform
// options.
const { content } = await result;
const $ = cheerio.load(content || '');
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
// Update these values with the expected values from
// the article.
assert.equal(
first13,
'NEWARK -- A Romanian native was sentenced to 57 months on Tuesday for'
);
});
});
});

@ -2,15 +2,18 @@ export const WwwOpposingviewsComExtractor = {
domain: 'www.opposingviews.com',
title: {
selectors: ['h1.title'],
selectors: ['h1.m-detail-header--title', 'h1.title'],
},
author: {
selectors: ['div.date span span a'],
selectors: [['meta[name="author"]', 'value'], 'div.date span span a'],
},
date_published: {
selectors: [['meta[name="publish_date"]', 'value']],
selectors: [
['meta[name="published"]', 'value'],
['meta[name="publish_date"]', 'value'],
],
},
dek: {
@ -24,7 +27,7 @@ export const WwwOpposingviewsComExtractor = {
},
content: {
selectors: ['.article-content'],
selectors: ['.m-detail--body', '.article-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images

@ -69,7 +69,7 @@ describe('WwwOpposingviewsComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://images.opposingviews.com:8080/ovi/catalog/downloads/preview/rndr_670x377//2016/12/icahn-1482373693.jpg/rndr_670x377.jpg'
'https://www.opposingviews.com/.image/t_share/MTU0MDAxMTEyNDA2ODkzNjUw/investor-icahn-to-advise-trump-on-finances-regulation-promo-image.jpg'
);
});

@ -14,7 +14,7 @@ export const WwwRbbtodayComExtractor = {
},
dek: {
selectors: ['.arti-summary'],
selectors: [['meta[name="description"]', 'value'], '.arti-summary'],
},
lead_image_url: {

@ -32,11 +32,11 @@ describe('WwwRefinery29ComExtractor', () => {
// in ./src/extractors/custom/www.refinery29.com/index.js.
const { title } = await result;
// Update these values with the expected values from
// Up these values with the expected values from
// the article.
assert.equal(
title,
"For Holiday Parties This Year, Let's Get Behind The Conversation-Starter Tee"
'For Holiday Parties This Year, Lets Get Behind The Conversation-Starter Tee'
);
});
@ -50,15 +50,16 @@ describe('WwwRefinery29ComExtractor', () => {
assert.equal(author, 'Connie Wang');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.refinery29.com/index.js.
const { date_published } = await result;
// As of 10-7-22, The HTML no longer returns a parseable date
// it('returns the date_published', async () => {
// // To pass this test, fill out the date_published selector
// // in ./src/extractors/custom/www.refinery29.com/index.js.
// const { date_published } = await result;
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-13T01:00:00.000Z');
});
// // Update these values with the expected values from
// // the article.
// assert.equal(date_published, '2016-12-13T01:00:00.000Z');
// });
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
@ -69,7 +70,7 @@ describe('WwwRefinery29ComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://s3.r29static.com//bin/entry/fc5/0,213,2000,1050/x,80/1708221/image.jpg'
'https://s1.r29static.com/bin/entry/c60/0,675,2000,1050/x,80/1708221/image.jpg'
);
});

@ -2,11 +2,11 @@ export const WwwReutersComExtractor = {
domain: 'www.reuters.com',
title: {
selectors: ['h1.article-headline'],
selectors: ['h1[class*="ArticleHeader-headline-"]', 'h1.article-headline'],
},
author: {
selectors: ['.author'],
selectors: [['meta[name="og:article:author"]', 'value'], '.author'],
},
date_published: {
@ -18,7 +18,7 @@ export const WwwReutersComExtractor = {
},
content: {
selectors: ['#article-text'],
selectors: ['div.ArticleBodyWrapper', '#article-text'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
@ -29,6 +29,9 @@ export const WwwReutersComExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['#article-byline .author'],
clean: [
'div[class^="ArticleBody-byline-container-"]',
'#article-byline .author',
],
},
};

@ -46,7 +46,7 @@ describe('WwwReutersComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(author, 'Howard Schneider and Lindsay Dunsmuir');
assert.equal(author, 'Howard Schneider, Lindsay Dunsmuir');
});
it('returns the date_published', async () => {
@ -56,7 +56,7 @@ describe('WwwReutersComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-14T22:03:42.000Z');
assert.equal(date_published, '2016-12-14T06:02:15.000Z');
});
it('returns the lead_image_url', async () => {
@ -68,7 +68,7 @@ describe('WwwReutersComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://s2.reutersmedia.net/resources/r/?m=02&d=20161214&t=2&i=1165492293&w=&fh=545px&fw=&ll=&pl=&sq=&r=LYNXMPECBD1EH'
'https://static.reuters.com/resources/r/?m=02&d=20161215&t=2&i=1165535950&r=LYNXMPECBD1EH&w=800'
);
});
@ -92,7 +92,7 @@ describe('WwwReutersComExtractor', () => {
// the article.
assert.equal(
first13,
'WASHINGTON The U.S. Federal Reserve raised interest rates on Wednesday and signaled a'
'WASHINGTON (Reuters) - The U.S. Federal Reserve raised interest rates on Wednesday and'
);
});
});

@ -2,21 +2,24 @@ export const WwwRollingstoneComExtractor = {
domain: 'www.rollingstone.com',
title: {
selectors: ['h1.content-title'],
selectors: ['h1.l-article-header__row--title', 'h1.content-title'],
},
author: {
selectors: ['a.content-author.tracked-offpage'],
selectors: ['a.c-byline__link', 'a.content-author.tracked-offpage'],
},
date_published: {
selectors: ['time.content-published-date'],
selectors: [
['meta[name="article:published_time"]', 'value'],
'time.content-published-date',
],
timezone: 'America/New_York',
},
dek: {
selectors: ['.content-description'],
selectors: ['h2.l-article-header__row--lead', '.content-description'],
},
lead_image_url: {
@ -24,7 +27,11 @@ export const WwwRollingstoneComExtractor = {
},
content: {
selectors: [['.lead-container', '.article-content'], '.article-content'],
selectors: [
'.l-article-content',
['.lead-container', '.article-content'],
'.article-content',
],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
@ -33,6 +40,6 @@ export const WwwRollingstoneComExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['.module-related'],
clean: ['.c-related-links-wrapper', '.module-related'],
},
};

@ -36,7 +36,7 @@ describe('WwwRollingstoneComExtractor', () => {
// the article.
assert.equal(
title,
"'La La Land': How a Young Filmmaker Resurrected the Hollywood Musical"
'La La Land: How a Young Filmmaker Resurrected the Hollywood Musical'
);
});
@ -57,7 +57,7 @@ describe('WwwRollingstoneComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-09T05:00:00.000Z');
assert.equal(date_published, '2016-12-09T14:50:00.000Z');
});
it('returns the dek', async () => {
@ -82,7 +82,7 @@ describe('WwwRollingstoneComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://img.wennermedia.com/social/rs-la-la-land-3d3a431a-8329-4539-b953-51e2d61a396c.jpg'
'https://www.rollingstone.com/wp-content/uploads/2018/06/rs-la-la-land-3d3a431a-8329-4539-b953-51e2d61a396c.jpg'
);
});
@ -106,7 +106,7 @@ describe('WwwRollingstoneComExtractor', () => {
// the article.
assert.equal(
first13,
"Inside: 'La La Land': How a young filmmaker, his best-friend composer and two"
'Ask most folks what the most iconic thing about Los Angeles is, and'
);
});
});

@ -14,7 +14,7 @@ export const WwwSbnationComExtractor = {
},
dek: {
selectors: ['h2.c-entry-summary.p-dek'],
selectors: ['p.c-entry-summary.p-dek', 'h2.c-entry-summary.p-dek'],
},
lead_image_url: {

@ -81,7 +81,7 @@ describe('WwwSbnationComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://cdn0.vox-cdn.com/thumbor/VfeYqqHzPEOqSIXXAF-yK-1e4LQ=/0x0:1710x962/1600x900/cdn0.vox-cdn.com/uploads/chorus_image/image/52164267/592603330.0.jpeg'
'https://cdn.vox-cdn.com/thumbor/EnCgXG3xHnUSyPoxG2k7uA0NJlo=/0x0:1710x962/1600x900/cdn.vox-cdn.com/uploads/chorus_image/image/52164267/592603330.0.jpeg'
);
});

@ -2,15 +2,15 @@ export const WwwTodayComExtractor = {
domain: 'www.today.com',
title: {
selectors: ['h1.entry-headline'],
selectors: ['h1.article-hero-headline__htag', 'h1.entry-headline'],
},
author: {
selectors: [['meta[name="author"]', 'value']],
selectors: ['span.byline-name', ['meta[name="author"]', 'value']],
},
date_published: {
selectors: [['meta[name="DC.date.issued"]', 'value']],
selectors: ['time[datetime]', ['meta[name="DC.date.issued"]', 'value']],
},
lead_image_url: {
@ -18,7 +18,7 @@ export const WwwTodayComExtractor = {
},
content: {
selectors: ['.entry-container'],
selectors: ['div.article-body__content', '.entry-container'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images

@ -57,7 +57,7 @@ describe('WwwTodayComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-22T15:36:00.000Z');
assert.equal(date_published, '2016-12-22T21:36:00.000Z');
});
it('returns the lead_image_url', async () => {
@ -69,7 +69,7 @@ describe('WwwTodayComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://media1.s-nbcnews.com/i/newscms/2016_51/1183946/zsa-zsa-pool-2-today-161222-tease_ef3cb1c171786baa69a3f5db09f3da06.jpg'
'https://media-cldnry.s-nbcnews.com/image/upload/t_social_share_1200x630_center,f_auto,q_auto:best/newscms/2016_51/1183946/zsa-zsa-pool-2-today-161222-tease.jpg'
);
});

@ -6,13 +6,13 @@ export const WwwUsmagazineComExtractor = {
},
author: {
selectors: ['a.article-byline.tracked-offpage'],
selectors: ['a.author', 'a.article-byline.tracked-offpage'],
},
date_published: {
timezone: 'America/New_York',
selectors: ['time.article-published-date'],
selectors: [['meta[name="article:published_time"]', 'value']],
},
lead_image_url: {
@ -20,7 +20,7 @@ export const WwwUsmagazineComExtractor = {
},
content: {
selectors: ['div.article-body-inner'],
selectors: ['div.article-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images

@ -70,7 +70,7 @@ describe('WwwUsmagazineComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-07T20:53:00.000Z');
assert.equal(date_published, '2016-12-07T20:53:36.000Z');
});
it('returns the lead_image_url', async () => {
@ -89,7 +89,7 @@ describe('WwwUsmagazineComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://img.wennermedia.com/social/lady-gaga-taylor-kinney-9662aa39-cb01-4b53-9aa0-7aa8c6e3e94f.jpg'
'https://i0.wp.com/www.usmagazine.com/wp-content/uploads/lady-gaga-taylor-kinney-9662aa39-cb01-4b53-9aa0-7aa8c6e3e94f.jpg?crop=0px%2C0px%2C1500px%2C788px&resize=1200%2C630&ssl=1&quality=82&strip=all'
);
});
@ -117,7 +117,7 @@ describe('WwwUsmagazineComExtractor', () => {
// the article.
assert.equal(
first13,
'Taylor Kinney and Lady Gaga arrive at the 37th Annual Kennedy Center Honors'
'The Little Monsters are praying for a reunion. Lady Gaga shared a photo'
);
});
});

@ -82,7 +82,7 @@ describe('WwwVoxComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://cdn0.vox-cdn.com/thumbor/RuJTDlBH9LAp_9uFqYfnPzWXhj0=/0x175:2500x1564/1080x600/cdn0.vox-cdn.com/uploads/chorus_image/image/52223131/628656068.0.jpeg'
'https://cdn.vox-cdn.com/thumbor/E4zA3smy2FKHl9BxfsfKjkNHfFc=/0x166:2500x1572/1600x900/cdn.vox-cdn.com/uploads/chorus_image/image/52223131/628656068.0.jpeg'
);
});

Loading…
Cancel
Save