feat: update more parsers and add correct bloomberg html files

move-fixtures
Sarah Doire 2 years ago
parent 8543db6017
commit 7b75d04501

58
dist/mercury.js vendored

@ -2205,14 +2205,14 @@ var WikiaExtractor = {
var LittleThingsExtractor = {
domain: 'www.littlethings.com',
title: {
selectors: ['h1.post-title']
selectors: ['h1[class*="PostHeader"]', 'h1.post-title']
},
author: {
selectors: [['meta[name="author"]', 'value']]
selectors: ['div[class^="PostHeader__ScAuthorNameSection"]', ['meta[name="author"]', 'value']]
},
content: {
selectors: [// enter content selectors
'.mainContentIntro', '.content-wrapper'],
'section[class*="PostMainArticle"]', '.mainContentIntro', '.content-wrapper'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [],
@ -2238,11 +2238,11 @@ var PoliticoExtractor = {
['meta[name="og:title"]', 'value']]
},
author: {
selectors: ['.story-main-content .byline .vcard']
selectors: ['.story-meta__authors .vcard', '.story-main-content .byline .vcard']
},
content: {
selectors: [// enter content selectors
'.story-main-content', '.content-group', '.story-core', '.story-text'],
['p.story-text__paragraph '], '.story-main-content', '.content-group', '.story-core', '.story-text'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [],
@ -2252,7 +2252,7 @@ var PoliticoExtractor = {
clean: ['figcaption']
},
date_published: {
selectors: [['.story-main-content .timestamp time[datetime]', 'datetime']]
selectors: ['.story-meta__details time[datetime]', ['.story-main-content .timestamp time[datetime]', 'datetime']]
},
lead_image_url: {
selectors: [// enter lead_image_url selectors
@ -2815,9 +2815,7 @@ var WwwTheguardianComExtractor = {
selectors: ['#maincontent', '.content__article-body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
'h2': 'h4'
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
@ -2860,24 +2858,24 @@ var WwwBloombergComExtractor = {
selectors: [// normal articles
'.lede-headline', // /graphics/ template
'h1.article-title', // /news/ template
'h1.lede-text-only__hed']
'h1[class^="headline"]', 'h1.lede-text-only__hed']
},
author: {
selectors: [['meta[name="parsely-author"]', 'value'], '.byline-details__link', // /graphics/ template
'.bydek', // /news/ template
'.author']
'.author', 'p[class*="author"]']
},
date_published: {
selectors: [['time.published-at', 'datetime'], ['time[datetime]', 'datetime'], ['meta[name="date"]', 'value'], ['meta[name="parsely-pub-date"]', 'value']]
selectors: [['time.published-at', 'datetime'], ['time[datetime]', 'datetime'], ['meta[name="date"]', 'value'], ['meta[name="parsely-pub-date"]', 'value'], ['meta[name="parsely-pub-date"]', 'content']]
},
dek: {
selectors: []
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
selectors: [['meta[name="og:image"]', 'value'], ['meta[name="og:image"]', 'content']]
},
content: {
selectors: ['.article-body__content', // /graphics/ template
selectors: ['.article-body__content', '.body-content', // /graphics/ template
['section.copy-block'], // /news/ template
'.body-copy'],
// Is there anything in the content you selected that needs transformed
@ -3459,21 +3457,22 @@ var WwwMentalflossComExtractor = {
var AbcnewsGoComExtractor = {
domain: 'abcnews.go.com',
title: {
selectors: ['.article-header h1']
selectors: ['div[class*="Article_main__body"] h1', '.article-header h1']
},
author: {
selectors: ['.authors'],
selectors: ['.ShareByline span:nth-child(2)', '.authors'],
clean: ['.author-overlay', '.by-text']
},
date_published: {
selectors: ['.timestamp'],
selectors: ['.ShareByline', '.timestamp'],
format: 'MMMM D, YYYY h:mm a',
timezone: 'America/New_York'
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['.article-copy'],
selectors: ['article', '.article-copy'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
@ -3735,19 +3734,19 @@ var UproxxComExtractor = {
var WwwEonlineComExtractor = {
domain: 'www.eonline.com',
title: {
selectors: ['h1.article__title']
selectors: ['h1.article-detail__title', 'h1.article__title']
},
author: {
selectors: ['.entry-meta__author a']
selectors: ['.article-detail__meta__author', '.entry-meta__author a']
},
date_published: {
selectors: [['meta[itemprop="datePublished"]', 'value']]
selectors: [['meta[name="article:published_time"]', 'value'], ['meta[itemprop="datePublished"]', 'value']]
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: [['.post-content section, .post-content div.post-content__image']],
selectors: [['.article-detail__main-content section'], ['.post-content section, .post-content div.post-content__image']],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
@ -4155,19 +4154,19 @@ var ScienceflyComExtractor = {
var HellogigglesComExtractor = {
domain: 'hellogiggles.com',
title: {
selectors: ['.title']
selectors: [['meta[name="og:title"]', 'value'], '.title']
},
author: {
selectors: ['.author-link']
selectors: ['.byline-wrapper span.author_name', '.author-link']
},
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']]
selectors: [['meta[property="article:published_time"]', 'content'], ['meta[name="article:published_time"]', 'value']]
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['.entry-content'],
selectors: ['.main-content', '.entry-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
@ -4472,10 +4471,11 @@ var GothamistComExtractor = {
selectors: ['h1', '.entry-header h1']
},
author: {
selectors: ['.author']
// There are multiple article-metadata and byline-author classes, but the main article's is the 3rd child of the l-container class
selectors: ['.article-metadata:nth-child(3) .byline-author', '.author']
},
date_published: {
selectors: ['abbr', 'abbr.published'],
selectors: [['meta[name="article:published_time"]', 'value'], 'abbr', 'abbr.published'],
timezone: 'America/New_York'
},
dek: {
@ -4485,7 +4485,7 @@ var GothamistComExtractor = {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['.entry-body'],
selectors: ['.article-body', '.entry-body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -2,16 +2,17 @@ export const AbcnewsGoComExtractor = {
domain: 'abcnews.go.com',
title: {
selectors: ['.article-header h1'],
selectors: ['div[class*="Article_main__body"] h1', '.article-header h1'],
},
author: {
selectors: ['.authors'],
selectors: ['.ShareByline span:nth-child(2)', '.authors'],
clean: ['.author-overlay', '.by-text'],
},
date_published: {
selectors: ['.timestamp'],
selectors: ['.ShareByline', '.timestamp'],
format: 'MMMM D, YYYY h:mm a',
timezone: 'America/New_York',
},
@ -20,7 +21,7 @@ export const AbcnewsGoComExtractor = {
},
content: {
selectors: ['.article-copy'],
selectors: ['article', '.article-copy'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images

@ -36,7 +36,7 @@ describe('AbcnewsGoComExtractor', () => {
// the article.
assert.equal(
title,
"Hillary Clinton: Putin's Alleged Involvement in Democratic Hack Stems From Longtime Grudge"
"Hillary Clinton: Putin's Alleged Involvement in Democratic Hack Stems From 'Personal Beef'"
);
});
@ -47,7 +47,7 @@ describe('AbcnewsGoComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(author, 'Josh Haskell David Caplan PATRICK REEVELL');
assert.equal(author, 'JOSH HASKELL, DAVID CAPLAN and PATRICK REEVELL');
});
it('returns the date_published', async () => {
@ -57,7 +57,7 @@ describe('AbcnewsGoComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-16T17:37:00.000Z');
assert.equal(date_published, '2016-12-16T21:19:00.000Z');
});
it('returns the lead_image_url', async () => {
@ -69,7 +69,7 @@ describe('AbcnewsGoComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://a.abcnews.com/images/Politics/AP-hillary-clinton-01-as-161216_16x9_992.jpg'
'https://s.abcnews.com/images/Politics/AP-hillary-clinton-01-as-161216_16x9_992.jpg'
);
});
@ -93,7 +93,7 @@ describe('AbcnewsGoComExtractor', () => {
// the article.
assert.equal(
first13,
"Hillary Clinton has an explanation for Vladimir Putin's alleged involvement in the hacking"
"— -- Hillary Clinton has an explanation for Vladimir Putin's alleged involvement in"
);
});
});

@ -14,11 +14,16 @@ export const GothamistComExtractor = {
},
author: {
selectors: ['.author'],
// There are multiple article-metadata and byline-author classes, but the main article's is the 3rd child of the l-container class
selectors: ['.article-metadata:nth-child(3) .byline-author', '.author'],
},
date_published: {
selectors: ['abbr', 'abbr.published'],
selectors: [
['meta[name="article:published_time"]', 'value'],
'abbr',
'abbr.published',
],
timezone: 'America/New_York',
},
@ -32,7 +37,7 @@ export const GothamistComExtractor = {
},
content: {
selectors: ['.entry-body'],
selectors: ['.article-body', '.entry-body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images

@ -44,9 +44,11 @@ describe('GothamistComExtractor', () => {
// in ./src/extractors/custom/gothamist.com/index.js.
const { author } = await result;
console.log(author.toString());
// Update these values with the expected values from
// the article.
assert.equal(author, 'Nathan Tempey');
assert.equal(author, 'Nathan\xa0Tempey');
});
it('returns the date_published', async () => {
@ -78,7 +80,7 @@ describe('GothamistComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://gothamist.com/assets_c/2017/03/030717FalloutShelter80NY-5-thumb-640xauto-989222.jpg'
'https://cms.prod.nypr.digital/images/42584/fill-1200x650/'
);
});
@ -102,7 +104,7 @@ describe('GothamistComExtractor', () => {
// the article.
assert.equal(
first13,
'The basement at 80 New York Avenue in Crown Heights is one of'
"You've seen the placards around town, beside the front door on apartment buildings,"
);
});
});

@ -10,6 +10,7 @@ export const WwwBloombergComExtractor = {
'h1.article-title',
// /news/ template
'h1[class^="headline"]',
'h1.lede-text-only__hed',
],
},
@ -24,6 +25,7 @@ export const WwwBloombergComExtractor = {
// /news/ template
'.author',
'p[class*="author"]',
],
},
@ -33,6 +35,7 @@ export const WwwBloombergComExtractor = {
['time[datetime]', 'datetime'],
['meta[name="date"]', 'value'],
['meta[name="parsely-pub-date"]', 'value'],
['meta[name="parsely-pub-date"]', 'content'],
],
},
@ -41,12 +44,16 @@ export const WwwBloombergComExtractor = {
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']],
selectors: [
['meta[name="og:image"]', 'value'],
['meta[name="og:image"]', 'content'],
],
},
content: {
selectors: [
'.article-body__content',
'.body-content',
// /graphics/ template
['section.copy-block'],

@ -69,7 +69,7 @@ describe('WwwBloombergComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://assets.bwbx.io/images/users/iqjWHBFdfxIU/ioUAfA1V2nzk/v0/-1x-1.jpg'
'https://assets.bwbx.io/images/users/iqjWHBFdfxIU/ioUAfA1V2nzk/v0/1200x675.jpg'
);
});

@ -2,15 +2,18 @@ export const WwwEonlineComExtractor = {
domain: 'www.eonline.com',
title: {
selectors: ['h1.article__title'],
selectors: ['h1.article-detail__title', 'h1.article__title'],
},
author: {
selectors: ['.entry-meta__author a'],
selectors: ['.article-detail__meta__author', '.entry-meta__author a'],
},
date_published: {
selectors: [['meta[itemprop="datePublished"]', 'value']],
selectors: [
['meta[name="article:published_time"]', 'value'],
['meta[itemprop="datePublished"]', 'value'],
],
},
lead_image_url: {
@ -19,6 +22,7 @@ export const WwwEonlineComExtractor = {
content: {
selectors: [
['.article-detail__main-content section'],
['.post-content section, .post-content div.post-content__image'],
],

@ -57,7 +57,7 @@ describe('WwwEonlineComExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-12-12T06:00:00.000Z');
assert.equal(date_published, '2016-12-12T14:00:00.000Z');
});
it('returns the lead_image_url', async () => {
@ -69,7 +69,7 @@ describe('WwwEonlineComExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://akns-images.eonline.com/eol_images/Entire_Site/2016117/rs_300x300-161207101544-600.ryan-gosling-gq.12716.jpg?downsize=600:*&crop=600:315;left,top'
'https://akns-images.eonline.com/eol_images/Entire_Site/2016117/rs_600x600-161207101544-600.ryan-gosling-gq.12716.jpg?fit=around%7C1080:1080&output-quality=90&crop=1080:1080;center,top'
);
});
@ -93,7 +93,7 @@ describe('WwwEonlineComExtractor', () => {
// the article.
assert.equal(
first13,
"Ryan Gosling's most cherished role won't win him any Hollywood awards.With his musical"
"Ryan Gosling's most cherished role won't win him any Hollywood awards. With his"
);
});
});

@ -5,6 +5,7 @@ export const LittleThingsExtractor = {
domain: 'www.littlethings.com',
title: {
selectors: [
'h1[class*="PostHeader"]',
'h1.post-title',
// enter title selectors
],
@ -12,6 +13,7 @@ export const LittleThingsExtractor = {
author: {
selectors: [
'div[class^="PostHeader__ScAuthorNameSection"]',
['meta[name="author"]', 'value'],
// enter author selectors
],
@ -20,6 +22,7 @@ export const LittleThingsExtractor = {
content: {
selectors: [
// enter content selectors
'section[class*="PostMainArticle"]',
'.mainContentIntro',
'.content-wrapper',
],

@ -14,7 +14,8 @@ describe('LittleThingsExtractor', () => {
let result;
let url;
beforeAll(() => {
url = 'http://www.littlethings.com/diy-pineapple-lamp/';
url =
'https://www.littlethings.com/lifestyle/amazon-has-a-private-food-brand-that-just-launched-100-new-products-for-fall/';
const html = fs.readFileSync('./fixtures/www.littlethings.com.html');
result = Mercury.parse(url, { html, fallback: false });
});
@ -38,7 +39,7 @@ describe('LittleThingsExtractor', () => {
// the article.
assert.equal(
title,
'Snip The Stems Off Plastic Spoons To Make A Quirky Pineapple Lamp'
'Amazon Has A Private Food Brand That Just Launched 100 New Products For Fall'
);
});
@ -49,7 +50,7 @@ describe('LittleThingsExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(author, 'Laura Caseley');
assert.equal(author, 'Bethany Braun-Silva');
});
it('returns the lead_image_url', async () => {
@ -61,7 +62,7 @@ describe('LittleThingsExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://cdn1.littlethings.com/app/uploads/2016/09/pineapple-b-thumb-1.jpg'
'https://images.ctfassets.net/f60q1anpxzid/6qQL1s7jrdjipeebu7wkNB/80ba587f55cc9f3673582f493dc611f1/04764069-41b3-48d3-be5a-4cde1c41.jpeg?w=1800&q=50&fm=jpg&fl=progressive'
);
});
@ -85,7 +86,7 @@ describe('LittleThingsExtractor', () => {
// the article.
assert.equal(
first13,
'Every room needs light, and so lamps are pretty much a necessity for'
"One of the best things about fall is all the food offerings. What's"
);
});
});

@ -11,12 +11,16 @@ export const PoliticoExtractor = {
},
author: {
selectors: ['.story-main-content .byline .vcard'],
selectors: [
'.story-meta__authors .vcard',
'.story-main-content .byline .vcard',
],
},
content: {
selectors: [
// enter content selectors
['p.story-text__paragraph '],
'.story-main-content',
'.content-group',
'.story-core',
@ -34,7 +38,10 @@ export const PoliticoExtractor = {
},
date_published: {
selectors: [['.story-main-content .timestamp time[datetime]', 'datetime']],
selectors: [
'.story-meta__details time[datetime]',
['.story-main-content .timestamp time[datetime]', 'datetime'],
],
},
lead_image_url: {

@ -57,7 +57,7 @@ describe('PoliticoExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-10-04T09:07:00.000Z');
assert.equal(date_published, '2016-10-04T10:07:00.000Z');
});
it('returns the lead_image_url', async () => {
@ -69,7 +69,7 @@ describe('PoliticoExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://static.politico.com/0f/e7/5ee9a89044d1a01f74140bcd5b9e/caucus-vp-preview.jpg'
'https://static.politico.com/0f/e7/5ee9a89044d1a01f74140bcd5b9e/caucus-vp-preview.jpg'
);
});

Loading…
Cancel
Save