Bugfix new yorker wired extractors (#604)

* www.newyorker.com: add updated fixtures and fix extractors

* www.wired.com: add updated fixtures and fix extractors

Co-authored-by: John Holdun <john@johnholdun.com>
pull/599/head
Joe Moon 2 years ago committed by GitHub
parent 99062da034
commit fb44ab0244
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -5,6 +5,7 @@ export const NewYorkerExtractor = {
domain: 'www.newyorker.com', domain: 'www.newyorker.com',
title: { title: {
selectors: [ selectors: [
'h1[class^="content-header"]',
'h1[class^="ArticleHeader__hed"]', 'h1[class^="ArticleHeader__hed"]',
['meta[name="og:title"]', 'value'], ['meta[name="og:title"]', 'value'],
], ],
@ -12,13 +13,17 @@ export const NewYorkerExtractor = {
author: { author: {
selectors: [ selectors: [
['meta[name="author"]', 'value'],
'div[class^="ArticleContributors"] a[rel="author"]', 'div[class^="ArticleContributors"] a[rel="author"]',
'article header div[class*="Byline__multipleContributors"]', 'article header div[class*="Byline__multipleContributors"]',
], ],
}, },
content: { content: {
selectors: ['main[class^="Layout__content"]'], selectors: [
'article.article.main-content',
'main[class^="Layout__content"]',
],
// Is there anything in the content you selected that needs transformed // Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images // before it's consumable content? E.g., unusual lazy loaded images
@ -31,8 +36,10 @@ export const NewYorkerExtractor = {
}, },
date_published: { date_published: {
selectors: [['meta[name="pubdate"]', 'value']], selectors: [
format: 'YYYYMMDD', 'time.content-header__publish-date',
['meta[name="pubdate"]', 'value'],
],
timezone: 'America/New_York', timezone: 'America/New_York',
}, },
@ -41,7 +48,7 @@ export const NewYorkerExtractor = {
}, },
dek: { dek: {
selectors: ['h2[class^="ArticleHeader__dek"]'], selectors: ['div.content-header__dek', 'h2[class^="ArticleHeader__dek"]'],
}, },
next_page_url: null, next_page_url: null,

@ -16,7 +16,7 @@ describe('NewYorkerExtractor', () => {
url = url =
'http://www.newyorker.com/tech/elements/hacking-cryptography-and-the-countdown-to-quantum-computing'; 'http://www.newyorker.com/tech/elements/hacking-cryptography-and-the-countdown-to-quantum-computing';
const html = fs.readFileSync( const html = fs.readFileSync(
'./fixtures/www.newyorker.com/1557138180688.html' './fixtures/www.newyorker.com/1611473608343.html'
); );
result = Mercury.parse(url, { html, fallback: false }); result = Mercury.parse(url, { html, fallback: false });
}); });
@ -73,7 +73,7 @@ describe('NewYorkerExtractor', () => {
// the article. // the article.
assert.equal( assert.equal(
lead_image_url, lead_image_url,
'https://media.newyorker.com/photos/59097a5e8b51cf59fc4239f5/16:9/w_1200,h_630,c_limit/Hutchinson-Quantum-Computing.jpg' 'https://media.newyorker.com/photos/59097a5e8b51cf59fc4239f5/16:9/w_1280,c_limit/Hutchinson-Quantum-Computing.jpg'
); );
}); });
@ -97,7 +97,7 @@ describe('NewYorkerExtractor', () => {
// the article. // the article.
assert.equal( assert.equal(
first13, first13,
'In a laboratory in Shanghai, researchers work on developing a quantum computer—a new' 'Given the recent ubiquity of cyber-scandals—Colin Powells stolen e-mails, Simone Biless leaked medical'
); );
}); });
}); });
@ -109,7 +109,7 @@ describe('NewYorkerExtractor', () => {
url = url =
'http://www.newyorker.com/magazine/2016/12/05/lessons-from-my-mother'; 'http://www.newyorker.com/magazine/2016/12/05/lessons-from-my-mother';
const html = fs.readFileSync( const html = fs.readFileSync(
'./fixtures/www.newyorker.com/1557145645680.html' './fixtures/www.newyorker.com/1611475571383.html'
); );
result = Mercury.parse(url, { html, fallback: false }); result = Mercury.parse(url, { html, fallback: false });
}); });

@ -5,6 +5,7 @@ export const WiredExtractor = {
domain: 'www.wired.com', domain: 'www.wired.com',
title: { title: {
selectors: [ selectors: [
'h1.content-header__hed',
'h1.post-title', 'h1.post-title',
// enter title selectors // enter title selectors
], ],
@ -12,6 +13,7 @@ export const WiredExtractor = {
author: { author: {
selectors: [ selectors: [
['meta[name="author"]', 'value'],
'a[rel="author"]', 'a[rel="author"]',
// enter author selectors // enter author selectors
], ],
@ -19,6 +21,7 @@ export const WiredExtractor = {
content: { content: {
selectors: [ selectors: [
'article.article.main-content',
'article.content', 'article.content',
// enter content selectors // enter content selectors
], ],
@ -34,7 +37,10 @@ export const WiredExtractor = {
}, },
date_published: { date_published: {
selectors: [['meta[itemprop="datePublished"]', 'value']], selectors: [
'time.content-header__publish-date',
['meta[itemprop="datePublished"]', 'value'],
],
}, },
lead_image_url: { lead_image_url: {

@ -17,7 +17,7 @@ describe('WiredExtractor', () => {
url = url =
'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/'; 'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
const html = fs.readFileSync( const html = fs.readFileSync(
'./fixtures/www.wired.com/1475256747028.html' './fixtures/www.wired.com/1611475755063.html'
); );
result = Mercury.parse(url, { html, fallback: false }); result = Mercury.parse(url, { html, fallback: false });
}); });
@ -62,7 +62,7 @@ describe('WiredExtractor', () => {
// Update these values with the expected values from // Update these values with the expected values from
// the article. // the article.
assert.equal(date_published, '2016-09-30T07:00:12.000Z'); assert.equal(date_published.split('T')[0], '2016-09-30');
}); });
it('returns the lead_image_url', async () => { it('returns the lead_image_url', async () => {
@ -74,7 +74,7 @@ describe('WiredExtractor', () => {
// the article. // the article.
assert.equal( assert.equal(
lead_image_url, lead_image_url,
'https://www.wired.com/wp-content/uploads/2016/09/Rosetta_impact-1-1200x630.jpg' 'https://media.wired.com/photos/5926b676af95806129f50602/191:100/w_1280,c_limit/Rosetta_impact-1.jpg'
); );
}); });
@ -98,7 +98,7 @@ describe('WiredExtractor', () => {
// the article. // the article.
assert.equal( assert.equal(
first13, first13,
'Today, the European Space Agencys Rosetta spacecraft will engage its thrusters for one' "Today, the European Space Agency's Rosetta spacecraft will engage its thrusters for one"
); );
}); });
}); });

Loading…
Cancel
Save