Bugfix new yorker wired extractors (#604)

* www.newyorker.com: add updated fixtures and fix extractors

* www.wired.com: add updated fixtures and fix extractors

Co-authored-by: John Holdun <john@johnholdun.com>
pull/599/head
Joe Moon 2 years ago committed by GitHub
parent 99062da034
commit fb44ab0244
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -5,6 +5,7 @@ export const NewYorkerExtractor = {
domain: 'www.newyorker.com',
title: {
selectors: [
'h1[class^="content-header"]',
'h1[class^="ArticleHeader__hed"]',
['meta[name="og:title"]', 'value'],
],
@ -12,13 +13,17 @@ export const NewYorkerExtractor = {
author: {
selectors: [
['meta[name="author"]', 'value'],
'div[class^="ArticleContributors"] a[rel="author"]',
'article header div[class*="Byline__multipleContributors"]',
],
},
content: {
selectors: ['main[class^="Layout__content"]'],
selectors: [
'article.article.main-content',
'main[class^="Layout__content"]',
],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
@ -31,8 +36,10 @@ export const NewYorkerExtractor = {
},
date_published: {
selectors: [['meta[name="pubdate"]', 'value']],
format: 'YYYYMMDD',
selectors: [
'time.content-header__publish-date',
['meta[name="pubdate"]', 'value'],
],
timezone: 'America/New_York',
},
@ -41,7 +48,7 @@ export const NewYorkerExtractor = {
},
dek: {
selectors: ['h2[class^="ArticleHeader__dek"]'],
selectors: ['div.content-header__dek', 'h2[class^="ArticleHeader__dek"]'],
},
next_page_url: null,

@ -16,7 +16,7 @@ describe('NewYorkerExtractor', () => {
url =
'http://www.newyorker.com/tech/elements/hacking-cryptography-and-the-countdown-to-quantum-computing';
const html = fs.readFileSync(
'./fixtures/www.newyorker.com/1557138180688.html'
'./fixtures/www.newyorker.com/1611473608343.html'
);
result = Mercury.parse(url, { html, fallback: false });
});
@ -73,7 +73,7 @@ describe('NewYorkerExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://media.newyorker.com/photos/59097a5e8b51cf59fc4239f5/16:9/w_1200,h_630,c_limit/Hutchinson-Quantum-Computing.jpg'
'https://media.newyorker.com/photos/59097a5e8b51cf59fc4239f5/16:9/w_1280,c_limit/Hutchinson-Quantum-Computing.jpg'
);
});
@ -97,7 +97,7 @@ describe('NewYorkerExtractor', () => {
// the article.
assert.equal(
first13,
'In a laboratory in Shanghai, researchers work on developing a quantum computer—a new'
'Given the recent ubiquity of cyber-scandals—Colin Powells stolen e-mails, Simone Biless leaked medical'
);
});
});
@ -109,7 +109,7 @@ describe('NewYorkerExtractor', () => {
url =
'http://www.newyorker.com/magazine/2016/12/05/lessons-from-my-mother';
const html = fs.readFileSync(
'./fixtures/www.newyorker.com/1557145645680.html'
'./fixtures/www.newyorker.com/1611475571383.html'
);
result = Mercury.parse(url, { html, fallback: false });
});

@ -5,6 +5,7 @@ export const WiredExtractor = {
domain: 'www.wired.com',
title: {
selectors: [
'h1.content-header__hed',
'h1.post-title',
// enter title selectors
],
@ -12,6 +13,7 @@ export const WiredExtractor = {
author: {
selectors: [
['meta[name="author"]', 'value'],
'a[rel="author"]',
// enter author selectors
],
@ -19,6 +21,7 @@ export const WiredExtractor = {
content: {
selectors: [
'article.article.main-content',
'article.content',
// enter content selectors
],
@ -34,7 +37,10 @@ export const WiredExtractor = {
},
date_published: {
selectors: [['meta[itemprop="datePublished"]', 'value']],
selectors: [
'time.content-header__publish-date',
['meta[itemprop="datePublished"]', 'value'],
],
},
lead_image_url: {

@ -17,7 +17,7 @@ describe('WiredExtractor', () => {
url =
'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
const html = fs.readFileSync(
'./fixtures/www.wired.com/1475256747028.html'
'./fixtures/www.wired.com/1611475755063.html'
);
result = Mercury.parse(url, { html, fallback: false });
});
@ -62,7 +62,7 @@ describe('WiredExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-09-30T07:00:12.000Z');
assert.equal(date_published.split('T')[0], '2016-09-30');
});
it('returns the lead_image_url', async () => {
@ -74,7 +74,7 @@ describe('WiredExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://www.wired.com/wp-content/uploads/2016/09/Rosetta_impact-1-1200x630.jpg'
'https://media.wired.com/photos/5926b676af95806129f50602/191:100/w_1280,c_limit/Rosetta_impact-1.jpg'
);
});
@ -98,7 +98,7 @@ describe('WiredExtractor', () => {
// the article.
assert.equal(
first13,
'Today, the European Space Agencys Rosetta spacecraft will engage its thrusters for one'
"Today, the European Space Agency's Rosetta spacecraft will engage its thrusters for one"
);
});
});

Loading…
Cancel
Save