fix: new yorker extractor (#414)

* fix: new yorker extractor

* fix: date_published selector

* fix: remove footer from content

* feat: add additional selector for title

* feat: support article with multiple authors
pull/420/head
Wajeeh Zantout 5 years ago committed by Toufic Mouallem
parent e66ad8b81c
commit 7c8de71c52

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -4,15 +4,21 @@
export const NewYorkerExtractor = {
domain: 'www.newyorker.com',
title: {
selectors: ['h1.title'],
selectors: [
'h1[class^="ArticleHeader__hed"]',
['meta[name="og:title"]', 'value'],
],
},
author: {
selectors: ['.contributors'],
selectors: [
'div[class^="ArticleContributors"] a[rel="author"]',
'article header div[class*="Byline__multipleContributors"]',
],
},
content: {
selectors: ['div#articleBody', 'div.articleBody'],
selectors: ['main[class^="Layout__content"]'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
@ -21,15 +27,12 @@ export const NewYorkerExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [],
clean: ['footer[class^="ArticleFooter__footer"]'],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
['time[itemProp="datePublished"]', 'content'],
],
selectors: [['meta[name="pubdate"]', 'value']],
format: 'YYYYMMDD',
timezone: 'America/New_York',
},
@ -38,7 +41,7 @@ export const NewYorkerExtractor = {
},
dek: {
selectors: ['.dek', 'h2.dek'],
selectors: ['h2[class^="ArticleHeader__dek"]'],
},
next_page_url: null,

@ -16,7 +16,7 @@ describe('NewYorkerExtractor', () => {
url =
'http://www.newyorker.com/tech/elements/hacking-cryptography-and-the-countdown-to-quantum-computing';
const html = fs.readFileSync(
'./fixtures/www.newyorker.com/1475248565793.html'
'./fixtures/www.newyorker.com/1557138180688.html'
);
result = Mercury.parse(url, { html, fallback: false });
});
@ -61,7 +61,7 @@ describe('NewYorkerExtractor', () => {
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-09-26T18:04:22.000Z');
assert.equal(date_published.split('T')[0], '2016-09-26');
});
it('returns the lead_image_url', async () => {
@ -73,7 +73,7 @@ describe('NewYorkerExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'http://www.newyorker.com/wp-content/uploads/2016/09/Hutchinson-Quantum-Computing-1200x630-1474903563.jpg'
'https://media.newyorker.com/photos/59097a5e8b51cf59fc4239f5/16:9/w_1200,h_630,c_limit/Hutchinson-Quantum-Computing.jpg'
);
});
@ -109,7 +109,7 @@ describe('NewYorkerExtractor', () => {
url =
'http://www.newyorker.com/magazine/2016/12/05/lessons-from-my-mother';
const html = fs.readFileSync(
'./fixtures/www.newyorker.com/1480713300334.html'
'./fixtures/www.newyorker.com/1557145645680.html'
);
result = Mercury.parse(url, { html, fallback: false });
});
@ -126,7 +126,29 @@ describe('NewYorkerExtractor', () => {
it('returns the date for magazine content', async () => {
const { date_published } = await result;
assert.equal(date_published, '2016-11-28T05:00:00.000Z');
assert.equal(date_published.split('T')[0], '2016-11-28');
});
});
describe('article with multiple authors', () => {
let result;
let url;
beforeAll(() => {
url =
'https://www.newyorker.com/humor/daily-shouts/teas-you-should-probably-get-rid-of-already';
const html = fs.readFileSync(
'./fixtures/www.newyorker.com/1557834611707.html'
);
result = Mercury.parse(url, { html, fallback: false });
});
it('returns multiple authors', async () => {
const { author } = await result;
assert.equal(
author,
'Ysabel YatesIllustration by Claire LordonMay 10, 2019'
);
});
});
});

Loading…
Cancel
Save