feat: update nytimes extractor (#506)

* feat: update custom extractor for nytimes.com
pull/523/head
Wajeeh Zantout 5 years ago committed by GitHub
parent 3fb8526c4a
commit b0e708aac6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -2,15 +2,25 @@ export const NYTimesExtractor = {
domain: 'www.nytimes.com',
title: {
selectors: ['h1.g-headline', 'h1[itemprop="headline"]', 'h1.headline'],
selectors: [
'h1.g-headline',
'h1[itemprop="headline"]',
'h1.headline',
'h1 .balancedHeadline',
],
},
author: {
selectors: [['meta[name="author"]', 'value'], '.g-byline', '.byline'],
selectors: [
['meta[name="author"]', 'value'],
'.g-byline',
'.byline',
['meta[name="byl"]', 'value'],
],
},
content: {
selectors: ['div.g-blocks', 'article#story'],
selectors: ['div.g-blocks', 'section[name="articleBody"]', 'article#story'],
transforms: {
'img.g-lazy': $node => {

@ -14,9 +14,9 @@ describe('NYTimesExtractor', () => {
let url;
beforeAll(() => {
url =
'http://www.nytimes.com/interactive/2016/09/15/arts/design/national-museum-of-african-american-history-and-culture.html';
'https://www.nytimes.com/2016/09/20/nyregion/nyc-nj-explosions-ahmad-khan-rahami.html';
const html = fs.readFileSync(
'./fixtures/www.nytimes.com/1474318141888.html'
'./fixtures/www.nytimes.com/1571224616991.html'
);
result = Mercury.parse(url, { html, fallback: false });
});
@ -76,7 +76,7 @@ describe('NYTimesExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://static01.nyt.com/images/2016/09/20/nyregion/20MANHUNT1/20MANHUNT1-facebookJumbo.jpg'
'https://static01.nyt.com/images/2016/09/20/nyregion/Manhunt/Manhunt-facebookJumbo-v2.jpg'
);
});
@ -100,34 +100,38 @@ describe('NYTimesExtractor', () => {
// the article.
assert.equal(
first13,
'The man believed to be responsible for the explosion in Manhattan on Saturday'
'The man who the police said sowed terror across two states, setting off'
);
});
});
it('works with a feature story', async () => {
const html = fs.readFileSync(
'./fixtures/www.nytimes.com/1474061823854.html'
'./fixtures/www.nytimes.com/1571223287888.html'
);
const uri =
'http://www.nytimes.com/interactive/2016/09/15/arts/design/national-museum-of-african-american-history-and-culture.html';
const { content, title, author } = await Mercury.parse(uri, { html });
const $ = cheerio.load(content);
const text = $('*')
.first()
.text()
.trim()
.slice(0, 20);
const text = excerptContent(
$('*')
.first()
.text(),
13
);
assert.equal(title, 'I, Too, Sing America');
assert.equal(author, 'The New York Times');
assert.equal(text, 'T he Smithsonians N');
assert.equal(
text,
'T he Smithsonians National Museum of African American History and Culture opens on'
);
});
it('returns the title on most recent articles', async () => {
const html = fs.readFileSync(
'./fixtures/www.nytimes.com/1539194812689.html'
'./fixtures/www.nytimes.com/1571223477873.html'
);
const uri =
'https://www.nytimes.com/2018/10/09/us/politics/nikki-haley-united-nations.html';

Loading…
Cancel
Save