fix: incorrect parsing on theatlantic.com (#475)

* fix: incorrect parsing on theatlantic.com

* chore: updating theatlantic.com tests & fixtures

* chore: removing script data from minified fixture
pull/469/head^2
Michael Ashley 5 years ago committed by GitHub
parent 5e33263d25
commit 0686ee7956
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because one or more lines are too long

@ -3,18 +3,15 @@
export const TheAtlanticExtractor = {
domain: 'www.theatlantic.com',
title: {
selectors: ['h1.hed'],
selectors: ['h1', '.c-article-header__hed'],
},
author: {
selectors: ['article#article .article-cover-extra .metadata .byline a'],
selectors: [['meta[name="author"]', 'value'], '.c-byline__author'],
},
content: {
selectors: [
['.article-cover figure.lead-img', '.article-body'],
'.article-body',
],
selectors: ['article', '.article-body'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
@ -23,14 +20,29 @@ export const TheAtlanticExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['.partner-box', '.callout'],
clean: [
'.partner-box',
'.callout',
'.c-article-writer__image',
'.c-article-writer__content',
'.c-letters-cta__text',
'.c-footer__logo',
'.c-recirculation-link',
'.twitter-tweet',
],
},
dek: {
selectors: [['meta[name="description"]', 'value']],
},
date_published: {
selectors: [['time[itemProp="datePublished"]', 'datetime']],
selectors: [['time[itemprop="datePublished"]', 'datetime']],
},
lead_image_url: null,
lead_image_url: {
selectors: [['img[itemprop="url"]', 'src']],
},
next_page_url: null,

@ -35,7 +35,8 @@ describe('AtlanticExtractor', () => {
// selectors in ./src/extractors/custom/www.theatlantic.com/index.js. This test is just
// a stub; you can add more fields to test as much of
// your parser as possible.
const { content, title, author } = await result;
const { content, title, author, dek, lead_image_url } = await result;
const $ = cheerio.load(content);
const text = $('*')
.first()
@ -48,7 +49,15 @@ describe('AtlanticExtractor', () => {
'Why New Yorkers Received a Push Alert About a Manhunt'
);
assert.equal(author, 'Kaveh Waddell');
assert.equal(text, 'New York police offi');
assert.equal(text, 'The city has never b');
assert.equal(
dek,
'The city has never before used the emergency system the way it did Monday morning.'
);
assert.equal(
lead_image_url,
'https://cdn.theatlantic.com/assets/media/img/mt/2016/09/RTSO9RP/lead_720_405.jpg?mod=1533691849'
);
});
});
});

Loading…
Cancel
Save