You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/extractors/custom/www.nytimes.com/index.js

69 lines
1.3 KiB
JavaScript

export const NYTimesExtractor = {
domain: 'www.nytimes.com',
title: {
selectors: [
'h1[data-testid="headline"]',
'h1.g-headline',
'h1[itemprop="headline"]',
'h1.headline',
'h1 .balancedHeadline',
],
},
author: {
selectors: [
['meta[name="author"]', 'value'],
'.g-byline',
'.byline',
['meta[name="byl"]', 'value'],
],
},
content: {
selectors: ['div.g-blocks', 'section[name="articleBody"]', 'article#story'],
transforms: {
'img.g-lazy': $node => {
let src = $node.attr('src');
const width = 640;
src = src.replace('{{size}}', width);
$node.attr('src', src);
},
},
clean: [
'.ad',
'header#story-header',
'.story-body-1 .lede.video',
'.visually-hidden',
'#newsletter-promo',
'.promo',
'.comments-button',
'.hidden',
'.comments',
'.supplemental',
'.nocontent',
'.story-footer-links',
],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
['meta[name="article:published"]', 'value'],
],
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']],
},
dek: null,
next_page_url: null,
excerpt: null,
};