You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
68 lines
1.5 KiB
JavaScript
68 lines
1.5 KiB
JavaScript
export const NYMagExtractor = {
|
|
domain: 'nymag.com',
|
|
content: {
|
|
// Order by most likely. Extractor will stop on first occurrence
|
|
selectors: [
|
|
'div.article-content',
|
|
'section.body',
|
|
'article.article',
|
|
],
|
|
|
|
// Selectors to remove from the extracted content
|
|
clean: [
|
|
'.ad',
|
|
'.single-related-story',
|
|
],
|
|
|
|
// Object of tranformations to make on matched elements
|
|
// Each key is the selector, each value is the tag to
|
|
// transform to.
|
|
// If a function is given, it should return a string
|
|
// to convert to or nothing (in which case it will not perform
|
|
// the transformation.
|
|
transforms: {
|
|
// Convert h1s to h2s
|
|
h1: 'h2',
|
|
|
|
// Convert lazy-loaded noscript images to figures
|
|
noscript: ($node, $) => {
|
|
const $children = $.browser ? $($node.text()) : $node.children();
|
|
if ($children.length === 1 && $children.get(0) !== undefined &&
|
|
$children.get(0).tagName.toLowerCase() === 'img') {
|
|
return 'figure';
|
|
}
|
|
|
|
return null;
|
|
},
|
|
},
|
|
},
|
|
|
|
title: {
|
|
selectors: [
|
|
'h1.lede-feature-title',
|
|
'h1.headline-primary',
|
|
'h1',
|
|
],
|
|
},
|
|
|
|
author: {
|
|
selectors: [
|
|
'.by-authors',
|
|
'.lede-feature-author',
|
|
],
|
|
},
|
|
|
|
dek: {
|
|
selectors: [
|
|
'.lede-feature-teaser',
|
|
],
|
|
},
|
|
|
|
date_published: {
|
|
selectors: [
|
|
['time.article-timestamp[datetime]', 'datetime'],
|
|
'time.article-timestamp',
|
|
],
|
|
},
|
|
};
|