You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
61 lines
1.3 KiB
JavaScript
61 lines
1.3 KiB
JavaScript
const NYMagExtractor = {
|
|
domain: 'nymag.com',
|
|
content: {
|
|
// Order by most likely. Extractor will stop on first occurence
|
|
selectors: [
|
|
'div.article-content',
|
|
'section.body',
|
|
'article.article',
|
|
],
|
|
|
|
// Selectors to remove from the extracted content
|
|
clean: [
|
|
'.ad',
|
|
'.single-related-story',
|
|
],
|
|
|
|
// Object of tranformations to make on matched elements
|
|
// Each key is the selector, each value is the tag to
|
|
// transform to.
|
|
// If a function is given, it should return a string
|
|
// to convert to or nothing (in which case it will not perform
|
|
// the transformation.
|
|
transforms: {
|
|
// Convert h1s to h2s
|
|
h1: 'h2',
|
|
|
|
// Convert lazy-loaded noscript images to figures
|
|
noscript: ($node) => {
|
|
const $children = $node.children();
|
|
if ($children.length === 1 && $children.get(0).tagName === 'img') {
|
|
return 'figure';
|
|
}
|
|
|
|
return null;
|
|
},
|
|
},
|
|
},
|
|
|
|
title: {
|
|
selectors: [
|
|
'h1.headline-primary',
|
|
'h1',
|
|
],
|
|
},
|
|
|
|
author: {
|
|
selectors: [
|
|
'.by-authors',
|
|
],
|
|
},
|
|
|
|
datePublished: {
|
|
selectors: [
|
|
'time.article-timestamp[datetime]',
|
|
'time.article-timestamp',
|
|
],
|
|
},
|
|
};
|
|
|
|
export default NYMagExtractor;
|