You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/extractors/custom/wikipedia.org/index.js

49 lines
918 B
JavaScript

const WikipediaExtractor = {
domain: 'wikipedia.org',
content: {
selectors: [
'#mw-content-text',
],
defaultCleaner: false,
// transform top infobox to an image with caption
transforms: {
'.infobox img': ($node) => {
const $parent = $node.parents('.infobox');
// Only prepend the first image in .infobox
if ($parent.children('img').length === 0) {
$parent.prepend($node);
}
},
'.infobox caption': 'figcaption',
'.infobox': 'figure',
},
// Selectors to remove from the extracted content
clean: [
'.mw-editsection',
'figure tr, figure td, figure tbody',
'#toc',
],
},
author: 'Wikipedia Contributors',
title: {
selectors: [
'h2.title',
],
},
date_published: {
selectors: [
'#footer-info-lastmod',
],
},
};
export default WikipediaExtractor;