feat: add option to allow custom extractors to skip default cleaners

This commit is contained in:
Adam Pash 2016-09-15 14:50:51 -04:00
parent 97a0728ecf
commit f0f216c7b9
2 changed files with 11 additions and 2 deletions

View File

@ -5,6 +5,8 @@ const WikipediaExtractor = {
'#mw-content-text', '#mw-content-text',
], ],
defaultCleaner: false,
// transform top infobox to an image with caption // transform top infobox to an image with caption
transforms: { transforms: {
'.infobox img': ($node) => { '.infobox img': ($node) => {

View File

@ -51,7 +51,7 @@ export function select(opts) {
// contributors), return the string // contributors), return the string
if (typeof extractionOpts === 'string') return extractionOpts; if (typeof extractionOpts === 'string') return extractionOpts;
const { selectors } = extractionOpts; const { selectors, defaultCleaner = true } = extractionOpts;
const matchingSelector = selectors.find(selector => $(selector).length === 1 && $(selector).text().trim() !== ''); const matchingSelector = selectors.find(selector => $(selector).length === 1 && $(selector).text().trim() !== '');
@ -88,9 +88,16 @@ export function select(opts) {
// otherwise use the text of the node // otherwise use the text of the node
result = $(matchingSelector).text(); result = $(matchingSelector).text();
} }
// Allow custom extractor to skip default cleaner
// for this type; defaults to true
if (defaultCleaner) {
return Cleaners[type](result, opts); return Cleaners[type](result, opts);
} }
return result;
}
function extractResult(opts) { function extractResult(opts) {
const { type, extractor } = opts; const { type, extractor } = opts;