From f0f216c7b97725525701dfd9681d6f2a65325a2b Mon Sep 17 00:00:00 2001 From: Adam Pash Date: Thu, 15 Sep 2016 14:50:51 -0400 Subject: [PATCH] feat: add option to allow custom extractors to skip default cleaners --- src/extractors/custom/wikipedia.org/index.js | 2 ++ src/extractors/root-extractor.js | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/extractors/custom/wikipedia.org/index.js b/src/extractors/custom/wikipedia.org/index.js index a30ce35b..5d8d59ce 100644 --- a/src/extractors/custom/wikipedia.org/index.js +++ b/src/extractors/custom/wikipedia.org/index.js @@ -5,6 +5,8 @@ const WikipediaExtractor = { '#mw-content-text', ], + defaultCleaner: false, + // transform top infobox to an image with caption transforms: { '.infobox img': ($node) => { diff --git a/src/extractors/root-extractor.js b/src/extractors/root-extractor.js index 4d71f3cb..9198824b 100644 --- a/src/extractors/root-extractor.js +++ b/src/extractors/root-extractor.js @@ -51,7 +51,7 @@ export function select(opts) { // contributors), return the string if (typeof extractionOpts === 'string') return extractionOpts; - const { selectors } = extractionOpts; + const { selectors, defaultCleaner = true } = extractionOpts; const matchingSelector = selectors.find(selector => $(selector).length === 1 && $(selector).text().trim() !== ''); @@ -88,7 +88,14 @@ export function select(opts) { // otherwise use the text of the node result = $(matchingSelector).text(); } - return Cleaners[type](result, opts); + + // Allow custom extractor to skip default cleaner + // for this type; defaults to true + if (defaultCleaner) { + return Cleaners[type](result, opts); + } + + return result; } function extractResult(opts) {