mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-17 03:25:31 +00:00
feat: add option to allow custom extractors to skip default cleaners
This commit is contained in:
parent
97a0728ecf
commit
f0f216c7b9
@ -5,6 +5,8 @@ const WikipediaExtractor = {
|
|||||||
'#mw-content-text',
|
'#mw-content-text',
|
||||||
],
|
],
|
||||||
|
|
||||||
|
defaultCleaner: false,
|
||||||
|
|
||||||
// transform top infobox to an image with caption
|
// transform top infobox to an image with caption
|
||||||
transforms: {
|
transforms: {
|
||||||
'.infobox img': ($node) => {
|
'.infobox img': ($node) => {
|
||||||
|
@ -51,7 +51,7 @@ export function select(opts) {
|
|||||||
// contributors), return the string
|
// contributors), return the string
|
||||||
if (typeof extractionOpts === 'string') return extractionOpts;
|
if (typeof extractionOpts === 'string') return extractionOpts;
|
||||||
|
|
||||||
const { selectors } = extractionOpts;
|
const { selectors, defaultCleaner = true } = extractionOpts;
|
||||||
|
|
||||||
const matchingSelector = selectors.find(selector => $(selector).length === 1 && $(selector).text().trim() !== '');
|
const matchingSelector = selectors.find(selector => $(selector).length === 1 && $(selector).text().trim() !== '');
|
||||||
|
|
||||||
@ -88,9 +88,16 @@ export function select(opts) {
|
|||||||
// otherwise use the text of the node
|
// otherwise use the text of the node
|
||||||
result = $(matchingSelector).text();
|
result = $(matchingSelector).text();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Allow custom extractor to skip default cleaner
|
||||||
|
// for this type; defaults to true
|
||||||
|
if (defaultCleaner) {
|
||||||
return Cleaners[type](result, opts);
|
return Cleaners[type](result, opts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
function extractResult(opts) {
|
function extractResult(opts) {
|
||||||
const { type, extractor } = opts;
|
const { type, extractor } = opts;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user