You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/extractors/root-extractor.js

273 lines
7.2 KiB
JavaScript

import Cleaners from 'cleaners';
import { convertNodeTo, makeLinksAbsolute } from 'utils/dom';
import GenericExtractor from './generic';
// Remove elements by an array of selectors
export function cleanBySelectors($content, $, { clean }) {
if (!clean) return $content;
$(clean.join(','), $content).remove();
return $content;
}
// Transform matching elements
export function transformElements($content, $, { transforms }) {
if (!transforms) return $content;
Reflect.ownKeys(transforms).forEach(key => {
const $matches = $(key, $content);
const value = transforms[key];
// If value is a string, convert directly
if (typeof value === 'string') {
$matches.each((index, node) => {
convertNodeTo($(node), $, transforms[key]);
});
} else if (typeof value === 'function') {
// If value is function, apply function to node
$matches.each((index, node) => {
const result = value($(node), $);
// If function returns a string, convert node to that value
if (typeof result === 'string') {
convertNodeTo($(node), $, result);
}
});
}
});
return $content;
}
function findMatchingSelector($, selectors, extractHtml, allowMultiple) {
return selectors.find(selector => {
if (Array.isArray(selector)) {
if (extractHtml) {
return selector.reduce((acc, s) => acc && $(s).length > 0, true);
}
const [s, attr] = selector;
return (
(allowMultiple || (!allowMultiple && $(s).length === 1)) &&
$(s).attr(attr) &&
$(s)
.attr(attr)
.trim() !== ''
);
}
return (
(allowMultiple || (!allowMultiple && $(selector).length === 1)) &&
$(selector)
.text()
.trim() !== ''
);
});
}
export function select(opts) {
const { $, type, extractionOpts, extractHtml = false } = opts;
// Skip if there's not extraction for this type
if (!extractionOpts) return null;
// If a string is hardcoded for a type (e.g., Wikipedia
// contributors), return the string
if (typeof extractionOpts === 'string') return extractionOpts;
const { selectors, defaultCleaner = true, allowMultiple } = extractionOpts;
const overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
const matchingSelector = findMatchingSelector(
$,
selectors,
extractHtml,
overrideAllowMultiple
);
if (!matchingSelector) return null;
function transformAndClean($node) {
makeLinksAbsolute($node, $, opts.url || '');
cleanBySelectors($node, $, extractionOpts);
transformElements($node, $, extractionOpts);
return $node;
}
function selectHtml() {
// If the selector type requests html as its return type
// transform and clean the element with provided selectors
let $content;
// If matching selector is an array, we're considering this a
// multi-match selection, which allows the parser to choose several
// selectors to include in the result. Note that all selectors in the
// array must match in order for this selector to trigger
if (Array.isArray(matchingSelector)) {
$content = $(matchingSelector.join(','));
const $wrapper = $('<div></div>');
$content.each((_, element) => {
$wrapper.append(element);
});
$content = $wrapper;
} else {
$content = $(matchingSelector);
}
// Wrap in div so transformation can take place on root element
$content.wrap($('<div></div>'));
$content = $content.parent();
$content = transformAndClean($content);
if (Cleaners[type]) {
Cleaners[type]($content, { ...opts, defaultCleaner });
}
if (allowMultiple) {
return $content
.children()
.toArray()
.map(el => $.html($(el)));
}
return $.html($content);
}
if (extractHtml) {
return selectHtml(matchingSelector);
}
let $match;
let result;
// if selector is an array (e.g., ['img', 'src']),
// extract the attr
if (Array.isArray(matchingSelector)) {
const [selector, attr, transform] = matchingSelector;
$match = $(selector);
$match = transformAndClean($match);
result = $match.map((_, el) => {
const item = $(el)
.attr(attr)
.trim();
return transform ? transform(item) : item;
});
} else {
$match = $(matchingSelector);
$match = transformAndClean($match);
result = $match.map((_, el) =>
$(el)
.text()
.trim()
);
}
result =
Array.isArray(result.toArray()) && allowMultiple
? result.toArray()
: result[0];
// Allow custom extractor to skip default cleaner
// for this type; defaults to true
if (defaultCleaner && Cleaners[type]) {
return Cleaners[type](result, { ...opts, ...extractionOpts });
}
return result;
}
export function selectExtendedTypes(extend, opts) {
const results = {};
Reflect.ownKeys(extend).forEach(t => {
if (!results[t]) {
results[t] = select({ ...opts, type: t, extractionOpts: extend[t] });
}
});
return results;
}
function extractResult(opts) {
const { type, extractor, fallback = true } = opts;
const result = select({ ...opts, extractionOpts: extractor[type] });
// If custom parser succeeds, return the result
if (result) {
return result;
}
// If nothing matches the selector, and fallback is enabled,
// run the Generic extraction
if (fallback) return GenericExtractor[type](opts);
return null;
}
const RootExtractor = {
extract(extractor = GenericExtractor, opts) {
const { contentOnly, extractedTitle } = opts;
// This is the generic extractor. Run its extract method
if (extractor.domain === '*') return extractor.extract(opts);
opts = {
...opts,
extractor,
};
if (contentOnly) {
const content = extractResult({
...opts,
type: 'content',
extractHtml: true,
title: extractedTitle,
});
return {
content,
};
}
let extendedResults = {};
if (extractor.extend) {
extendedResults = selectExtendedTypes(extractor.extend, opts);
}
const title = extractResult({ ...opts, type: 'title' });
const date_published = extractResult({ ...opts, type: 'date_published' });
const author = extractResult({ ...opts, type: 'author' });
const next_page_url = extractResult({ ...opts, type: 'next_page_url' });
const content = extractResult({
...opts,
type: 'content',
extractHtml: true,
title,
});
const lead_image_url = extractResult({
...opts,
type: 'lead_image_url',
content,
});
const excerpt = extractResult({ ...opts, type: 'excerpt', content });
const dek = extractResult({ ...opts, type: 'dek', content, excerpt });
const word_count = extractResult({ ...opts, type: 'word_count', content });
const direction = extractResult({ ...opts, type: 'direction', title });
const { url, domain } = extractResult({
...opts,
type: 'url_and_domain',
}) || { url: null, domain: null };
return {
title,
content,
author,
date_published,
lead_image_url,
dek,
next_page_url,
url,
domain,
excerpt,
word_count,
direction,
...extendedResults,
};
},
};
export default RootExtractor;