You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/extractors/generic/content/extract-best-node.js

39 lines
956 B
JavaScript

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import {
stripUnlikelyCandidates,
convertToParagraphs,
} from 'utils/dom';
import {
scoreContent,
findTopCandidate,
} from './scoring';
// Using a variety of scoring techniques, extract the content most
// likely to be article text.
//
// If strip_unlikely_candidates is True, remove any elements that
// match certain criteria first. (Like, does this element have a
// classname of "comment")
//
// If weight_nodes is True, use classNames and IDs to determine the
// worthiness of nodes.
//
// Returns a cheerio object $
export default function extractBestNode($, opts) {
// clone the node so we can get back to our
// initial parsed state if needed
// TODO Do I need this? AP
// let $root = $.root().clone()
if (opts.stripUnlikelyCandidates) {
$ = stripUnlikelyCandidates($);
}
$ = convertToParagraphs($);
$ = scoreContent($, opts.weightNodes);
const $topCandidate = findTopCandidate($);
return $topCandidate;
}