You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/cleaners/content.js

95 lines
3.1 KiB
JavaScript

import {
cleanAttributes,
cleanHeaders,
cleanHOnes,
cleanImages,
cleanTags,
removeEmpty,
rewriteTopLevel,
stripJunkTags,
makeLinksAbsolute,
} from 'utils/dom';
// Clean our article content, returning a new, cleaned node.
export default function extractCleanNode(
article,
{
$,
cleanConditionally = true,
title = '',
url = '',
}
) {
// Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags.
rewriteTopLevel(article, $);
// Drop small images and spacer images
cleanImages(article, $);
// Drop certain tags like <title>, etc
// This is -mostly- for cleanliness, not security.
stripJunkTags(article, $);
// H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),
// strip them. Otherwise, turn 'em into H2s.
cleanHOnes(article, $);
// Clean headers
cleanHeaders(article, $, title);
// Make links absolute
makeLinksAbsolute(article, $, url);
// Remove style or align attributes
cleanAttributes(article);
// We used to clean UL's and OL's here, but it was leading to
// too many in-article lists being removed. Consider a better
// way to detect menus particularly and remove them.
cleanTags(article, $, cleanConditionally);
// Remove empty paragraph nodes
removeEmpty(article, $);
return article;
}
// headers = doc.xpath('.//h2 | .//h3 | .//h4 | .//h5 | .//h6')
// for header in headers:
// drop_header = False
//
// # Remove any headers that are before any p tags in the
// # document. This probably means that it was part of the title, a
// # subtitle or something else extraneous like a datestamp or byline,
// # all of which should be handled by other metadata handling.
// no_previous_ps = int(header.xpath("count(preceding::p[1])")) == 0
// if no_previous_ps:
// similar_header_count = int(doc.xpath('count(.//%s)' % header.tag))
// if similar_header_count < 3:
// drop_header = True
//
// # Remove any headers that match the title exactly.
// if inner_text(header) == self.title:
// drop_header = True
//
// # If this header has a negative weight, it's probably junk.
// # Get rid of it.
// if self.get_weight(header) < 0:
// drop_header = True
//
// if drop_header:
// try:
// header.drop_tree()
// except AssertionError:
// # No parent exists for this node, so just blank it out.
// header.text = ''
//
// if clean_conditionally:
// # We used to clean UL's and OL's here, but it was leading to
// # too many in-article lists being removed. Consider a better
// # way to detect menus particularly and remove them.
// self._clean_conditionally(doc, ['ul', 'ol', 'table', 'div'])
//
// return doc