You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/utils/dom/clean-headers.js

40 lines
1.4 KiB
JavaScript

import { HEADER_TAG_LIST } from './constants'
import { normalizeSpaces } from '../text'
import { getWeight } from 'extractors/generic/content/scoring'
export default function cleanHeaders($article, $, title='') {
$(HEADER_TAG_LIST, $article).each((index, header) => {
const $header = $(header)
// Remove any headers that appear before all other p tags in the
// document. This probably means that it was part of the title, a
// subtitle or something else extraneous like a datestamp or byline,
// all of which should be handled by other metadata handling.
if ($($header, $article).prevAll('p').length === 0) {
return $header.remove()
}
// Remove any headers that match the title exactly.
if (normalizeSpaces($(header).text()) === title) {
return $header.remove()
}
// If this header has a negative weight, it's probably junk.
// Get rid of it.
if (getWeight($(header)) < 0) {
return $header.remove()
}
})
return $
}
// # If this header has a negative weight, it's probably junk.
// # Get rid of it.
// if self.get_weight(header) < 0:
// drop_header = True
//
// if drop_header:
// try:
// header.drop_tree()
// except AssertionError:
// # No parent exists for this node, so just blank it out.
// header.text = ''