You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/extractors/generic/dek/extractor.js

52 lines
1.7 KiB
JavaScript

// import {
// DEK_META_TAGS,
// DEK_SELECTORS,
// DEK_URL_RES,
// } from './constants';
// import { cleanDek } from 'cleaners';
// import {
// extractFromMeta,
// extractFromSelectors,
// } from 'utils/dom';
// Currently there is only one selector for
// deks. We should simply return null here
// until we have a more robust generic option.
// Below is the original source for this, for reference.
const GenericDekExtractor = {
// extract({ $, content, metaCache }) {
extract() {
return null;
},
};
export default GenericDekExtractor;
// def extract_dek(self):
// # First, check to see if we have a matching meta tag that we can make
// # use of.
// dek = self.extract_from_meta('dek', constants.DEK_META_TAGS)
// if not dek:
// # Second, look through our CSS/XPath selectors. This may return
// # an HTML fragment.
// dek = self.extract_from_selectors('dek',
// constants.DEK_SELECTORS,
// text_only=False)
//
// if dek:
// # Make sure our dek isn't in the first few thousand characters
// # of the content, otherwise it's just the start of the article
// # and not a true dek.
// content = self.extract_content()
// content_chunk = normalize_spaces(strip_tags(content[:2000]))
// dek_chunk = normalize_spaces(dek[:100]) # Already has no tags.
//
// # 80% or greater similarity means the dek was very similar to some
// # of the starting content, so we skip it.
// if fuzz.partial_ratio(content_chunk, dek_chunk) < 80:
// return dek
//
// return None