mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-17 03:25:31 +00:00
basic structure
This commit is contained in:
parent
155efb3833
commit
7f95b9f44f
183
bundle.js
Normal file
183
bundle.js
Normal file
@ -0,0 +1,183 @@
|
||||
'use strict';
|
||||
|
||||
function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'default' in ex) ? ex['default'] : ex; }
|
||||
|
||||
var fs = _interopDefault(require('fs'));
|
||||
|
||||
const PHOTO_HINTS = [
|
||||
'figure',
|
||||
'photo',
|
||||
'image',
|
||||
'caption'
|
||||
]
|
||||
const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i')
|
||||
|
||||
const POSITIVE_LEAD_IMAGE_URL_HINTS = [
|
||||
'upload',
|
||||
'wp-content',
|
||||
'large',
|
||||
'photo',
|
||||
'wp-image',
|
||||
]
|
||||
const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
|
||||
|
||||
const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
|
||||
'spacer',
|
||||
'sprite',
|
||||
'blank',
|
||||
'throbber',
|
||||
'gradient',
|
||||
'tile',
|
||||
'bg',
|
||||
'background',
|
||||
'icon',
|
||||
'social',
|
||||
'header',
|
||||
'hdr',
|
||||
'advert',
|
||||
'spinner',
|
||||
'loader',
|
||||
'loading',
|
||||
'default',
|
||||
'rating',
|
||||
'share',
|
||||
'facebook',
|
||||
'twitter',
|
||||
'theme',
|
||||
'promo',
|
||||
'ads',
|
||||
'wp-includes',
|
||||
]
|
||||
const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
|
||||
|
||||
// A list of strings that denote a positive scoring for this content as being
|
||||
// an article container. Checked against className and id.
|
||||
//
|
||||
// TODO: Perhaps have these scale based on their odds of being quality?
|
||||
const POSITIVE_SCORE_HINTS = [
|
||||
'article',
|
||||
'articlecontent',
|
||||
'instapaper_body',
|
||||
'blog',
|
||||
'body',
|
||||
'content',
|
||||
'entry-content-asset',
|
||||
'entry',
|
||||
'hentry',
|
||||
'main',
|
||||
'Normal',
|
||||
'page',
|
||||
'pagination',
|
||||
'permalink',
|
||||
'post',
|
||||
'story',
|
||||
'text',
|
||||
'[-_]copy', //usatoday
|
||||
'\Bcopy'
|
||||
]
|
||||
|
||||
// The above list, joined into a matching regular expression
|
||||
const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i')
|
||||
|
||||
|
||||
// A list of strings that denote a negative scoring for this content as being
|
||||
// an article container. Checked against className and id.
|
||||
//
|
||||
// TODO: Perhaps have these scale based on their odds of being quality?
|
||||
const NEGATIVE_SCORE_HINTS = [
|
||||
'adbox',
|
||||
'advert',
|
||||
'author',
|
||||
'bio',
|
||||
'bookmark',
|
||||
'bottom',
|
||||
'byline',
|
||||
'clear',
|
||||
'com-',
|
||||
'combx',
|
||||
'comment',
|
||||
'comment\B',
|
||||
'contact',
|
||||
'copy',
|
||||
'credit',
|
||||
'crumb',
|
||||
'date',
|
||||
'deck',
|
||||
'excerpt',
|
||||
'featured', //tnr.com has a featured_content which throws us off
|
||||
'foot',
|
||||
'footer',
|
||||
'footnote',
|
||||
'graf',
|
||||
'head',
|
||||
'info',
|
||||
'infotext', //newscientist.com copyright
|
||||
'instapaper_ignore',
|
||||
'jump',
|
||||
'linebreak',
|
||||
'link',
|
||||
'masthead',
|
||||
'media',
|
||||
'meta',
|
||||
'modal',
|
||||
'outbrain', //slate.com junk
|
||||
'promo',
|
||||
'pr_', // autoblog - press release
|
||||
'related',
|
||||
'respond',
|
||||
'roundcontent', //lifehacker restricted content warning
|
||||
'scroll',
|
||||
'secondary',
|
||||
'share',
|
||||
'shopping',
|
||||
'shoutbox',
|
||||
'side',
|
||||
'sidebar',
|
||||
'sponsor',
|
||||
'stamp',
|
||||
'sub',
|
||||
'summary',
|
||||
'tags',
|
||||
'tools',
|
||||
'widget'
|
||||
]
|
||||
// The above list, joined into a matching regular expression
|
||||
const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i')
|
||||
|
||||
// A list of words that, if found in link text or URLs, likely mean that
|
||||
// this link is not a next page link.
|
||||
const EXTRANEOUS_LINK_HINTS = [
|
||||
'print',
|
||||
'archive',
|
||||
'comment',
|
||||
'discuss',
|
||||
'e-mail',
|
||||
'email',
|
||||
'share',
|
||||
'reply',
|
||||
'all',
|
||||
'login',
|
||||
'sign',
|
||||
'single',
|
||||
'adx',
|
||||
'entry-unrelated'
|
||||
]
|
||||
const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i')
|
||||
|
||||
const GenericContentExtractor = {
|
||||
parse: (html) => {
|
||||
return html
|
||||
}
|
||||
}
|
||||
|
||||
const GenericExtractor = {
|
||||
parse: (html) => {
|
||||
return {
|
||||
content: GenericContentExtractor.parse(html)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const html = fs.readFileSync('../fixtures/wired.html', 'utf-8')
|
||||
const result = GenericExtractor.parse(html)
|
||||
console.log(result)
|
821
extractor/generic/content-extractor.js
Normal file
821
extractor/generic/content-extractor.js
Normal file
@ -0,0 +1,821 @@
|
||||
import CONSTANTS from './constants'
|
||||
|
||||
const GenericContentExtractor = {
|
||||
parse: (html) => {
|
||||
return html
|
||||
}
|
||||
}
|
||||
// import logging
|
||||
//
|
||||
// from copy import deepcopy
|
||||
// from . import constants
|
||||
// from utils.dom.attribmap import AttribMap
|
||||
// import lxml.html
|
||||
// from lxml.html import builder as E
|
||||
// from utils.dom import (
|
||||
// extract_by_selector,
|
||||
// inner_text,
|
||||
// link_density,
|
||||
// node_to_html,
|
||||
// )
|
||||
// from utils.text import (has_sentence_end, normalize_spaces)
|
||||
// from utils.html import strip_tags
|
||||
//
|
||||
// logger = logging.getLogger(__name__)
|
||||
// # Shortcut
|
||||
// RE_NAMESPACE = {'re': constants.RE_NS}
|
||||
//
|
||||
// class GenericContentExtractor(object):
|
||||
// """Article content extraction is a beast. For clarities sake, it is
|
||||
// broken out into its own component.
|
||||
//
|
||||
// """
|
||||
//
|
||||
// def __init__(self, resource, flags=None, title=None):
|
||||
// self.resource = resource
|
||||
//
|
||||
// self.title = title
|
||||
//
|
||||
// if flags:
|
||||
// self.flags = flags
|
||||
// else:
|
||||
// self.flags = {
|
||||
// "strip_unlikely_candidates": True,
|
||||
// "weight_nodes": True,
|
||||
// "clean_conditionally": True,
|
||||
// }
|
||||
//
|
||||
// def extract(self, return_type="html"):
|
||||
// """ Extract the content for this resource - initially, pass in our
|
||||
// most restrictive flags which will return the highest quality
|
||||
// content. On each failure, retry with slightly more lax flags.
|
||||
//
|
||||
// :param return_type: string. If "node", should return the content
|
||||
// as an LXML node rather than as an HTML string.
|
||||
//
|
||||
// Flags:
|
||||
// strip_unlikely_candidates: Remove any elements that match
|
||||
// non-article-like criteria first.(Like, does this element
|
||||
// have a classname of "comment")
|
||||
//
|
||||
// weight_nodes: Modify an elements score based on whether it has
|
||||
// certain classNames or IDs. Examples: Subtract if a node has
|
||||
// a className of 'comment', Add if a node has an ID of
|
||||
// 'entry-content'.
|
||||
//
|
||||
// clean_conditionally: Clean the node to return of some
|
||||
// superfluous content. Things like forms, ads, etc.
|
||||
// """
|
||||
// extraction_flags = ['strip_unlikely_candidates', 'weight_nodes', 'clean_conditionally']
|
||||
//
|
||||
// # Cascade through our extraction-specific flags in an ordered fashion,
|
||||
// # turning them off as we try to extract content.
|
||||
// clean_conditionally = self.flags.get('clean_conditionally', False)
|
||||
// node = self.extract_clean_node(
|
||||
// self._extract_best_node(),
|
||||
// clean_conditionally=clean_conditionally)
|
||||
//
|
||||
// if not self.node_is_sufficient(node):
|
||||
// # We didn't succeed on first pass, one by one disable our
|
||||
// # extraction flags and try again.
|
||||
// for flag in extraction_flags:
|
||||
// self.flags[flag] = False
|
||||
// clean_conditionally = self.flags.get(
|
||||
// 'clean_conditionally',
|
||||
// False
|
||||
// )
|
||||
// node = self.extract_clean_node(
|
||||
// self._extract_best_node(),
|
||||
// clean_conditionally=clean_conditionally
|
||||
// )
|
||||
//
|
||||
// # If we found a good node, break out of the flag loop.
|
||||
// if self.node_is_sufficient(node):
|
||||
// break
|
||||
//
|
||||
// # Once we got here, either we're at our last-resort node, or
|
||||
// # we broke early. Make sure we at least have -something- before we
|
||||
// # move forward.
|
||||
// if node is None:
|
||||
// return None
|
||||
//
|
||||
// # Remove our scoring information from our content
|
||||
// if 'score' in node.attrib:
|
||||
// del node.attrib['score']
|
||||
// for scored_node in node.xpath('./#<{(|[@score]'):
|
||||
// del scored_node.attrib['score']
|
||||
//
|
||||
// if return_type == "html":
|
||||
// return normalize_spaces(node_to_html(node))
|
||||
// else:
|
||||
// return node
|
||||
//
|
||||
// def node_is_sufficient(self, node):
|
||||
// """Given a node, determine if it is article-like enough to return."""
|
||||
// return (isinstance(node, lxml.html.HtmlElement) and
|
||||
// len(inner_text(node)) >= 100)
|
||||
//
|
||||
//
|
||||
// def _extract_best_node(self):
|
||||
// """ Using a variety of scoring techniques, extract the content most
|
||||
// likely to be article text.
|
||||
//
|
||||
// If strip_unlikely_candidates is True, remove any elements that
|
||||
// match certain criteria first. (Like, does this element have a
|
||||
// classname of "comment")
|
||||
//
|
||||
// If weight_nodes is True, use classNames and IDs to determine the
|
||||
// worthiness of nodes.
|
||||
//
|
||||
// Returns an lxml node.
|
||||
// """
|
||||
//
|
||||
// # deep clone the node so we can get back to our initial parsed state
|
||||
// # if needed
|
||||
// # TODO: Performance improvements here? Deepcopy is known to be slow.
|
||||
// # Can we avoid this somehow?
|
||||
// root = deepcopy(self.resource)
|
||||
//
|
||||
// if self.flags['strip_unlikely_candidates']:
|
||||
// self._strip_unlikely_candidates(root)
|
||||
//
|
||||
// self._convert_to_paragraphs(root)
|
||||
// self._score_content(root, weight_nodes=self.flags['weight_nodes'])
|
||||
//
|
||||
// # print structure(root)
|
||||
//
|
||||
// top_candidate = self._find_top_candidate(root)
|
||||
//
|
||||
// return top_candidate
|
||||
//
|
||||
// def get_weight(self, node):
|
||||
// """ Get the score of a node based on its className and id. """
|
||||
// score = 0
|
||||
//
|
||||
// if node.get('id'):
|
||||
// if constants.NEGATIVE_SCORE_RE.search(node.get('id')):
|
||||
// score -= 25
|
||||
// if constants.POSITIVE_SCORE_RE.search(node.get('id')):
|
||||
// score += 25
|
||||
//
|
||||
// if node.get('class'):
|
||||
// # Only score classes on negative/positive if the ID didn't match.
|
||||
// if score == 0:
|
||||
// if constants.NEGATIVE_SCORE_RE.search(node.get('class')):
|
||||
// score -= 25
|
||||
// if constants.POSITIVE_SCORE_RE.search(node.get('class')):
|
||||
// score += 25
|
||||
//
|
||||
// # Try to keep photos if we can.
|
||||
// if constants.PHOTO_HINTS_RE.search(node.get('class')):
|
||||
// score += 10
|
||||
//
|
||||
// # Bonus for entry-content-asset, which is explicitly denoted to be
|
||||
// # more valuable to Readability in the publisher guidelines.
|
||||
// if 'entry-content-asset' in node.get('class'):
|
||||
// score += 25
|
||||
//
|
||||
// return score
|
||||
//
|
||||
//
|
||||
// # The removal is implemented as a blacklist and whitelist, this test finds
|
||||
// # blacklisted elements that aren't whitelisted. We do this all in one
|
||||
// # expression-both because it's only one pass, and because this skips the
|
||||
// # serialization for whitelisted nodes.
|
||||
// candidates_blacklist = '|'.join(constants.UNLIKELY_CANDIDATES_BLACKLIST)
|
||||
// candidates_whitelist = '|'.join(constants.UNLIKELY_CANDIDATES_WHITELIST)
|
||||
//
|
||||
// # Note: Regular expressions appear to be about 3 times as fast as looping
|
||||
// # over each key and matching with contains().
|
||||
// #
|
||||
// # TODO: Consider mapping all classnames and ids to hashes and using set
|
||||
// # intersections for performance.
|
||||
// candidates_xpath = (
|
||||
// './#<{(|['
|
||||
// 'not(self::a) and '
|
||||
// 're:test(concat(@id, " ", @class), "%s", "i") and '
|
||||
// 'not( re:test(concat(@id, " ", @class), "%s", "i"))'
|
||||
// ']'
|
||||
// ) % (candidates_blacklist, candidates_whitelist)
|
||||
// def _strip_unlikely_candidates(self, doc):
|
||||
// """ Loop through the provided document and remove any non-link nodes
|
||||
// that are unlikely candidates for article content.
|
||||
//
|
||||
// Links are ignored because there are very often links to content
|
||||
// that are identified as non-body-content, but may be inside
|
||||
// article-like content.
|
||||
//
|
||||
// :param doc: an LXML doc to strip nodes from
|
||||
// :return node: The node itself (even though the conversion happens
|
||||
// by-reference)
|
||||
// """
|
||||
// unlikely_candidates = doc.xpath(self.candidates_xpath,
|
||||
// namespaces=RE_NAMESPACE)
|
||||
//
|
||||
// for node in unlikely_candidates:
|
||||
// node.drop_tree()
|
||||
//
|
||||
// return doc
|
||||
//
|
||||
// def _convert_to_paragraphs(self, doc):
|
||||
// """ Loop through the provided doc, and convert any p-like elements to
|
||||
// actual paragraph tags.
|
||||
//
|
||||
// Things fitting this criteria:
|
||||
// * Multiple consecutive <br /> tags.
|
||||
// * <div /> tags without block level elements inside of them
|
||||
// * <span /> tags who are not children of <p /> or <div /> tags.
|
||||
//
|
||||
// :param doc: An LXML node to search through.
|
||||
// :return an LXML node of the element, cleaned up.
|
||||
// (By-reference mutation, though. Returned just for convenience.)
|
||||
// """
|
||||
//
|
||||
// # Convert every doubled-<br /> to a paragraph tag.
|
||||
// self._brs_to_paragraphs(doc)
|
||||
//
|
||||
// # Convert every shallow <div /> to a paragraph tag. Ignore divs that
|
||||
// # contain other block level elements.
|
||||
// inner_block_tags = './/' + ' or .//'.join(constants.DIV_TO_P_BLOCK_TAGS)
|
||||
// shallow_divs = doc.xpath('.//div[not(%s)]' % inner_block_tags)
|
||||
//
|
||||
// for div in shallow_divs:
|
||||
// div.tag = 'p'
|
||||
//
|
||||
// # Convert every span tag who has no ancestor p or div tag within their
|
||||
// # family tree to a P as well.
|
||||
// p_like_spans = doc.xpath('.//span[not(ancestor::p or ancestor::div)]')
|
||||
// for span in p_like_spans:
|
||||
// span.tag = 'p'
|
||||
//
|
||||
// # If, after all of this, we have no P tags at all, we are probably
|
||||
// # dealing with some very ugly content that is separated by single BR
|
||||
// # tags. Convert them individually to P tags.
|
||||
// if int(doc.xpath('count(//p)')) == 0:
|
||||
// self._brs_to_paragraphs(doc, min_consecutive=1)
|
||||
//
|
||||
// # Remove font and center tags, which are ugly and annoying
|
||||
// for fonttag in doc.xpath('.//font | .//center'):
|
||||
// fonttag.drop_tag()
|
||||
//
|
||||
//
|
||||
// ### DO WE EVEN NEED THIS?? -Chris ###
|
||||
//
|
||||
// # # Due to the way the paras are inserted, the first paragraph does not
|
||||
// # # get captured. Since this first para can contain all sorts of random
|
||||
// # # junk (links, drop caps, images) it's not easy to regex our way to
|
||||
// # # victory so we do it via dom. - Karl G
|
||||
// # try:
|
||||
// # first = node.xpath('.//p[@class = "rdb_br"][position() = 1]')[0]
|
||||
// # except IndexError:
|
||||
// # pass
|
||||
// # else:
|
||||
// # parent = first.getparent()
|
||||
// # breaker = None
|
||||
// # if parent is None:
|
||||
// # parent = node
|
||||
// # para = E.P({'class':'rdb_br firstp'})
|
||||
// # has_predecessors = False
|
||||
// # for sibling in first.itersiblings(preceding = True):
|
||||
// # has_predecessors = True
|
||||
// # if sibling.tag in ['p', 'div']:
|
||||
// # breaker = sibling
|
||||
// # break
|
||||
// # para.insert(0,sibling)
|
||||
// #
|
||||
// # if (not has_predecessors and parent.text is not None and
|
||||
// # parent.text.strip() != ""):
|
||||
// # para.text = parent.text
|
||||
// # parent.text = ''
|
||||
// # else:
|
||||
// # para.text = (para.text or '') + (parent.tail or '')
|
||||
// #
|
||||
// # parent.tail = ''
|
||||
// # if breaker is None:
|
||||
// # parent.insert(0,para)
|
||||
// # else:
|
||||
// # parent.insert(parent.index(breaker)+1,para)
|
||||
//
|
||||
// return doc
|
||||
//
|
||||
// def _brs_to_paragraphs(self, doc, min_consecutive=2):
|
||||
// """ Given an LXML document, convert consecutive <br /> tags into
|
||||
// <p /> tags instead.
|
||||
//
|
||||
// :param doc: An LXML document to convert within.
|
||||
// :param min_consecutive: Integer, the minimum number of consecutive
|
||||
// <br /> tags that must exist for them to be converted to <p />
|
||||
// tags. Must be at least 1.
|
||||
//
|
||||
// A word to the wise: This is deceptively tricky, as break tags
|
||||
// don't behave like normal XML should. Make sure you test
|
||||
// thoroughly if you make any changes to this code.
|
||||
// """
|
||||
// brs = doc.xpath('.//br')
|
||||
//
|
||||
// # Loop through all of our break tags, looking for consecutive
|
||||
// # <br />s with no content in between them. If found, replace them
|
||||
// # with a single P tag.
|
||||
// for br in brs:
|
||||
// # Generate a list of all the breaks in a row, with no text in
|
||||
// # between them.
|
||||
// joined_brs = []
|
||||
// cur_br = br
|
||||
// while True:
|
||||
// joined_brs.append(cur_br)
|
||||
//
|
||||
// if cur_br.tail:
|
||||
// break
|
||||
//
|
||||
// next = cur_br.getnext()
|
||||
// next_is_br = next is not None and next.tag.lower() == 'br'
|
||||
//
|
||||
// if next_is_br:
|
||||
// cur_br = next
|
||||
// else:
|
||||
// break
|
||||
//
|
||||
// if len(joined_brs) < min_consecutive:
|
||||
// continue
|
||||
//
|
||||
// last_br = joined_brs[-1]
|
||||
//
|
||||
// # Now loop through following siblings, until we hit a block
|
||||
// # tag or the end, and append them to this P if they are not a
|
||||
// # block tag that is not a BR.
|
||||
// self._paragraphize(last_br)
|
||||
//
|
||||
// # Drop every break that we no longer need because of the P.
|
||||
// # The first BR has been turned into a P tag.
|
||||
// for joined_br in joined_brs:
|
||||
// if joined_br is not last_br:
|
||||
// joined_br.drop_tag()
|
||||
//
|
||||
// # If we had any new p tags that are already inside a P tag, resolve
|
||||
// # those by paragraphizing them, which will append their block level
|
||||
// # contents.
|
||||
// for fix_count in xrange(1000):
|
||||
// # Find the first p that contains another p, and paragraphize it.
|
||||
// # We do this in a loop because we're modifying the dom as we go.
|
||||
// try:
|
||||
// parent_p = doc.xpath('//p[./p][1]')[0]
|
||||
// self._paragraphize(parent_p)
|
||||
// except IndexError:
|
||||
// break
|
||||
// else:
|
||||
// # We exhausted our loop, which means we've looped too many times
|
||||
// # such that it's unreasonable. Log a warning.
|
||||
// logger.warning("Bailing on p parent fix due to crazy "
|
||||
// "looping for url %s" % self.resource.url)
|
||||
//
|
||||
// def _paragraphize(self, node):
|
||||
// """ Given a node, turn it into a P if it is not already a P, and
|
||||
// make sure it conforms to the constraints of a P tag (I.E. does
|
||||
// not contain any other block tags.)
|
||||
//
|
||||
// If the node is a <br />, it treats the following inline siblings
|
||||
// as if they were its children.
|
||||
//
|
||||
// :param node: The node to paragraphize
|
||||
// """
|
||||
// is_br = (node.tag.lower() == 'br')
|
||||
//
|
||||
// if is_br and node.tail:
|
||||
// node.text = node.tail
|
||||
// node.tail = None
|
||||
//
|
||||
// node.tag = 'p'
|
||||
//
|
||||
// if is_br:
|
||||
// sibling = node.getnext()
|
||||
// while True:
|
||||
// if (sibling is None or (
|
||||
// sibling.tag in constants.BLOCK_LEVEL_TAGS and
|
||||
// sibling.tag != 'br'
|
||||
// )):
|
||||
// break
|
||||
//
|
||||
// next_sibling = sibling.getnext()
|
||||
// node.append(sibling)
|
||||
// sibling = next_sibling
|
||||
//
|
||||
// else:
|
||||
// children = node.getchildren()
|
||||
// i = 0
|
||||
// il = len(children)
|
||||
// # Ghetto looping so we have access to the iterator afterward
|
||||
// while i < il:
|
||||
// child = children[i]
|
||||
// if (child is None or
|
||||
// (child.tag in constants.BLOCK_LEVEL_TAGS and
|
||||
// child.tag != 'br')
|
||||
// ):
|
||||
// break
|
||||
// i = i+1
|
||||
//
|
||||
// # This means we encountered a block level tag within our P,
|
||||
// # so we should pop the rest down to siblings.
|
||||
// if i < il:
|
||||
// for j in xrange(i, il):
|
||||
// node.addnext(children[j])
|
||||
//
|
||||
//
|
||||
// ### --- SCORING --- ###
|
||||
//
|
||||
// def _get_score(self, node, weight_nodes=True):
|
||||
// """Get a node's score. If weight_nodes is true, weight classes when
|
||||
// getting the score as well.
|
||||
//
|
||||
// """
|
||||
// score = node.get('score')
|
||||
// if score is None:
|
||||
// score = self._score_node(node)
|
||||
// if weight_nodes:
|
||||
// score += self.get_weight(node)
|
||||
// parent = node.getparent()
|
||||
// if parent is not None:
|
||||
// self._set_score(parent, self._get_score(parent) + .25 * score)
|
||||
// else:
|
||||
// score = float(score)
|
||||
// return score
|
||||
//
|
||||
// def _set_score(self, node, val):
|
||||
// """Set the score of a node to val"""
|
||||
// return node.set('score', str(val))
|
||||
//
|
||||
// def _add_score(self, node, val, weight_nodes=True):
|
||||
// return self._set_score(node, self._get_score(node, weight_nodes) + val)
|
||||
//
|
||||
// def _score_content(self, doc, weight_nodes=True):
|
||||
// """score content. Parents get the full value of their children's
|
||||
// content score, grandparents half
|
||||
// """
|
||||
//
|
||||
// # First, look for special hNews based selectors and give them a big
|
||||
// # boost, if they exist
|
||||
// for selector in constants.HNEWS_CONTENT_SELECTORS:
|
||||
// # Not self.resource.extract_by_selector because our doc is a copy
|
||||
// # of the resource doc.
|
||||
// nodes = extract_by_selector(doc, selector,
|
||||
// AttribMap(doc))
|
||||
// for node in nodes:
|
||||
// self._add_score(node, 80)
|
||||
//
|
||||
// paras = doc.xpath('.//p | .//pre')
|
||||
//
|
||||
// # If we don't have any paragraphs at all, we can't score based on
|
||||
// # paragraphs, so return without modifying anything else.
|
||||
// if len(paras) == 0:
|
||||
// return doc
|
||||
//
|
||||
// for para in paras:
|
||||
// # Don't score invalid tags
|
||||
// if not isinstance(para.tag, basestring):
|
||||
// continue
|
||||
//
|
||||
// # The raw score for this paragraph, before we add any parent/child
|
||||
// # scores.
|
||||
// raw_score = self._score_node(para)
|
||||
// self._set_score(para, self._get_score(para, weight_nodes))
|
||||
//
|
||||
// parent = para.getparent()
|
||||
// if parent is not None:
|
||||
// if parent.tag == 'span':
|
||||
// parent.tag = 'div'
|
||||
//
|
||||
// # Add the individual content score to the parent node
|
||||
// self._add_score(parent, raw_score, weight_nodes=weight_nodes)
|
||||
//
|
||||
// grandparent = parent.getparent()
|
||||
// if grandparent is not None:
|
||||
// if grandparent.tag == 'span':
|
||||
// grandparent.tag = 'div'
|
||||
//
|
||||
// # Add half of the individual content score to the
|
||||
// # grandparent
|
||||
// gp_score = raw_score / 2.0
|
||||
// self._add_score(grandparent, gp_score, weight_nodes=weight_nodes)
|
||||
//
|
||||
// return doc
|
||||
//
|
||||
//
|
||||
// def _score_node(self, node):
|
||||
// """Score an individual node. Has some smarts for paragraphs, otherwise
|
||||
// just scores based on tag currently.
|
||||
//
|
||||
// """
|
||||
// score = 0
|
||||
//
|
||||
// if node.tag in ['p', 'li', 'span', 'pre']:
|
||||
// score += self._score_paragraph(node)
|
||||
// if node.tag in ['div']:
|
||||
// score += 5
|
||||
// elif node.tag in ['pre', 'td', 'blockquote', 'ol', 'ul', 'dl']:
|
||||
// score += 3
|
||||
// elif node.tag in ['address', 'form']:
|
||||
// score -= 3
|
||||
// elif node.tag in ['th']:
|
||||
// score -= 5
|
||||
//
|
||||
// return score
|
||||
//
|
||||
// def _score_paragraph(self, node):
|
||||
// """Score a paragraph using various methods. Things like number of
|
||||
// commas, etc. Higher is better.
|
||||
//
|
||||
// """
|
||||
//
|
||||
// # Start with a point for the paragraph itself as a base.
|
||||
// score = 1
|
||||
// text = inner_text(node)
|
||||
// text_len = len(text)
|
||||
//
|
||||
// if text_len == 0:
|
||||
// if node.getparent() is not None and len(node.getchildren()) == 0:
|
||||
// node.drop_tree()
|
||||
// return 0
|
||||
//
|
||||
// # If this paragraph is less than 25 characters, don't count it.
|
||||
// if text_len < 25:
|
||||
// return 0
|
||||
//
|
||||
// # Add points for any commas within this paragraph
|
||||
// score += text.count(',')
|
||||
//
|
||||
// # For every 50 characters in this paragraph, add another point. Up
|
||||
// # to 3 points.
|
||||
// chunk_count = (text_len / 50)
|
||||
// if chunk_count > 0:
|
||||
// length_bonus = 0
|
||||
// if node.tag in ('pre', 'p'):
|
||||
// length_bonus += chunk_count - 2
|
||||
// else:
|
||||
// length_bonus += chunk_count - 1.25
|
||||
// score += min(max(length_bonus, 0), 3)
|
||||
//
|
||||
// # Articles can end with short paragraphs when people are being clever
|
||||
// # but they can also end with short paragraphs setting up lists of junk
|
||||
// # that we strip. This negative tweaks junk setup paragraphs just below
|
||||
// # the cutoff threshold.
|
||||
// if text.endswith(':'):
|
||||
// score -= 1
|
||||
//
|
||||
// return score
|
||||
//
|
||||
// ### ------- TOP CANDIDATE EXTRACTION ------ ###
|
||||
//
|
||||
// def _find_top_candidate(self, root):
|
||||
// # After we've calculated scores, loop through all of the possible
|
||||
// # candidate nodes we found and find the one with the highest score.
|
||||
// top_candidate = None
|
||||
// top_candidate_score = 0
|
||||
// # Note: ./#<{(| is faster than ./#<{(|[@score], believe it or not.
|
||||
// for candidate in root.xpath('./#<{(|'):
|
||||
//
|
||||
// if candidate.tag in constants.NON_TOP_CANDIDATE_TAGS:
|
||||
// continue
|
||||
//
|
||||
// candidate_score = self._get_score(candidate)
|
||||
// if top_candidate is None or candidate_score > top_candidate_score:
|
||||
// top_candidate = candidate
|
||||
// top_candidate_score = self._get_score(top_candidate)
|
||||
//
|
||||
//
|
||||
// # If we still have no candidate, just use the body
|
||||
// if top_candidate is None or len(inner_text(top_candidate)) < 250:
|
||||
// to_ret = root.find('body')
|
||||
// if to_ret is None:
|
||||
// to_ret = root.xpath('.')[0]
|
||||
// elif top_candidate.getparent() is not None:
|
||||
// # Now that we have a top_candidate, look through the siblings of
|
||||
// # it to see if any of them are decently scored. If they are, they
|
||||
// # may be split parts of the content (Like two divs, a preamble and
|
||||
// # a body.) Example:
|
||||
// # http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
|
||||
// to_ret = E.DIV()
|
||||
// sibling_score_threshold = max(10, top_candidate_score * 0.2)
|
||||
// for child in top_candidate.getparent().iterchildren():
|
||||
// if not isinstance(child.tag, basestring):
|
||||
// continue
|
||||
//
|
||||
// if self._get_score(child):
|
||||
// append = False
|
||||
//
|
||||
// if child == top_candidate:
|
||||
// to_ret.append(child)
|
||||
// continue
|
||||
//
|
||||
// density = link_density(child)
|
||||
// content_bonus = 0
|
||||
//
|
||||
// # If the sibling has a very low link density, give a small
|
||||
// # bonus.
|
||||
// if density < 0.05:
|
||||
// content_bonus += 20
|
||||
//
|
||||
// # If it's high, give it a penalty
|
||||
// if density >= 0.5:
|
||||
// content_bonus -= 20
|
||||
//
|
||||
// # If sibling nodes and top candidates have the exact same
|
||||
// # className, give a bonus
|
||||
// if child.get('class', False) == top_candidate.get('class'):
|
||||
// content_bonus += top_candidate_score * 0.2
|
||||
//
|
||||
// sibling_score = self._get_score(child) + content_bonus
|
||||
// if sibling_score >= sibling_score_threshold:
|
||||
// append = True
|
||||
// elif child.tag == 'p':
|
||||
// child_content = child.text_content()
|
||||
// child_content_len = len(child_content)
|
||||
//
|
||||
// if child_content_len > 80 and density < 0.25:
|
||||
// append = True
|
||||
// elif (child_content_len <= 80 and density == 0 and
|
||||
// has_sentence_end(child_content)):
|
||||
// append = True
|
||||
//
|
||||
// if append:
|
||||
// to_ret.append(child)
|
||||
// else:
|
||||
// to_ret = top_candidate
|
||||
//
|
||||
// return to_ret
|
||||
//
|
||||
// def extract_clean_node(self, article, clean_conditionally=False):
|
||||
// """ Clean our article content, returning a new, cleaned node. """
|
||||
// doc = deepcopy(article)
|
||||
//
|
||||
// # Rewrite the tag name to div if it's a top level node like body or
|
||||
// # html to avoid later complications with multiple body tags.
|
||||
// if doc.tag in ['html','body']:
|
||||
// doc.tag = 'div'
|
||||
//
|
||||
// for img in doc.xpath('.//img'):
|
||||
// try:
|
||||
// img_height = int(img.attrib.get('height', 20))
|
||||
// img_width = int(img.attrib.get('width', 20))
|
||||
// if img_height < 10 or img_width < 10:
|
||||
// # Remove images that explicitly have very small heights or
|
||||
// # widths, because they are most likely shims or icons,
|
||||
// # which aren't very useful for reading.
|
||||
// img.drop_tree()
|
||||
// elif 'height' in img.attrib:
|
||||
// # Don't ever specify a height on images, so that we can
|
||||
// # scale with respect to width without screwing up the
|
||||
// # aspect ratio.
|
||||
// del img.attrib['height']
|
||||
// except:
|
||||
// pass
|
||||
//
|
||||
// # Drop certain tags like <title>, etc
|
||||
// # This is -mostly- for cleanliness, not security. The lxml Cleaner
|
||||
// # method in Resource does most of the security stuff for us.
|
||||
// for tag in doc.xpath('.//' + ' | .//'.join(constants.STRIP_OUTPUT_TAGS)):
|
||||
// tag.drop_tree()
|
||||
//
|
||||
// # Drop spacer images
|
||||
// spacer_path = './/img[re:test(@src, "trans|transparent|spacer|blank", "i")]'
|
||||
// for tag in doc.xpath(spacer_path, namespaces={'re': constants.RE_NS}):
|
||||
// tag.drop_tree()
|
||||
//
|
||||
// # H1 tags are typically the article title, which should be extracted
|
||||
// # by the title extractor instead. If there's less than 3 of them (<3),
|
||||
// # strip them. Otherwise, turn 'em into H2s.
|
||||
// hOnes = doc.xpath('.//h1')
|
||||
// if len(hOnes) < 3:
|
||||
// for e in hOnes:
|
||||
// e.drop_tree()
|
||||
// else:
|
||||
// for e in hOnes:
|
||||
// e.tag = 'h2'
|
||||
//
|
||||
// headers = doc.xpath('.//h2 | .//h3 | .//h4 | .//h5 | .//h6')
|
||||
// for header in headers:
|
||||
// drop_header = False
|
||||
//
|
||||
// # Remove any headers that are before any p tags in the
|
||||
// # document. This probably means that it was part of the title, a
|
||||
// # subtitle or something else extraneous like a datestamp or byline,
|
||||
// # all of which should be handled by other metadata handling.
|
||||
// no_previous_ps = int(header.xpath("count(preceding::p[1])")) == 0
|
||||
// if no_previous_ps:
|
||||
// similar_header_count = int(doc.xpath('count(.//%s)' % header.tag))
|
||||
// if similar_header_count < 3:
|
||||
// drop_header = True
|
||||
//
|
||||
// # Remove any headers that match the title exactly.
|
||||
// if inner_text(header) == self.title:
|
||||
// drop_header = True
|
||||
//
|
||||
// # If this header has a negative weight, it's probably junk.
|
||||
// # Get rid of it.
|
||||
// if self.get_weight(header) < 0:
|
||||
// drop_header = True
|
||||
//
|
||||
// if drop_header:
|
||||
// try:
|
||||
// header.drop_tree()
|
||||
// except AssertionError:
|
||||
// # No parent exists for this node, so just blank it out.
|
||||
// header.text = ''
|
||||
//
|
||||
// for tag in doc.xpath('./#<{(|[@style or @align]'):
|
||||
// try:
|
||||
// del tag.attrib['style']
|
||||
// except KeyError:
|
||||
// pass
|
||||
// try:
|
||||
// del tag.attrib['align']
|
||||
// except KeyError:
|
||||
// pass
|
||||
//
|
||||
// for para in doc.xpath('.//p'):
|
||||
// # We have a blank tag
|
||||
// if (len(inner_text(para)) < 3 and
|
||||
// len(para.xpath('.//img')) == 0 and
|
||||
// len(para.xpath('.//iframe')) == 0):
|
||||
// para.drop_tree()
|
||||
//
|
||||
// if clean_conditionally:
|
||||
// # We used to clean UL's and OL's here, but it was leading to
|
||||
// # too many in-article lists being removed. Consider a better
|
||||
// # way to detect menus particularly and remove them.
|
||||
// self._clean_conditionally(doc, ['ul', 'ol', 'table', 'div'])
|
||||
//
|
||||
// return doc
|
||||
//
|
||||
// def _clean_conditionally(self, doc, tags):
|
||||
// """Given a doc, clean it of some superfluous content specified by
|
||||
// tags. Things like forms, ads, etc.
|
||||
//
|
||||
// Tags is an array of tag name's to search through. (like div, form,
|
||||
// etc)
|
||||
//
|
||||
// Return this same doc.
|
||||
// """
|
||||
// for node in doc.xpath('.//' + ' | .//'.join(tags)):
|
||||
//
|
||||
// node_is_list = node.tag in ('ul', 'ol')
|
||||
//
|
||||
// weight = self._get_score(node)
|
||||
// if node.getparent() is None:
|
||||
// continue
|
||||
// if weight < 0:
|
||||
// node.drop_tree()
|
||||
// else:
|
||||
// node_content = inner_text(node)
|
||||
// if node_content.count(',') < 10:
|
||||
// remove_node = False
|
||||
// p_count = int(node.xpath('count(.//p)'))
|
||||
// img_count = int(node.xpath('count(.//img)'))
|
||||
// input_count = int(node.xpath('count(.//input)'))
|
||||
// script_count = int(node.xpath('count(.//script)'))
|
||||
// density = link_density(node)
|
||||
// content_length = len(inner_text(node))
|
||||
//
|
||||
// # Looks like a form, too many inputs.
|
||||
// if input_count > (p_count / 3):
|
||||
// remove_node = True
|
||||
//
|
||||
// # Content is too short, and there are no images, so
|
||||
// # this is probably junk content.
|
||||
// elif content_length < 25 and img_count == 0:
|
||||
// remove_node = True
|
||||
//
|
||||
// # Too high of link density, is probably a menu or
|
||||
// # something similar.
|
||||
// elif (weight < 25 and
|
||||
// density > 0.2 and
|
||||
// content_length > 75):
|
||||
// remove_node = True
|
||||
//
|
||||
// # Too high of a link density, despite the score being
|
||||
// # high.
|
||||
// elif weight >= 25 and density > 0.5:
|
||||
// remove_node = True
|
||||
// # Don't remove the node if it's a list and the
|
||||
// # previous sibling starts with a colon though. That
|
||||
// # means it's probably content.
|
||||
// if node_is_list:
|
||||
// previous_sibling = node.getprevious()
|
||||
// if (previous_sibling is not None and
|
||||
// inner_text(previous_sibling)[-1:] == ':'):
|
||||
// remove_node = False
|
||||
//
|
||||
// # Too many script tags, not enough content.
|
||||
// elif script_count > 0 and len(node_content) < 150:
|
||||
// remove_node = True
|
||||
//
|
||||
// # Explicitly save entry-content-asset tags, which are
|
||||
// # noted as valuable in the Publisher guidelines. For now
|
||||
// # this works everywhere. We may want to consider making
|
||||
// # this less of a sure-thing later.
|
||||
// if 'entry-content-asset' in node.get('class', ''):
|
||||
// remove_node = False
|
||||
//
|
||||
// if remove_node:
|
||||
// node.drop_tree()
|
||||
// return doc
|
||||
|
||||
export default GenericContentExtractor
|
12
extractor/generic/index.js
Normal file
12
extractor/generic/index.js
Normal file
@ -0,0 +1,12 @@
|
||||
import GenericContentExtractor from './content-extractor.js'
|
||||
|
||||
const GenericExtractor = {
|
||||
parse: (html) => {
|
||||
return {
|
||||
content: GenericContentExtractor.parse(html)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export default GenericExtractor
|
||||
|
Loading…
Reference in New Issue
Block a user