# -*- coding: utf8 -*- """Handle dealing with scoring nodes and content for our parsing.""" from __future__ import absolute_import from __future__ import division, print_function import re import logging from hashlib import md5 from lxml.etree import tostring from ._compat import to_bytes from .utils import normalize_whitespace # A series of sets of attributes we check to help in determining if a node is # a potential candidate or not. CLS_UNLIKELY = re.compile( "combx|comment|community|disqus|extra|foot|header|menu|remark|rss|" "shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|perma|popup|" "tweet|twitter|social|breadcrumb", re.IGNORECASE ) CLS_MAYBE = re.compile( "and|article|body|column|main|shadow|entry", re.IGNORECASE ) CLS_WEIGHT_POSITIVE = re.compile( "article|body|content|entry|main|page|pagination|post|text|blog|story", re.IGNORECASE ) CLS_WEIGHT_NEGATIVE = re.compile( "combx|comment|com-|contact|foot|footer|footnote|head|masthead|media|meta|" "outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|" "tool|widget", re.IGNORECASE ) logger = logging.getLogger("breadability") def check_node_attributes(pattern, node, *attributes): """ Searches match in attributes against given pattern and if finds the match against any of them returns True. """ for attribute_name in attributes: attribute = node.get(attribute_name) if attribute is not None and pattern.search(attribute): return True return False def generate_hash_id(node): """ Generates a hash_id for the node in question. :param node: lxml etree node """ try: content = tostring(node) except Exception: logger.exception("Generating of hash failed") content = to_bytes(repr(node)) hash_id = md5(content).hexdigest() return hash_id[:8] def get_link_density(node, node_text=None): """ Computes the ratio for text in given node and text in links contained in the node. It is computed from number of characters in the texts. :parameter Element node: HTML element in which links density is computed. :parameter string node_text: Text content of given node if it was obtained before. :returns float: Returns value of computed 0 <= density <= 1, where 0 means no links and 1 means that node contains only links. """ if node_text is None: node_text = node.text_content() node_text = normalize_whitespace(node_text.strip()) text_length = len(node_text) if text_length == 0: return 0.0 links_length = sum(map(_get_normalized_text_length, node.findall(".//a"))) # Give 50 bonus chars worth of length for each img. # Tweaking this 50 down a notch should help if we hit false positives. img_bonuses = 50 * len(node.findall(".//img")) links_length = max(0, links_length - img_bonuses) return links_length / text_length def _get_normalized_text_length(node): return len(normalize_whitespace(node.text_content().strip())) def get_class_weight(node): """ Computes weight of element according to its class/id. We're using sets to help efficiently check for existence of matches. """ weight = 0 if check_node_attributes(CLS_WEIGHT_NEGATIVE, node, "class"): weight -= 25 if check_node_attributes(CLS_WEIGHT_POSITIVE, node, "class"): weight += 25 if check_node_attributes(CLS_WEIGHT_NEGATIVE, node, "id"): weight -= 25 if check_node_attributes(CLS_WEIGHT_POSITIVE, node, "id"): weight += 25 return weight def is_unlikely_node(node): """ Short helper for checking unlikely status. If the class or id are in the unlikely list, and there's not also a class/id in the likely list then it might need to be removed. """ unlikely = check_node_attributes(CLS_UNLIKELY, node, "class", "id") maybe = check_node_attributes(CLS_MAYBE, node, "class", "id") return bool(unlikely and not maybe and node.tag != "body") def score_candidates(nodes): """Given a list of potential nodes, find some initial scores to start""" MIN_HIT_LENTH = 25 candidates = {} for node in nodes: logger.debug("* Scoring candidate %s %r", node.tag, node.attrib) # if the node has no parent it knows of then it ends up creating a # body & html tag to parent the html fragment parent = node.getparent() if parent is None: logger.debug("Skipping candidate - parent node is 'None'.") continue grand = parent.getparent() if grand is None: logger.debug("Skipping candidate - grand parent node is 'None'.") continue # if paragraph is < `MIN_HIT_LENTH` characters don't even count it inner_text = node.text_content().strip() if len(inner_text) < MIN_HIT_LENTH: logger.debug( "Skipping candidate - inner text < %d characters.", MIN_HIT_LENTH) continue # initialize readability data for the parent # add parent node if it isn't in the candidate list if parent not in candidates: candidates[parent] = ScoredNode(parent) if grand not in candidates: candidates[grand] = ScoredNode(grand) # add a point for the paragraph itself as a base content_score = 1 if inner_text: # add 0.25 points for any commas within this paragraph commas_count = inner_text.count(",") content_score += commas_count * 0.25 logger.debug("Bonus points for %d commas.", commas_count) # subtract 0.5 points for each double quote within this paragraph double_quotes_count = inner_text.count('"') content_score += double_quotes_count * -0.5 logger.debug( "Penalty points for %d double-quotes.", double_quotes_count) # for every 100 characters in this paragraph, add another point # up to 3 points length_points = len(inner_text) / 100 content_score += min(length_points, 3.0) logger.debug("Bonus points for length of text: %f", length_points) # add the score to the parent logger.debug( "Bonus points for parent %s %r with score %f: %f", parent.tag, parent.attrib, candidates[parent].content_score, content_score) candidates[parent].content_score += content_score # the grand node gets half logger.debug( "Bonus points for grand %s %r with score %f: %f", grand.tag, grand.attrib, candidates[grand].content_score, content_score / 2.0) candidates[grand].content_score += content_score / 2.0 if node not in candidates: candidates[node] = ScoredNode(node) candidates[node].content_score += content_score for candidate in candidates.values(): adjustment = 1.0 - get_link_density(candidate.node) candidate.content_score *= adjustment logger.debug( "Link density adjustment for %s %r: %f", candidate.node.tag, candidate.node.attrib, adjustment) return candidates class ScoredNode(object): """ We need Scored nodes we use to track possible article matches We might have a bunch of these so we use __slots__ to keep memory usage down. """ __slots__ = ('node', 'content_score') def __init__(self, node): """Given node, set an initial score and weigh based on css and id""" self.node = node self.content_score = 0 if node.tag in ('div', 'article'): self.content_score = 5 if node.tag in ('pre', 'td', 'blockquote'): self.content_score = 3 if node.tag in ('address', 'ol', 'ul', 'dl', 'dd', 'dt', 'li', 'form'): self.content_score = -3 if node.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): self.content_score = -5 self.content_score += get_class_weight(node) @property def hash_id(self): return generate_hash_id(self.node) def __repr__(self): if self.node is None: return "" % self.content_score return "".format( self.node.tag, self.node.attrib, self.content_score )