You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
breadability/scoring.bak.py

238 lines
7.7 KiB
Python

"""Handle dealing with scoring nodes and content for our parsing."""
import re
from hashlib import md5
from lxml.etree import tounicode
from breadability.logconfig import LNODE
from breadability.logconfig import LOG
# A series of sets of attributes we check to help in determining if a node is
# a potential candidate or not.
CLS_UNLIKELY = re.compile(('combx|comment|community|disqus|extra|foot|header|'
'menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|'
'pager|perma|popup|tweet|twitter'), re.I)
CLS_MAYBE = re.compile('and|article|body|column|main|shadow', re.I)
CLS_WEIGHT_POSITIVE = re.compile(('article|body|content|entry|hentry|main|'
'page|pagination|post|text|blog|story'), re.I)
CLS_WEIGHT_NEGATIVE = re.compile(('combx|comment|com-|contact|foot|footer|'
'footnote|head|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|'
'sidebar|sponsor|shopping|tags|tool|widget'), re.I)
def check_node_attr(node, attr, checkset):
value = node.get(attr) or ""
check = checkset.search(value)
if check:
return True
else:
return False
def generate_hash_id(node):
"""Generate a hash_id for the node in question.
:param node: lxml etree node
"""
content = tounicode(node)
hashed = md5()
try:
hashed.update(content.encode('utf-8', "replace"))
except Exception, e:
LOG.error("BOOM! " + str(e))
return hashed.hexdigest()[0:8]
def get_link_density(node, node_text=None):
"""Generate a value for the number of links in the node.
:param node: pared elementree node
:param node_text: if we already have the text_content() make this easier
on us.
:returns float:
"""
link_length = sum([len(a.text_content()) or 0
for a in node.findall(".//a")])
# For each img, give 50 bonus chars worth of length.
# Tweaking this 50 down a notch should help if we hit false positives.
link_length = max(link_length -
sum([50 for img in node.findall(".//img")]), 0)
if node_text:
text_length = len(node_text)
else:
text_length = len(node.text_content())
return float(link_length) / max(text_length, 1)
def get_class_weight(node):
"""Get an elements class/id weight.
We're using sets to help efficiently check for existence of matches.
"""
weight = 0
if check_node_attr(node, 'class', CLS_WEIGHT_NEGATIVE):
weight = weight - 25
if check_node_attr(node, 'class', CLS_WEIGHT_POSITIVE):
weight = weight + 25
if check_node_attr(node, 'id', CLS_WEIGHT_NEGATIVE):
weight = weight - 25
if check_node_attr(node, 'id', CLS_WEIGHT_POSITIVE):
weight = weight + 25
return weight
def is_unlikely_node(node):
"""Short helper for checking unlikely status.
If the class or id are in the unlikely list, and there's not also a
class/id in the likely list then it might need to be removed.
"""
unlikely = check_node_attr(node, 'class', CLS_UNLIKELY) or \
check_node_attr(node, 'id', CLS_UNLIKELY)
maybe = check_node_attr(node, 'class', CLS_MAYBE) or \
check_node_attr(node, 'id', CLS_MAYBE)
if unlikely and not maybe and node.tag != 'body':
return True
else:
return False
def score_candidates(nodes):
"""Given a list of potential nodes, find some initial scores to start"""
MIN_HIT_LENTH = 25
candidates = {}
for node in nodes:
LNODE.log(node, 1, "Scoring Node")
content_score = 0
# if the node has no parent it knows of, then it ends up creating a
# body and html tag to parent the html fragment.
parent = node.getparent()
grand = parent.getparent() if parent is not None else None
innertext = node.text_content()
if parent is None or grand is None:
LNODE.log(
node, 1,
"Skipping candidate because parent/grand are none")
continue
# If this paragraph is less than 25 characters, don't even count it.
if innertext and len(innertext) < MIN_HIT_LENTH:
LNODE.log(
node, 1,
"Skipping candidate because not enough content.")
continue
# Initialize readability data for the parent.
# if the parent node isn't in the candidate list, add it
if parent not in candidates:
candidates[parent] = ScoredNode(parent)
if grand not in candidates:
candidates[grand] = ScoredNode(grand)
# Add a point for the paragraph itself as a base.
content_score += 1
if innertext:
# Add 0.25 points for any commas within this paragraph
content_score += innertext.count(',') * 0.25
LNODE.log(node, 1,
"Bonus points for ,: " + str(innertext.count(',')))
# Subtract 0.5 points for each double quote within this paragraph
content_score += innertext.count('"') * (-0.5)
LNODE.log(node, 1,
'Penalty points for ": ' + str(innertext.count('"')))
# For every 100 characters in this paragraph, add another point.
# Up to 3 points.
length_points = len(innertext) / 100
if length_points > 3:
content_score += 3
else:
content_score += length_points
LNODE.log(
node, 1,
"Length/content points: {0} : {1}".format(length_points,
content_score))
# Add the score to the parent.
LNODE.log(node, 1, "From this current node.")
candidates[parent].content_score += content_score
LNODE.log(
candidates[parent].node,
1,
"Giving parent bonus points: " + str(
candidates[parent].content_score))
# The grandparent gets half.
LNODE.log(candidates[grand].node, 1, "Giving grand bonus points")
candidates[grand].content_score += (content_score / 2.0)
LNODE.log(
candidates[parent].node,
1,
"Giving grand bonus points: " + str(
candidates[grand].content_score))
for candidate in candidates.values():
adjustment = 1 - get_link_density(candidate.node)
LNODE.log(
candidate.node,
1,
"Getting link density adjustment: {0} * {1} ".format(
candidate.content_score, adjustment))
candidate.content_score = candidate.content_score * (adjustment)
return candidates
class ScoredNode(object):
"""We need Scored nodes we use to track possible article matches
We might have a bunch of these so we use __slots__ to keep memory usage
down.
"""
__slots__ = ['node', 'content_score']
def __repr__(self):
"""Helpful representation of our Scored Node"""
return "{0}: {1:0.1F}\t{2}".format(
self.hash_id,
self.content_score,
self.node)
def __init__(self, node):
"""Given node, set an initial score and weigh based on css and id"""
self.node = node
content_score = 0
if node.tag in ['div', 'article']:
content_score = 5
if node.tag in ['pre', 'td', 'blockquote']:
content_score = 3
if node.tag in ['address', 'ol', 'ul', 'dl', 'dd', 'dt', 'li',
'form']:
content_score = -3
if node.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th']:
content_score = -5
content_score += get_class_weight(node)
self.content_score = content_score
@property
def hash_id(self):
return generate_hash_id(self.node)