You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
238 lines
7.7 KiB
Python
238 lines
7.7 KiB
Python
"""Handle dealing with scoring nodes and content for our parsing."""
|
|
import re
|
|
from hashlib import md5
|
|
from lxml.etree import tounicode
|
|
|
|
from breadability.logconfig import LNODE
|
|
from breadability.logconfig import LOG
|
|
|
|
# A series of sets of attributes we check to help in determining if a node is
|
|
# a potential candidate or not.
|
|
CLS_UNLIKELY = re.compile(('combx|comment|community|disqus|extra|foot|header|'
|
|
'menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|'
|
|
'pager|perma|popup|tweet|twitter'), re.I)
|
|
CLS_MAYBE = re.compile('and|article|body|column|main|shadow', re.I)
|
|
CLS_WEIGHT_POSITIVE = re.compile(('article|body|content|entry|hentry|main|'
|
|
'page|pagination|post|text|blog|story'), re.I)
|
|
CLS_WEIGHT_NEGATIVE = re.compile(('combx|comment|com-|contact|foot|footer|'
|
|
'footnote|head|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|'
|
|
'sidebar|sponsor|shopping|tags|tool|widget'), re.I)
|
|
|
|
|
|
def check_node_attr(node, attr, checkset):
|
|
value = node.get(attr) or ""
|
|
check = checkset.search(value)
|
|
if check:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
def generate_hash_id(node):
|
|
"""Generate a hash_id for the node in question.
|
|
|
|
:param node: lxml etree node
|
|
|
|
"""
|
|
content = tounicode(node)
|
|
hashed = md5()
|
|
try:
|
|
hashed.update(content.encode('utf-8', "replace"))
|
|
except Exception, e:
|
|
LOG.error("BOOM! " + str(e))
|
|
|
|
return hashed.hexdigest()[0:8]
|
|
|
|
|
|
def get_link_density(node, node_text=None):
|
|
"""Generate a value for the number of links in the node.
|
|
|
|
:param node: pared elementree node
|
|
:param node_text: if we already have the text_content() make this easier
|
|
on us.
|
|
:returns float:
|
|
|
|
"""
|
|
link_length = sum([len(a.text_content()) or 0
|
|
for a in node.findall(".//a")])
|
|
# For each img, give 50 bonus chars worth of length.
|
|
# Tweaking this 50 down a notch should help if we hit false positives.
|
|
link_length = max(link_length -
|
|
sum([50 for img in node.findall(".//img")]), 0)
|
|
if node_text:
|
|
text_length = len(node_text)
|
|
else:
|
|
text_length = len(node.text_content())
|
|
return float(link_length) / max(text_length, 1)
|
|
|
|
|
|
def get_class_weight(node):
|
|
"""Get an elements class/id weight.
|
|
|
|
We're using sets to help efficiently check for existence of matches.
|
|
|
|
"""
|
|
weight = 0
|
|
if check_node_attr(node, 'class', CLS_WEIGHT_NEGATIVE):
|
|
weight = weight - 25
|
|
if check_node_attr(node, 'class', CLS_WEIGHT_POSITIVE):
|
|
weight = weight + 25
|
|
|
|
if check_node_attr(node, 'id', CLS_WEIGHT_NEGATIVE):
|
|
weight = weight - 25
|
|
if check_node_attr(node, 'id', CLS_WEIGHT_POSITIVE):
|
|
weight = weight + 25
|
|
|
|
return weight
|
|
|
|
|
|
def is_unlikely_node(node):
|
|
"""Short helper for checking unlikely status.
|
|
|
|
If the class or id are in the unlikely list, and there's not also a
|
|
class/id in the likely list then it might need to be removed.
|
|
|
|
"""
|
|
unlikely = check_node_attr(node, 'class', CLS_UNLIKELY) or \
|
|
check_node_attr(node, 'id', CLS_UNLIKELY)
|
|
|
|
maybe = check_node_attr(node, 'class', CLS_MAYBE) or \
|
|
check_node_attr(node, 'id', CLS_MAYBE)
|
|
|
|
if unlikely and not maybe and node.tag != 'body':
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
def score_candidates(nodes):
|
|
"""Given a list of potential nodes, find some initial scores to start"""
|
|
MIN_HIT_LENTH = 25
|
|
candidates = {}
|
|
|
|
for node in nodes:
|
|
LNODE.log(node, 1, "Scoring Node")
|
|
|
|
content_score = 0
|
|
# if the node has no parent it knows of, then it ends up creating a
|
|
# body and html tag to parent the html fragment.
|
|
parent = node.getparent()
|
|
grand = parent.getparent() if parent is not None else None
|
|
innertext = node.text_content()
|
|
|
|
if parent is None or grand is None:
|
|
LNODE.log(
|
|
node, 1,
|
|
"Skipping candidate because parent/grand are none")
|
|
continue
|
|
|
|
# If this paragraph is less than 25 characters, don't even count it.
|
|
if innertext and len(innertext) < MIN_HIT_LENTH:
|
|
LNODE.log(
|
|
node, 1,
|
|
"Skipping candidate because not enough content.")
|
|
continue
|
|
|
|
# Initialize readability data for the parent.
|
|
# if the parent node isn't in the candidate list, add it
|
|
if parent not in candidates:
|
|
candidates[parent] = ScoredNode(parent)
|
|
|
|
if grand not in candidates:
|
|
candidates[grand] = ScoredNode(grand)
|
|
|
|
# Add a point for the paragraph itself as a base.
|
|
content_score += 1
|
|
|
|
if innertext:
|
|
# Add 0.25 points for any commas within this paragraph
|
|
content_score += innertext.count(',') * 0.25
|
|
LNODE.log(node, 1,
|
|
"Bonus points for ,: " + str(innertext.count(',')))
|
|
|
|
# Subtract 0.5 points for each double quote within this paragraph
|
|
content_score += innertext.count('"') * (-0.5)
|
|
LNODE.log(node, 1,
|
|
'Penalty points for ": ' + str(innertext.count('"')))
|
|
|
|
# For every 100 characters in this paragraph, add another point.
|
|
# Up to 3 points.
|
|
length_points = len(innertext) / 100
|
|
|
|
if length_points > 3:
|
|
content_score += 3
|
|
else:
|
|
content_score += length_points
|
|
LNODE.log(
|
|
node, 1,
|
|
"Length/content points: {0} : {1}".format(length_points,
|
|
content_score))
|
|
|
|
# Add the score to the parent.
|
|
LNODE.log(node, 1, "From this current node.")
|
|
candidates[parent].content_score += content_score
|
|
LNODE.log(
|
|
candidates[parent].node,
|
|
1,
|
|
"Giving parent bonus points: " + str(
|
|
candidates[parent].content_score))
|
|
# The grandparent gets half.
|
|
LNODE.log(candidates[grand].node, 1, "Giving grand bonus points")
|
|
candidates[grand].content_score += (content_score / 2.0)
|
|
LNODE.log(
|
|
candidates[parent].node,
|
|
1,
|
|
"Giving grand bonus points: " + str(
|
|
candidates[grand].content_score))
|
|
|
|
for candidate in candidates.values():
|
|
adjustment = 1 - get_link_density(candidate.node)
|
|
LNODE.log(
|
|
candidate.node,
|
|
1,
|
|
"Getting link density adjustment: {0} * {1} ".format(
|
|
candidate.content_score, adjustment))
|
|
candidate.content_score = candidate.content_score * (adjustment)
|
|
|
|
return candidates
|
|
|
|
|
|
class ScoredNode(object):
|
|
"""We need Scored nodes we use to track possible article matches
|
|
|
|
We might have a bunch of these so we use __slots__ to keep memory usage
|
|
down.
|
|
|
|
"""
|
|
__slots__ = ['node', 'content_score']
|
|
|
|
def __repr__(self):
|
|
"""Helpful representation of our Scored Node"""
|
|
return "{0}: {1:0.1F}\t{2}".format(
|
|
self.hash_id,
|
|
self.content_score,
|
|
self.node)
|
|
|
|
def __init__(self, node):
|
|
"""Given node, set an initial score and weigh based on css and id"""
|
|
self.node = node
|
|
content_score = 0
|
|
if node.tag in ['div', 'article']:
|
|
content_score = 5
|
|
|
|
if node.tag in ['pre', 'td', 'blockquote']:
|
|
content_score = 3
|
|
|
|
if node.tag in ['address', 'ol', 'ul', 'dl', 'dd', 'dt', 'li',
|
|
'form']:
|
|
content_score = -3
|
|
if node.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th']:
|
|
content_score = -5
|
|
|
|
content_score += get_class_weight(node)
|
|
self.content_score = content_score
|
|
|
|
@property
|
|
def hash_id(self):
|
|
return generate_hash_id(self.node)
|