Refactored file 'scoring.py'

pull/21/head
Mišo Belica 11 years ago
parent dcb7c18fd5
commit 18b5c9b447

@ -11,29 +11,45 @@ from hashlib import md5
from lxml.etree import tostring
from ._py3k import to_bytes
# A series of sets of attributes we check to help in determining if a node is
# a potential candidate or not.
CLS_UNLIKELY = re.compile(('combx|comment|community|disqus|extra|foot|header|'
'menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|'
'pager|perma|popup|tweet|twitter'), re.IGNORECASE)
CLS_MAYBE = re.compile('and|article|body|column|main|shadow', re.IGNORECASE)
CLS_WEIGHT_POSITIVE = re.compile(('article|body|content|entry|hentry|main|'
'page|pagination|post|text|blog|story'), re.IGNORECASE)
CLS_WEIGHT_NEGATIVE = re.compile(('combx|comment|com-|contact|foot|footer|'
'footnote|head|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|'
'sidebar|sponsor|shopping|tags|tool|widget'), re.IGNORECASE)
CLS_UNLIKELY = re.compile(
"combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|"
"sidebar|sponsor|ad-break|agegate|pagination|pager|perma|popup|tweet|"
"twitter",
re.IGNORECASE
)
CLS_MAYBE = re.compile(
"and|article|body|column|main|shadow",
re.IGNORECASE
)
CLS_WEIGHT_POSITIVE = re.compile(
"article|body|content|entry|hentry|main|page|pagination|post|text|blog|"
"story",
re.IGNORECASE
)
CLS_WEIGHT_NEGATIVE = re.compile(
"combx|comment|com-|contact|foot|footer|footnote|head|masthead|media|meta|"
"outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|"
"widget",
re.IGNORECASE
)
logger = logging.getLogger("breadability")
def check_node_attribute(node, attribute_name, pattern):
attribute = node.get(attribute_name)
if attribute is None:
return False
else:
return bool(pattern.search(attribute))
def check_node_attributes(pattern, node, *attributes):
"""
Searches match in attributes against given pattern and if
finds the match against any of them returns True.
"""
for attribute_name in attributes:
attribute = node.get(attribute_name)
if attribute is not None and pattern.search(attribute):
return True
return False
def generate_hash_id(node):
"""
@ -74,14 +90,14 @@ def get_class_weight(node):
"""
weight = 0
if check_node_attribute(node, 'class', CLS_WEIGHT_NEGATIVE):
if check_node_attributes(CLS_WEIGHT_NEGATIVE, node, "class"):
weight -= 25
if check_node_attribute(node, 'class', CLS_WEIGHT_POSITIVE):
if check_node_attributes(CLS_WEIGHT_POSITIVE, node, "class"):
weight += 25
if check_node_attribute(node, 'id', CLS_WEIGHT_NEGATIVE):
if check_node_attributes(CLS_WEIGHT_NEGATIVE, node, "id"):
weight -= 25
if check_node_attribute(node, 'id', CLS_WEIGHT_POSITIVE):
if check_node_attributes(CLS_WEIGHT_POSITIVE, node, "id"):
weight += 25
return weight
@ -94,13 +110,10 @@ def is_unlikely_node(node):
If the class or id are in the unlikely list, and there's not also a
class/id in the likely list then it might need to be removed.
"""
unlikely = check_node_attribute(node, 'class', CLS_UNLIKELY) or \
check_node_attribute(node, 'id', CLS_UNLIKELY)
maybe = check_node_attribute(node, 'class', CLS_MAYBE) or \
check_node_attribute(node, 'id', CLS_MAYBE)
unlikely = check_node_attributes(CLS_UNLIKELY, node, "class", "id")
maybe = check_node_attributes(CLS_MAYBE, node, "class", "id")
return bool(unlikely and not maybe and node.tag != 'body')
return bool(unlikely and not maybe and node.tag != "body")
def score_candidates(nodes):
@ -111,62 +124,62 @@ def score_candidates(nodes):
for node in nodes:
logger.debug("Scoring Node")
content_score = 0
# if the node has no parent it knows of, then it ends up creating a
# body and html tag to parent the html fragment.
# if the node has no parent it knows of
# then it ends up creating a body & html tag to parent the html fragment
parent = node.getparent()
grand = parent.getparent() if parent is not None else None
innertext = node.text_content()
if parent is None:
logger.debug("Skipping node - parent node is none.")
continue
if parent is None or grand is None:
logger.debug("Skipping candidate because parent/grand are none")
grand = parent.getparent()
if grand is None:
logger.debug("Skipping node - grand parent node is none.")
continue
# If this paragraph is less than 25 characters, don't even count it.
if innertext and len(innertext) < MIN_HIT_LENTH:
# if paragraph is < `MIN_HIT_LENTH` characters don't even count it
inner_text = node.text_content().strip()
if len(inner_text) < MIN_HIT_LENTH:
logger.debug("Skipping candidate because inner text is shorter than %d characters.", MIN_HIT_LENTH)
continue
# Initialize readability data for the parent.
# if the parent node isn't in the candidate list, add it
# initialize readability data for the parent
# add parent node if it isn't in the candidate list
if parent not in candidates:
candidates[parent] = ScoredNode(parent)
if grand not in candidates:
candidates[grand] = ScoredNode(grand)
# Add a point for the paragraph itself as a base.
content_score += 1
# add a point for the paragraph itself as a base
content_score = 1
if innertext:
# Add 0.25 points for any commas within this paragraph
content_score += innertext.count(',') * 0.25
logger.debug("Bonus points for ,: " + str(innertext.count(',')))
if inner_text:
# add 0.25 points for any commas within this paragraph
commas_count = inner_text.count(",")
content_score += commas_count * 0.25
logger.debug("Bonus points for commas: %d", commas_count)
# Subtract 0.5 points for each double quote within this paragraph
content_score += innertext.count('"') * (-0.5)
logger.debug('Penalty points for ": ' + str(innertext.count('"')))
# subtract 0.5 points for each double quote within this paragraph
double_quotes_count = inner_text.count('"')
content_score += double_quotes_count * -0.5
logger.debug("Penalty points for double-quotes: %d", double_quotes_count)
# For every 100 characters in this paragraph, add another point.
# Up to 3 points.
length_points = len(innertext) // 100
# for every 100 characters in this paragraph, add another point
# up to 3 points
length_points = len(inner_text) // 100
content_score += min(length_points, 3)
logger.debug("Length/content points: %r : %r", length_points,
content_score)
logger.debug("Length/content points: %d : %f", length_points, content_score)
# Add the score to the parent.
logger.debug("From this current node.")
# add the score to the parent
candidates[parent].content_score += content_score
logger.debug("Giving parent bonus points: %r", candidates[parent].content_score)
# The grandparent gets half.
logger.debug("Giving grand bonus points")
candidates[grand].content_score += (content_score / 2.0)
logger.debug("Giving grand bonus points: %r", candidates[grand].content_score)
logger.debug("Giving parent bonus points: %f", candidates[parent].content_score)
# the grand node gets half
candidates[grand].content_score += content_score / 2.0
logger.debug("Giving grand bonus points: %f", candidates[grand].content_score)
for candidate in candidates.values():
adjustment = 1 - get_link_density(candidate.node)
logger.debug("Getting link density adjustment: %r * %r",
candidate.content_score, adjustment)
logger.debug("Getting link density adjustment: %f * %f", candidate.content_score, adjustment)
candidate.content_score = candidate.content_score * adjustment
return candidates

@ -15,7 +15,7 @@ except ImportError:
from breadability._py3k import to_unicode
from breadability.readable import Article
from breadability.scoring import check_node_attribute
from breadability.scoring import check_node_attributes
from breadability.scoring import get_class_weight
from breadability.scoring import ScoredNode
from breadability.scoring import score_candidates
@ -61,33 +61,33 @@ class TestCheckNodeAttr(unittest.TestCase):
"""
def test_has_class(self):
"""Verify that a node has a class in our set."""
test_re = re.compile('test1|test2', re.I)
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('class', 'test2 comment')
self.assertTrue(check_node_attribute(test_node, 'class', test_re))
self.assertTrue(check_node_attributes(test_pattern, test_node, 'class'))
def test_has_id(self):
"""Verify that a node has an id in our set."""
test_re = re.compile('test1|test2', re.I)
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('id', 'test2')
self.assertTrue(check_node_attribute(test_node, 'id', test_re))
self.assertTrue(check_node_attributes(test_pattern, test_node, 'id'))
def test_lacks_class(self):
"""Verify that a node does not have a class in our set."""
test_re = re.compile('test1|test2', re.I)
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('class', 'test4 comment')
self.assertFalse(check_node_attribute(test_node, 'class', test_re))
self.assertFalse(check_node_attributes(test_pattern, test_node, 'class'))
def test_lacks_id(self):
"""Verify that a node does not have an id in our set."""
test_re = re.compile('test1|test2', re.I)
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('id', 'test4')
self.assertFalse(check_node_attribute(test_node, 'id', test_re))
self.assertFalse(check_node_attributes(test_pattern, test_node, 'id'))
class TestLinkDensity(unittest.TestCase):

Loading…
Cancel
Save