Refactored file 'scoring.py'

11 years ago · 18b5c9b447
parent dcb7c18fd5
commit 18b5c9b447
2 changed files with 82 additions and 69 deletions
--- a/breadability/scoring.py
+++ b/breadability/scoring.py
@ -11,29 +11,45 @@ from hashlib import md5
 from lxml.etree import tostring
 from ._py3k import to_bytes

+
 # A series of sets of attributes we check to help in determining if a node is
 # a potential candidate or not.
-CLS_UNLIKELY = re.compile(('combx|comment|community|disqus|extra|foot|header|'
-    'menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|'
-    'pager|perma|popup|tweet|twitter'), re.IGNORECASE)
-CLS_MAYBE = re.compile('and|article|body|column|main|shadow', re.IGNORECASE)
-CLS_WEIGHT_POSITIVE = re.compile(('article|body|content|entry|hentry|main|'
-    'page|pagination|post|text|blog|story'), re.IGNORECASE)
-CLS_WEIGHT_NEGATIVE = re.compile(('combx|comment|com-|contact|foot|footer|'
-    'footnote|head|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|'
-    'sidebar|sponsor|shopping|tags|tool|widget'), re.IGNORECASE)
+CLS_UNLIKELY = re.compile(
+    "combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|"
+    "sidebar|sponsor|ad-break|agegate|pagination|pager|perma|popup|tweet|"
+    "twitter",
+    re.IGNORECASE
+)
+CLS_MAYBE = re.compile(
+    "and|article|body|column|main|shadow",
+    re.IGNORECASE
+)
+CLS_WEIGHT_POSITIVE = re.compile(
+    "article|body|content|entry|hentry|main|page|pagination|post|text|blog|"
+    "story",
+    re.IGNORECASE
+)
+CLS_WEIGHT_NEGATIVE = re.compile(
+    "combx|comment|com-|contact|foot|footer|footnote|head|masthead|media|meta|"
+    "outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|"
+    "widget",
+    re.IGNORECASE
+)

 logger = logging.getLogger("breadability")


-def check_node_attribute(node, attribute_name, pattern):
-    attribute = node.get(attribute_name)
-
-    if attribute is None:
-        return False
-    else:
-        return bool(pattern.search(attribute))
+def check_node_attributes(pattern, node, *attributes):
+    """
+    Searches match in attributes against given pattern and if
+    finds the match against any of them returns True.
+    """
+    for attribute_name in attributes:
+        attribute = node.get(attribute_name)
+        if attribute is not None and pattern.search(attribute):
+            return True

+    return False

 def generate_hash_id(node):
    """
@ -74,14 +90,14 @@ def get_class_weight(node):
    """
    weight = 0

-    if check_node_attribute(node, 'class', CLS_WEIGHT_NEGATIVE):
+    if check_node_attributes(CLS_WEIGHT_NEGATIVE, node, "class"):
        weight -= 25
-    if check_node_attribute(node, 'class', CLS_WEIGHT_POSITIVE):
+    if check_node_attributes(CLS_WEIGHT_POSITIVE, node, "class"):
        weight += 25

-    if check_node_attribute(node, 'id', CLS_WEIGHT_NEGATIVE):
+    if check_node_attributes(CLS_WEIGHT_NEGATIVE, node, "id"):
        weight -= 25
-    if check_node_attribute(node, 'id', CLS_WEIGHT_POSITIVE):
+    if check_node_attributes(CLS_WEIGHT_POSITIVE, node, "id"):
        weight += 25

    return weight
@ -94,13 +110,10 @@ def is_unlikely_node(node):
    If the class or id are in the unlikely list, and there's not also a
    class/id in the likely list then it might need to be removed.
    """
-    unlikely = check_node_attribute(node, 'class', CLS_UNLIKELY) or \
-        check_node_attribute(node, 'id', CLS_UNLIKELY)
-
-    maybe = check_node_attribute(node, 'class', CLS_MAYBE) or \
-        check_node_attribute(node, 'id', CLS_MAYBE)
+    unlikely = check_node_attributes(CLS_UNLIKELY, node, "class", "id")
+    maybe = check_node_attributes(CLS_MAYBE, node, "class", "id")

-    return bool(unlikely and not maybe and node.tag != 'body')
+    return bool(unlikely and not maybe and node.tag != "body")


 def score_candidates(nodes):
@ -111,62 +124,62 @@ def score_candidates(nodes):
    for node in nodes:
        logger.debug("Scoring Node")

-        content_score = 0
-        # if the node has no parent it knows of, then it ends up creating a
-        # body and html tag to parent the html fragment.
+        # if the node has no parent it knows of
+        # then it ends up creating a body & html tag to parent the html fragment
        parent = node.getparent()
-        grand = parent.getparent() if parent is not None else None
-        innertext = node.text_content()
+        if parent is None:
+            logger.debug("Skipping node - parent node is none.")
+            continue

-        if parent is None or grand is None:
-            logger.debug("Skipping candidate because parent/grand are none")
+        grand = parent.getparent()
+        if grand is None:
+            logger.debug("Skipping node - grand parent node is none.")
            continue

-        # If this paragraph is less than 25 characters, don't even count it.
-        if innertext and len(innertext) < MIN_HIT_LENTH:
+        # if paragraph is < `MIN_HIT_LENTH` characters don't even count it
+        inner_text = node.text_content().strip()
+        if len(inner_text) < MIN_HIT_LENTH:
            logger.debug("Skipping candidate because inner text is shorter than %d characters.", MIN_HIT_LENTH)
            continue

-        # Initialize readability data for the parent.
-        # if the parent node isn't in the candidate list, add it
+        # initialize readability data for the parent
+        # add parent node if it isn't in the candidate list
        if parent not in candidates:
            candidates[parent] = ScoredNode(parent)

        if grand not in candidates:
            candidates[grand] = ScoredNode(grand)

-        # Add a point for the paragraph itself as a base.
-        content_score += 1
+        # add a point for the paragraph itself as a base
+        content_score = 1

-        if innertext:
-            # Add 0.25 points for any commas within this paragraph
-            content_score += innertext.count(',') * 0.25
-            logger.debug("Bonus points for ,: " + str(innertext.count(',')))
+        if inner_text:
+            # add 0.25 points for any commas within this paragraph
+            commas_count = inner_text.count(",")
+            content_score += commas_count * 0.25
+            logger.debug("Bonus points for commas: %d", commas_count)

-            # Subtract 0.5 points for each double quote within this paragraph
-            content_score += innertext.count('"') * (-0.5)
-            logger.debug('Penalty points for ": ' + str(innertext.count('"')))
+            # subtract 0.5 points for each double quote within this paragraph
+            double_quotes_count = inner_text.count('"')
+            content_score += double_quotes_count * -0.5
+            logger.debug("Penalty points for double-quotes: %d", double_quotes_count)

-            # For every 100 characters in this paragraph, add another point.
-            # Up to 3 points.
-            length_points = len(innertext) // 100
+            # for every 100 characters in this paragraph, add another point
+            # up to 3 points
+            length_points = len(inner_text) // 100
            content_score += min(length_points, 3)
-            logger.debug("Length/content points: %r : %r", length_points,
-                content_score)
+            logger.debug("Length/content points: %d : %f", length_points, content_score)

-        # Add the score to the parent.
-        logger.debug("From this current node.")
+        # add the score to the parent
        candidates[parent].content_score += content_score
-        logger.debug("Giving parent bonus points: %r", candidates[parent].content_score)
-        # The grandparent gets half.
-        logger.debug("Giving grand bonus points")
-        candidates[grand].content_score += (content_score / 2.0)
-        logger.debug("Giving grand bonus points: %r", candidates[grand].content_score)
+        logger.debug("Giving parent bonus points: %f", candidates[parent].content_score)
+        # the grand node gets half
+        candidates[grand].content_score += content_score / 2.0
+        logger.debug("Giving grand bonus points: %f", candidates[grand].content_score)

    for candidate in candidates.values():
        adjustment = 1 - get_link_density(candidate.node)
-        logger.debug("Getting link density adjustment: %r * %r",
-            candidate.content_score, adjustment)
+        logger.debug("Getting link density adjustment: %f * %f", candidate.content_score, adjustment)
        candidate.content_score = candidate.content_score * adjustment

    return candidates
--- a/tests/test_scoring.py
+++ b/tests/test_scoring.py
@ -15,7 +15,7 @@ except ImportError:

 from breadability._py3k import to_unicode
 from breadability.readable import Article
-from breadability.scoring import check_node_attribute
+from breadability.scoring import check_node_attributes
 from breadability.scoring import get_class_weight
 from breadability.scoring import ScoredNode
 from breadability.scoring import score_candidates
@ -61,33 +61,33 @@ class TestCheckNodeAttr(unittest.TestCase):
    """
    def test_has_class(self):
        """Verify that a node has a class in our set."""
-        test_re = re.compile('test1|test2', re.I)
+        test_pattern = re.compile('test1|test2', re.I)
        test_node = fragment_fromstring('<div/>')
        test_node.set('class', 'test2 comment')

-        self.assertTrue(check_node_attribute(test_node, 'class', test_re))
+        self.assertTrue(check_node_attributes(test_pattern, test_node, 'class'))

    def test_has_id(self):
        """Verify that a node has an id in our set."""
-        test_re = re.compile('test1|test2', re.I)
+        test_pattern = re.compile('test1|test2', re.I)
        test_node = fragment_fromstring('<div/>')
        test_node.set('id', 'test2')

-        self.assertTrue(check_node_attribute(test_node, 'id', test_re))
+        self.assertTrue(check_node_attributes(test_pattern, test_node, 'id'))

    def test_lacks_class(self):
        """Verify that a node does not have a class in our set."""
-        test_re = re.compile('test1|test2', re.I)
+        test_pattern = re.compile('test1|test2', re.I)
        test_node = fragment_fromstring('<div/>')
        test_node.set('class', 'test4 comment')
-        self.assertFalse(check_node_attribute(test_node, 'class', test_re))
+        self.assertFalse(check_node_attributes(test_pattern, test_node, 'class'))

    def test_lacks_id(self):
        """Verify that a node does not have an id in our set."""
-        test_re = re.compile('test1|test2', re.I)
+        test_pattern = re.compile('test1|test2', re.I)
        test_node = fragment_fromstring('<div/>')
        test_node.set('id', 'test4')
-        self.assertFalse(check_node_attribute(test_node, 'id', test_re))
+        self.assertFalse(check_node_attributes(test_pattern, test_node, 'id'))


 class TestLinkDensity(unittest.TestCase):