Update to add link density scoring adjustments, prep for sibling checks

12 years ago · 5b3ef916ef
parent e843940549
commit 5b3ef916ef
2 changed files with 98 additions and 37 deletions
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -6,19 +6,16 @@ from lxml.html import fragment_fromstring
 from breadability.document import OriginalDocument
 from breadability.utils import cached_property

-
-RegexList = namedtuple('RegexList', ['unlikely', 'maybe'])
-
-
-READABLERE = RegexList(
-    unlikely=(re.compile(
-        'combx|comment|community|disqus|extra|foot|header|menu|'
-        'remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination'
-        '|pager|popup|tweet|twitter', re.I)),
-    maybe=(re.compile('and|article|body|column|main|shadow', re.I)),
-)
-
-
+# A series of sets of attributes we check to help in determining if a node is
+# a potential candidate or not.
+CLS_UNLIKELY = set([
+    'combx', 'comment', 'community', 'disqus', 'extra', 'foot', 'header',
+    'menu', '' 'remark', 'rss', 'shoutbox', 'sidebar', 'sponsor', 'ad-break',
+    'agegate', 'pagination' '', 'pager', 'popup', 'tweet', 'twitter',
+])
+CLS_MAYBE = set([
+    'and', 'article', 'body', 'column', 'main', 'shadow',
+])
 CLS_WEIGHT_POSITIVE = set(['article', 'body', 'content', 'entry', 'hentry',
    'main', 'page', 'pagination', 'post', 'text', 'blog', 'story'])
 CLS_WEIGHT_NEGATIVE = set(['combx', 'comment', 'com-', 'contact', 'foot',
@ -27,6 +24,15 @@ CLS_WEIGHT_NEGATIVE = set(['combx', 'comment', 'com-', 'contact', 'foot',
    'tool', 'widget'])


+def check_node_attr(node, attr, checkset):
+    attr = node.get(attr) or ""
+    check = set(attr.lower().split(' '))
+    if check.intersection(checkset):
+        return True
+    else:
+        return False
+
+
 def drop_tag(doc, *tags):
    """Helper to just remove any nodes that match this html tag passed in

@ -83,6 +89,18 @@ def transform_misused_divs_into_paragraphs(doc):
    return doc


+def get_link_density(node):
+    """Generate a value for the number of links in the node.
+
+    :param node: pared elementree node
+    :returns float:
+
+    """
+    link_length = len("".join([a.text or "" for a in node.findall(".//a")]))
+    text_length = len(node.text_content())
+    return float(link_length) / max(text_length, 1)
+
+
 ###### SCORING


@ -93,19 +111,15 @@ def get_class_weight(node):

    """
    weight = 0
-    cls = set(node.get('class', default="").split(' '))
-    ids = node.get('id', default="None")
-    if cls:
-        if cls.intersection(CLS_WEIGHT_NEGATIVE):
-            weight = weight - 25
-        if cls.intersection(CLS_WEIGHT_POSITIVE):
-            weight = weight + 25
-
-    if ids:
-        if ids in CLS_WEIGHT_NEGATIVE:
-            weight = weight - 25
-        if ids in CLS_WEIGHT_POSITIVE:
-            weight = weight + 25
+    if check_node_attr(node, 'class', CLS_WEIGHT_NEGATIVE):
+        weight = weight - 25
+    if check_node_attr(node, 'class', CLS_WEIGHT_POSITIVE):
+        weight = weight + 25
+
+    if check_node_attr(node, 'id', CLS_WEIGHT_NEGATIVE):
+        weight = weight - 25
+    if check_node_attr(node, 'id', CLS_WEIGHT_POSITIVE):
+        weight = weight + 25

    return weight

@ -153,6 +167,10 @@ def score_candidates(nodes):
        if grand is not None:
            candidates[grand].content_score += content_score

+        for candidate in candidates.values():
+            candidate.content_score = candidate.content_score * (1 -
+                    get_link_density(candidate.node))
+
    return candidates


@ -167,16 +185,25 @@ def process(doc):
    scorable_node_tags = ['p', 'td', 'pre']
    nodes_to_score = []

-    def is_unlikely_node(n):
-        """Short helper for checking unlikely status."""
-        if READABLERE.unlikely.match(nodeid):
-            if not READABLERE.maybe.match(nodeid):
-                if node.tag != "body":
-                    return True
+    def is_unlikely_node(node):
+        """Short helper for checking unlikely status.
+
+        If the class or id are in the unlikely list, and there's not also a
+        class/id in the likely list then it might need to be removed.
+
+        """
+        unlikely = check_node_attr(node, 'class', CLS_UNLIKELY) or \
+            check_node_attr(node, 'id', CLS_UNLIKELY)
+
+        maybe = check_node_attr(node, 'class', CLS_MAYBE) or \
+            check_node_attr(node, 'id', CLS_MAYBE)
+
+        if unlikely and not maybe and node.tag != 'body':
+            return True
+        else:
+            return False

    for node in doc.getiterator():
-        # if the id or clsas show up in the unlikely list, mark for removal
-        nodeid = "%s%s" % (node.get('class', ''), node.get('id', ''))
        if is_unlikely_node(node):
            unlikely.append(node)

@ -191,9 +218,16 @@ def process(doc):


 class CandidateNode(object):
+    """We need Candidate nodes we use to track possible article matches
+
+    We might have a bunch of these so we use __slots__ to keep memory usage
+    down.
+
+    """
    __slots__ = ['node', 'content_score']

    def __init__(self, node):
+        """Given node, set an initial score and weigh based on css and id"""
        self.node = node
        content_score = 0
        if node.tag == 'div':
--- a/src/breadability/tests/test_readable.py
+++ b/src/breadability/tests/test_readable.py
@ -6,6 +6,7 @@ from unittest import TestCase
 from breadability.readable import Article
 from breadability.readable import CandidateNode
 from breadability.readable import get_class_weight
+from breadability.readable import get_link_density
 from breadability.readable import score_candidates
 from breadability.readable import transform_misused_divs_into_paragraphs
 from breadability.tests import load_snippet
@ -36,9 +37,8 @@ class TestReadableDocument(TestCase):

        """
        doc = Article(load_snippet('document_min.html'))
-        self.assertEqual(doc.readable.tag, 'html')
-        found_body = doc.readable.find('.//body')
-        self.assertEqual(found_body.get('id'), 'readabilityBody')
+        self.assertEqual(doc.readable.tag, 'div')
+        self.assertEqual(doc.readable.get('id'), 'readabilityBody')

    def test_body_doesnt_exist(self):
        """If we can't find a body, then we create one.
@ -191,3 +191,30 @@ class TestScoringNodes(TestCase):
        # one of these should have a decent score
        scores = sorted([c.content_score for c in candidates.values()])
        self.assertTrue(scores[-1] > 100)
+
+class TestLinkDensityScoring(TestCase):
+    """Link density will adjust out candidate scoresself."""
+
+    def test_link_density(self):
+        """Test that we get a link density"""
+        doc = document_fromstring(load_article('ars/ars.001.html'))
+        for node in doc.getiterator():
+            if node.tag in ['p', 'td', 'pre']:
+                density = get_link_density(node)
+
+                # the density must be between 0, 1
+                self.assertTrue(density >= 0.0 and density <= 1.0)
+
+
+class TestSiblings(TestCase):
+    """Siblings will be included if their content is related."""
+
+    def test_bad_siblings_not_counted(self):
+        """"""
+
+        assert False, "TBD"
+
+    def test_good_siblings_counted(self):
+        """"""
+
+        assert False, "TBD"