Update tests for scoring, returning div/html doc depending on the found content

12 years ago · 8e96cb7844
parent 60ab4a96b0
commit 8e96cb7844
4 changed files with 224 additions and 22 deletions
--- a/src/breadability/document.py
+++ b/src/breadability/document.py
@ -1,4 +1,5 @@
 import chardet
+import logging
 import re
 from lxml.etree import tostring
 from lxml.etree import tounicode
@ -8,6 +9,7 @@ from lxml.html import HTMLParser
 from breadability.utils import cached_property


+LOG = logging.getLogger(__name__)
 utf8_parser = HTMLParser(encoding='utf-8')


--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -1,6 +1,6 @@
 import re
 from collections import namedtuple
-
+from operator import attrgetter
 from lxml.etree import tounicode
 from lxml.html import fragment_fromstring
 from breadability.document import OriginalDocument
@ -21,6 +21,13 @@ READABLERE = RegexList(
    negative=()
 )

+CLS_WEIGHT_POSITIVE = set(['article', 'body', 'content', 'entry', 'hentry',
+    'main', 'page', 'pagination', 'post', 'text', 'blog', 'story'])
+CLS_WEIGHT_NEGATIVE = set(['combx', 'comment', 'com-', 'contact', 'foot',
+    'footer', 'footnote', 'masthead', 'media', 'meta', 'outbrain', 'promo',
+    'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor', 'shopping', 'tags',
+    'tool', 'widget'])
+

 def drop_tag(doc, *tags):
    [[n.drop_tree() for n in doc.iterfind(".//" + tag)]
@ -35,11 +42,15 @@ def build_base_document(html):

    """
    found_body = html.find('.//body')
-    if found_body is not None:
-        # remove any CSS and set our own
-        found_body.set('id', 'readabilityBody')
-        return found_body

+    if found_body is None:
+        fragment = fragment_fromstring('<div/>')
+        fragment.set('id', 'readabilityBody')
+        fragment.append(html)
+        return fragment
+    else:
+        found_body.set('id', 'readabilityBody')
+        return html

 def transform_misused_divs_into_paragraphs(doc):
    """Turn all divs that don't have children block level elements into p's
@ -68,6 +79,78 @@ def transform_misused_divs_into_paragraphs(doc):
    return doc


+###### SCORING
+
+def get_class_weight(node):
+    """Get an elements class/id weight.
+
+    We're using sets to help efficiently check for existence of matches.
+
+    """
+    weight = 0
+    cls = set(node.get('class', default="").split(' '))
+    ids = node.get('id', default="None")
+    if cls:
+        if cls.intersection(CLS_WEIGHT_NEGATIVE):
+            weight = weight - 25
+        if cls.intersection(CLS_WEIGHT_POSITIVE):
+            weight = weight + 25
+
+    if ids:
+        if ids in CLS_WEIGHT_NEGATIVE:
+            weight = weight - 25
+        if ids in CLS_WEIGHT_POSITIVE:
+            weight = weight + 25
+
+    return weight
+
+
+def score_candidates(nodes):
+    """Given a list of potential nodes, find some initial scores to start"""
+    MIN_HIT_LENTH = 25
+    candidates = {}
+
+    for node in nodes:
+        content_score = 0
+        parent = node.getparent()
+        grand  = parent.getparent() if parent is not None else None
+        innertext = node.text
+
+        if parent is None or grand is None:
+            continue
+
+        # If this paragraph is less than 25 characters, don't even count it.
+        if innertext and len(innertext) < MIN_HIT_LENTH:
+            continue
+
+        # Initialize readability data for the parent.
+        # if the parent node isn't in the candidate list, add it
+        if parent not in candidates:
+            candidates[parent] = CandidateNode(parent)
+
+        if grand not in candidates:
+            candidates[grand] = CandidateNode(grand)
+
+        # Add a point for the paragraph itself as a base.
+        content_score += 1;
+
+        # Add points for any commas within this paragraph
+        content_score += innertext.count(',') if innertext else 0
+
+        # For every 100 characters in this paragraph, add another point. Up to
+        # 3 points.
+        length_points = len(innertext) % 100 if innertext else 0
+        content_score = length_points if length_points > 3 else 3
+
+        # Add the score to the parent. The grandparent gets half. */
+        if parent is not None:
+            candidates[parent].content_score += content_score
+        if grand is not None:
+            candidates[grand].content_score += content_score
+
+    return candidates
+
+
 def process(doc):
    """Process this doc to make it readable.

@ -83,21 +166,44 @@ def process(doc):
        """Short helper for checking unlikely status."""
        if READABLERE.unlikely.match(nodeid):
            if not READABLERE.maybe.match(nodeid):
-                if n.tag != "body":
+                if node.tag != "body":
                    return True

-    for n in doc.getiterator():
+    for node in doc.getiterator():
        # if the id or clsas show up in the unlikely list, mark for removal
-        nodeid = "%s%s" % (n.get('class', ''), n.get('id', ''))
-        if is_unlikely_node(n):
-            unlikely.append(n)
+        nodeid = "%s%s" % (node.get('class', ''), node.get('id', ''))
+        if is_unlikely_node(node):
+            unlikely.append(node)

-        if n.tag in scorable_node_tags:
-            nodes_to_score.append(n)
+        if node.tag in scorable_node_tags:
+            nodes_to_score.append(node)

    # process our clean up instructions
    [n.drop_tree() for n in unlikely]
-    return doc
+
+    candidates = score_candidates(nodes_to_score)
+    return candidates
+
+
+class CandidateNode(object):
+    __slots__ = ['node', 'content_score']
+
+    def __init__(self, node):
+        self.node = node
+        content_score = 0
+        if node.tag == 'div':
+            content_score = 5
+
+        if node.tag in ['pre', 'td', 'blockquote']:
+            content_score = 3
+
+        if node.tag in ['address', 'ol', 'ul', 'dl', 'dd', 'dt', 'li',
+            'form']:
+            content_score = -3
+        if node.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th']:
+            content_score = -5
+        content_score += get_class_weight(node)
+        self.content_score = content_score


 class Article(object):
@ -110,10 +216,19 @@ class Article(object):
    def readable(self):
        """The readable parsed article"""
        doc = self.orig.html
-        doc = build_base_document(doc)
        doc = drop_tag(doc, 'script', 'link', 'style', 'noscript')
        doc = transform_misused_divs_into_paragraphs(doc)
-        doc = process(doc)
+        candidates = process(doc)
+
+        if candidates:
+            # right now we return the highest scoring candidate content
+            by_score = sorted([c for c in candidates.values()],
+                key=attrgetter('content_score'), reverse=True)
+
+            doc = build_base_document(by_score[0].node)
+        else:
+            doc = build_base_document(doc)
+
        return doc


--- a/src/breadability/tests/init.py
+++ b/src/breadability/tests/init.py
@ -7,3 +7,8 @@ TEST_DIR = path.dirname(__file__)
 def load_snippet(filename):
    """Helper to fetch in the content of a test snippet"""
    return open(path.join(TEST_DIR, 'test_snippets', filename)).read()
+
+
+def load_article(filename):
+    """Helper to fetch in the content of a test article"""
+    return open(path.join(TEST_DIR, 'test_articles', filename)).read()
--- a/src/breadability/tests/test_readable.py
+++ b/src/breadability/tests/test_readable.py
@ -1,10 +1,15 @@
 from lxml.etree import tounicode
 from lxml.html import document_fromstring
+from lxml.html import fragment_fromstring
 from unittest import TestCase

 from breadability.readable import Article
+from breadability.readable import CandidateNode
+from breadability.readable import get_class_weight
+from breadability.readable import score_candidates
 from breadability.readable import transform_misused_divs_into_paragraphs
 from breadability.tests import load_snippet
+from breadability.tests import load_article


 class TestReadableDocument(TestCase):
@ -13,8 +18,8 @@ class TestReadableDocument(TestCase):
    def test_load_doc(self):
        """We get back an element tree from our original doc"""
        doc = Article(load_snippet('document_min.html'))
-        # We get back the document as a body tag currently by default.
-        self.assertEqual(doc.readable.tag, 'body')
+        # We get back the document as a div tag currently by default.
+        self.assertEqual(doc.readable.tag, 'html')

    def test_doc_no_scripts_styles(self):
        """Step #1 remove all scripts from the document"""
@ -31,8 +36,9 @@ class TestReadableDocument(TestCase):

        """
        doc = Article(load_snippet('document_min.html'))
-        self.assertEqual(doc.readable.tag, 'body')
-        self.assertEqual(doc.readable.get('id'), 'readabilityBody')
+        self.assertEqual(doc.readable.tag, 'html')
+        found_body = doc.readable.find('.//body')
+        self.assertEqual(found_body.get('id'), 'readabilityBody')

    def test_body_doesnt_exist(self):
        """If we can't find a body, then we create one.
@ -41,8 +47,9 @@ class TestReadableDocument(TestCase):

        """
        doc = Article(load_snippet('document_no_body.html'))
-        self.assertEqual(doc.readable.tag, 'body')
-        self.assertEqual(doc.readable.get('id'), 'readabilityBody')
+        self.assertEqual(doc.readable.tag, 'html')
+        found_body = doc.readable.find('.//body')
+        self.assertEqual(found_body.get('id'), 'readabilityBody')

    def test_bare_content(self):
        """If the document is just pure content, no html tags we should be ok
@ -51,7 +58,7 @@ class TestReadableDocument(TestCase):

        """
        doc = Article(load_snippet('document_only_content.html'))
-        self.assertEqual(doc.readable.tag, 'body')
+        self.assertEqual(doc.readable.tag, 'div')
        self.assertEqual(doc.readable.get('id'), 'readabilityBody')


@ -111,3 +118,76 @@ class TestCleaning(TestCase):
                transform_misused_divs_into_paragraphs(test_doc2)),
                u'<html><body><p>simple<a href="">link</a></p></body></html>'
        )
+
+class TestCandidateNodes(TestCase):
+    """Candidate nodes are scoring containers we use."""
+
+    def test_candidate_scores(self):
+        """We should be getting back objects with some scores."""
+        fives = ['<div/>']
+        threes = ['<pre/>', '<td/>', '<blockquote/>']
+        neg_threes = ['<address/>', '<ol/>']
+        neg_fives = ['<h1/>', '<h2/>', '<h3/>', '<h4/>']
+
+        for n in fives:
+            doc = fragment_fromstring(n)
+            self.assertEqual(CandidateNode(doc).content_score, 5)
+
+        for n in threes:
+            doc = fragment_fromstring(n)
+            self.assertEqual(CandidateNode(doc).content_score, 3)
+
+        for n in neg_threes:
+            doc = fragment_fromstring(n)
+            self.assertEqual(CandidateNode(doc).content_score, -3)
+
+        for n in neg_fives:
+            doc = fragment_fromstring(n)
+            self.assertEqual(CandidateNode(doc).content_score, -5)
+
+
+class TestClassWeights(TestCase):
+    """Certain ids and classes get us bonus points."""
+
+    def test_positive_class(self):
+        """Some classes get us bonus points."""
+        node = fragment_fromstring('<p class="article">')
+        self.assertEqual(get_class_weight(node), 25)
+
+    def test_positive_ids(self):
+        """Some ids get us bonus points."""
+        node = fragment_fromstring('<p id="content">')
+        self.assertEqual(get_class_weight(node), 25)
+
+    def test_negative_class(self):
+        """Some classes get us negative points."""
+        node = fragment_fromstring('<p class="comment">')
+        self.assertEqual(get_class_weight(node), -25)
+
+    def test_negative_ids(self):
+        """Some ids get us negative points."""
+        node = fragment_fromstring('<p id="media">')
+        self.assertEqual(get_class_weight(node), -25)
+
+
+class TestScoringNodes(TestCase):
+    """We take out list of potential nodes and score them up."""
+
+    def test_we_get_candidates(self):
+        """Processing candidates should get us a list of nodes to try out."""
+        # we'll start out using our first real test document
+        test_nodes = []
+        doc = document_fromstring(load_article('ars/ars.001.html'))
+        for node in doc.getiterator():
+            if node.tag in ['p', 'td', 'pre']:
+                test_nodes.append(node)
+
+        candidates = score_candidates(test_nodes)
+
+        # this might change as we tweak our algorithm, but if it does change,
+        # it signifies we need to look at what we changed.
+        self.assertEqual(len(candidates.keys()), 8)
+
+        # one of these should have a decent score
+        scores = sorted([c.content_score for c in candidates.values()])
+        self.assertTrue(scores[-1] > 100)