Add support for sibling detection, need to figure out how to test it well still

12 years ago · 19a38a2cea
parent 4455ec226d
commit 19a38a2cea
3 changed files with 67 additions and 10 deletions
--- a/src/breadability/client.py
+++ b/src/breadability/client.py
@ -58,7 +58,7 @@ def main():
        #     print m.html.encode(enc, 'replace')
        # else:
        #     print doc.summary().encode(enc, 'replace')
-        print unicode(doc)
+        print doc
    finally:
        target.close()

--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -69,6 +69,18 @@ def build_base_document(html):
    return found_body


+def get_link_density(node):
+    """Generate a value for the number of links in the node.
+
+    :param node: pared elementree node
+    :returns float:
+
+    """
+    link_length = len("".join([a.text or "" for a in node.findall(".//a")]))
+    text_length = len(node.text_content())
+    return float(link_length) / max(text_length, 1)
+
+
 def transform_misused_divs_into_paragraphs(doc):
    """Turn all divs that don't have children block level elements into p's

@ -96,16 +108,58 @@ def transform_misused_divs_into_paragraphs(doc):
    return doc


-def get_link_density(node):
-    """Generate a value for the number of links in the node.
+def check_siblings(candidate_node, candidate_list):
+    """Look through siblings for content that might also be related.

-    :param node: pared elementree node
-    :returns float:
+    Things like preambles, content split by ads that we removed, etc.

    """
-    link_length = len("".join([a.text or "" for a in node.findall(".//a")]))
-    text_length = len(node.text_content())
-    return float(link_length) / max(text_length, 1)
+    candidate_css = candidate_node.node.get('class')
+    potential_target = candidate_node.content_score * 0.2
+    sibling_target_score = potential_target if potential_target > 10 else 10
+    parent = candidate_node.node.getparent()
+    siblings = parent.getchildren() if parent is not None else []
+
+    for sibling in siblings:
+        append = False
+        content_bonus = 0;
+
+        if sibling is candidate_node.node:
+            append = True
+
+        # Give a bonus if sibling nodes and top candidates have the example
+        # same classname
+        if candidate_css and sibling.get('class') == candidate_css:
+            content_bonus += candidate_node.content_score * 0.2;
+
+        if sibling in candidate_list:
+            adjusted_score = candidate_list[sibling].content_score + \
+                content_bonus
+
+            if adjusted_score >= sibling_target_score:
+                append = True
+
+        if sibling.tag == 'p':
+            link_density = get_link_density(sibling)
+            content = sibling.text_content()
+            content_length  = len(content)
+
+            if content_length > 80 and link_density < 0.25:
+                append = true;
+            elif content_length < 80 and link_density == 0:
+                if ". " in content:
+                    append = True
+
+        if append:
+            if sibling.tag not in ['div', 'p']:
+                # We have a node that isn't a common block level element, like
+                # a form or td tag. Turn it into a div so it doesn't get
+                # filtered out later by accident.
+                sibling.tag = 'div'
+
+            candidate_node.node.append(sibling)
+
+    return candidate_node


 ###### SCORING
@ -277,7 +331,11 @@ class Article(object):
            by_score = sorted([c for c in candidates.values()],
                key=attrgetter('content_score'), reverse=True)

-            doc = build_base_document(by_score[0].node)
+            # since we have several candidates, check the winner's siblings
+            # for extra content
+            winner = by_score[0]
+            updated_winner = check_siblings(winner, candidates)
+            doc = build_base_document(updated_winner.node)
        else:
            doc = build_base_document(doc)

--- a/src/breadability/tests/test_readable.py
+++ b/src/breadability/tests/test_readable.py
@ -36,7 +36,6 @@ class TestReadableDocument(TestCase):
        No sense processing anything other than the body content.

        """
-        print "MIN DOCUMENT"
        doc = Article(load_snippet('document_min.html'))
        self.assertEqual(doc.readable.tag, 'div')
        self.assertEqual(doc.readable.get('id'), 'readabilityBody')