Add some cleaning/post processing of our target

- Starting to look decent - Still need to port their cleanConditionally but going to have to think on that - Removes spare paragraphs, does some other cleaning tweaks
12 years ago · ccac04e567
parent 19a38a2cea
commit ccac04e567
1 changed files with 93 additions and 7 deletions
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -2,6 +2,7 @@ import re
 from operator import attrgetter
 from lxml.etree import tounicode
 from lxml.etree import tostring
+from lxml.html.clean import Cleaner
 from lxml.html import fragment_fromstring
 from lxml.html import fromstring
 from breadability.document import OriginalDocument
@ -24,6 +25,13 @@ CLS_WEIGHT_NEGATIVE = set(['combx', 'comment', 'com-', 'contact', 'foot',
    'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor', 'shopping', 'tags',
    'tool', 'widget'])

+html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
+                  style=True, links=True, meta=False, add_nofollow=False,
+                  page_structure=False, processing_instructions=True,
+                  embedded=False, frames=False, forms=False,
+                  annoying_tags=False, remove_tags=None,
+                  remove_unknown_tags=False, safe_attrs_only=False)
+

 def check_node_attr(node, attr, checkset):
    attr = node.get(attr) or ""
@ -40,8 +48,10 @@ def drop_tag(doc, *tags):
    :param *tags: one or more html tag strings to remove e.g. style, script

    """
-    [[n.drop_tree() for n in doc.iterfind(".//" + tag)]
-            for tag in tags]
+    for tag in tags:
+        found = doc.iterfind(".//" + tag)
+        if found:
+            [n.drop_tree for n in found]
    return doc


@ -122,7 +132,7 @@ def check_siblings(candidate_node, candidate_list):

    for sibling in siblings:
        append = False
-        content_bonus = 0;
+        content_bonus = 0

        if sibling is candidate_node.node:
            append = True
@ -130,7 +140,7 @@ def check_siblings(candidate_node, candidate_list):
        # Give a bonus if sibling nodes and top candidates have the example
        # same classname
        if candidate_css and sibling.get('class') == candidate_css:
-            content_bonus += candidate_node.content_score * 0.2;
+            content_bonus += candidate_node.content_score * 0.2

        if sibling in candidate_list:
            adjusted_score = candidate_list[sibling].content_score + \
@ -142,10 +152,10 @@ def check_siblings(candidate_node, candidate_list):
        if sibling.tag == 'p':
            link_density = get_link_density(sibling)
            content = sibling.text_content()
-            content_length  = len(content)
+            content_length = len(content)

            if content_length > 80 and link_density < 0.25:
-                append = true;
+                append = true
            elif content_length < 80 and link_density == 0:
                if ". " in content:
                    append = True
@ -235,6 +245,77 @@ def score_candidates(nodes):
    return candidates


+def prep_article(doc):
+    """Once we've found our target article we want to clean it up.
+
+    Clean out:
+    - inline styles
+    - forms
+    - strip empty <p>
+    - extra tags
+
+    """
+
+    def clean_document(candidate):
+        """Remove the style attribute on every element."""
+        clean_list = ['object', 'h1']
+        keep_keywords = ['youtube', 'blip.tv', 'vimeo']
+
+        # If there is only one h2, they are probably using it as a header and
+        # not a subheader, so remove it since we already have a header.
+        if len(candidate.node.findall('.//h2')) == 1:
+            clean_list.append('h2')
+
+        for n in candidate.node.getiterator():
+            # clean out any incline style properties
+            n.set('style', '')
+
+            # remove all of the following tags
+            # Clean a node of all elements of type "tag".
+            # (Unless it's a youtube/vimeo video. People love movies.)
+            is_embed = True if n.tag in ['object', 'embed'] else False
+            if n.tag in clean_list:
+                allow = False
+
+                # Allow youtube and vimeo videos through as people usually
+                # want to see those.
+                if is_embed:
+                    # if this object or embed has any of the keywords in the
+                    # html from here on out, then let it live.
+                    node_str = tounicode(n)
+
+                    for key in keep_keywords:
+                        if not allow and key in node_str:
+                            allow = True
+                if not allow:
+                    n.drop_tree()
+
+            if n.tag in ['h1', 'h2', 'h3', 'h4']:
+                # clean headings
+                # if the heading has no css weight or a high link density,
+                # remove it
+                if get_class_weight(n) < 0 or get_link_density(n) > .33:
+                    n.drop_tree()
+
+            # clean out extra <p>
+            if n.tag == 'p':
+                # if the p has no children and has no content...well then down
+                # with it.
+                if not n.getchildren() and len(n.text_content()) < 5:
+                    n.drop_tree()
+
+        return candidate
+
+    def clean_conditionally(doc, clean_el):
+        """Remove the clean_el if it looks like bad content based on rules."""
+
+    def clean_objects():
+        pass
+
+    doc = clean_document(doc)
+    return doc
+
+
 def process(doc):
    """Process this doc to make it readable.

@ -322,7 +403,9 @@ class Article(object):
    def readable(self):
        """The readable parsed article"""
        doc = self.orig.html
-        doc = drop_tag(doc, 'script', 'link', 'style', 'noscript')
+        # cleaning doesn't return, just wipes in place
+        html_cleaner(doc)
+        doc = drop_tag(doc, 'noscript')
        doc = transform_misused_divs_into_paragraphs(doc)
        candidates = process(doc)

@ -335,13 +418,16 @@ class Article(object):
            # for extra content
            winner = by_score[0]
            updated_winner = check_siblings(winner, candidates)
+            doc = prep_article(updated_winner)
            doc = build_base_document(updated_winner.node)
        else:
+            doc = prep_article(doc)
            doc = build_base_document(doc)

        return doc


+
 """
 Algorithm notes for