Cleanups

11 years ago · c9e087d077
parent e0c87223ae
commit c9e087d077
1 changed files with 24 additions and 27 deletions
--- a/readability/readable.py
+++ b/readability/readable.py
@ -107,11 +107,11 @@ def document_from_fragment(fragment, return_fragment):


 def check_siblings(candidate_node, candidate_list):
-    """Look through siblings for content that might also be related.
-
+    """
+    Looks through siblings for content that might also be related.
    Things like preambles, content split by ads that we removed, etc.
    """
-    candidate_css = candidate_node.node.get('class')
+    candidate_css = candidate_node.node.get("class")
    potential_target = candidate_node.content_score * 0.2
    sibling_target_score = potential_target if potential_target > 10 else 10
    parent = candidate_node.node.getparent()
@ -122,22 +122,20 @@ def check_siblings(candidate_node, candidate_list):
        content_bonus = 0

        if sibling is candidate_node.node:
-            logger.debug('Sibling is the node so append')
            append = True

        # Give a bonus if sibling nodes and top candidates have the example
        # same class name
-        if candidate_css and sibling.get('class') == candidate_css:
+        if candidate_css and sibling.get("class") == candidate_css:
            content_bonus += candidate_node.content_score * 0.2

        if sibling in candidate_list:
-            adjusted_score = candidate_list[sibling].content_score + \
-                content_bonus
+            adjusted_score = candidate_list[sibling].content_score + content_bonus

            if adjusted_score >= sibling_target_score:
                append = True

-        if sibling.tag == 'p':
+        if sibling.tag == "p":
            link_density = get_link_density(sibling)
            content = sibling.text_content()
            content_length = len(content)
@ -149,12 +147,12 @@ def check_siblings(candidate_node, candidate_list):
                    append = True

        if append:
-            logger.debug('Sibling being appended')
-            if sibling.tag not in ('div', 'p'):
+            logger.debug("Sibling appended: %s %r", sibling.tag, sibling.attrib)
+            if sibling.tag not in ("div", "p"):
                # We have a node that isn't a common block level element, like
                # a form or td tag. Turn it into a div so it doesn't get
                # filtered out later by accident.
-                sibling.tag = 'div'
+                sibling.tag = "div"

            candidate_node.node.append(sibling)

@ -162,30 +160,30 @@ def check_siblings(candidate_node, candidate_list):


 def clean_document(node):
-    """Clean up the final document we return as the readable article"""
+    """Cleans up the final document we return as the readable article."""
    if node is None or len(node) == 0:
        return

-    logger.debug("Processing doc")
-    clean_list = ['object', 'h1']
+    logger.debug("Cleaning document.")
+    clean_list = ["object", "h1"]
    to_drop = []

    # If there is only one h2, they are probably using it as a header and
    # not a subheader, so remove it since we already have a header.
-    if len(node.findall('.//h2')) == 1:
-        logger.debug('Adding H2 to list of nodes to clean.')
-        clean_list.append('h2')
+    if len(node.findall(".//h2")) == 1:
+        logger.debug("Adding H2 to list of nodes to clean.")
+        clean_list.append("h2")

    for n in node.iter():
        logger.debug("Cleaning iter node: %s %r", n.tag, n.attrib)
        # clean out any in-line style properties
-        if 'style' in n.attrib:
-            n.set('style', '')
+        if "style" in n.attrib:
+            n.set("style", "")

        # remove all of the following tags
        # Clean a node of all elements of type "tag".
        # (Unless it's a youtube/vimeo video. People love movies.)
-        is_embed = bool(n.tag in ('object', 'embed'))
+        is_embed = bool(n.tag in ("object", "embed"))
        if n.tag in clean_list:
            allow = False

@ -196,23 +194,23 @@ def clean_document(node):
                    allow = True

            if not allow:
-                logger.debug("Dropping Node")
+                logger.debug("Dropping Node %s %r", n.tag, n.attrib)
                to_drop.append(n)

-        if n.tag in ('h1', 'h2', 'h3', 'h4'):
+        if n.tag in ("h1", "h2", "h3", "h4"):
            # clean headings
            # if the heading has no css weight or a high link density,
            # remove it
-            if get_class_weight(n) < 0 or get_link_density(n) > .33:
-                logger.debug("Dropping <hX>, it's insignificant")
+            if get_class_weight(n) < 0 or get_link_density(n) > 0.33:
+                logger.debug("Dropping <%s>, it's insignificant", n.tag)
                to_drop.append(n)

        # clean out extra <p>
-        if n.tag == 'p':
+        if n.tag == "p":
            # if the p has no children and has no content...well then down
            # with it.
            if not n.getchildren() and len(n.text_content()) < 5:
-                logger.debug('Dropping extra <p>')
+                logger.debug("Dropping extra <p>")
                to_drop.append(n)

        # finally try out the conditional cleaning of the target node
@ -434,7 +432,6 @@ class Article(object):
        # for extra content
        winner = best_candidates[0]
        updated_winner = check_siblings(winner, self.candidates)
-        logger.debug('Begin final prep of article')
        updated_winner.node = prep_article(updated_winner.node)
        if updated_winner.node is not None:
            dom = build_base_document(updated_winner.node, self._return_fragment)