Cleanups for function 'clean_document'

2024-11-10 07:10:43 +00:00 · 2013-03-27 00:13:31 +01:00 · 2013-03-27 00:13:31 +01:00 · c9afc38c49
commit c9afc38c49
parent 5c20673d45
2 changed files with 17 additions and 29 deletions
--- a/readability/readable.py
+++ b/readability/readable.py
@ -16,7 +16,7 @@ from .document import OriginalDocument
 from .annotated_text import AnnotatedTextHandler
 from .scoring import (score_candidates, get_link_density, get_class_weight,
    is_unlikely_node)
-from .utils import cached_property
+from .utils import cached_property, shrink_text


 html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
@ -162,49 +162,33 @@ def check_siblings(candidate_node, candidate_list):
 def clean_document(node):
    """Cleans up the final document we return as the readable article."""
    if node is None or len(node) == 0:
-        return
+        return None

    logger.debug("Cleaning document.")
-    clean_list = ["object"]
    to_drop = []

    for n in node.iter():
-        logger.debug("Cleaning iter node: %s %r", n.tag, n.attrib)
+        logger.debug("Cleaning node: %s %r", n.tag, n.attrib)
        # clean out any in-line style properties
        if "style" in n.attrib:
            n.set("style", "")

-        # remove all of the following tags
-        # Clean a node of all elements of type "tag".
-        # (Unless it's a youtube/vimeo video. People love movies.)
-        is_embed = bool(n.tag in ("object", "embed"))
-        if n.tag in clean_list:
-            allow = False
-
-            # Allow youtube and vimeo videos through as people usually
-            # want to see those.
-            if is_embed:
-                if ok_embedded_video(n):
-                    allow = True
-
-            if not allow:
-                logger.debug("Dropping Node %s %r", n.tag, n.attrib)
-                to_drop.append(n)
+        # remove embended objects unless it's wanted video
+        if n.tag in ("object", "embed") and not ok_embedded_video(n):
+            logger.debug("Dropping node %s %r", n.tag, n.attrib)
+            to_drop.append(n)

+        # clean headings with bad css or high link density
        if n.tag in ("h1", "h2", "h3", "h4"):
-            # clean headings
-            # if the heading has no css weight or a high link density,
-            # remove it
            if get_class_weight(n) < 0 or get_link_density(n) > 0.33:
                logger.debug("Dropping <%s>, it's insignificant", n.tag)
                to_drop.append(n)

-        # clean out extra <p>
-        if n.tag == "p":
-            # if the p has no children and has no content...well then down
-            # with it.
-            if not n.getchildren() and len(n.text_content()) < 5:
-                logger.debug("Dropping extra <p>")
+        # drop block element without content and children
+        if n.tag in ("div", "p"):
+            text_content = shrink_text(n.text_content())
+            if len(text_content) < 5 and not n.getchildren():
+                logger.debug("Dropping %s %r without content.", n.tag, n.attrib)
                to_drop.append(n)

        # finally try out the conditional cleaning of the target node
--- a/readability/utils.py
+++ b/readability/utils.py
@ -14,6 +14,10 @@ def is_blank(text):
    return not text or text.isspace()


+def shrink_text(text):
+    return normalize_whitespace(text.strip())
+
+
 MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)
 def normalize_whitespace(text):
    """