diff --git a/readability/readable.py b/readability/readable.py index 1abe4b5..570e111 100644 --- a/readability/readable.py +++ b/readability/readable.py @@ -16,7 +16,7 @@ from .document import OriginalDocument from .annotated_text import AnnotatedTextHandler from .scoring import (score_candidates, get_link_density, get_class_weight, is_unlikely_node) -from .utils import cached_property +from .utils import cached_property, shrink_text html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, @@ -162,49 +162,33 @@ def check_siblings(candidate_node, candidate_list): def clean_document(node): """Cleans up the final document we return as the readable article.""" if node is None or len(node) == 0: - return + return None logger.debug("Cleaning document.") - clean_list = ["object"] to_drop = [] for n in node.iter(): - logger.debug("Cleaning iter node: %s %r", n.tag, n.attrib) + logger.debug("Cleaning node: %s %r", n.tag, n.attrib) # clean out any in-line style properties if "style" in n.attrib: n.set("style", "") - # remove all of the following tags - # Clean a node of all elements of type "tag". - # (Unless it's a youtube/vimeo video. People love movies.) - is_embed = bool(n.tag in ("object", "embed")) - if n.tag in clean_list: - allow = False - - # Allow youtube and vimeo videos through as people usually - # want to see those. - if is_embed: - if ok_embedded_video(n): - allow = True - - if not allow: - logger.debug("Dropping Node %s %r", n.tag, n.attrib) - to_drop.append(n) + # remove embended objects unless it's wanted video + if n.tag in ("object", "embed") and not ok_embedded_video(n): + logger.debug("Dropping node %s %r", n.tag, n.attrib) + to_drop.append(n) + # clean headings with bad css or high link density if n.tag in ("h1", "h2", "h3", "h4"): - # clean headings - # if the heading has no css weight or a high link density, - # remove it if get_class_weight(n) < 0 or get_link_density(n) > 0.33: logger.debug("Dropping <%s>, it's insignificant", n.tag) to_drop.append(n) - # clean out extra
- if n.tag == "p": - # if the p has no children and has no content...well then down - # with it. - if not n.getchildren() and len(n.text_content()) < 5: - logger.debug("Dropping extra
") + # drop block element without content and children + if n.tag in ("div", "p"): + text_content = shrink_text(n.text_content()) + if len(text_content) < 5 and not n.getchildren(): + logger.debug("Dropping %s %r without content.", n.tag, n.attrib) to_drop.append(n) # finally try out the conditional cleaning of the target node diff --git a/readability/utils.py b/readability/utils.py index c259b1e..8fb55ff 100644 --- a/readability/utils.py +++ b/readability/utils.py @@ -14,6 +14,10 @@ def is_blank(text): return not text or text.isspace() +def shrink_text(text): + return normalize_whitespace(text.strip()) + + MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE) def normalize_whitespace(text): """