mirror of
https://github.com/bookieio/breadability
synced 2024-11-10 07:10:43 +00:00
Cleanups for function 'clean_document'
This commit is contained in:
parent
5c20673d45
commit
c9afc38c49
@ -16,7 +16,7 @@ from .document import OriginalDocument
|
||||
from .annotated_text import AnnotatedTextHandler
|
||||
from .scoring import (score_candidates, get_link_density, get_class_weight,
|
||||
is_unlikely_node)
|
||||
from .utils import cached_property
|
||||
from .utils import cached_property, shrink_text
|
||||
|
||||
|
||||
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
|
||||
@ -162,49 +162,33 @@ def check_siblings(candidate_node, candidate_list):
|
||||
def clean_document(node):
|
||||
"""Cleans up the final document we return as the readable article."""
|
||||
if node is None or len(node) == 0:
|
||||
return
|
||||
return None
|
||||
|
||||
logger.debug("Cleaning document.")
|
||||
clean_list = ["object"]
|
||||
to_drop = []
|
||||
|
||||
for n in node.iter():
|
||||
logger.debug("Cleaning iter node: %s %r", n.tag, n.attrib)
|
||||
logger.debug("Cleaning node: %s %r", n.tag, n.attrib)
|
||||
# clean out any in-line style properties
|
||||
if "style" in n.attrib:
|
||||
n.set("style", "")
|
||||
|
||||
# remove all of the following tags
|
||||
# Clean a node of all elements of type "tag".
|
||||
# (Unless it's a youtube/vimeo video. People love movies.)
|
||||
is_embed = bool(n.tag in ("object", "embed"))
|
||||
if n.tag in clean_list:
|
||||
allow = False
|
||||
|
||||
# Allow youtube and vimeo videos through as people usually
|
||||
# want to see those.
|
||||
if is_embed:
|
||||
if ok_embedded_video(n):
|
||||
allow = True
|
||||
|
||||
if not allow:
|
||||
logger.debug("Dropping Node %s %r", n.tag, n.attrib)
|
||||
to_drop.append(n)
|
||||
# remove embended objects unless it's wanted video
|
||||
if n.tag in ("object", "embed") and not ok_embedded_video(n):
|
||||
logger.debug("Dropping node %s %r", n.tag, n.attrib)
|
||||
to_drop.append(n)
|
||||
|
||||
# clean headings with bad css or high link density
|
||||
if n.tag in ("h1", "h2", "h3", "h4"):
|
||||
# clean headings
|
||||
# if the heading has no css weight or a high link density,
|
||||
# remove it
|
||||
if get_class_weight(n) < 0 or get_link_density(n) > 0.33:
|
||||
logger.debug("Dropping <%s>, it's insignificant", n.tag)
|
||||
to_drop.append(n)
|
||||
|
||||
# clean out extra <p>
|
||||
if n.tag == "p":
|
||||
# if the p has no children and has no content...well then down
|
||||
# with it.
|
||||
if not n.getchildren() and len(n.text_content()) < 5:
|
||||
logger.debug("Dropping extra <p>")
|
||||
# drop block element without content and children
|
||||
if n.tag in ("div", "p"):
|
||||
text_content = shrink_text(n.text_content())
|
||||
if len(text_content) < 5 and not n.getchildren():
|
||||
logger.debug("Dropping %s %r without content.", n.tag, n.attrib)
|
||||
to_drop.append(n)
|
||||
|
||||
# finally try out the conditional cleaning of the target node
|
||||
|
@ -14,6 +14,10 @@ def is_blank(text):
|
||||
return not text or text.isspace()
|
||||
|
||||
|
||||
def shrink_text(text):
|
||||
return normalize_whitespace(text.strip())
|
||||
|
||||
|
||||
MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)
|
||||
def normalize_whitespace(text):
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user