Fewer code - fewer bugs (I hope)

Mišo Belica 12 years ago
parent 1a5970b238
commit 4e3227521e

@ -25,6 +25,7 @@ html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
remove_unknown_tags=False, safe_attrs_only=False)
SCORABLE_TAGS = ("div", "p", "td", "pre", "article")
@ -34,34 +35,10 @@ NULL_DOCUMENT = """
SCORABLE_TAGS = ('div', 'p', 'td', 'pre', 'article')
logger = logging.getLogger("breadability")
def is_bad_link(node):
Helper to determine if the link is something to clean out
We've hit articles with many multiple links that should be cleaned out
because they're just there to pollute the space. See tests for examples.
if node.tag != 'a':
return False
name = node.get('name')
href = node.get('href')
if name and not href:
return True
if href:
url_bits = href.split('#')
if len(url_bits) == 2 and len(url_bits[1]) > 25:
return True
return False
def ok_embedded_video(node):
"""Check if this embed/video is an ok one to count."""
good_keywords = ('youtube', 'blip.tv', 'vimeo')
@ -74,64 +51,50 @@ def ok_embedded_video(node):
return False
def build_base_document(html, fragment=True):
"""Return a base document with the body as root.
def build_base_document(dom, return_fragment=True):
Builds a base document with the body as root.
:param html: Parsed Element object
:param fragment: Should we return a <div> doc fragment or
a full <html> doc.
:param dom: Parsed lxml tree (Document Object Model).
:param bool return_fragment: If True only <div> fragment is returned.
Otherwise full HTML document is returned.
if html.tag == 'body':
html.tag = 'div'
found_body = html
body_element = dom.find(".//body")
if body_element is None:
fragment = fragment_fromstring('<div id="readabilityBody"/>')
found_body = html.find('.//body')
body_element.tag = "div"
body_element.set("id", "readabilityBody")
fragment = body_element
if found_body is None:
frag = fragment_fromstring('<div/>')
frag.set('id', 'readabilityBody')
return document_from_fragment(fragment, return_fragment)
if not fragment:
output = fromstring(NULL_DOCUMENT)
insert_point = output.find('.//body')
output = frag
found_body.tag = 'div'
found_body.set('id', 'readabilityBody')
if not fragment:
output = fromstring(NULL_DOCUMENT)
insert_point = output.find('.//body')
output = found_body
def build_error_document(dom, return_fragment=True):
Builds an empty erorr document with the body as root.
output.doctype = "<!DOCTYPE html>"
return output
:param bool return_fragment: If True only <div> fragment is returned.
Otherwise full HTML document is returned.
fragment = fragment_fromstring(
'<div id="readabilityBody" class="parsing-error"/>')
return document_from_fragment(fragment, return_fragment)
def build_error_document(html, fragment=True):
"""Return an empty erorr document with the body as root.
:param fragment: Should we return a <div> doc fragment or
a full <html> doc.
frag = fragment_fromstring('<div/>')
frag.set('id', 'readabilityBody')
frag.set('class', 'parsing-error')
if not fragment:
output = fromstring(NULL_DOCUMENT)
insert_point = output.find('.//body')
def document_from_fragment(fragment, return_fragment):
if return_fragment:
document = fragment
output = frag
document = fromstring(NULL_DOCUMENT)
body_element = document.find(".//body")
output.doctype = "<!DOCTYPE html>"
return output
document.doctype = "<!DOCTYPE html>"
return document
def check_siblings(candidate_node, candidate_list):
@ -342,33 +305,55 @@ def prep_article(doc):
return clean_document(doc)
def find_candidates(doc):
"""Find cadidate nodes for the readable version of the article.
def find_candidates(document):
Finds cadidate nodes for the readable version of the article.
Here's we're going to remove unlikely nodes, find scores on the rest, and
Here's we're going to remove unlikely nodes, find scores on the rest,
clean up and return the final best match.
nodes_to_score = []
should_remove = []
nodes_to_score = set()
should_remove = set()
for node in doc.iter():
for node in document.iter():
if is_unlikely_node(node):
logger.debug('We should drop unlikely: ' + str(node))
if node.tag == 'a' and is_bad_link(node):
logger.debug('We should drop bad link: ' + str(node))
if node.tag in SCORABLE_TAGS and node not in nodes_to_score:
logger.debug("We should drop unlikely: %s", str(node))
elif is_bad_link(node):
logger.debug("We should drop bad link: %s", str(node))
elif node.tag in SCORABLE_TAGS:
return score_candidates(nodes_to_score), should_remove
def is_bad_link(node):
Helper to determine if the node is link that is useless.
We've hit articles with many multiple links that should be cleaned out
because they're just there to pollute the space. See tests for examples.
if node.tag != "a":
return False
name = node.get("name")
href = node.get("href")
if name and not href:
return True
if href:
href_parts = href.split("#")
if len(href_parts) == 2 and len(href_parts[1]) > 25:
return True
return False
class Article(object):
"""Parsed readable object"""
_should_drop = []
_should_drop = ()
def __init__(self, html, url=None, fragment=True):
"""Create the Article we're going to use.
@ -401,15 +386,14 @@ class Article(object):
def candidates(self):
"""Generate the list of candidates from the doc."""
doc = self.dom
if doc is not None and len(doc):
candidates, should_drop = find_candidates(doc)
self._should_drop = should_drop
return candidates
"""Generates list of candidates from the DOM."""
dom = self.dom
if dom is None or len(dom) == 0:
return None
candidates, self._should_drop = find_candidates(dom)
return candidates
def readable(self):
return tounicode(self.readable_dom)
@ -420,51 +404,48 @@ class Article(object):
def _readable(self):
"""The readable parsed article"""
if self.candidates:
logger.debug('Candidates found')
pp = PrettyPrinter(indent=2)
# cleanup by removing the should_drop we spotted.
# right now we return the highest scoring candidate content
by_score = sorted([c for c in self.candidates.values()],
key=attrgetter('content_score'), reverse=True)
# since we have several candidates, check the winner's siblings
# for extra content
winner = by_score[0]
logger.debug('Selected winning node: ' + str(winner))
updated_winner = check_siblings(winner, self.candidates)
logger.debug('Begin final prep of article')
updated_winner.node = prep_article(updated_winner.node)
if updated_winner.node is not None:
doc = build_base_document(updated_winner.node, self.fragment)
logger.warning('Had candidates but failed to find a cleaned winning doc.')
doc = self._handle_no_candidates()
if not self.candidates:
logger.warning("No candidates found in document.")
return self._handle_no_candidates()
# cleanup by removing the should_drop we spotted.
# right now we return the highest scoring candidate content
best_candidates = sorted((c for c in self.candidates.values()),
key=attrgetter("content_score"), reverse=True)
printer = PrettyPrinter(indent=2)
# since we have several candidates, check the winner's siblings
# for extra content
winner = best_candidates[0]
updated_winner = check_siblings(winner, self.candidates)
logger.debug('Begin final prep of article')
updated_winner.node = prep_article(updated_winner.node)
if updated_winner.node is not None:
doc = build_base_document(updated_winner.node, self.fragment)
logger.warning('No candidates found: using document.')
logger.debug('Begin final prep of article')
logger.warning('Had candidates but failed to find a cleaned winning doc.')
doc = self._handle_no_candidates()
return doc
def _handle_no_candidates(self):
"""If we fail to find a good candidate we need to find something else."""
If we fail to find a good candidate we need to find something else.
# since we've not found a good candidate we're should help this
if self.dom is not None and len(self.dom):
# cleanup by removing the should_drop we spotted.
doc = prep_article(self.dom)
doc = build_base_document(doc, self.fragment)
dom = prep_article(self.dom)
return build_base_document(dom, self.fragment)
logger.warning('No document to use.')
doc = build_error_document(self.fragment)
return doc
logger.warning("No document to use.")
return build_error_document(self.fragment)
def leaf_div_elements_into_paragraphs(document):
