|
|
|
@ -25,6 +25,7 @@ html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
|
|
|
|
|
remove_unknown_tags=False, safe_attrs_only=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SCORABLE_TAGS = ("div", "p", "td", "pre", "article")
|
|
|
|
|
NULL_DOCUMENT = """
|
|
|
|
|
<html>
|
|
|
|
|
<head>
|
|
|
|
@ -34,34 +35,10 @@ NULL_DOCUMENT = """
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
SCORABLE_TAGS = ('div', 'p', 'td', 'pre', 'article')
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger("breadability")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_bad_link(node):
|
|
|
|
|
"""
|
|
|
|
|
Helper to determine if the link is something to clean out
|
|
|
|
|
|
|
|
|
|
We've hit articles with many multiple links that should be cleaned out
|
|
|
|
|
because they're just there to pollute the space. See tests for examples.
|
|
|
|
|
"""
|
|
|
|
|
if node.tag != 'a':
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
name = node.get('name')
|
|
|
|
|
href = node.get('href')
|
|
|
|
|
if name and not href:
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
if href:
|
|
|
|
|
url_bits = href.split('#')
|
|
|
|
|
if len(url_bits) == 2 and len(url_bits[1]) > 25:
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ok_embedded_video(node):
|
|
|
|
|
"""Check if this embed/video is an ok one to count."""
|
|
|
|
|
good_keywords = ('youtube', 'blip.tv', 'vimeo')
|
|
|
|
@ -74,64 +51,50 @@ def ok_embedded_video(node):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_base_document(html, fragment=True):
|
|
|
|
|
"""Return a base document with the body as root.
|
|
|
|
|
def build_base_document(dom, return_fragment=True):
|
|
|
|
|
"""
|
|
|
|
|
Builds a base document with the body as root.
|
|
|
|
|
|
|
|
|
|
:param html: Parsed Element object
|
|
|
|
|
:param fragment: Should we return a <div> doc fragment or
|
|
|
|
|
a full <html> doc.
|
|
|
|
|
:param dom: Parsed lxml tree (Document Object Model).
|
|
|
|
|
:param bool return_fragment: If True only <div> fragment is returned.
|
|
|
|
|
Otherwise full HTML document is returned.
|
|
|
|
|
"""
|
|
|
|
|
if html.tag == 'body':
|
|
|
|
|
html.tag = 'div'
|
|
|
|
|
found_body = html
|
|
|
|
|
body_element = dom.find(".//body")
|
|
|
|
|
|
|
|
|
|
if body_element is None:
|
|
|
|
|
fragment = fragment_fromstring('<div id="readabilityBody"/>')
|
|
|
|
|
fragment.append(dom)
|
|
|
|
|
else:
|
|
|
|
|
found_body = html.find('.//body')
|
|
|
|
|
body_element.tag = "div"
|
|
|
|
|
body_element.set("id", "readabilityBody")
|
|
|
|
|
fragment = body_element
|
|
|
|
|
|
|
|
|
|
if found_body is None:
|
|
|
|
|
frag = fragment_fromstring('<div/>')
|
|
|
|
|
frag.set('id', 'readabilityBody')
|
|
|
|
|
frag.append(html)
|
|
|
|
|
return document_from_fragment(fragment, return_fragment)
|
|
|
|
|
|
|
|
|
|
if not fragment:
|
|
|
|
|
output = fromstring(NULL_DOCUMENT)
|
|
|
|
|
insert_point = output.find('.//body')
|
|
|
|
|
insert_point.append(frag)
|
|
|
|
|
else:
|
|
|
|
|
output = frag
|
|
|
|
|
else:
|
|
|
|
|
found_body.tag = 'div'
|
|
|
|
|
found_body.set('id', 'readabilityBody')
|
|
|
|
|
|
|
|
|
|
if not fragment:
|
|
|
|
|
output = fromstring(NULL_DOCUMENT)
|
|
|
|
|
insert_point = output.find('.//body')
|
|
|
|
|
insert_point.append(found_body)
|
|
|
|
|
else:
|
|
|
|
|
output = found_body
|
|
|
|
|
def build_error_document(dom, return_fragment=True):
|
|
|
|
|
"""
|
|
|
|
|
Builds an empty erorr document with the body as root.
|
|
|
|
|
|
|
|
|
|
output.doctype = "<!DOCTYPE html>"
|
|
|
|
|
return output
|
|
|
|
|
:param bool return_fragment: If True only <div> fragment is returned.
|
|
|
|
|
Otherwise full HTML document is returned.
|
|
|
|
|
"""
|
|
|
|
|
fragment = fragment_fromstring(
|
|
|
|
|
'<div id="readabilityBody" class="parsing-error"/>')
|
|
|
|
|
|
|
|
|
|
return document_from_fragment(fragment, return_fragment)
|
|
|
|
|
|
|
|
|
|
def build_error_document(html, fragment=True):
|
|
|
|
|
"""Return an empty erorr document with the body as root.
|
|
|
|
|
|
|
|
|
|
:param fragment: Should we return a <div> doc fragment or
|
|
|
|
|
a full <html> doc.
|
|
|
|
|
"""
|
|
|
|
|
frag = fragment_fromstring('<div/>')
|
|
|
|
|
frag.set('id', 'readabilityBody')
|
|
|
|
|
frag.set('class', 'parsing-error')
|
|
|
|
|
|
|
|
|
|
if not fragment:
|
|
|
|
|
output = fromstring(NULL_DOCUMENT)
|
|
|
|
|
insert_point = output.find('.//body')
|
|
|
|
|
insert_point.append(frag)
|
|
|
|
|
def document_from_fragment(fragment, return_fragment):
|
|
|
|
|
if return_fragment:
|
|
|
|
|
document = fragment
|
|
|
|
|
else:
|
|
|
|
|
output = frag
|
|
|
|
|
document = fromstring(NULL_DOCUMENT)
|
|
|
|
|
body_element = document.find(".//body")
|
|
|
|
|
body_element.append(fragment)
|
|
|
|
|
|
|
|
|
|
output.doctype = "<!DOCTYPE html>"
|
|
|
|
|
return output
|
|
|
|
|
document.doctype = "<!DOCTYPE html>"
|
|
|
|
|
return document
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def check_siblings(candidate_node, candidate_list):
|
|
|
|
@ -342,33 +305,55 @@ def prep_article(doc):
|
|
|
|
|
return clean_document(doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_candidates(doc):
|
|
|
|
|
"""Find cadidate nodes for the readable version of the article.
|
|
|
|
|
def find_candidates(document):
|
|
|
|
|
"""
|
|
|
|
|
Finds cadidate nodes for the readable version of the article.
|
|
|
|
|
|
|
|
|
|
Here's we're going to remove unlikely nodes, find scores on the rest, and
|
|
|
|
|
Here's we're going to remove unlikely nodes, find scores on the rest,
|
|
|
|
|
clean up and return the final best match.
|
|
|
|
|
"""
|
|
|
|
|
nodes_to_score = []
|
|
|
|
|
should_remove = []
|
|
|
|
|
nodes_to_score = set()
|
|
|
|
|
should_remove = set()
|
|
|
|
|
|
|
|
|
|
for node in doc.iter():
|
|
|
|
|
for node in document.iter():
|
|
|
|
|
if is_unlikely_node(node):
|
|
|
|
|
logger.debug('We should drop unlikely: ' + str(node))
|
|
|
|
|
should_remove.append(node)
|
|
|
|
|
continue
|
|
|
|
|
if node.tag == 'a' and is_bad_link(node):
|
|
|
|
|
logger.debug('We should drop bad link: ' + str(node))
|
|
|
|
|
should_remove.append(node)
|
|
|
|
|
continue
|
|
|
|
|
if node.tag in SCORABLE_TAGS and node not in nodes_to_score:
|
|
|
|
|
nodes_to_score.append(node)
|
|
|
|
|
logger.debug("We should drop unlikely: %s", str(node))
|
|
|
|
|
should_remove.add(node)
|
|
|
|
|
elif is_bad_link(node):
|
|
|
|
|
logger.debug("We should drop bad link: %s", str(node))
|
|
|
|
|
should_remove.add(node)
|
|
|
|
|
elif node.tag in SCORABLE_TAGS:
|
|
|
|
|
nodes_to_score.add(node)
|
|
|
|
|
|
|
|
|
|
return score_candidates(nodes_to_score), should_remove
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_bad_link(node):
|
|
|
|
|
"""
|
|
|
|
|
Helper to determine if the node is link that is useless.
|
|
|
|
|
|
|
|
|
|
We've hit articles with many multiple links that should be cleaned out
|
|
|
|
|
because they're just there to pollute the space. See tests for examples.
|
|
|
|
|
"""
|
|
|
|
|
if node.tag != "a":
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
name = node.get("name")
|
|
|
|
|
href = node.get("href")
|
|
|
|
|
if name and not href:
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
if href:
|
|
|
|
|
href_parts = href.split("#")
|
|
|
|
|
if len(href_parts) == 2 and len(href_parts[1]) > 25:
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Article(object):
|
|
|
|
|
"""Parsed readable object"""
|
|
|
|
|
_should_drop = []
|
|
|
|
|
_should_drop = ()
|
|
|
|
|
|
|
|
|
|
def __init__(self, html, url=None, fragment=True):
|
|
|
|
|
"""Create the Article we're going to use.
|
|
|
|
@ -401,15 +386,14 @@ class Article(object):
|
|
|
|
|
|
|
|
|
|
@cached_property
|
|
|
|
|
def candidates(self):
|
|
|
|
|
"""Generate the list of candidates from the doc."""
|
|
|
|
|
doc = self.dom
|
|
|
|
|
if doc is not None and len(doc):
|
|
|
|
|
candidates, should_drop = find_candidates(doc)
|
|
|
|
|
self._should_drop = should_drop
|
|
|
|
|
return candidates
|
|
|
|
|
else:
|
|
|
|
|
"""Generates list of candidates from the DOM."""
|
|
|
|
|
dom = self.dom
|
|
|
|
|
if dom is None or len(dom) == 0:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
candidates, self._should_drop = find_candidates(dom)
|
|
|
|
|
return candidates
|
|
|
|
|
|
|
|
|
|
@cached_property
|
|
|
|
|
def readable(self):
|
|
|
|
|
return tounicode(self.readable_dom)
|
|
|
|
@ -420,51 +404,48 @@ class Article(object):
|
|
|
|
|
|
|
|
|
|
def _readable(self):
|
|
|
|
|
"""The readable parsed article"""
|
|
|
|
|
if self.candidates:
|
|
|
|
|
logger.debug('Candidates found')
|
|
|
|
|
pp = PrettyPrinter(indent=2)
|
|
|
|
|
|
|
|
|
|
# cleanup by removing the should_drop we spotted.
|
|
|
|
|
drop_nodes_with_parents(self._should_drop)
|
|
|
|
|
|
|
|
|
|
# right now we return the highest scoring candidate content
|
|
|
|
|
by_score = sorted([c for c in self.candidates.values()],
|
|
|
|
|
key=attrgetter('content_score'), reverse=True)
|
|
|
|
|
logger.debug(pp.pformat(by_score))
|
|
|
|
|
|
|
|
|
|
# since we have several candidates, check the winner's siblings
|
|
|
|
|
# for extra content
|
|
|
|
|
winner = by_score[0]
|
|
|
|
|
logger.debug('Selected winning node: ' + str(winner))
|
|
|
|
|
updated_winner = check_siblings(winner, self.candidates)
|
|
|
|
|
logger.debug('Begin final prep of article')
|
|
|
|
|
updated_winner.node = prep_article(updated_winner.node)
|
|
|
|
|
if updated_winner.node is not None:
|
|
|
|
|
doc = build_base_document(updated_winner.node, self.fragment)
|
|
|
|
|
else:
|
|
|
|
|
logger.warning('Had candidates but failed to find a cleaned winning doc.')
|
|
|
|
|
doc = self._handle_no_candidates()
|
|
|
|
|
if not self.candidates:
|
|
|
|
|
logger.warning("No candidates found in document.")
|
|
|
|
|
return self._handle_no_candidates()
|
|
|
|
|
|
|
|
|
|
# cleanup by removing the should_drop we spotted.
|
|
|
|
|
drop_nodes_with_parents(self._should_drop)
|
|
|
|
|
|
|
|
|
|
# right now we return the highest scoring candidate content
|
|
|
|
|
best_candidates = sorted((c for c in self.candidates.values()),
|
|
|
|
|
key=attrgetter("content_score"), reverse=True)
|
|
|
|
|
|
|
|
|
|
printer = PrettyPrinter(indent=2)
|
|
|
|
|
logger.debug(printer.pformat(best_candidates))
|
|
|
|
|
|
|
|
|
|
# since we have several candidates, check the winner's siblings
|
|
|
|
|
# for extra content
|
|
|
|
|
winner = best_candidates[0]
|
|
|
|
|
updated_winner = check_siblings(winner, self.candidates)
|
|
|
|
|
logger.debug('Begin final prep of article')
|
|
|
|
|
updated_winner.node = prep_article(updated_winner.node)
|
|
|
|
|
if updated_winner.node is not None:
|
|
|
|
|
doc = build_base_document(updated_winner.node, self.fragment)
|
|
|
|
|
else:
|
|
|
|
|
logger.warning('No candidates found: using document.')
|
|
|
|
|
logger.debug('Begin final prep of article')
|
|
|
|
|
logger.warning('Had candidates but failed to find a cleaned winning doc.')
|
|
|
|
|
doc = self._handle_no_candidates()
|
|
|
|
|
|
|
|
|
|
return doc
|
|
|
|
|
|
|
|
|
|
def _handle_no_candidates(self):
|
|
|
|
|
"""If we fail to find a good candidate we need to find something else."""
|
|
|
|
|
"""
|
|
|
|
|
If we fail to find a good candidate we need to find something else.
|
|
|
|
|
"""
|
|
|
|
|
# since we've not found a good candidate we're should help this
|
|
|
|
|
if self.dom is not None and len(self.dom):
|
|
|
|
|
# cleanup by removing the should_drop we spotted.
|
|
|
|
|
drop_nodes_with_parents(self._should_drop)
|
|
|
|
|
|
|
|
|
|
doc = prep_article(self.dom)
|
|
|
|
|
doc = build_base_document(doc, self.fragment)
|
|
|
|
|
dom = prep_article(self.dom)
|
|
|
|
|
return build_base_document(dom, self.fragment)
|
|
|
|
|
else:
|
|
|
|
|
logger.warning('No document to use.')
|
|
|
|
|
doc = build_error_document(self.fragment)
|
|
|
|
|
|
|
|
|
|
return doc
|
|
|
|
|
logger.warning("No document to use.")
|
|
|
|
|
return build_error_document(self.fragment)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def leaf_div_elements_into_paragraphs(document):
|
|
|
|
|