From 4e3227521e2291ddf7dd7713f0fbce4b6173c1fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mi=C5=A1o=20Belica?= Date: Fri, 15 Mar 2013 01:40:41 +0100 Subject: [PATCH] Fewer code - fewer bugs (I hope) --- breadability/readable.py | 233 ++++++++++++++++++--------------------- 1 file changed, 107 insertions(+), 126 deletions(-) diff --git a/breadability/readable.py b/breadability/readable.py index d6aec16..1539858 100644 --- a/breadability/readable.py +++ b/breadability/readable.py @@ -25,6 +25,7 @@ html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, remove_unknown_tags=False, safe_attrs_only=False) +SCORABLE_TAGS = ("div", "p", "td", "pre", "article") NULL_DOCUMENT = """ @@ -34,34 +35,10 @@ NULL_DOCUMENT = """ """ -SCORABLE_TAGS = ('div', 'p', 'td', 'pre', 'article') logger = logging.getLogger("breadability") -def is_bad_link(node): - """ - Helper to determine if the link is something to clean out - - We've hit articles with many multiple links that should be cleaned out - because they're just there to pollute the space. See tests for examples. - """ - if node.tag != 'a': - return False - - name = node.get('name') - href = node.get('href') - if name and not href: - return True - - if href: - url_bits = href.split('#') - if len(url_bits) == 2 and len(url_bits[1]) > 25: - return True - - return False - - def ok_embedded_video(node): """Check if this embed/video is an ok one to count.""" good_keywords = ('youtube', 'blip.tv', 'vimeo') @@ -74,64 +51,50 @@ def ok_embedded_video(node): return False -def build_base_document(html, fragment=True): - """Return a base document with the body as root. +def build_base_document(dom, return_fragment=True): + """ + Builds a base document with the body as root. - :param html: Parsed Element object - :param fragment: Should we return a
doc fragment or - a full doc. + :param dom: Parsed lxml tree (Document Object Model). + :param bool return_fragment: If True only
fragment is returned. + Otherwise full HTML document is returned. """ - if html.tag == 'body': - html.tag = 'div' - found_body = html + body_element = dom.find(".//body") + + if body_element is None: + fragment = fragment_fromstring('
') + fragment.append(dom) else: - found_body = html.find('.//body') + body_element.tag = "div" + body_element.set("id", "readabilityBody") + fragment = body_element - if found_body is None: - frag = fragment_fromstring('
') - frag.set('id', 'readabilityBody') - frag.append(html) + return document_from_fragment(fragment, return_fragment) - if not fragment: - output = fromstring(NULL_DOCUMENT) - insert_point = output.find('.//body') - insert_point.append(frag) - else: - output = frag - else: - found_body.tag = 'div' - found_body.set('id', 'readabilityBody') - if not fragment: - output = fromstring(NULL_DOCUMENT) - insert_point = output.find('.//body') - insert_point.append(found_body) - else: - output = found_body +def build_error_document(dom, return_fragment=True): + """ + Builds an empty erorr document with the body as root. - output.doctype = "" - return output + :param bool return_fragment: If True only
fragment is returned. + Otherwise full HTML document is returned. + """ + fragment = fragment_fromstring( + '
') + return document_from_fragment(fragment, return_fragment) -def build_error_document(html, fragment=True): - """Return an empty erorr document with the body as root. - :param fragment: Should we return a
doc fragment or - a full doc. - """ - frag = fragment_fromstring('
') - frag.set('id', 'readabilityBody') - frag.set('class', 'parsing-error') - - if not fragment: - output = fromstring(NULL_DOCUMENT) - insert_point = output.find('.//body') - insert_point.append(frag) +def document_from_fragment(fragment, return_fragment): + if return_fragment: + document = fragment else: - output = frag + document = fromstring(NULL_DOCUMENT) + body_element = document.find(".//body") + body_element.append(fragment) - output.doctype = "" - return output + document.doctype = "" + return document def check_siblings(candidate_node, candidate_list): @@ -342,33 +305,55 @@ def prep_article(doc): return clean_document(doc) -def find_candidates(doc): - """Find cadidate nodes for the readable version of the article. +def find_candidates(document): + """ + Finds cadidate nodes for the readable version of the article. - Here's we're going to remove unlikely nodes, find scores on the rest, and + Here's we're going to remove unlikely nodes, find scores on the rest, clean up and return the final best match. """ - nodes_to_score = [] - should_remove = [] + nodes_to_score = set() + should_remove = set() - for node in doc.iter(): + for node in document.iter(): if is_unlikely_node(node): - logger.debug('We should drop unlikely: ' + str(node)) - should_remove.append(node) - continue - if node.tag == 'a' and is_bad_link(node): - logger.debug('We should drop bad link: ' + str(node)) - should_remove.append(node) - continue - if node.tag in SCORABLE_TAGS and node not in nodes_to_score: - nodes_to_score.append(node) + logger.debug("We should drop unlikely: %s", str(node)) + should_remove.add(node) + elif is_bad_link(node): + logger.debug("We should drop bad link: %s", str(node)) + should_remove.add(node) + elif node.tag in SCORABLE_TAGS: + nodes_to_score.add(node) return score_candidates(nodes_to_score), should_remove +def is_bad_link(node): + """ + Helper to determine if the node is link that is useless. + + We've hit articles with many multiple links that should be cleaned out + because they're just there to pollute the space. See tests for examples. + """ + if node.tag != "a": + return False + + name = node.get("name") + href = node.get("href") + if name and not href: + return True + + if href: + href_parts = href.split("#") + if len(href_parts) == 2 and len(href_parts[1]) > 25: + return True + + return False + + class Article(object): """Parsed readable object""" - _should_drop = [] + _should_drop = () def __init__(self, html, url=None, fragment=True): """Create the Article we're going to use. @@ -401,15 +386,14 @@ class Article(object): @cached_property def candidates(self): - """Generate the list of candidates from the doc.""" - doc = self.dom - if doc is not None and len(doc): - candidates, should_drop = find_candidates(doc) - self._should_drop = should_drop - return candidates - else: + """Generates list of candidates from the DOM.""" + dom = self.dom + if dom is None or len(dom) == 0: return None + candidates, self._should_drop = find_candidates(dom) + return candidates + @cached_property def readable(self): return tounicode(self.readable_dom) @@ -420,51 +404,48 @@ class Article(object): def _readable(self): """The readable parsed article""" - if self.candidates: - logger.debug('Candidates found') - pp = PrettyPrinter(indent=2) - - # cleanup by removing the should_drop we spotted. - drop_nodes_with_parents(self._should_drop) - - # right now we return the highest scoring candidate content - by_score = sorted([c for c in self.candidates.values()], - key=attrgetter('content_score'), reverse=True) - logger.debug(pp.pformat(by_score)) - - # since we have several candidates, check the winner's siblings - # for extra content - winner = by_score[0] - logger.debug('Selected winning node: ' + str(winner)) - updated_winner = check_siblings(winner, self.candidates) - logger.debug('Begin final prep of article') - updated_winner.node = prep_article(updated_winner.node) - if updated_winner.node is not None: - doc = build_base_document(updated_winner.node, self.fragment) - else: - logger.warning('Had candidates but failed to find a cleaned winning doc.') - doc = self._handle_no_candidates() + if not self.candidates: + logger.warning("No candidates found in document.") + return self._handle_no_candidates() + + # cleanup by removing the should_drop we spotted. + drop_nodes_with_parents(self._should_drop) + + # right now we return the highest scoring candidate content + best_candidates = sorted((c for c in self.candidates.values()), + key=attrgetter("content_score"), reverse=True) + + printer = PrettyPrinter(indent=2) + logger.debug(printer.pformat(best_candidates)) + + # since we have several candidates, check the winner's siblings + # for extra content + winner = best_candidates[0] + updated_winner = check_siblings(winner, self.candidates) + logger.debug('Begin final prep of article') + updated_winner.node = prep_article(updated_winner.node) + if updated_winner.node is not None: + doc = build_base_document(updated_winner.node, self.fragment) else: - logger.warning('No candidates found: using document.') - logger.debug('Begin final prep of article') + logger.warning('Had candidates but failed to find a cleaned winning doc.') doc = self._handle_no_candidates() return doc def _handle_no_candidates(self): - """If we fail to find a good candidate we need to find something else.""" + """ + If we fail to find a good candidate we need to find something else. + """ # since we've not found a good candidate we're should help this if self.dom is not None and len(self.dom): # cleanup by removing the should_drop we spotted. drop_nodes_with_parents(self._should_drop) - doc = prep_article(self.dom) - doc = build_base_document(doc, self.fragment) + dom = prep_article(self.dom) + return build_base_document(dom, self.fragment) else: - logger.warning('No document to use.') - doc = build_error_document(self.fragment) - - return doc + logger.warning("No document to use.") + return build_error_document(self.fragment) def leaf_div_elements_into_paragraphs(document):