Add better handling around xml parsing issues

- Fixes #9 with empty/non parsable docs - Fixes #8 and removes kwargs for the decode statements. - Fixes #7 by checking if the node has a parent before dropping.
12 years ago · 3984e04668
parent fe9364295f
commit 3984e04668
6 changed files with 81 additions and 21 deletions
--- a/NEWS.txt
+++ b/NEWS.txt
@ -6,6 +6,16 @@
 News
 ====

+0.1.8
+------
+
+* Release date: Aug 27nd 2012*
+
+* Add code/tests for an empty document.
+* Fixes #9 to handle xml parsing issues.
+
+
+
 0.1.7
 ------

--- a/setup.py
+++ b/setup.py
@ -6,7 +6,7 @@ README = open(os.path.join(here, 'README.rst')).read()
 NEWS = open(os.path.join(here, 'NEWS.txt')).read()


-version = '0.1.7'
+version = '0.1.8'
 install_requires = [
    # List your project dependencies here.
    # For more details, see:
--- a/src/breadability/init.py
+++ b/src/breadability/init.py
@ -1,3 +1,3 @@
-VERSION = '0.1.7'
+VERSION = '0.1.8'
 import client
 from scripts import newtest
--- a/src/breadability/document.py
+++ b/src/breadability/document.py
@ -4,6 +4,7 @@ import chardet
 import re
 from lxml.etree import tostring
 from lxml.etree import tounicode
+from lxml.etree import XMLSyntaxError
 from lxml.html import document_fromstring
 from lxml.html import HTMLParser

@ -54,10 +55,14 @@ def build_doc(page):
    else:
        enc = get_encoding(page)
        page_unicode = page.decode(enc, 'replace')
-    doc = document_fromstring(
-        page_unicode.encode('utf-8', 'replace'),
-        parser=utf8_parser)
-    return doc
+    try:
+        doc = document_fromstring(
+            page_unicode.encode('utf-8', 'replace'),
+            parser=utf8_parser)
+        return doc
+    except XMLSyntaxError, exc:
+        LOG.error('Failed to parse: ' + str(exc))
+        raise ValueError('Failed to parse document contents.')


 class OriginalDocument(object):
@ -80,6 +85,7 @@ class OriginalDocument(object):
        """Generate an lxml document from our html."""
        html = replace_multi_br_to_paragraphs(html)
        doc = build_doc(html)
+
        # doc = html_cleaner.clean_html(doc)
        base_href = self.url
        if base_href:
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -11,7 +11,6 @@ from breadability.document import OriginalDocument
 from breadability.logconfig import LOG
 from breadability.logconfig import LNODE
 from breadability.scoring import score_candidates
-from breadability.scoring import generate_hash_id
 from breadability.scoring import get_link_density
 from breadability.scoring import get_class_weight
 from breadability.scoring import is_unlikely_node
@ -124,6 +123,28 @@ def build_base_document(html, fragment=True):
    return output


+def build_error_document(html, fragment=True):
+    """Return an empty erorr document with the body as root.
+
+    :param fragment: Should we return a <div> doc fragment or a full <html>
+    doc.
+
+    """
+    frag = fragment_fromstring('<div/>')
+    frag.set('id', 'readabilityBody')
+    frag.set('class', 'parsing-error')
+
+    if not fragment:
+        output = fromstring(BASE_DOC)
+        insert_point = output.find('.//body')
+        insert_point.append(frag)
+    else:
+        output = frag
+
+    output.doctype = "<!DOCTYPE html>"
+    return output
+
+
 def transform_misused_divs_into_paragraphs(doc):
    """Turn all divs that don't have children block level elements into p's

@ -209,6 +230,9 @@ def check_siblings(candidate_node, candidate_list):

 def clean_document(node):
    """Clean up the final document we return as the readable article"""
+    if node is None or len(node) == 0:
+        return
+
    LNODE.log(node, 2, "Processing doc")
    clean_list = ['object', 'h1']
    to_drop = []
@ -383,6 +407,7 @@ def find_candidates(doc):

 class Article(object):
    """Parsed readable object"""
+    _should_drop = []

    def __init__(self, html, url=None, fragment=True):
        """Create the Article we're going to use.
@ -406,20 +431,26 @@ class Article(object):
    @cached_property(ttl=600)
    def doc(self):
        """The doc is the parsed xml tree of the given html."""
-        doc = self.orig.html
-        # cleaning doesn't return, just wipes in place
-        html_cleaner(doc)
-        doc = drop_tag(doc, 'noscript', 'iframe')
-        doc = transform_misused_divs_into_paragraphs(doc)
-        return doc
+        try:
+            doc = self.orig.html
+            # cleaning doesn't return, just wipes in place
+            html_cleaner(doc)
+            doc = drop_tag(doc, 'noscript', 'iframe')
+            doc = transform_misused_divs_into_paragraphs(doc)
+            return doc
+        except ValueError:
+            return None

    @cached_property(ttl=600)
    def candidates(self):
        """Generate the list of candidates from the doc."""
        doc = self.doc
-        candidates, should_drop = find_candidates(doc)
-        self._should_drop = should_drop
-        return candidates
+        if doc is not None and len(doc):
+            candidates, should_drop = find_candidates(doc)
+            self._should_drop = should_drop
+            return candidates
+        else:
+            return None

    @cached_property(ttl=600)
    def readable(self):
@ -433,7 +464,8 @@ class Article(object):
            pp = PrettyPrinter(indent=2)

            # cleanup by removing the should_drop we spotted.
-            [n.drop_tree() for n in self._should_drop]
+            [n.drop_tree() for n in self._should_drop
+                if n.getparent() is not None]

            # right now we return the highest scoring candidate content
            by_score = sorted([c for c in self.candidates.values()],
@ -452,9 +484,13 @@ class Article(object):
            LOG.warning('No candidates found: using document.')
            LOG.debug('Begin final prep of article')
            # since we've not found a good candidate we're should help this
-            # cleanup by removing the should_drop we spotted.
-            [n.drop_tree() for n in self._should_drop]
-            doc = prep_article(self.doc)
-            doc = build_base_document(doc, self.fragment)
+            if self.doc is not None and len(self.doc):
+                # cleanup by removing the should_drop we spotted.
+                [n.drop_tree() for n in self._should_drop]
+                doc = prep_article(self.doc)
+                doc = build_base_document(doc, self.fragment)
+            else:
+                LOG.warning('No document to use.')
+                doc = build_error_document(self.fragment)

        return doc
--- a/src/breadability/tests/test_readable.py
+++ b/src/breadability/tests/test_readable.py
@ -62,6 +62,14 @@ class TestReadableDocument(TestCase):
        self.assertEqual(doc._readable.get('id'), 'readabilityBody')


+    def test_no_content(self):
+        """Without content we supply an empty unparsed doc."""
+        doc = Article('')
+        self.assertEqual(doc._readable.tag, 'div')
+        self.assertEqual(doc._readable.get('id'), 'readabilityBody')
+        self.assertEqual(doc._readable.get('class'), 'parsing-error')
+
+
 class TestCleaning(TestCase):
    """Test out our cleaning processing we do."""