Add better handling around xml parsing issues

- Fixes #9 with empty/non parsable docs
- Fixes #8 and removes kwargs for the decode statements.
- Fixes #7 by checking if the node has a parent before dropping.
pull/11/head
Richard Harding 12 years ago
parent fe9364295f
commit 3984e04668

@ -6,6 +6,16 @@
News
====
0.1.8
------
* Release date: Aug 27nd 2012*
* Add code/tests for an empty document.
* Fixes #9 to handle xml parsing issues.
0.1.7
------

@ -6,7 +6,7 @@ README = open(os.path.join(here, 'README.rst')).read()
NEWS = open(os.path.join(here, 'NEWS.txt')).read()
version = '0.1.7'
version = '0.1.8'
install_requires = [
# List your project dependencies here.
# For more details, see:

@ -1,3 +1,3 @@
VERSION = '0.1.7'
VERSION = '0.1.8'
import client
from scripts import newtest

@ -4,6 +4,7 @@ import chardet
import re
from lxml.etree import tostring
from lxml.etree import tounicode
from lxml.etree import XMLSyntaxError
from lxml.html import document_fromstring
from lxml.html import HTMLParser
@ -54,10 +55,14 @@ def build_doc(page):
else:
enc = get_encoding(page)
page_unicode = page.decode(enc, 'replace')
doc = document_fromstring(
page_unicode.encode('utf-8', 'replace'),
parser=utf8_parser)
return doc
try:
doc = document_fromstring(
page_unicode.encode('utf-8', 'replace'),
parser=utf8_parser)
return doc
except XMLSyntaxError, exc:
LOG.error('Failed to parse: ' + str(exc))
raise ValueError('Failed to parse document contents.')
class OriginalDocument(object):
@ -80,6 +85,7 @@ class OriginalDocument(object):
"""Generate an lxml document from our html."""
html = replace_multi_br_to_paragraphs(html)
doc = build_doc(html)
# doc = html_cleaner.clean_html(doc)
base_href = self.url
if base_href:

@ -11,7 +11,6 @@ from breadability.document import OriginalDocument
from breadability.logconfig import LOG
from breadability.logconfig import LNODE
from breadability.scoring import score_candidates
from breadability.scoring import generate_hash_id
from breadability.scoring import get_link_density
from breadability.scoring import get_class_weight
from breadability.scoring import is_unlikely_node
@ -124,6 +123,28 @@ def build_base_document(html, fragment=True):
return output
def build_error_document(html, fragment=True):
"""Return an empty erorr document with the body as root.
:param fragment: Should we return a <div> doc fragment or a full <html>
doc.
"""
frag = fragment_fromstring('<div/>')
frag.set('id', 'readabilityBody')
frag.set('class', 'parsing-error')
if not fragment:
output = fromstring(BASE_DOC)
insert_point = output.find('.//body')
insert_point.append(frag)
else:
output = frag
output.doctype = "<!DOCTYPE html>"
return output
def transform_misused_divs_into_paragraphs(doc):
"""Turn all divs that don't have children block level elements into p's
@ -209,6 +230,9 @@ def check_siblings(candidate_node, candidate_list):
def clean_document(node):
"""Clean up the final document we return as the readable article"""
if node is None or len(node) == 0:
return
LNODE.log(node, 2, "Processing doc")
clean_list = ['object', 'h1']
to_drop = []
@ -383,6 +407,7 @@ def find_candidates(doc):
class Article(object):
"""Parsed readable object"""
_should_drop = []
def __init__(self, html, url=None, fragment=True):
"""Create the Article we're going to use.
@ -406,20 +431,26 @@ class Article(object):
@cached_property(ttl=600)
def doc(self):
"""The doc is the parsed xml tree of the given html."""
doc = self.orig.html
# cleaning doesn't return, just wipes in place
html_cleaner(doc)
doc = drop_tag(doc, 'noscript', 'iframe')
doc = transform_misused_divs_into_paragraphs(doc)
return doc
try:
doc = self.orig.html
# cleaning doesn't return, just wipes in place
html_cleaner(doc)
doc = drop_tag(doc, 'noscript', 'iframe')
doc = transform_misused_divs_into_paragraphs(doc)
return doc
except ValueError:
return None
@cached_property(ttl=600)
def candidates(self):
"""Generate the list of candidates from the doc."""
doc = self.doc
candidates, should_drop = find_candidates(doc)
self._should_drop = should_drop
return candidates
if doc is not None and len(doc):
candidates, should_drop = find_candidates(doc)
self._should_drop = should_drop
return candidates
else:
return None
@cached_property(ttl=600)
def readable(self):
@ -433,7 +464,8 @@ class Article(object):
pp = PrettyPrinter(indent=2)
# cleanup by removing the should_drop we spotted.
[n.drop_tree() for n in self._should_drop]
[n.drop_tree() for n in self._should_drop
if n.getparent() is not None]
# right now we return the highest scoring candidate content
by_score = sorted([c for c in self.candidates.values()],
@ -452,9 +484,13 @@ class Article(object):
LOG.warning('No candidates found: using document.')
LOG.debug('Begin final prep of article')
# since we've not found a good candidate we're should help this
# cleanup by removing the should_drop we spotted.
[n.drop_tree() for n in self._should_drop]
doc = prep_article(self.doc)
doc = build_base_document(doc, self.fragment)
if self.doc is not None and len(self.doc):
# cleanup by removing the should_drop we spotted.
[n.drop_tree() for n in self._should_drop]
doc = prep_article(self.doc)
doc = build_base_document(doc, self.fragment)
else:
LOG.warning('No document to use.')
doc = build_error_document(self.fragment)
return doc

@ -62,6 +62,14 @@ class TestReadableDocument(TestCase):
self.assertEqual(doc._readable.get('id'), 'readabilityBody')
def test_no_content(self):
"""Without content we supply an empty unparsed doc."""
doc = Article('')
self.assertEqual(doc._readable.tag, 'div')
self.assertEqual(doc._readable.get('id'), 'readabilityBody')
self.assertEqual(doc._readable.get('class'), 'parsing-error')
class TestCleaning(TestCase):
"""Test out our cleaning processing we do."""

Loading…
Cancel
Save