Refactored file 'document.py'

Removed non-intuitive parts and dead code not covered by tests. Better names for objects. Better coverage by tests.
11 years ago · dcb7c18fd5
parent 03ff0be266
commit dcb7c18fd5
2 changed files with 65 additions and 56 deletions
--- a/breadability/document.py
+++ b/breadability/document.py
@ -15,67 +15,60 @@ from ._py3k import unicode, to_string, to_bytes
 from .utils import cached_property


-utf8_parser = HTMLParser(encoding='utf-8')
 logger = logging.getLogger("breadability")


-def get_encoding(page):
-    encoding = 'utf-8'
-    text = re.sub(to_bytes('</?[^>]*>\s*'), to_bytes(' '), page)
+def determine_encoding(page):
+    encoding = "utf8"
+    text = re.sub(to_bytes(r"</?[^>]*>\s*"), to_bytes(" "), page)

-    # don't veture to guess
+    # don't venture to guess
    if not text.strip() or len(text) < 10:
        return encoding

-    try:
-        diff = text.decode(encoding, 'ignore').encode(encoding)
-        sizes = len(diff), len(text)
+    # try enforce UTF-8
+    diff = text.decode(encoding, "ignore").encode(encoding)
+    sizes = len(diff), len(text)

-        # 99% of utf-8
-        if abs(len(text) - len(diff)) < max(sizes) * 0.01:
-            return encoding
-    except UnicodeDecodeError:
-        pass
+    # 99% of UTF-8
+    if abs(len(text) - len(diff)) < max(sizes) * 0.01:
+        return encoding

+    # try detect encoding
    encoding_detector = charade.detect(text)
-    encoding = encoding_detector['encoding']
-
-    if not encoding:
-        encoding = 'utf-8'
-    elif encoding == 'MacCyrillic':
-        encoding = 'cp1251'
+    if encoding_detector["encoding"]:
+        encoding = encoding_detector["encoding"]

    return encoding

+
 MULTIPLE_BR_TAGS_PATTERN = re.compile(r"(?:<br[^>]*>\s*){2,}", re.IGNORECASE)
 def replace_multi_br_to_paragraphs(html):
    """Converts multiple <br> tags into paragraphs."""
-    logger.debug('Replacing multiple <br/> to <p>')
+    logger.debug("Replacing multiple <br/> to <p>")

-    return MULTIPLE_BR_TAGS_PATTERN.sub('</p><p>', html)
+    return MULTIPLE_BR_TAGS_PATTERN.sub("</p><p>", html)


-def build_doc(page):
-    """Requires that the `page` not be None"""
-    if page is None:
-        logger.error("Page content is None, can't build_doc")
-        return ''
+UTF8_PARSER = HTMLParser(encoding="utf8")
+def build_document(html_content, base_href=None):
+    """Requires that the `html_content` not be None"""
+    assert html_content is not None

-    if isinstance(page, unicode):
-        page_unicode = page
-    else:
-        encoding = get_encoding(page)
-        page_unicode = page.decode(encoding, 'replace')
+    if isinstance(html_content, unicode):
+        html_content = html_content.encode("utf8", "replace")

    try:
-        doc = document_fromstring(
-            page_unicode.encode('utf-8', 'replace'),
-            parser=utf8_parser)
-        return doc
+        document = document_fromstring(html_content, parser=UTF8_PARSER)
    except XMLSyntaxError:
-        msg = 'Failed to parse document contents.'
-        logger.exception(msg)
-        raise ValueError(msg)
+        raise ValueError("Failed to parse document contents.")
+
+    if base_href:
+        document.make_links_absolute(base_href, resolve_base_href=True)
+    else:
+        document.resolve_base_href()
+
+    return document


 class OriginalDocument(object):
@ -94,19 +87,11 @@ class OriginalDocument(object):
        return tounicode(self.html)

    def _parse(self, html):
-        """Generate an lxml document from our html."""
+        """Generate an lxml document from html."""
        html = replace_multi_br_to_paragraphs(html)
-        doc = build_doc(html)
-
-        # doc = html_cleaner.clean_html(doc)
-        base_href = self.url
-        if base_href:
-            logger.debug('Making links absolute')
-            doc.make_links_absolute(base_href, resolve_base_href=True)
-        else:
-            doc.resolve_base_href()
+        document = build_document(html, self.url)

-        return doc
+        return document

    @cached_property
    def html(self):
@ -121,8 +106,8 @@ class OriginalDocument(object):
    @cached_property
    def title(self):
        """Pull the title attribute out of the parsed document"""
-        titleElem = self.html.find('.//title')
-        if titleElem is None or titleElem.text is None:
-            return ''
+        title_element = self.html.find(".//title")
+        if title_element is None or title_element.text is None:
+            return ""
        else:
-            return titleElem.text
+            return title_element.text.strip()
--- a/tests/test_orig_document.py
+++ b/tests/test_orig_document.py
@ -9,8 +9,8 @@ except ImportError:
    import unittest

 from collections import defaultdict
-from breadability._py3k import to_unicode
-from breadability.document import OriginalDocument, get_encoding
+from breadability._py3k import to_unicode, to_bytes
+from breadability.document import OriginalDocument, determine_encoding
 from .utils import load_snippet


@ -51,6 +51,30 @@ class TestOriginalDocument(unittest.TestCase):
        doc = OriginalDocument(load_snippet('document_min.html'))
        self.assertIsNone(doc.html.find('.//br'))

+    def test_empty_title(self):
+        """We convert all <br/> tags to <p> tags"""
+        document = OriginalDocument("<html><head><title></title></head><body></body></html>")
+        self.assertEqual(document.title, "")
+
+    def test_title_only_with_tags(self):
+        """We convert all <br/> tags to <p> tags"""
+        document = OriginalDocument("<html><head><title><em></em></title></head><body></body></html>")
+        self.assertEqual(document.title, "")
+
+    def test_no_title(self):
+        """We convert all <br/> tags to <p> tags"""
+        document = OriginalDocument("<html><head></head><body></body></html>")
+        self.assertEqual(document.title, "")
+
    def test_encoding(self):
        text = to_unicode("ľščťžýáíéäúňôůě").encode("iso-8859-2")
-        encoding = get_encoding(text)
+        encoding = determine_encoding(text)
+
+    def test_encoding_short(self):
+        text = to_unicode("ľščťžýáíé").encode("iso-8859-2")
+        encoding = determine_encoding(text)
+        self.assertEqual(encoding, "utf8")
+
+        text = to_bytes("ľščťžýáíé")
+        encoding = determine_encoding(text)
+        self.assertEqual(encoding, "utf8")