Better decoding page into unicode

- Fixes #22 - Fixes #23 Prepare for release
10 years ago · e2f3391dc3
parent 5cb028ec93
commit e2f3391dc3
6 changed files with 71 additions and 31 deletions
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@ -3,6 +3,10 @@
 Changelog for breadability
 ==========================

+0.1.18 (April 6th 2014)
+----------------------
+- Improved decoding of the page into Unicode.
+
 0.1.17 (Jan 22nd 2014)
 ----------------------
 - More log quieting down to INFO vs WARN
--- a/2
+++ b/2
@ -27,7 +27,7 @@ bin/python:

 .PHONY: deps
 deps: venv
-	pip install -r requirements.txt
+	$(PIP) install -r requirements.txt

 .PHONY: clean_venv
 clean_venv:
--- a/breadability/document.py
+++ b/breadability/document.py
@ -23,7 +23,10 @@ from ._compat import (
    unicode,
    unicode_compatible,
 )
-from .utils import cached_property
+from .utils import (
+    cached_property,
+    ignored,
+)


 logger = logging.getLogger("breadability")
@ -31,30 +34,46 @@ logger = logging.getLogger("breadability")

 TAG_MARK_PATTERN = re.compile(to_bytes(r"</?[^>]*>\s*"))
 UTF8_PARSER = HTMLParser(encoding="utf8")
+CHARSET_META_TAG_PATTERN = re.compile(
+    br"""<meta[^>]+charset=["']?([^'"/>\s]+)""",
+    re.IGNORECASE
+)


-def determine_encoding(page):
-    encoding = "utf8"
-    text = TAG_MARK_PATTERN.sub(to_bytes(" "), page)
-
-    # don't venture to guess
-    if not text.strip() or len(text) < 10:
-        return encoding
-
-    # try enforce UTF-8
-    diff = text.decode(encoding, "ignore").encode(encoding)
+def decode_html(html):
+    """
+    Converts bytes stream containing an HTML page into Unicode.
+    Tries to guess character encoding from meta tag of by "charade" library.
+    """
+    if isinstance(html, unicode):
+        return html
+
+    match = CHARSET_META_TAG_PATTERN.search(html)
+    if match:
+        declared_encoding = match.group(1).decode("ASCII")
+        # proceed unknown encoding as if it wasn't found at all
+        with ignored(LookupError):
+            return html.decode(declared_encoding, "ignore")
+
+    # try to enforce UTF-8 firstly
+    with ignored(UnicodeDecodeError):
+        return html.decode("utf8")
+
+    text = TAG_MARK_PATTERN.sub(to_bytes(" "), html)
+    diff = text.decode("utf8", "ignore").encode("utf8")
    sizes = len(diff), len(text)

-    # 99% of UTF-8
+    # 99% of text is UTF-8
    if abs(len(text) - len(diff)) < max(sizes) * 0.01:
-        return encoding
+        return html.decode("utf8", "ignore")

    # try detect encoding
+    encoding = "utf8"
    encoding_detector = charade.detect(text)
    if encoding_detector["encoding"]:
        encoding = encoding_detector["encoding"]

-    return encoding
+    return html.decode(encoding, "ignore")


 BREAK_TAGS_PATTERN = re.compile(
@ -88,7 +107,7 @@ def build_document(html_content, base_href=None):
    assert html_content is not None

    if isinstance(html_content, unicode):
-        html_content = html_content.encode("utf8", "replace")
+        html_content = html_content.encode("utf8", "xmlcharrefreplace")

    try:
        document = document_fromstring(html_content, parser=UTF8_PARSER)
@ -125,8 +144,7 @@ class OriginalDocument(object):
        """Parsed HTML document from the input."""
        html = self._html
        if not isinstance(html, unicode):
-            encoding = determine_encoding(html)
-            html = html.decode(encoding)
+            html = decode_html(html)

        html = convert_breaks_to_paragraphs(html)
        document = build_document(html, self._url)
--- a/breadability/utils.py
+++ b/breadability/utils.py
@ -5,6 +5,18 @@ from __future__ import division, print_function, unicode_literals

 import re

+try:
+    from contextlib import ignored
+except ImportError:
+    from contextlib import contextmanager
+
+    @contextmanager
+    def ignored(*exceptions):
+        try:
+            yield
+        except tuple(exceptions):
+            pass
+

 MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)

--- a/setup.py
+++ b/setup.py
@ -3,7 +3,7 @@ import sys
 from os.path import abspath, dirname, join
 from setuptools import setup, find_packages

-VERSION = "0.1.17"
+VERSION = "0.1.18"

 VERSION_SUFFIX = "%d.%d" % sys.version_info[:2]
 CURRENT_DIRECTORY = abspath(dirname(__file__))
--- a/tests/test_orig_document.py
+++ b/tests/test_orig_document.py
@ -4,10 +4,15 @@ from __future__ import absolute_import
 from __future__ import division, print_function, unicode_literals

 from collections import defaultdict
-from breadability._compat import to_unicode, to_bytes
+from breadability._compat import (
+    to_unicode,
+    to_bytes,
+    unicode,
+)
+
 from breadability.document import (
    convert_breaks_to_paragraphs,
-    determine_encoding,
+    decode_html,
    OriginalDocument,
 )
 from .compat import unittest
@ -19,7 +24,8 @@ class TestOriginalDocument(unittest.TestCase):

    def test_convert_br_tags_to_paragraphs(self):
        returned = convert_breaks_to_paragraphs(
-            "<div>HI<br><br>How are you?<br><br> \t \n  <br>Fine\n I guess</div>")
+            ("<div>HI<br><br>How are you?<br><br> \t \n  <br>"
+             "Fine\n I guess</div>"))

        self.assertEqual(
            returned,
@ -69,12 +75,14 @@ class TestOriginalDocument(unittest.TestCase):

    def test_empty_title(self):
        """We convert all <br/> tags to <p> tags"""
-        document = OriginalDocument("<html><head><title></title></head><body></body></html>")
+        document = OriginalDocument(
+            "<html><head><title></title></head><body></body></html>")
        self.assertEqual(document.title, "")

    def test_title_only_with_tags(self):
        """We convert all <br/> tags to <p> tags"""
-        document = OriginalDocument("<html><head><title><em></em></title></head><body></body></html>")
+        document = OriginalDocument(
+            "<html><head><title><em></em></title></head><body></body></html>")
        self.assertEqual(document.title, "")

    def test_no_title(self):
@ -84,13 +92,11 @@ class TestOriginalDocument(unittest.TestCase):

    def test_encoding(self):
        text = "ľščťžýáíéäúňôůě".encode("iso-8859-2")
-        determine_encoding(text)
+        html = decode_html(text)
+        self.assertEqual(type(html), unicode)

    def test_encoding_short(self):
-        text = "ľščťžýáíé".encode("iso-8859-2")
-        encoding = determine_encoding(text)
-        self.assertEqual(encoding, "utf8")
-
        text = to_bytes("ľščťžýáíé")
-        encoding = determine_encoding(text)
-        self.assertEqual(encoding, "utf8")
+        html = decode_html(text)
+        self.assertEqual(type(html), unicode)
+        self.assertEqual(html, "ľščťžýáíé")