From e2f3391dc30a2f30fbcda08d80df92d7699a525e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mi=C5=A1o=20Belica?= <miso.belica@gmail.com>
Date: Sat, 29 Mar 2014 15:41:23 +0100
Subject: [PATCH] Better decoding page into unicode

- Fixes #22
- Fixes #23

Prepare for release
---
 CHANGELOG.rst               |  4 +++
 Makefile                    |  2 +-
 breadability/document.py    | 52 +++++++++++++++++++++++++------------
 breadability/utils.py       | 12 +++++++++
 setup.py                    |  2 +-
 tests/test_orig_document.py | 30 ++++++++++++---------
 6 files changed, 71 insertions(+), 31 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 8108de8..e64af1a 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -3,6 +3,10 @@
 Changelog for breadability
 ==========================
 
+0.1.18 (April 6th 2014)
+----------------------
+- Improved decoding of the page into Unicode.
+
 0.1.17 (Jan 22nd 2014)
 ----------------------
 - More log quieting down to INFO vs WARN
diff --git a/Makefile b/Makefile
index 45a8ee3..6a237de 100644
--- a/Makefile
+++ b/Makefile
@@ -27,7 +27,7 @@ bin/python:
 
 .PHONY: deps
 deps: venv
-	pip install -r requirements.txt
+	$(PIP) install -r requirements.txt
 
 .PHONY: clean_venv
 clean_venv:
diff --git a/breadability/document.py b/breadability/document.py
index 8c08523..6dbdb8c 100644
--- a/breadability/document.py
+++ b/breadability/document.py
@@ -23,7 +23,10 @@ from ._compat import (
     unicode,
     unicode_compatible,
 )
-from .utils import cached_property
+from .utils import (
+    cached_property,
+    ignored,
+)
 
 
 logger = logging.getLogger("breadability")
@@ -31,30 +34,46 @@ logger = logging.getLogger("breadability")
 
 TAG_MARK_PATTERN = re.compile(to_bytes(r"</?[^>]*>\s*"))
 UTF8_PARSER = HTMLParser(encoding="utf8")
+CHARSET_META_TAG_PATTERN = re.compile(
+    br"""<meta[^>]+charset=["']?([^'"/>\s]+)""",
+    re.IGNORECASE
+)
 
 
-def determine_encoding(page):
-    encoding = "utf8"
-    text = TAG_MARK_PATTERN.sub(to_bytes(" "), page)
-
-    # don't venture to guess
-    if not text.strip() or len(text) < 10:
-        return encoding
-
-    # try enforce UTF-8
-    diff = text.decode(encoding, "ignore").encode(encoding)
+def decode_html(html):
+    """
+    Converts bytes stream containing an HTML page into Unicode.
+    Tries to guess character encoding from meta tag of by "charade" library.
+    """
+    if isinstance(html, unicode):
+        return html
+
+    match = CHARSET_META_TAG_PATTERN.search(html)
+    if match:
+        declared_encoding = match.group(1).decode("ASCII")
+        # proceed unknown encoding as if it wasn't found at all
+        with ignored(LookupError):
+            return html.decode(declared_encoding, "ignore")
+
+    # try to enforce UTF-8 firstly
+    with ignored(UnicodeDecodeError):
+        return html.decode("utf8")
+
+    text = TAG_MARK_PATTERN.sub(to_bytes(" "), html)
+    diff = text.decode("utf8", "ignore").encode("utf8")
     sizes = len(diff), len(text)
 
-    # 99% of UTF-8
+    # 99% of text is UTF-8
     if abs(len(text) - len(diff)) < max(sizes) * 0.01:
-        return encoding
+        return html.decode("utf8", "ignore")
 
     # try detect encoding
+    encoding = "utf8"
     encoding_detector = charade.detect(text)
     if encoding_detector["encoding"]:
         encoding = encoding_detector["encoding"]
 
-    return encoding
+    return html.decode(encoding, "ignore")
 
 
 BREAK_TAGS_PATTERN = re.compile(
@@ -88,7 +107,7 @@ def build_document(html_content, base_href=None):
     assert html_content is not None
 
     if isinstance(html_content, unicode):
-        html_content = html_content.encode("utf8", "replace")
+        html_content = html_content.encode("utf8", "xmlcharrefreplace")
 
     try:
         document = document_fromstring(html_content, parser=UTF8_PARSER)
@@ -125,8 +144,7 @@ class OriginalDocument(object):
         """Parsed HTML document from the input."""
         html = self._html
         if not isinstance(html, unicode):
-            encoding = determine_encoding(html)
-            html = html.decode(encoding)
+            html = decode_html(html)
 
         html = convert_breaks_to_paragraphs(html)
         document = build_document(html, self._url)
diff --git a/breadability/utils.py b/breadability/utils.py
index 7385d9e..70a9778 100644
--- a/breadability/utils.py
+++ b/breadability/utils.py
@@ -5,6 +5,18 @@ from __future__ import division, print_function, unicode_literals
 
 import re
 
+try:
+    from contextlib import ignored
+except ImportError:
+    from contextlib import contextmanager
+
+    @contextmanager
+    def ignored(*exceptions):
+        try:
+            yield
+        except tuple(exceptions):
+            pass
+
 
 MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)
 
diff --git a/setup.py b/setup.py
index 0a938d4..3268c86 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ import sys
 from os.path import abspath, dirname, join
 from setuptools import setup, find_packages
 
-VERSION = "0.1.17"
+VERSION = "0.1.18"
 
 VERSION_SUFFIX = "%d.%d" % sys.version_info[:2]
 CURRENT_DIRECTORY = abspath(dirname(__file__))
diff --git a/tests/test_orig_document.py b/tests/test_orig_document.py
index 8d2bcc7..cff46aa 100644
--- a/tests/test_orig_document.py
+++ b/tests/test_orig_document.py
@@ -4,10 +4,15 @@ from __future__ import absolute_import
 from __future__ import division, print_function, unicode_literals
 
 from collections import defaultdict
-from breadability._compat import to_unicode, to_bytes
+from breadability._compat import (
+    to_unicode,
+    to_bytes,
+    unicode,
+)
+
 from breadability.document import (
     convert_breaks_to_paragraphs,
-    determine_encoding,
+    decode_html,
     OriginalDocument,
 )
 from .compat import unittest
@@ -19,7 +24,8 @@ class TestOriginalDocument(unittest.TestCase):
 
     def test_convert_br_tags_to_paragraphs(self):
         returned = convert_breaks_to_paragraphs(
-            "<div>HI<br><br>How are you?<br><br> \t \n  <br>Fine\n I guess</div>")
+            ("<div>HI<br><br>How are you?<br><br> \t \n  <br>"
+             "Fine\n I guess</div>"))
 
         self.assertEqual(
             returned,
@@ -69,12 +75,14 @@ class TestOriginalDocument(unittest.TestCase):
 
     def test_empty_title(self):
         """We convert all <br/> tags to <p> tags"""
-        document = OriginalDocument("<html><head><title></title></head><body></body></html>")
+        document = OriginalDocument(
+            "<html><head><title></title></head><body></body></html>")
         self.assertEqual(document.title, "")
 
     def test_title_only_with_tags(self):
         """We convert all <br/> tags to <p> tags"""
-        document = OriginalDocument("<html><head><title><em></em></title></head><body></body></html>")
+        document = OriginalDocument(
+            "<html><head><title><em></em></title></head><body></body></html>")
         self.assertEqual(document.title, "")
 
     def test_no_title(self):
@@ -84,13 +92,11 @@ class TestOriginalDocument(unittest.TestCase):
 
     def test_encoding(self):
         text = "ľščťžýáíéäúňôůě".encode("iso-8859-2")
-        determine_encoding(text)
+        html = decode_html(text)
+        self.assertEqual(type(html), unicode)
 
     def test_encoding_short(self):
-        text = "ľščťžýáíé".encode("iso-8859-2")
-        encoding = determine_encoding(text)
-        self.assertEqual(encoding, "utf8")
-
         text = to_bytes("ľščťžýáíé")
-        encoding = determine_encoding(text)
-        self.assertEqual(encoding, "utf8")
+        html = decode_html(text)
+        self.assertEqual(type(html), unicode)
+        self.assertEqual(html, "ľščťžýáíé")