From e2f3391dc30a2f30fbcda08d80df92d7699a525e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mi=C5=A1o=20Belica?= Date: Sat, 29 Mar 2014 15:41:23 +0100 Subject: [PATCH] Better decoding page into unicode - Fixes #22 - Fixes #23 Prepare for release --- CHANGELOG.rst | 4 +++ Makefile | 2 +- breadability/document.py | 52 +++++++++++++++++++++++++------------ breadability/utils.py | 12 +++++++++ setup.py | 2 +- tests/test_orig_document.py | 30 ++++++++++++--------- 6 files changed, 71 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8108de8..e64af1a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -3,6 +3,10 @@ Changelog for breadability ========================== +0.1.18 (April 6th 2014) +---------------------- +- Improved decoding of the page into Unicode. + 0.1.17 (Jan 22nd 2014) ---------------------- - More log quieting down to INFO vs WARN diff --git a/Makefile b/Makefile index 45a8ee3..6a237de 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ bin/python: .PHONY: deps deps: venv - pip install -r requirements.txt + $(PIP) install -r requirements.txt .PHONY: clean_venv clean_venv: diff --git a/breadability/document.py b/breadability/document.py index 8c08523..6dbdb8c 100644 --- a/breadability/document.py +++ b/breadability/document.py @@ -23,7 +23,10 @@ from ._compat import ( unicode, unicode_compatible, ) -from .utils import cached_property +from .utils import ( + cached_property, + ignored, +) logger = logging.getLogger("breadability") @@ -31,30 +34,46 @@ logger = logging.getLogger("breadability") TAG_MARK_PATTERN = re.compile(to_bytes(r"]*>\s*")) UTF8_PARSER = HTMLParser(encoding="utf8") +CHARSET_META_TAG_PATTERN = re.compile( + br"""]+charset=["']?([^'"/>\s]+)""", + re.IGNORECASE +) -def determine_encoding(page): - encoding = "utf8" - text = TAG_MARK_PATTERN.sub(to_bytes(" "), page) - - # don't venture to guess - if not text.strip() or len(text) < 10: - return encoding - - # try enforce UTF-8 - diff = text.decode(encoding, "ignore").encode(encoding) +def decode_html(html): + """ + Converts bytes stream containing an HTML page into Unicode. + Tries to guess character encoding from meta tag of by "charade" library. + """ + if isinstance(html, unicode): + return html + + match = CHARSET_META_TAG_PATTERN.search(html) + if match: + declared_encoding = match.group(1).decode("ASCII") + # proceed unknown encoding as if it wasn't found at all + with ignored(LookupError): + return html.decode(declared_encoding, "ignore") + + # try to enforce UTF-8 firstly + with ignored(UnicodeDecodeError): + return html.decode("utf8") + + text = TAG_MARK_PATTERN.sub(to_bytes(" "), html) + diff = text.decode("utf8", "ignore").encode("utf8") sizes = len(diff), len(text) - # 99% of UTF-8 + # 99% of text is UTF-8 if abs(len(text) - len(diff)) < max(sizes) * 0.01: - return encoding + return html.decode("utf8", "ignore") # try detect encoding + encoding = "utf8" encoding_detector = charade.detect(text) if encoding_detector["encoding"]: encoding = encoding_detector["encoding"] - return encoding + return html.decode(encoding, "ignore") BREAK_TAGS_PATTERN = re.compile( @@ -88,7 +107,7 @@ def build_document(html_content, base_href=None): assert html_content is not None if isinstance(html_content, unicode): - html_content = html_content.encode("utf8", "replace") + html_content = html_content.encode("utf8", "xmlcharrefreplace") try: document = document_fromstring(html_content, parser=UTF8_PARSER) @@ -125,8 +144,7 @@ class OriginalDocument(object): """Parsed HTML document from the input.""" html = self._html if not isinstance(html, unicode): - encoding = determine_encoding(html) - html = html.decode(encoding) + html = decode_html(html) html = convert_breaks_to_paragraphs(html) document = build_document(html, self._url) diff --git a/breadability/utils.py b/breadability/utils.py index 7385d9e..70a9778 100644 --- a/breadability/utils.py +++ b/breadability/utils.py @@ -5,6 +5,18 @@ from __future__ import division, print_function, unicode_literals import re +try: + from contextlib import ignored +except ImportError: + from contextlib import contextmanager + + @contextmanager + def ignored(*exceptions): + try: + yield + except tuple(exceptions): + pass + MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE) diff --git a/setup.py b/setup.py index 0a938d4..3268c86 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ import sys from os.path import abspath, dirname, join from setuptools import setup, find_packages -VERSION = "0.1.17" +VERSION = "0.1.18" VERSION_SUFFIX = "%d.%d" % sys.version_info[:2] CURRENT_DIRECTORY = abspath(dirname(__file__)) diff --git a/tests/test_orig_document.py b/tests/test_orig_document.py index 8d2bcc7..cff46aa 100644 --- a/tests/test_orig_document.py +++ b/tests/test_orig_document.py @@ -4,10 +4,15 @@ from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from collections import defaultdict -from breadability._compat import to_unicode, to_bytes +from breadability._compat import ( + to_unicode, + to_bytes, + unicode, +) + from breadability.document import ( convert_breaks_to_paragraphs, - determine_encoding, + decode_html, OriginalDocument, ) from .compat import unittest @@ -19,7 +24,8 @@ class TestOriginalDocument(unittest.TestCase): def test_convert_br_tags_to_paragraphs(self): returned = convert_breaks_to_paragraphs( - "
HI

How are you?

\t \n
Fine\n I guess
") + ("
HI

How are you?

\t \n
" + "Fine\n I guess
")) self.assertEqual( returned, @@ -69,12 +75,14 @@ class TestOriginalDocument(unittest.TestCase): def test_empty_title(self): """We convert all
tags to

tags""" - document = OriginalDocument("") + document = OriginalDocument( + "") self.assertEqual(document.title, "") def test_title_only_with_tags(self): """We convert all
tags to

tags""" - document = OriginalDocument("<em></em>") + document = OriginalDocument( + "<em></em>") self.assertEqual(document.title, "") def test_no_title(self): @@ -84,13 +92,11 @@ class TestOriginalDocument(unittest.TestCase): def test_encoding(self): text = "ľščťžýáíéäúňôůě".encode("iso-8859-2") - determine_encoding(text) + html = decode_html(text) + self.assertEqual(type(html), unicode) def test_encoding_short(self): - text = "ľščťžýáíé".encode("iso-8859-2") - encoding = determine_encoding(text) - self.assertEqual(encoding, "utf8") - text = to_bytes("ľščťžýáíé") - encoding = determine_encoding(text) - self.assertEqual(encoding, "utf8") + html = decode_html(text) + self.assertEqual(type(html), unicode) + self.assertEqual(html, "ľščťžýáíé")