pull/24/merge
Mišo Belica 10 years ago
commit 2b59b34f39

@ -3,6 +3,10 @@
Changelog for breadability
==========================
0.1.18 (?? ?? 2014)
----------------------
- Improved decoding of the page into Unicode.
0.1.17 (Jan 22nd 2014)
----------------------
- More log quieting down to INFO vs WARN

@ -23,7 +23,7 @@ from ._compat import (
unicode,
unicode_compatible,
)
from .utils import cached_property
from .utils import cached_property, ignored
logger = logging.getLogger("breadability")
@ -31,30 +31,46 @@ logger = logging.getLogger("breadability")
TAG_MARK_PATTERN = re.compile(to_bytes(r"</?[^>]*>\s*"))
UTF8_PARSER = HTMLParser(encoding="utf8")
CHARSET_META_TAG_PATTERN = re.compile(
br"""<meta[^>]+charset=["']?([^'"/>\s]+)""",
re.IGNORECASE
)
def determine_encoding(page):
encoding = "utf8"
text = TAG_MARK_PATTERN.sub(to_bytes(" "), page)
# don't venture to guess
if not text.strip() or len(text) < 10:
return encoding
# try enforce UTF-8
diff = text.decode(encoding, "ignore").encode(encoding)
def decode_html(html):
"""
Converts bytes stream containing an HTML page into Unicode.
Tries to guess character encoding from meta tag of by "charade" library.
"""
if isinstance(html, unicode):
return html
match = CHARSET_META_TAG_PATTERN.search(html)
if match:
declared_encoding = match.group(1).decode("ASCII")
# proceed unknown encoding as if it wasn't found at all
with ignored(LookupError):
return html.decode(declared_encoding, "ignore")
# try to enforce UTF-8 firstly
with ignored(UnicodeDecodeError):
return html.decode("utf8")
text = TAG_MARK_PATTERN.sub(to_bytes(" "), html)
diff = text.decode("utf8", "ignore").encode("utf8")
sizes = len(diff), len(text)
# 99% of UTF-8
# 99% of text is UTF-8
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
return encoding
return html.decode("utf8", "ignore")
# try detect encoding
encoding = "utf8"
encoding_detector = charade.detect(text)
if encoding_detector["encoding"]:
encoding = encoding_detector["encoding"]
return encoding
return html.decode(encoding, "ignore")
BREAK_TAGS_PATTERN = re.compile(
@ -88,7 +104,7 @@ def build_document(html_content, base_href=None):
assert html_content is not None
if isinstance(html_content, unicode):
html_content = html_content.encode("utf8", "replace")
html_content = html_content.encode("utf8", "xmlcharrefreplace")
try:
document = document_fromstring(html_content, parser=UTF8_PARSER)
@ -125,8 +141,7 @@ class OriginalDocument(object):
"""Parsed HTML document from the input."""
html = self._html
if not isinstance(html, unicode):
encoding = determine_encoding(html)
html = html.decode(encoding)
html = decode_html(html)
html = convert_breaks_to_paragraphs(html)
document = build_document(html, self._url)

@ -5,6 +5,18 @@ from __future__ import division, print_function, unicode_literals
import re
try:
from contextlib import ignored
except ImportError:
from contextlib import contextmanager
@contextmanager
def ignored(*exceptions):
try:
yield
except tuple(exceptions):
pass
MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)

@ -4,10 +4,10 @@ from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from collections import defaultdict
from breadability._compat import to_unicode, to_bytes
from breadability._compat import to_unicode, to_bytes, unicode
from breadability.document import (
convert_breaks_to_paragraphs,
determine_encoding,
decode_html,
OriginalDocument,
)
from .compat import unittest
@ -84,13 +84,11 @@ class TestOriginalDocument(unittest.TestCase):
def test_encoding(self):
text = "ľščťžýáíéäúňôůě".encode("iso-8859-2")
determine_encoding(text)
html = decode_html(text)
self.assertEqual(type(html), unicode)
def test_encoding_short(self):
text = "ľščťžýáíé".encode("iso-8859-2")
encoding = determine_encoding(text)
self.assertEqual(encoding, "utf8")
text = to_bytes("ľščťžýáíé")
encoding = determine_encoding(text)
self.assertEqual(encoding, "utf8")
html = decode_html(text)
self.assertEqual(type(html), unicode)
self.assertEqual(html, "ľščťžýáíé")

Loading…
Cancel
Save