Better decoding page into unicode

- Fixes #22
- Fixes #23

Prepare for release
pull/26/head
Mišo Belica 10 years ago committed by Richard Harding
parent 5cb028ec93
commit e2f3391dc3

@ -3,6 +3,10 @@
Changelog for breadability
==========================
0.1.18 (April 6th 2014)
----------------------
- Improved decoding of the page into Unicode.
0.1.17 (Jan 22nd 2014)
----------------------
- More log quieting down to INFO vs WARN

@ -27,7 +27,7 @@ bin/python:
.PHONY: deps
deps: venv
pip install -r requirements.txt
$(PIP) install -r requirements.txt
.PHONY: clean_venv
clean_venv:

@ -23,7 +23,10 @@ from ._compat import (
unicode,
unicode_compatible,
)
from .utils import cached_property
from .utils import (
cached_property,
ignored,
)
logger = logging.getLogger("breadability")
@ -31,30 +34,46 @@ logger = logging.getLogger("breadability")
TAG_MARK_PATTERN = re.compile(to_bytes(r"</?[^>]*>\s*"))
UTF8_PARSER = HTMLParser(encoding="utf8")
CHARSET_META_TAG_PATTERN = re.compile(
br"""<meta[^>]+charset=["']?([^'"/>\s]+)""",
re.IGNORECASE
)
def determine_encoding(page):
encoding = "utf8"
text = TAG_MARK_PATTERN.sub(to_bytes(" "), page)
# don't venture to guess
if not text.strip() or len(text) < 10:
return encoding
# try enforce UTF-8
diff = text.decode(encoding, "ignore").encode(encoding)
def decode_html(html):
"""
Converts bytes stream containing an HTML page into Unicode.
Tries to guess character encoding from meta tag of by "charade" library.
"""
if isinstance(html, unicode):
return html
match = CHARSET_META_TAG_PATTERN.search(html)
if match:
declared_encoding = match.group(1).decode("ASCII")
# proceed unknown encoding as if it wasn't found at all
with ignored(LookupError):
return html.decode(declared_encoding, "ignore")
# try to enforce UTF-8 firstly
with ignored(UnicodeDecodeError):
return html.decode("utf8")
text = TAG_MARK_PATTERN.sub(to_bytes(" "), html)
diff = text.decode("utf8", "ignore").encode("utf8")
sizes = len(diff), len(text)
# 99% of UTF-8
# 99% of text is UTF-8
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
return encoding
return html.decode("utf8", "ignore")
# try detect encoding
encoding = "utf8"
encoding_detector = charade.detect(text)
if encoding_detector["encoding"]:
encoding = encoding_detector["encoding"]
return encoding
return html.decode(encoding, "ignore")
BREAK_TAGS_PATTERN = re.compile(
@ -88,7 +107,7 @@ def build_document(html_content, base_href=None):
assert html_content is not None
if isinstance(html_content, unicode):
html_content = html_content.encode("utf8", "replace")
html_content = html_content.encode("utf8", "xmlcharrefreplace")
try:
document = document_fromstring(html_content, parser=UTF8_PARSER)
@ -125,8 +144,7 @@ class OriginalDocument(object):
"""Parsed HTML document from the input."""
html = self._html
if not isinstance(html, unicode):
encoding = determine_encoding(html)
html = html.decode(encoding)
html = decode_html(html)
html = convert_breaks_to_paragraphs(html)
document = build_document(html, self._url)

@ -5,6 +5,18 @@ from __future__ import division, print_function, unicode_literals
import re
try:
from contextlib import ignored
except ImportError:
from contextlib import contextmanager
@contextmanager
def ignored(*exceptions):
try:
yield
except tuple(exceptions):
pass
MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)

@ -3,7 +3,7 @@ import sys
from os.path import abspath, dirname, join
from setuptools import setup, find_packages
VERSION = "0.1.17"
VERSION = "0.1.18"
VERSION_SUFFIX = "%d.%d" % sys.version_info[:2]
CURRENT_DIRECTORY = abspath(dirname(__file__))

@ -4,10 +4,15 @@ from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from collections import defaultdict
from breadability._compat import to_unicode, to_bytes
from breadability._compat import (
to_unicode,
to_bytes,
unicode,
)
from breadability.document import (
convert_breaks_to_paragraphs,
determine_encoding,
decode_html,
OriginalDocument,
)
from .compat import unittest
@ -19,7 +24,8 @@ class TestOriginalDocument(unittest.TestCase):
def test_convert_br_tags_to_paragraphs(self):
returned = convert_breaks_to_paragraphs(
"<div>HI<br><br>How are you?<br><br> \t \n <br>Fine\n I guess</div>")
("<div>HI<br><br>How are you?<br><br> \t \n <br>"
"Fine\n I guess</div>"))
self.assertEqual(
returned,
@ -69,12 +75,14 @@ class TestOriginalDocument(unittest.TestCase):
def test_empty_title(self):
"""We convert all <br/> tags to <p> tags"""
document = OriginalDocument("<html><head><title></title></head><body></body></html>")
document = OriginalDocument(
"<html><head><title></title></head><body></body></html>")
self.assertEqual(document.title, "")
def test_title_only_with_tags(self):
"""We convert all <br/> tags to <p> tags"""
document = OriginalDocument("<html><head><title><em></em></title></head><body></body></html>")
document = OriginalDocument(
"<html><head><title><em></em></title></head><body></body></html>")
self.assertEqual(document.title, "")
def test_no_title(self):
@ -84,13 +92,11 @@ class TestOriginalDocument(unittest.TestCase):
def test_encoding(self):
text = "ľščťžýáíéäúňôůě".encode("iso-8859-2")
determine_encoding(text)
html = decode_html(text)
self.assertEqual(type(html), unicode)
def test_encoding_short(self):
text = "ľščťžýáíé".encode("iso-8859-2")
encoding = determine_encoding(text)
self.assertEqual(encoding, "utf8")
text = to_bytes("ľščťžýáíé")
encoding = determine_encoding(text)
self.assertEqual(encoding, "utf8")
html = decode_html(text)
self.assertEqual(type(html), unicode)
self.assertEqual(html, "ľščťžýáíé")

Loading…
Cancel
Save