Use 'charade' for detecting encoding

pull/21/head
Mišo Belica 11 years ago
parent 544220e9a3
commit 3322681166

@ -4,15 +4,16 @@
from __future__ import absolute_import
import chardet
import re
import charade
from lxml.etree import tostring
from lxml.etree import tounicode
from lxml.etree import XMLSyntaxError
from lxml.html import document_fromstring
from lxml.html import HTMLParser
from ._py3k import unicode, to_string
from ._py3k import unicode, to_string, to_bytes
from .logconfig import LOG
from .utils import cached_property
@ -21,7 +22,7 @@ utf8_parser = HTMLParser(encoding='utf-8')
def get_encoding(page):
text = re.sub('</?[^>]*>\s*', ' ', page)
text = re.sub(to_bytes('</?[^>]*>\s*'), to_bytes(' '), page)
enc = 'utf-8'
if not text.strip() or len(text) < 10:
return enc # can't guess
@ -33,7 +34,7 @@ def get_encoding(page):
return enc
except UnicodeDecodeError:
pass
res = chardet.detect(text)
res = charade.detect(text)
enc = res['encoding']
# print '->', enc, "%.2f" % res['confidence']
if enc == 'MacCyrillic':

@ -1,6 +1,4 @@
chardet
charade
lxml
coverage
nose
pep8
pylint

@ -8,17 +8,13 @@ NEWS = open(os.path.join(here, 'NEWS.txt')).read()
version = '0.1.11'
install_requires = [
# List your project dependencies here.
# For more details, see:
# http://packages.python.org/distribute/setuptools.html#declaring-dependencies
'chardet',
'charade',
'lxml',
]
tests_require = [
'coverage',
'nose',
'pep8',
'pylint',
]

@ -1,4 +1,4 @@
from collections import defaultdict
# -*- coding: utf8 -*-
try:
# Python < 2.7
@ -6,13 +6,13 @@ try:
except ImportError:
import unittest
from collections import defaultdict
from breadability._py3k import to_unicode
from breadability.document import OriginalDocument
from breadability.document import OriginalDocument, get_encoding
from utils import load_snippet
class TestOriginalDocument(unittest.TestCase):
"""Verify we can process html into a document to work off of."""
def test_readin_min_document(self):
@ -48,3 +48,7 @@ class TestOriginalDocument(unittest.TestCase):
"""We convert all <br/> tags to <p> tags"""
doc = OriginalDocument(load_snippet('document_min.html'))
self.assertIsNone(doc.html.find('.//br'))
def test_encoding(self):
text = to_unicode("ľščťžýáíéäúňôůě").encode("iso-8859-2")
encoding = get_encoding(text)

Loading…
Cancel
Save