Use 'charade' for detecting encoding

11 years ago · 3322681166
parent 544220e9a3
commit 3322681166
4 changed files with 14 additions and 15 deletions
--- a/breadability/document.py
+++ b/breadability/document.py
@ -4,15 +4,16 @@

 from __future__ import absolute_import

-import chardet
 import re
+import charade
+
 from lxml.etree import tostring
 from lxml.etree import tounicode
 from lxml.etree import XMLSyntaxError
 from lxml.html import document_fromstring
 from lxml.html import HTMLParser

-from ._py3k import unicode, to_string
+from ._py3k import unicode, to_string, to_bytes
 from .logconfig import LOG
 from .utils import cached_property

@ -21,7 +22,7 @@ utf8_parser = HTMLParser(encoding='utf-8')


 def get_encoding(page):
-    text = re.sub('</?[^>]*>\s*', ' ', page)
+    text = re.sub(to_bytes('</?[^>]*>\s*'), to_bytes(' '), page)
    enc = 'utf-8'
    if not text.strip() or len(text) < 10:
        return enc  # can't guess
@ -33,7 +34,7 @@ def get_encoding(page):
            return enc
    except UnicodeDecodeError:
        pass
-    res = chardet.detect(text)
+    res = charade.detect(text)
    enc = res['encoding']
    # print '->', enc, "%.2f" % res['confidence']
    if enc == 'MacCyrillic':
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,4 @@
-chardet
+charade
 lxml
 coverage
 nose
-pep8
-pylint
--- a/setup.py
+++ b/setup.py
@ -8,17 +8,13 @@ NEWS = open(os.path.join(here, 'NEWS.txt')).read()

 version = '0.1.11'
 install_requires = [
-    # List your project dependencies here.
-    # For more details, see:
    # http://packages.python.org/distribute/setuptools.html#declaring-dependencies
-    'chardet',
+    'charade',
    'lxml',
 ]
 tests_require = [
    'coverage',
    'nose',
-    'pep8',
-    'pylint',
 ]


--- a/tests/test_orig_document.py
+++ b/tests/test_orig_document.py
@ -1,4 +1,4 @@
-from collections import defaultdict
+# -*- coding: utf8 -*-

 try:
    # Python < 2.7
@ -6,13 +6,13 @@ try:
 except ImportError:
    import unittest

+from collections import defaultdict
 from breadability._py3k import to_unicode
-from breadability.document import OriginalDocument
+from breadability.document import OriginalDocument, get_encoding
 from utils import load_snippet


 class TestOriginalDocument(unittest.TestCase):
-
    """Verify we can process html into a document to work off of."""

    def test_readin_min_document(self):
@ -48,3 +48,7 @@ class TestOriginalDocument(unittest.TestCase):
        """We convert all <br/> tags to <p> tags"""
        doc = OriginalDocument(load_snippet('document_min.html'))
        self.assertIsNone(doc.html.find('.//br'))
+
+    def test_encoding(self):
+        text = to_unicode("ľščťžýáíéäúňôůě").encode("iso-8859-2")
+        encoding = get_encoding(text)