Fixes checking of declared encodings in get_encoding.

In PYthon 3 .decode() on bytes requires the name of the encoding to be a str type which means we have to convert the extracted encoding before we can use it.
10 years ago · 386e48d29b
parent 046d2c10c3
commit 386e48d29b
1 changed files with 13 additions and 7 deletions
--- a/readability/encoding.py
+++ b/readability/encoding.py
@ -1,5 +1,6 @@
 import re
 import chardet
 import sys
 def get_encoding(page):
    # Regex for XML and HTML Meta charset declaration
@ -12,13 +13,18 @@ def get_encoding(page):
            xml_re.findall(page))
    # Try any declared encodings
-    if len(declared_encodings) > 0:
+    for declared_encoding in declared_encodings:
-        for declared_encoding in declared_encodings:
+        try:
-            try:
+            if sys.version_info[0] == 3:
-                page.decode(custom_decode(declared_encoding))
+                # declared_encoding will actually be bytes but .decode() only
-                return custom_decode(declared_encoding)
+                # accepts `str` type. Decode blindly with ascii because no one should
-            except UnicodeDecodeError:
+                # ever use non-ascii characters in the name of an encoding.
-                pass
+                declared_encoding = declared_encoding.decode('ascii', 'replace')
            page.decode(custom_decode(declared_encoding))
            return custom_decode(declared_encoding)
        except UnicodeDecodeError:
            pass
    # Fallback to chardet if declared encodings fail
    text = re.sub(b'</?[^>]*>\s*', b' ', page)