Sorted out unicode issues, thanks to Lee Semel.

13 years ago · c2ec1d1c38
parent 45781a600f
commit c2ec1d1c38
3 changed files with 10 additions and 11 deletions
--- a/readability/encoding.py
+++ b/readability/encoding.py
@ -3,17 +3,15 @@ import chardet
 def get_encoding(page):
    text = re.sub('</?[^>]*>\s*', ' ', page)
    enc = 'utf-8'
    if not text.strip() or len(text) < 10:
-        return 'ascii'
+        return enc # can't guess
    try:
        enc = 'utf-8'
        diff = text.decode(enc, 'ignore').encode(enc)
        sizes = len(diff), len(text)
-        if abs(len(text) - len(diff)) < max(sizes) * 0.01:
+        if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
            #print '->', enc, '100%'
            return enc
    except UnicodeDecodeError:
        #import traceback;traceback.print_exc()
        pass
    res = chardet.detect(text)
    enc = res['encoding']
--- a/readability/htmls.py
+++ b/readability/htmls.py
@ -10,12 +10,12 @@ logging.getLogger().setLevel(logging.DEBUG)
 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
 def build_doc(page):
-    if type(page) != unicode:
+    if isinstance(page, unicode):
-        enc = get_encoding(page)
+        page_unicode = page
        page_enc = page.decode(enc, 'replace')
    else:
-        page_enc = page
+        enc = get_encoding(page)
-    doc = lxml.html.document_fromstring(page_enc.encode('utf-8'), parser=utf8_parser)
+        page_unicode = page.decode(enc, 'replace')
    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
    return doc
 def js_re(src, pattern, flags, repl):
--- a/readability/readability.py
+++ b/readability/readability.py
@ -496,8 +496,9 @@ def main():
 		file = urllib.urlopen(options.url)
 	else:
 		file = open(args[0])
        enc = sys.stdout.encoding or 'utf-8'
 	try:
-		print Document(file.read(), debug=options.verbose).summary().encode('ascii','ignore')
+		print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace')
 	finally:
 		file.close()