From c2ec1d1c382bf48847856f08ad2c3f20373942b5 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Thu, 30 Jun 2011 11:51:16 +0700 Subject: [PATCH] Sorted out unicode issues, thanks to Lee Semel. --- readability/encoding.py | 8 +++----- readability/htmls.py | 10 +++++----- readability/readability.py | 3 ++- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/readability/encoding.py b/readability/encoding.py index e7bf503..b83dc28 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -3,17 +3,15 @@ import chardet def get_encoding(page): text = re.sub(']*>\s*', ' ', page) + enc = 'utf-8' if not text.strip() or len(text) < 10: - return 'ascii' + return enc # can't guess try: - enc = 'utf-8' diff = text.decode(enc, 'ignore').encode(enc) sizes = len(diff), len(text) - if abs(len(text) - len(diff)) < max(sizes) * 0.01: - #print '->', enc, '100%' + if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8 return enc except UnicodeDecodeError: - #import traceback;traceback.print_exc() pass res = chardet.detect(text) enc = res['encoding'] diff --git a/readability/htmls.py b/readability/htmls.py index d11f891..97aa55b 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -10,12 +10,12 @@ logging.getLogger().setLevel(logging.DEBUG) utf8_parser = lxml.html.HTMLParser(encoding='utf-8') def build_doc(page): - if type(page) != unicode: - enc = get_encoding(page) - page_enc = page.decode(enc, 'replace') + if isinstance(page, unicode): + page_unicode = page else: - page_enc = page - doc = lxml.html.document_fromstring(page_enc.encode('utf-8'), parser=utf8_parser) + enc = get_encoding(page) + page_unicode = page.decode(enc, 'replace') + doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) return doc def js_re(src, pattern, flags, repl): diff --git a/readability/readability.py b/readability/readability.py index 50e191d..94d43fe 100644 --- a/readability/readability.py +++ b/readability/readability.py @@ -496,8 +496,9 @@ def main(): file = urllib.urlopen(options.url) else: file = open(args[0]) + enc = sys.stdout.encoding or 'utf-8' try: - print Document(file.read(), debug=options.verbose).summary().encode('ascii','ignore') + print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace') finally: file.close()