From 318f25c577649d06cd147c10fd8fb5e2d8af13e9 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Thu, 10 Oct 2013 02:57:53 +0700 Subject: [PATCH] Minor fix in encoding guessing. Claiming it v0.3.0.1 --- readability/htmls.py | 16 +++------------- setup.py | 2 +- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/readability/htmls.py b/readability/htmls.py index edaaa52..f6efac6 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -11,20 +11,10 @@ def build_doc(page): if isinstance(page, unicode): page_unicode = page else: - enc = get_encoding(page) - if enc: - page_unicode = page.decode(enc, 'replace') - encoding = enc - else: - try: - #try utf-8 - page_unicode = page.decode('utf-8', 'strict') - encoding = 'utf-8' - except UnicodeDecodeError: - page_unicode = page.decode('utf-8', 'replace') - encoding = 'utf-8' + enc = get_encoding(page) or 'utf-8' + page_unicode = page.decode(enc, 'replace') doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) - return doc, encoding + return doc, enc def js_re(src, pattern, flags, repl): return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) diff --git a/setup.py b/setup.py index 3558177..f11fadd 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ else: setup( name="readability-lxml", - version="0.3", + version="0.3.0.1", author="Yuri Baburov", author_email="burchik@gmail.com", description="fast python port of arc90's readability tool",