diff --git a/readability/htmls.py b/readability/htmls.py index 7d32137..d11f891 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -10,9 +10,12 @@ logging.getLogger().setLevel(logging.DEBUG) utf8_parser = lxml.html.HTMLParser(encoding='utf-8') def build_doc(page): - enc = get_encoding(page) - page_enc = page.decode(enc, 'replace').encode('utf-8') - doc = lxml.html.document_fromstring(page_enc, parser=utf8_parser) + if type(page) != unicode: + enc = get_encoding(page) + page_enc = page.decode(enc, 'replace') + else: + page_enc = page + doc = lxml.html.document_fromstring(page_enc.encode('utf-8'), parser=utf8_parser) return doc def js_re(src, pattern, flags, repl):