From f3d0a8d8428ee0512094b6de2f5e1dcf3c63cef2 Mon Sep 17 00:00:00 2001 From: Lee Semel Date: Tue, 28 Jun 2011 00:54:36 +0800 Subject: [PATCH] Allow passing unicode objects --- readability/htmls.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/readability/htmls.py b/readability/htmls.py index 7d32137..d11f891 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -10,9 +10,12 @@ logging.getLogger().setLevel(logging.DEBUG) utf8_parser = lxml.html.HTMLParser(encoding='utf-8') def build_doc(page): - enc = get_encoding(page) - page_enc = page.decode(enc, 'replace').encode('utf-8') - doc = lxml.html.document_fromstring(page_enc, parser=utf8_parser) + if type(page) != unicode: + enc = get_encoding(page) + page_enc = page.decode(enc, 'replace') + else: + page_enc = page + doc = lxml.html.document_fromstring(page_enc.encode('utf-8'), parser=utf8_parser) return doc def js_re(src, pattern, flags, repl):