Sorted out unicode issues, thanks to Lee Semel.

pull/9/head
Yuri Baburov 13 years ago
parent 45781a600f
commit c2ec1d1c38

@ -3,17 +3,15 @@ import chardet
def get_encoding(page): def get_encoding(page):
text = re.sub('</?[^>]*>\s*', ' ', page) text = re.sub('</?[^>]*>\s*', ' ', page)
enc = 'utf-8'
if not text.strip() or len(text) < 10: if not text.strip() or len(text) < 10:
return 'ascii' return enc # can't guess
try: try:
enc = 'utf-8'
diff = text.decode(enc, 'ignore').encode(enc) diff = text.decode(enc, 'ignore').encode(enc)
sizes = len(diff), len(text) sizes = len(diff), len(text)
if abs(len(text) - len(diff)) < max(sizes) * 0.01: if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
#print '->', enc, '100%'
return enc return enc
except UnicodeDecodeError: except UnicodeDecodeError:
#import traceback;traceback.print_exc()
pass pass
res = chardet.detect(text) res = chardet.detect(text)
enc = res['encoding'] enc = res['encoding']

@ -10,12 +10,12 @@ logging.getLogger().setLevel(logging.DEBUG)
utf8_parser = lxml.html.HTMLParser(encoding='utf-8') utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
def build_doc(page): def build_doc(page):
if type(page) != unicode: if isinstance(page, unicode):
enc = get_encoding(page) page_unicode = page
page_enc = page.decode(enc, 'replace')
else: else:
page_enc = page enc = get_encoding(page)
doc = lxml.html.document_fromstring(page_enc.encode('utf-8'), parser=utf8_parser) page_unicode = page.decode(enc, 'replace')
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
return doc return doc
def js_re(src, pattern, flags, repl): def js_re(src, pattern, flags, repl):

@ -496,8 +496,9 @@ def main():
file = urllib.urlopen(options.url) file = urllib.urlopen(options.url)
else: else:
file = open(args[0]) file = open(args[0])
enc = sys.stdout.encoding or 'utf-8'
try: try:
print Document(file.read(), debug=options.verbose).summary().encode('ascii','ignore') print Document(file.read(), debug=options.verbose).summary().encode(enc, 'replace')
finally: finally:
file.close() file.close()

Loading…
Cancel
Save