Defaulting to utf-8 when chardet returns None

On articles like this one chardet returns None:
http://news.zing.vn/nhip-song-tre/thay-giao-gay-sot-tung-bo-luat-tinh-yeu/a291427.html
This causes exceptions later on when encoding.lower() is called
This commit is contained in:
Nathan Breit 2014-12-18 18:48:22 -08:00
parent 0c2f29ed0d
commit 75e2e0cb3a

View File

@ -26,7 +26,7 @@ def get_encoding(page):
if not text.strip() or len(text) < 10:
return enc # can't guess
res = chardet.detect(text)
enc = res['encoding']
enc = res['encoding'] or 'utf-8'
#print '->', enc, "%.2f" % res['confidence']
enc = custom_decode(enc)
return enc
@ -45,4 +45,4 @@ def custom_decode(encoding):
if encoding in alternates:
return alternates[encoding]
else:
return encoding
return encoding