Defaulting to utf-8 when chardet returns None
On articles like this one chardet returns None: http://news.zing.vn/nhip-song-tre/thay-giao-gay-sot-tung-bo-luat-tinh-yeu/a291427.html This causes exceptions later on when encoding.lower() is called
This commit is contained in:
parent
0c2f29ed0d
commit
75e2e0cb3a
@ -26,7 +26,7 @@ def get_encoding(page):
|
||||
if not text.strip() or len(text) < 10:
|
||||
return enc # can't guess
|
||||
res = chardet.detect(text)
|
||||
enc = res['encoding']
|
||||
enc = res['encoding'] or 'utf-8'
|
||||
#print '->', enc, "%.2f" % res['confidence']
|
||||
enc = custom_decode(enc)
|
||||
return enc
|
||||
@ -45,4 +45,4 @@ def custom_decode(encoding):
|
||||
if encoding in alternates:
|
||||
return alternates[encoding]
|
||||
else:
|
||||
return encoding
|
||||
return encoding
|
||||
|
Loading…
Reference in New Issue
Block a user