|
|
@ -3,17 +3,15 @@ import chardet
|
|
|
|
|
|
|
|
|
|
|
|
def get_encoding(page):
|
|
|
|
def get_encoding(page):
|
|
|
|
text = re.sub('</?[^>]*>\s*', ' ', page)
|
|
|
|
text = re.sub('</?[^>]*>\s*', ' ', page)
|
|
|
|
|
|
|
|
enc = 'utf-8'
|
|
|
|
if not text.strip() or len(text) < 10:
|
|
|
|
if not text.strip() or len(text) < 10:
|
|
|
|
return 'ascii'
|
|
|
|
return enc # can't guess
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
enc = 'utf-8'
|
|
|
|
|
|
|
|
diff = text.decode(enc, 'ignore').encode(enc)
|
|
|
|
diff = text.decode(enc, 'ignore').encode(enc)
|
|
|
|
sizes = len(diff), len(text)
|
|
|
|
sizes = len(diff), len(text)
|
|
|
|
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
|
|
|
|
if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
|
|
|
|
#print '->', enc, '100%'
|
|
|
|
|
|
|
|
return enc
|
|
|
|
return enc
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
#import traceback;traceback.print_exc()
|
|
|
|
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
res = chardet.detect(text)
|
|
|
|
res = chardet.detect(text)
|
|
|
|
enc = res['encoding']
|
|
|
|
enc = res['encoding']
|
|
|
|