Added code to check declared encodings first and check them
from kennethreitz/requests/utils.py. Also I added some superset encodings I have found in Chinese pages that are mishandled by chardet/character declarations.
This commit is contained in:
parent
1a4d3697bc
commit
3a43a3fe7e
@ -2,20 +2,47 @@ import re
|
||||
import chardet
|
||||
|
||||
def get_encoding(page):
|
||||
# Regex for XML and HTML Meta charset declaration
|
||||
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
|
||||
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
|
||||
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
|
||||
|
||||
declared_encodings = (charset_re.findall(page) +
|
||||
pragma_re.findall(page) +
|
||||
xml_re.findall(page))
|
||||
|
||||
# Try any declared encodings
|
||||
if len(declared_encodings) > 0:
|
||||
for declared_encoding in declared_encodings:
|
||||
try:
|
||||
page.decode(custom_decode(declared_encoding))
|
||||
return custom_decode(declared_encoding)
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
# Fallback to chardet if declared encodings fail
|
||||
text = re.sub('</?[^>]*>\s*', ' ', page)
|
||||
enc = 'utf-8'
|
||||
if not text.strip() or len(text) < 10:
|
||||
return enc # can't guess
|
||||
try:
|
||||
diff = text.decode(enc, 'ignore').encode(enc)
|
||||
sizes = len(diff), len(text)
|
||||
if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
|
||||
return enc
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
res = chardet.detect(text)
|
||||
enc = res['encoding']
|
||||
#print '->', enc, "%.2f" % res['confidence']
|
||||
if enc == 'MacCyrillic':
|
||||
enc = 'cp1251'
|
||||
enc = custom_decode(enc)
|
||||
return enc
|
||||
|
||||
def custom_decode(encoding):
|
||||
"""Overrides encoding when charset declaration
|
||||
or charset determination is a subset of a larger
|
||||
charset. Created because of issues with Chinese websites"""
|
||||
encoding = encoding.lower()
|
||||
alternates = {
|
||||
'big5': 'big5hkscs',
|
||||
'gb2312': 'gb18030',
|
||||
'ascii': 'utf-8',
|
||||
'MacCyrillic': 'cp1251',
|
||||
}
|
||||
if encoding in alternates:
|
||||
return alternates[encoding]
|
||||
else:
|
||||
return encoding
|
Loading…
Reference in New Issue
Block a user