@ -2,20 +2,47 @@ import re
import chardet
def get_encoding ( page ) :
# Regex for XML and HTML Meta charset declaration
charset_re = re . compile ( r ' <meta.*?charset=[ " \' ]*(.+?)[ " \' >] ' , flags = re . I )
pragma_re = re . compile ( r ' <meta.*?content=[ " \' ]*;?charset=(.+?)[ " \' >] ' , flags = re . I )
xml_re = re . compile ( r ' ^< \ ?xml.*?encoding=[ " \' ]*(.+?)[ " \' >] ' )
declared_encodings = ( charset_re . findall ( page ) +
pragma_re . findall ( page ) +
xml_re . findall ( page ) )
# Try any declared encodings
if len ( declared_encodings ) > 0 :
for declared_encoding in declared_encodings :
try :
page . decode ( custom_decode ( declared_encoding ) )
return custom_decode ( declared_encoding )
except UnicodeDecodeError :
pass
# Fallback to chardet if declared encodings fail
text = re . sub ( ' </?[^>]*> \ s* ' , ' ' , page )
enc = ' utf-8 '
if not text . strip ( ) or len ( text ) < 10 :
return enc # can't guess
try :
diff = text . decode ( enc , ' ignore ' ) . encode ( enc )
sizes = len ( diff ) , len ( text )
if abs ( len ( text ) - len ( diff ) ) < max ( sizes ) * 0.01 : # 99% of utf-8
return enc
except UnicodeDecodeError :
pass
res = chardet . detect ( text )
enc = res [ ' encoding ' ]
#print '->', enc, "%.2f" % res['confidence']
if enc == ' MacCyrillic ' :
enc = ' cp1251 '
enc = custom_decode ( enc )
return enc
def custom_decode ( encoding ) :
""" Overrides encoding when charset declaration
or charset determination is a subset of a larger
charset . Created because of issues with Chinese websites """
encoding = encoding . lower ( )
alternates = {
' big5 ' : ' big5hkscs ' ,
' gb2312 ' : ' gb18030 ' ,
' ascii ' : ' utf-8 ' ,
' MacCyrillic ' : ' cp1251 ' ,
}
if encoding in alternates :
return alternates [ encoding ]
else :
return encoding