@ -1,5 +1,6 @@
import re
import re
import chardet
import chardet
import sys
def get_encoding ( page ) :
def get_encoding ( page ) :
# Regex for XML and HTML Meta charset declaration
# Regex for XML and HTML Meta charset declaration
@ -12,13 +13,18 @@ def get_encoding(page):
xml_re . findall ( page ) )
xml_re . findall ( page ) )
# Try any declared encodings
# Try any declared encodings
if len ( declared_encodings ) > 0 :
for declared_encoding in declared_encodings :
for declared_encoding in declared_encodings :
try :
try :
if sys . version_info [ 0 ] == 3 :
page . decode ( custom_decode ( declared_encoding ) )
# declared_encoding will actually be bytes but .decode() only
return custom_decode ( declared_encoding )
# accepts `str` type. Decode blindly with ascii because no one should
except UnicodeDecodeError :
# ever use non-ascii characters in the name of an encoding.
pass
declared_encoding = declared_encoding . decode ( ' ascii ' , ' replace ' )
page . decode ( custom_decode ( declared_encoding ) )
return custom_decode ( declared_encoding )
except UnicodeDecodeError :
pass
# Fallback to chardet if declared encodings fail
# Fallback to chardet if declared encodings fail
text = re . sub ( b ' </?[^>]*> \ s* ' , b ' ' , page )
text = re . sub ( b ' </?[^>]*> \ s* ' , b ' ' , page )