fix: Decodes bytes if needed in get_body

pull/137/head
Raphael Cohen 4 years ago
parent 5800210e99
commit 15f3692e68

@ -134,7 +134,9 @@ def get_body(doc):
elem.drop_tree()
# tostring() always return utf-8 encoded string
# FIXME: isn't better to use tounicode?
raw_html = str_(tostring(doc.body or doc))
raw_html = tostring(doc.body or doc)
if isinstance(raw_html, bytes):
raw_html = raw_html.decode()
cleaned = clean_attributes(raw_html)
try:
# BeautifulSoup(cleaned) #FIXME do we really need to try loading it?

Loading…
Cancel
Save