2011-05-03 04:34:29 +00:00
|
|
|
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
|
|
|
|
import re
|
|
|
|
from lxml.html.clean import Cleaner
|
|
|
|
|
2011-06-01 05:16:32 +00:00
|
|
|
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
|
2011-05-03 04:34:29 +00:00
|
|
|
single_quoted = "'[^']+'"
|
|
|
|
double_quoted = '"[^"]+"'
|
|
|
|
non_space = '[^ "\'>]+'
|
|
|
|
htmlstrip = re.compile("<" # open
|
|
|
|
"([^>]+) " # prefix
|
|
|
|
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
|
|
|
|
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
|
|
|
|
"([^>]*)" # postfix
|
|
|
|
">" # end
|
|
|
|
, re.I)
|
|
|
|
|
|
|
|
def clean_attributes(html):
|
|
|
|
while htmlstrip.search(html):
|
|
|
|
html = htmlstrip.sub('<\\1\\2>', html)
|
|
|
|
return html
|
|
|
|
|
|
|
|
def normalize_spaces(s):
|
|
|
|
if not s: return ''
|
|
|
|
"""replace any sequence of whitespace
|
|
|
|
characters with a single space"""
|
|
|
|
return ' '.join(s.split())
|
|
|
|
|
|
|
|
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
|
|
|
|
style=True, links=True, meta=False, add_nofollow=False,
|
|
|
|
page_structure=False, processing_instructions=True, embedded=False,
|
|
|
|
frames=False, forms=False, annoying_tags=False, remove_tags=None,
|
|
|
|
remove_unknown_tags=False, safe_attrs_only=False)
|