python-readability/readability/cleaners.py

# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
import re
from lxml.html.clean import Cleaner

bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
single_quoted = "'[^']+'"
double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+'
htmlstrip = re.compile("<" # open
    "([^>]+) " # prefix
    "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
    "([^>]*)"  # postfix
    ">"        # end
, re.I)

def clean_attributes(html):
    while htmlstrip.search(html):
        html = htmlstrip.sub('<\\1\\2>', html)
    return html

def normalize_spaces(s):
    if not s: return ''
    """replace any sequence of whitespace
    characters with a single space"""
    return ' '.join(s.split())

html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                  style=True, links=True, meta=False, add_nofollow=False,
                  page_structure=False, processing_instructions=True, embedded=False,
                  frames=False, forms=False, annoying_tags=False, remove_tags=None,
                  remove_unknown_tags=False, safe_attrs_only=False)
Moved to lxml (based on decruft version); better encoding recognition. 2011-05-03 04:34:29 +00:00			`# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds`
			`import re`
			`from lxml.html.clean import Cleaner`

Updated scoring algorithm to match readability.js v1.7.1 2011-06-01 05:16:32 +00:00			`bad_attrs = ['width', 'height', 'style', '[-a-z]color', 'background[-a-z]', 'on*']`
Moved to lxml (based on decruft version); better encoding recognition. 2011-05-03 04:34:29 +00:00			`single_quoted = "'[^']+'"`
			`double_quoted = '"[^"]+"'`
			`non_space = '[^ "\'>]+'`
			`htmlstrip = re.compile("<" # open`
			`"([^>]+) " # prefix`
			`"(?:%s) *" % ('\|'.join(bad_attrs),) + # undesirable attributes`
			`'= *(?:%s\|%s\|%s)' % (non_space, single_quoted, double_quoted) + # value`
			`"([^>]*)" # postfix`
			`">" # end`
			`, re.I)`

			`def clean_attributes(html):`
			`while htmlstrip.search(html):`
			`html = htmlstrip.sub('<\\1\\2>', html)`
			`return html`

			`def normalize_spaces(s):`
			`if not s: return ''`
			`"""replace any sequence of whitespace`
			`characters with a single space"""`
			`return ' '.join(s.split())`

			`html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,`
			`style=True, links=True, meta=False, add_nofollow=False,`
			`page_structure=False, processing_instructions=True, embedded=False,`
			`frames=False, forms=False, annoying_tags=False, remove_tags=None,`
			`remove_unknown_tags=False, safe_attrs_only=False)`