More pep8, almost there
parent
bbb60ed077
commit
b498df200b
@ -1,32 +1,38 @@
|
||||
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
|
||||
# strip out a set of nuisance html attributes that can mess up rendering in
|
||||
# RSS feeds
|
||||
import re
|
||||
from lxml.html.clean import Cleaner
|
||||
|
||||
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
|
||||
bad_attrs = ['width', 'height', 'style', '[-a-z]*color',
|
||||
'background[-a-z]*', 'on*']
|
||||
single_quoted = "'[^']+'"
|
||||
double_quoted = '"[^"]+"'
|
||||
non_space = '[^ "\'>]+'
|
||||
htmlstrip = re.compile("<" # open
|
||||
"([^>]+) " # prefix
|
||||
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
|
||||
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
|
||||
htmlstrip = re.compile("<" # open
|
||||
"([^>]+) " # prefix
|
||||
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
|
||||
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
|
||||
"([^>]*)" # postfix
|
||||
">" # end
|
||||
, re.I)
|
||||
">", # end
|
||||
re.I)
|
||||
|
||||
|
||||
def clean_attributes(html):
|
||||
while htmlstrip.search(html):
|
||||
html = htmlstrip.sub('<\\1\\2>', html)
|
||||
return html
|
||||
|
||||
|
||||
def normalize_spaces(s):
|
||||
if not s: return ''
|
||||
"""replace any sequence of whitespace
|
||||
characters with a single space"""
|
||||
"""replace any sequence of whitespace characters with a single space"""
|
||||
if not s:
|
||||
return ''
|
||||
return ' '.join(s.split())
|
||||
|
||||
|
||||
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
|
||||
style=True, links=True, meta=False, add_nofollow=False,
|
||||
page_structure=False, processing_instructions=True, embedded=False,
|
||||
frames=False, forms=False, annoying_tags=False, remove_tags=None,
|
||||
page_structure=False, processing_instructions=True,
|
||||
embedded=False, frames=False, forms=False,
|
||||
annoying_tags=False, remove_tags=None,
|
||||
remove_unknown_tags=False, safe_attrs_only=False)
|
||||
|
@ -1,25 +1,32 @@
|
||||
uids = {}
|
||||
|
||||
|
||||
def save_to_file(text, filename):
|
||||
f = open(filename, 'wt')
|
||||
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
|
||||
f.write("""
|
||||
<meta http-equiv="Content-Type"
|
||||
content="text/html; charset=UTF-8"
|
||||
/>""")
|
||||
f.write(text.encode('utf-8'))
|
||||
f.close()
|
||||
|
||||
uids = {}
|
||||
|
||||
def describe(node, depth=2):
|
||||
if not hasattr(node, 'tag'):
|
||||
return "[%s]" % type(node)
|
||||
name = node.tag
|
||||
if node.get('id', ''): name += '#'+node.get('id')
|
||||
if node.get('class', ''):
|
||||
name += '.' + node.get('class').replace(' ','.')
|
||||
if node.get('id', ''):
|
||||
name += '#' + node.get('id')
|
||||
if node.get('class', ''):
|
||||
name += '.' + node.get('class').replace(' ', '.')
|
||||
if name[:4] in ['div#', 'div.']:
|
||||
name = name[3:]
|
||||
if name in ['tr', 'td', 'div', 'p']:
|
||||
if not node in uids:
|
||||
uid = uids[node] = len(uids)+1
|
||||
uid = uids[node] = len(uids) + 1
|
||||
else:
|
||||
uid = uids.get(node)
|
||||
name += "%02d" % (uid)
|
||||
if depth and node.getparent() is not None:
|
||||
return name+' - '+describe(node.getparent(), depth-1)
|
||||
return name + ' - ' + describe(node.getparent(), depth - 1)
|
||||
return name
|
||||
|
@ -1,21 +1,23 @@
|
||||
import re
|
||||
import chardet
|
||||
|
||||
|
||||
def get_encoding(page):
|
||||
text = re.sub('</?[^>]*>\s*', ' ', page)
|
||||
enc = 'utf-8'
|
||||
if not text.strip() or len(text) < 10:
|
||||
return enc # can't guess
|
||||
return enc # can't guess
|
||||
try:
|
||||
diff = text.decode(enc, 'ignore').encode(enc)
|
||||
sizes = len(diff), len(text)
|
||||
if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
|
||||
# 99% of utf-8
|
||||
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
|
||||
return enc
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
res = chardet.detect(text)
|
||||
enc = res['encoding']
|
||||
#print '->', enc, "%.2f" % res['confidence']
|
||||
# print '->', enc, "%.2f" % res['confidence']
|
||||
if enc == 'MacCyrillic':
|
||||
enc = 'cp1251'
|
||||
return enc
|
||||
|
Loading…
Reference in New Issue