More pep8, almost there

This commit is contained in:
Richard Harding 2012-04-17 14:14:02 -04:00
parent bbb60ed077
commit b498df200b
4 changed files with 69 additions and 39 deletions

View File

@ -1,32 +1,38 @@
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
# strip out a set of nuisance html attributes that can mess up rendering in
# RSS feeds
import re
from lxml.html.clean import Cleaner
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
bad_attrs = ['width', 'height', 'style', '[-a-z]*color',
'background[-a-z]*', 'on*']
single_quoted = "'[^']+'"
double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+'
htmlstrip = re.compile("<" # open
"([^>]+) " # prefix
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
htmlstrip = re.compile("<" # open
"([^>]+) " # prefix
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
"([^>]*)" # postfix
">" # end
, re.I)
">", # end
re.I)
def clean_attributes(html):
while htmlstrip.search(html):
html = htmlstrip.sub('<\\1\\2>', html)
return html
def normalize_spaces(s):
if not s: return ''
"""replace any sequence of whitespace
characters with a single space"""
"""replace any sequence of whitespace characters with a single space"""
if not s:
return ''
return ' '.join(s.split())
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False,
page_structure=False, processing_instructions=True, embedded=False,
frames=False, forms=False, annoying_tags=False, remove_tags=None,
page_structure=False, processing_instructions=True,
embedded=False, frames=False, forms=False,
annoying_tags=False, remove_tags=None,
remove_unknown_tags=False, safe_attrs_only=False)

View File

@ -1,25 +1,32 @@
uids = {}
def save_to_file(text, filename):
f = open(filename, 'wt')
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
f.write("""
<meta http-equiv="Content-Type"
content="text/html; charset=UTF-8"
/>""")
f.write(text.encode('utf-8'))
f.close()
uids = {}
def describe(node, depth=2):
if not hasattr(node, 'tag'):
return "[%s]" % type(node)
name = node.tag
if node.get('id', ''): name += '#'+node.get('id')
if node.get('class', ''):
name += '.' + node.get('class').replace(' ','.')
if node.get('id', ''):
name += '#' + node.get('id')
if node.get('class', ''):
name += '.' + node.get('class').replace(' ', '.')
if name[:4] in ['div#', 'div.']:
name = name[3:]
if name in ['tr', 'td', 'div', 'p']:
if not node in uids:
uid = uids[node] = len(uids)+1
uid = uids[node] = len(uids) + 1
else:
uid = uids.get(node)
name += "%02d" % (uid)
if depth and node.getparent() is not None:
return name+' - '+describe(node.getparent(), depth-1)
return name + ' - ' + describe(node.getparent(), depth - 1)
return name

View File

@ -1,21 +1,23 @@
import re
import chardet
def get_encoding(page):
text = re.sub('</?[^>]*>\s*', ' ', page)
enc = 'utf-8'
if not text.strip() or len(text) < 10:
return enc # can't guess
return enc # can't guess
try:
diff = text.decode(enc, 'ignore').encode(enc)
sizes = len(diff), len(text)
if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
# 99% of utf-8
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
return enc
except UnicodeDecodeError:
pass
res = chardet.detect(text)
enc = res['encoding']
#print '->', enc, "%.2f" % res['confidence']
# print '->', enc, "%.2f" % res['confidence']
if enc == 'MacCyrillic':
enc = 'cp1251'
return enc

View File

@ -1,13 +1,17 @@
from cleaners import normalize_spaces, clean_attributes
from encoding import get_encoding
from lxml.html import tostring
import logging
import lxml.html
import re
logging.getLogger().setLevel(logging.DEBUG)
from lxml.html import document_fromstring
from lxml.html import HTMLParser
from lxml.html import tostring
from cleaners import clean_attributes
from cleaners import normalize_spaces
from encoding import get_encoding
logging.getLogger().setLevel(logging.DEBUG)
utf8_parser = HTMLParser(encoding='utf-8')
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
def build_doc(page):
if isinstance(page, unicode):
@ -15,17 +19,20 @@ def build_doc(page):
else:
enc = get_encoding(page)
page_unicode = page.decode(enc, 'replace')
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
doc = document_fromstring(
page_unicode.encode('utf-8', 'replace'),
parser=utf8_parser)
return doc
def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
def normalize_entities(cur_title):
entities = {
u'\u2014':'-',
u'\u2013':'-',
u'\u2014': '-',
u'\u2013': '-',
u'&mdash;': '-',
u'&ndash;': '-',
u'\u00A0': ' ',
@ -39,27 +46,31 @@ def normalize_entities(cur_title):
return cur_title
def norm_title(title):
return normalize_entities(normalize_spaces(title))
def get_title(doc):
title = doc.find('.//title').text
if not title:
return '[no-title]'
return norm_title(title)
def add_match(collection, text, orig):
text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', '') in orig.replace('"', ''):
collection.add(text)
def shorten_title(doc):
title = doc.find('.//title').text
if not title:
return ''
title = orig = norm_title(title)
candidates = set()
@ -71,13 +82,14 @@ def shorten_title(doc):
if e.text_content():
add_match(candidates, e.text_content(), orig)
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title',
'.title', '.head', '.heading', '.contentheading', '.small_header_red']:
for e in doc.cssselect(item):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
if candidates:
title = sorted(candidates, key=len)[-1]
else:
@ -103,13 +115,16 @@ def shorten_title(doc):
return title
def get_body(doc):
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
[elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')]
raw_html = unicode(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html)
try:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
return cleaned
except Exception: #FIXME find the equivalent lxml error
logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
except Exception: # FIXME find the equivalent lxml error
logging.error("cleansing broke html content: %s\n---------\n%s" % (
raw_html,
cleaned))
return raw_html