More pep8, almost there

This commit is contained in:
Richard Harding 2012-04-17 14:14:02 -04:00
parent bbb60ed077
commit b498df200b
4 changed files with 69 additions and 39 deletions

View File

@ -1,32 +1,38 @@
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds # strip out a set of nuisance html attributes that can mess up rendering in
# RSS feeds
import re import re
from lxml.html.clean import Cleaner from lxml.html.clean import Cleaner
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*'] bad_attrs = ['width', 'height', 'style', '[-a-z]*color',
'background[-a-z]*', 'on*']
single_quoted = "'[^']+'" single_quoted = "'[^']+'"
double_quoted = '"[^"]+"' double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+' non_space = '[^ "\'>]+'
htmlstrip = re.compile("<" # open htmlstrip = re.compile("<" # open
"([^>]+) " # prefix "([^>]+) " # prefix
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
"([^>]*)" # postfix "([^>]*)" # postfix
">" # end ">", # end
, re.I) re.I)
def clean_attributes(html): def clean_attributes(html):
while htmlstrip.search(html): while htmlstrip.search(html):
html = htmlstrip.sub('<\\1\\2>', html) html = htmlstrip.sub('<\\1\\2>', html)
return html return html
def normalize_spaces(s): def normalize_spaces(s):
if not s: return '' """replace any sequence of whitespace characters with a single space"""
"""replace any sequence of whitespace if not s:
characters with a single space""" return ''
return ' '.join(s.split()) return ' '.join(s.split())
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False, style=True, links=True, meta=False, add_nofollow=False,
page_structure=False, processing_instructions=True, embedded=False, page_structure=False, processing_instructions=True,
frames=False, forms=False, annoying_tags=False, remove_tags=None, embedded=False, frames=False, forms=False,
annoying_tags=False, remove_tags=None,
remove_unknown_tags=False, safe_attrs_only=False) remove_unknown_tags=False, safe_attrs_only=False)

View File

@ -1,25 +1,32 @@
uids = {}
def save_to_file(text, filename): def save_to_file(text, filename):
f = open(filename, 'wt') f = open(filename, 'wt')
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />') f.write("""
<meta http-equiv="Content-Type"
content="text/html; charset=UTF-8"
/>""")
f.write(text.encode('utf-8')) f.write(text.encode('utf-8'))
f.close() f.close()
uids = {}
def describe(node, depth=2): def describe(node, depth=2):
if not hasattr(node, 'tag'): if not hasattr(node, 'tag'):
return "[%s]" % type(node) return "[%s]" % type(node)
name = node.tag name = node.tag
if node.get('id', ''): name += '#'+node.get('id') if node.get('id', ''):
if node.get('class', ''): name += '#' + node.get('id')
name += '.' + node.get('class').replace(' ','.') if node.get('class', ''):
name += '.' + node.get('class').replace(' ', '.')
if name[:4] in ['div#', 'div.']: if name[:4] in ['div#', 'div.']:
name = name[3:] name = name[3:]
if name in ['tr', 'td', 'div', 'p']: if name in ['tr', 'td', 'div', 'p']:
if not node in uids: if not node in uids:
uid = uids[node] = len(uids)+1 uid = uids[node] = len(uids) + 1
else: else:
uid = uids.get(node) uid = uids.get(node)
name += "%02d" % (uid) name += "%02d" % (uid)
if depth and node.getparent() is not None: if depth and node.getparent() is not None:
return name+' - '+describe(node.getparent(), depth-1) return name + ' - ' + describe(node.getparent(), depth - 1)
return name return name

View File

@ -1,21 +1,23 @@
import re import re
import chardet import chardet
def get_encoding(page): def get_encoding(page):
text = re.sub('</?[^>]*>\s*', ' ', page) text = re.sub('</?[^>]*>\s*', ' ', page)
enc = 'utf-8' enc = 'utf-8'
if not text.strip() or len(text) < 10: if not text.strip() or len(text) < 10:
return enc # can't guess return enc # can't guess
try: try:
diff = text.decode(enc, 'ignore').encode(enc) diff = text.decode(enc, 'ignore').encode(enc)
sizes = len(diff), len(text) sizes = len(diff), len(text)
if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8 # 99% of utf-8
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
return enc return enc
except UnicodeDecodeError: except UnicodeDecodeError:
pass pass
res = chardet.detect(text) res = chardet.detect(text)
enc = res['encoding'] enc = res['encoding']
#print '->', enc, "%.2f" % res['confidence'] # print '->', enc, "%.2f" % res['confidence']
if enc == 'MacCyrillic': if enc == 'MacCyrillic':
enc = 'cp1251' enc = 'cp1251'
return enc return enc

View File

@ -1,13 +1,17 @@
from cleaners import normalize_spaces, clean_attributes
from encoding import get_encoding
from lxml.html import tostring
import logging import logging
import lxml.html
import re import re
logging.getLogger().setLevel(logging.DEBUG) from lxml.html import document_fromstring
from lxml.html import HTMLParser
from lxml.html import tostring
from cleaners import clean_attributes
from cleaners import normalize_spaces
from encoding import get_encoding
logging.getLogger().setLevel(logging.DEBUG)
utf8_parser = HTMLParser(encoding='utf-8')
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
def build_doc(page): def build_doc(page):
if isinstance(page, unicode): if isinstance(page, unicode):
@ -15,17 +19,20 @@ def build_doc(page):
else: else:
enc = get_encoding(page) enc = get_encoding(page)
page_unicode = page.decode(enc, 'replace') page_unicode = page.decode(enc, 'replace')
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) doc = document_fromstring(
page_unicode.encode('utf-8', 'replace'),
parser=utf8_parser)
return doc return doc
def js_re(src, pattern, flags, repl): def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
def normalize_entities(cur_title): def normalize_entities(cur_title):
entities = { entities = {
u'\u2014':'-', u'\u2014': '-',
u'\u2013':'-', u'\u2013': '-',
u'&mdash;': '-', u'&mdash;': '-',
u'&ndash;': '-', u'&ndash;': '-',
u'\u00A0': ' ', u'\u00A0': ' ',
@ -39,27 +46,31 @@ def normalize_entities(cur_title):
return cur_title return cur_title
def norm_title(title): def norm_title(title):
return normalize_entities(normalize_spaces(title)) return normalize_entities(normalize_spaces(title))
def get_title(doc): def get_title(doc):
title = doc.find('.//title').text title = doc.find('.//title').text
if not title: if not title:
return '[no-title]' return '[no-title]'
return norm_title(title) return norm_title(title)
def add_match(collection, text, orig): def add_match(collection, text, orig):
text = norm_title(text) text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15: if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', '') in orig.replace('"', ''): if text.replace('"', '') in orig.replace('"', ''):
collection.add(text) collection.add(text)
def shorten_title(doc): def shorten_title(doc):
title = doc.find('.//title').text title = doc.find('.//title').text
if not title: if not title:
return '' return ''
title = orig = norm_title(title) title = orig = norm_title(title)
candidates = set() candidates = set()
@ -71,13 +82,14 @@ def shorten_title(doc):
if e.text_content(): if e.text_content():
add_match(candidates, e.text_content(), orig) add_match(candidates, e.text_content(), orig)
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']: for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title',
'.title', '.head', '.heading', '.contentheading', '.small_header_red']:
for e in doc.cssselect(item): for e in doc.cssselect(item):
if e.text: if e.text:
add_match(candidates, e.text, orig) add_match(candidates, e.text, orig)
if e.text_content(): if e.text_content():
add_match(candidates, e.text_content(), orig) add_match(candidates, e.text_content(), orig)
if candidates: if candidates:
title = sorted(candidates, key=len)[-1] title = sorted(candidates, key=len)[-1]
else: else:
@ -103,13 +115,16 @@ def shorten_title(doc):
return title return title
def get_body(doc): def get_body(doc):
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] [elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')]
raw_html = unicode(tostring(doc.body or doc)) raw_html = unicode(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html) cleaned = clean_attributes(raw_html)
try: try:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it? #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
return cleaned return cleaned
except Exception: #FIXME find the equivalent lxml error except Exception: # FIXME find the equivalent lxml error
logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) logging.error("cleansing broke html content: %s\n---------\n%s" % (
raw_html,
cleaned))
return raw_html return raw_html