More pep8, almost there
This commit is contained in:
parent
bbb60ed077
commit
b498df200b
@ -1,32 +1,38 @@
|
|||||||
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
|
# strip out a set of nuisance html attributes that can mess up rendering in
|
||||||
|
# RSS feeds
|
||||||
import re
|
import re
|
||||||
from lxml.html.clean import Cleaner
|
from lxml.html.clean import Cleaner
|
||||||
|
|
||||||
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
|
bad_attrs = ['width', 'height', 'style', '[-a-z]*color',
|
||||||
|
'background[-a-z]*', 'on*']
|
||||||
single_quoted = "'[^']+'"
|
single_quoted = "'[^']+'"
|
||||||
double_quoted = '"[^"]+"'
|
double_quoted = '"[^"]+"'
|
||||||
non_space = '[^ "\'>]+'
|
non_space = '[^ "\'>]+'
|
||||||
htmlstrip = re.compile("<" # open
|
htmlstrip = re.compile("<" # open
|
||||||
"([^>]+) " # prefix
|
"([^>]+) " # prefix
|
||||||
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
|
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
|
||||||
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
|
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
|
||||||
"([^>]*)" # postfix
|
"([^>]*)" # postfix
|
||||||
">" # end
|
">", # end
|
||||||
, re.I)
|
re.I)
|
||||||
|
|
||||||
|
|
||||||
def clean_attributes(html):
|
def clean_attributes(html):
|
||||||
while htmlstrip.search(html):
|
while htmlstrip.search(html):
|
||||||
html = htmlstrip.sub('<\\1\\2>', html)
|
html = htmlstrip.sub('<\\1\\2>', html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
||||||
def normalize_spaces(s):
|
def normalize_spaces(s):
|
||||||
if not s: return ''
|
"""replace any sequence of whitespace characters with a single space"""
|
||||||
"""replace any sequence of whitespace
|
if not s:
|
||||||
characters with a single space"""
|
return ''
|
||||||
return ' '.join(s.split())
|
return ' '.join(s.split())
|
||||||
|
|
||||||
|
|
||||||
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
|
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
|
||||||
style=True, links=True, meta=False, add_nofollow=False,
|
style=True, links=True, meta=False, add_nofollow=False,
|
||||||
page_structure=False, processing_instructions=True, embedded=False,
|
page_structure=False, processing_instructions=True,
|
||||||
frames=False, forms=False, annoying_tags=False, remove_tags=None,
|
embedded=False, frames=False, forms=False,
|
||||||
|
annoying_tags=False, remove_tags=None,
|
||||||
remove_unknown_tags=False, safe_attrs_only=False)
|
remove_unknown_tags=False, safe_attrs_only=False)
|
||||||
|
@ -1,25 +1,32 @@
|
|||||||
|
uids = {}
|
||||||
|
|
||||||
|
|
||||||
def save_to_file(text, filename):
|
def save_to_file(text, filename):
|
||||||
f = open(filename, 'wt')
|
f = open(filename, 'wt')
|
||||||
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
|
f.write("""
|
||||||
|
<meta http-equiv="Content-Type"
|
||||||
|
content="text/html; charset=UTF-8"
|
||||||
|
/>""")
|
||||||
f.write(text.encode('utf-8'))
|
f.write(text.encode('utf-8'))
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
uids = {}
|
|
||||||
def describe(node, depth=2):
|
def describe(node, depth=2):
|
||||||
if not hasattr(node, 'tag'):
|
if not hasattr(node, 'tag'):
|
||||||
return "[%s]" % type(node)
|
return "[%s]" % type(node)
|
||||||
name = node.tag
|
name = node.tag
|
||||||
if node.get('id', ''): name += '#'+node.get('id')
|
if node.get('id', ''):
|
||||||
if node.get('class', ''):
|
name += '#' + node.get('id')
|
||||||
name += '.' + node.get('class').replace(' ','.')
|
if node.get('class', ''):
|
||||||
|
name += '.' + node.get('class').replace(' ', '.')
|
||||||
if name[:4] in ['div#', 'div.']:
|
if name[:4] in ['div#', 'div.']:
|
||||||
name = name[3:]
|
name = name[3:]
|
||||||
if name in ['tr', 'td', 'div', 'p']:
|
if name in ['tr', 'td', 'div', 'p']:
|
||||||
if not node in uids:
|
if not node in uids:
|
||||||
uid = uids[node] = len(uids)+1
|
uid = uids[node] = len(uids) + 1
|
||||||
else:
|
else:
|
||||||
uid = uids.get(node)
|
uid = uids.get(node)
|
||||||
name += "%02d" % (uid)
|
name += "%02d" % (uid)
|
||||||
if depth and node.getparent() is not None:
|
if depth and node.getparent() is not None:
|
||||||
return name+' - '+describe(node.getparent(), depth-1)
|
return name + ' - ' + describe(node.getparent(), depth - 1)
|
||||||
return name
|
return name
|
||||||
|
@ -1,21 +1,23 @@
|
|||||||
import re
|
import re
|
||||||
import chardet
|
import chardet
|
||||||
|
|
||||||
|
|
||||||
def get_encoding(page):
|
def get_encoding(page):
|
||||||
text = re.sub('</?[^>]*>\s*', ' ', page)
|
text = re.sub('</?[^>]*>\s*', ' ', page)
|
||||||
enc = 'utf-8'
|
enc = 'utf-8'
|
||||||
if not text.strip() or len(text) < 10:
|
if not text.strip() or len(text) < 10:
|
||||||
return enc # can't guess
|
return enc # can't guess
|
||||||
try:
|
try:
|
||||||
diff = text.decode(enc, 'ignore').encode(enc)
|
diff = text.decode(enc, 'ignore').encode(enc)
|
||||||
sizes = len(diff), len(text)
|
sizes = len(diff), len(text)
|
||||||
if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
|
# 99% of utf-8
|
||||||
|
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
|
||||||
return enc
|
return enc
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
pass
|
pass
|
||||||
res = chardet.detect(text)
|
res = chardet.detect(text)
|
||||||
enc = res['encoding']
|
enc = res['encoding']
|
||||||
#print '->', enc, "%.2f" % res['confidence']
|
# print '->', enc, "%.2f" % res['confidence']
|
||||||
if enc == 'MacCyrillic':
|
if enc == 'MacCyrillic':
|
||||||
enc = 'cp1251'
|
enc = 'cp1251'
|
||||||
return enc
|
return enc
|
||||||
|
@ -1,13 +1,17 @@
|
|||||||
from cleaners import normalize_spaces, clean_attributes
|
|
||||||
from encoding import get_encoding
|
|
||||||
from lxml.html import tostring
|
|
||||||
import logging
|
import logging
|
||||||
import lxml.html
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
logging.getLogger().setLevel(logging.DEBUG)
|
from lxml.html import document_fromstring
|
||||||
|
from lxml.html import HTMLParser
|
||||||
|
from lxml.html import tostring
|
||||||
|
|
||||||
|
from cleaners import clean_attributes
|
||||||
|
from cleaners import normalize_spaces
|
||||||
|
from encoding import get_encoding
|
||||||
|
|
||||||
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
utf8_parser = HTMLParser(encoding='utf-8')
|
||||||
|
|
||||||
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
|
|
||||||
|
|
||||||
def build_doc(page):
|
def build_doc(page):
|
||||||
if isinstance(page, unicode):
|
if isinstance(page, unicode):
|
||||||
@ -15,17 +19,20 @@ def build_doc(page):
|
|||||||
else:
|
else:
|
||||||
enc = get_encoding(page)
|
enc = get_encoding(page)
|
||||||
page_unicode = page.decode(enc, 'replace')
|
page_unicode = page.decode(enc, 'replace')
|
||||||
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
|
doc = document_fromstring(
|
||||||
|
page_unicode.encode('utf-8', 'replace'),
|
||||||
|
parser=utf8_parser)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
def js_re(src, pattern, flags, repl):
|
def js_re(src, pattern, flags, repl):
|
||||||
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
|
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
|
||||||
|
|
||||||
|
|
||||||
def normalize_entities(cur_title):
|
def normalize_entities(cur_title):
|
||||||
entities = {
|
entities = {
|
||||||
u'\u2014':'-',
|
u'\u2014': '-',
|
||||||
u'\u2013':'-',
|
u'\u2013': '-',
|
||||||
u'—': '-',
|
u'—': '-',
|
||||||
u'–': '-',
|
u'–': '-',
|
||||||
u'\u00A0': ' ',
|
u'\u00A0': ' ',
|
||||||
@ -39,27 +46,31 @@ def normalize_entities(cur_title):
|
|||||||
|
|
||||||
return cur_title
|
return cur_title
|
||||||
|
|
||||||
|
|
||||||
def norm_title(title):
|
def norm_title(title):
|
||||||
return normalize_entities(normalize_spaces(title))
|
return normalize_entities(normalize_spaces(title))
|
||||||
|
|
||||||
|
|
||||||
def get_title(doc):
|
def get_title(doc):
|
||||||
title = doc.find('.//title').text
|
title = doc.find('.//title').text
|
||||||
if not title:
|
if not title:
|
||||||
return '[no-title]'
|
return '[no-title]'
|
||||||
|
|
||||||
return norm_title(title)
|
return norm_title(title)
|
||||||
|
|
||||||
|
|
||||||
def add_match(collection, text, orig):
|
def add_match(collection, text, orig):
|
||||||
text = norm_title(text)
|
text = norm_title(text)
|
||||||
if len(text.split()) >= 2 and len(text) >= 15:
|
if len(text.split()) >= 2 and len(text) >= 15:
|
||||||
if text.replace('"', '') in orig.replace('"', ''):
|
if text.replace('"', '') in orig.replace('"', ''):
|
||||||
collection.add(text)
|
collection.add(text)
|
||||||
|
|
||||||
|
|
||||||
def shorten_title(doc):
|
def shorten_title(doc):
|
||||||
title = doc.find('.//title').text
|
title = doc.find('.//title').text
|
||||||
if not title:
|
if not title:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
title = orig = norm_title(title)
|
title = orig = norm_title(title)
|
||||||
|
|
||||||
candidates = set()
|
candidates = set()
|
||||||
@ -71,13 +82,14 @@ def shorten_title(doc):
|
|||||||
if e.text_content():
|
if e.text_content():
|
||||||
add_match(candidates, e.text_content(), orig)
|
add_match(candidates, e.text_content(), orig)
|
||||||
|
|
||||||
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
|
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title',
|
||||||
|
'.title', '.head', '.heading', '.contentheading', '.small_header_red']:
|
||||||
for e in doc.cssselect(item):
|
for e in doc.cssselect(item):
|
||||||
if e.text:
|
if e.text:
|
||||||
add_match(candidates, e.text, orig)
|
add_match(candidates, e.text, orig)
|
||||||
if e.text_content():
|
if e.text_content():
|
||||||
add_match(candidates, e.text_content(), orig)
|
add_match(candidates, e.text_content(), orig)
|
||||||
|
|
||||||
if candidates:
|
if candidates:
|
||||||
title = sorted(candidates, key=len)[-1]
|
title = sorted(candidates, key=len)[-1]
|
||||||
else:
|
else:
|
||||||
@ -103,13 +115,16 @@ def shorten_title(doc):
|
|||||||
|
|
||||||
return title
|
return title
|
||||||
|
|
||||||
|
|
||||||
def get_body(doc):
|
def get_body(doc):
|
||||||
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
|
[elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')]
|
||||||
raw_html = unicode(tostring(doc.body or doc))
|
raw_html = unicode(tostring(doc.body or doc))
|
||||||
cleaned = clean_attributes(raw_html)
|
cleaned = clean_attributes(raw_html)
|
||||||
try:
|
try:
|
||||||
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
|
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
|
||||||
return cleaned
|
return cleaned
|
||||||
except Exception: #FIXME find the equivalent lxml error
|
except Exception: # FIXME find the equivalent lxml error
|
||||||
logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
|
logging.error("cleansing broke html content: %s\n---------\n%s" % (
|
||||||
|
raw_html,
|
||||||
|
cleaned))
|
||||||
return raw_html
|
return raw_html
|
||||||
|
Loading…
Reference in New Issue
Block a user