diff --git a/readability/cleaners.py b/readability/cleaners.py index 9b158c5..1415df1 100644 --- a/readability/cleaners.py +++ b/readability/cleaners.py @@ -1,32 +1,38 @@ -# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds +# strip out a set of nuisance html attributes that can mess up rendering in +# RSS feeds import re from lxml.html.clean import Cleaner -bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*'] +bad_attrs = ['width', 'height', 'style', '[-a-z]*color', + 'background[-a-z]*', 'on*'] single_quoted = "'[^']+'" double_quoted = '"[^"]+"' non_space = '[^ "\'>]+' -htmlstrip = re.compile("<" # open - "([^>]+) " # prefix - "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes - '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value +htmlstrip = re.compile("<" # open + "([^>]+) " # prefix + "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes + '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value "([^>]*)" # postfix - ">" # end -, re.I) + ">", # end + re.I) + def clean_attributes(html): while htmlstrip.search(html): html = htmlstrip.sub('<\\1\\2>', html) return html + def normalize_spaces(s): - if not s: return '' - """replace any sequence of whitespace - characters with a single space""" + """replace any sequence of whitespace characters with a single space""" + if not s: + return '' return ' '.join(s.split()) + html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, style=True, links=True, meta=False, add_nofollow=False, - page_structure=False, processing_instructions=True, embedded=False, - frames=False, forms=False, annoying_tags=False, remove_tags=None, + page_structure=False, processing_instructions=True, + embedded=False, frames=False, forms=False, + annoying_tags=False, remove_tags=None, remove_unknown_tags=False, safe_attrs_only=False) diff --git a/readability/debug.py b/readability/debug.py index a5e644d..fbf13c0 100644 --- a/readability/debug.py +++ b/readability/debug.py @@ -1,25 +1,32 @@ +uids = {} + + def save_to_file(text, filename): f = open(filename, 'wt') - f.write('') + f.write(""" + """) f.write(text.encode('utf-8')) f.close() -uids = {} + def describe(node, depth=2): if not hasattr(node, 'tag'): return "[%s]" % type(node) name = node.tag - if node.get('id', ''): name += '#'+node.get('id') - if node.get('class', ''): - name += '.' + node.get('class').replace(' ','.') + if node.get('id', ''): + name += '#' + node.get('id') + if node.get('class', ''): + name += '.' + node.get('class').replace(' ', '.') if name[:4] in ['div#', 'div.']: name = name[3:] if name in ['tr', 'td', 'div', 'p']: if not node in uids: - uid = uids[node] = len(uids)+1 + uid = uids[node] = len(uids) + 1 else: uid = uids.get(node) name += "%02d" % (uid) if depth and node.getparent() is not None: - return name+' - '+describe(node.getparent(), depth-1) + return name + ' - ' + describe(node.getparent(), depth - 1) return name diff --git a/readability/encoding.py b/readability/encoding.py index d05b7f4..2207495 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -1,21 +1,23 @@ import re import chardet + def get_encoding(page): text = re.sub(']*>\s*', ' ', page) enc = 'utf-8' if not text.strip() or len(text) < 10: - return enc # can't guess + return enc # can't guess try: diff = text.decode(enc, 'ignore').encode(enc) sizes = len(diff), len(text) - if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8 + # 99% of utf-8 + if abs(len(text) - len(diff)) < max(sizes) * 0.01: return enc except UnicodeDecodeError: pass res = chardet.detect(text) enc = res['encoding'] - #print '->', enc, "%.2f" % res['confidence'] + # print '->', enc, "%.2f" % res['confidence'] if enc == 'MacCyrillic': enc = 'cp1251' return enc diff --git a/readability/htmls.py b/readability/htmls.py index 97aa55b..1f8b5e3 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -1,13 +1,17 @@ -from cleaners import normalize_spaces, clean_attributes -from encoding import get_encoding -from lxml.html import tostring import logging -import lxml.html import re -logging.getLogger().setLevel(logging.DEBUG) +from lxml.html import document_fromstring +from lxml.html import HTMLParser +from lxml.html import tostring + +from cleaners import clean_attributes +from cleaners import normalize_spaces +from encoding import get_encoding + +logging.getLogger().setLevel(logging.DEBUG) +utf8_parser = HTMLParser(encoding='utf-8') -utf8_parser = lxml.html.HTMLParser(encoding='utf-8') def build_doc(page): if isinstance(page, unicode): @@ -15,17 +19,20 @@ def build_doc(page): else: enc = get_encoding(page) page_unicode = page.decode(enc, 'replace') - doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) + doc = document_fromstring( + page_unicode.encode('utf-8', 'replace'), + parser=utf8_parser) return doc + def js_re(src, pattern, flags, repl): return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) def normalize_entities(cur_title): entities = { - u'\u2014':'-', - u'\u2013':'-', + u'\u2014': '-', + u'\u2013': '-', u'—': '-', u'–': '-', u'\u00A0': ' ', @@ -39,27 +46,31 @@ def normalize_entities(cur_title): return cur_title + def norm_title(title): return normalize_entities(normalize_spaces(title)) + def get_title(doc): title = doc.find('.//title').text if not title: return '[no-title]' - + return norm_title(title) + def add_match(collection, text, orig): text = norm_title(text) if len(text.split()) >= 2 and len(text) >= 15: if text.replace('"', '') in orig.replace('"', ''): collection.add(text) + def shorten_title(doc): title = doc.find('.//title').text if not title: return '' - + title = orig = norm_title(title) candidates = set() @@ -71,13 +82,14 @@ def shorten_title(doc): if e.text_content(): add_match(candidates, e.text_content(), orig) - for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']: + for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', + '.title', '.head', '.heading', '.contentheading', '.small_header_red']: for e in doc.cssselect(item): if e.text: add_match(candidates, e.text, orig) if e.text_content(): add_match(candidates, e.text_content(), orig) - + if candidates: title = sorted(candidates, key=len)[-1] else: @@ -103,13 +115,16 @@ def shorten_title(doc): return title + def get_body(doc): - [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] + [elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')] raw_html = unicode(tostring(doc.body or doc)) cleaned = clean_attributes(raw_html) try: #BeautifulSoup(cleaned) #FIXME do we really need to try loading it? return cleaned - except Exception: #FIXME find the equivalent lxml error - logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) + except Exception: # FIXME find the equivalent lxml error + logging.error("cleansing broke html content: %s\n---------\n%s" % ( + raw_html, + cleaned)) return raw_html