More pep8, almost there

2012-04-17 14:14:02 -04:00 · 2012-04-17 14:14:02 -04:00 · b498df200b
commit b498df200b
parent bbb60ed077
4 changed files with 69 additions and 39 deletions
--- a/readability/cleaners.py
+++ b/readability/cleaners.py
@ -1,32 +1,38 @@
-# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
+# strip out a set of nuisance html attributes that can mess up rendering in
+# RSS feeds
 import re
 from lxml.html.clean import Cleaner

-bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
+bad_attrs = ['width', 'height', 'style', '[-a-z]*color',
+    'background[-a-z]*', 'on*']
 single_quoted = "'[^']+'"
 double_quoted = '"[^"]+"'
 non_space = '[^ "\'>]+'
-htmlstrip = re.compile("<" # open
-    "([^>]+) " # prefix
-    "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
-    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
+htmlstrip = re.compile("<"  # open
+    "([^>]+) "  # prefix
+    "(?:%s) *" % ('|'.join(bad_attrs),) +  # undesirable attributes
+    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) +  # value
    "([^>]*)"  # postfix
-    ">"        # end
-, re.I)
+    ">",       # end
+    re.I)
+

 def clean_attributes(html):
    while htmlstrip.search(html):
        html = htmlstrip.sub('<\\1\\2>', html)
    return html

+
 def normalize_spaces(s):
-    if not s: return ''
-    """replace any sequence of whitespace
-    characters with a single space"""
+    """replace any sequence of whitespace characters with a single space"""
+    if not s:
+        return ''
    return ' '.join(s.split())

+
 html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                  style=True, links=True, meta=False, add_nofollow=False,
-                  page_structure=False, processing_instructions=True, embedded=False,
-                  frames=False, forms=False, annoying_tags=False, remove_tags=None,
+                  page_structure=False, processing_instructions=True,
+                  embedded=False, frames=False, forms=False,
+                  annoying_tags=False, remove_tags=None,
                  remove_unknown_tags=False, safe_attrs_only=False)
--- a/readability/debug.py
+++ b/readability/debug.py
@ -1,25 +1,32 @@
+uids = {}
+
+
 def save_to_file(text, filename):
    f = open(filename, 'wt')
-    f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
+    f.write("""
+        <meta http-equiv="Content-Type"
+            content="text/html; charset=UTF-8"
+        />""")
    f.write(text.encode('utf-8'))
    f.close()

-uids = {} 
+
 def describe(node, depth=2):
    if not hasattr(node, 'tag'):
        return "[%s]" % type(node)
    name = node.tag
-    if node.get('id', ''): name += '#'+node.get('id') 
-    if node.get('class', ''): 
-        name += '.' + node.get('class').replace(' ','.')
+    if node.get('id', ''):
+        name += '#' + node.get('id')
+    if node.get('class', ''):
+        name += '.' + node.get('class').replace(' ', '.')
    if name[:4] in ['div#', 'div.']:
        name = name[3:]
    if name in ['tr', 'td', 'div', 'p']:
        if not node in uids:
-            uid = uids[node] = len(uids)+1
+            uid = uids[node] = len(uids) + 1
        else:
            uid = uids.get(node)
        name += "%02d" % (uid)
    if depth and node.getparent() is not None:
-        return name+' - '+describe(node.getparent(), depth-1)
+        return name + ' - ' + describe(node.getparent(), depth - 1)
    return name
--- a/readability/encoding.py
+++ b/readability/encoding.py
@ -1,21 +1,23 @@
 import re
 import chardet

+
 def get_encoding(page):
    text = re.sub('</?[^>]*>\s*', ' ', page)
    enc = 'utf-8'
    if not text.strip() or len(text) < 10:
-        return enc # can't guess
+        return enc  # can't guess
    try:
        diff = text.decode(enc, 'ignore').encode(enc)
        sizes = len(diff), len(text)
-        if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
+        # 99% of utf-8
+        if abs(len(text) - len(diff)) < max(sizes) * 0.01:
            return enc
    except UnicodeDecodeError:
        pass
    res = chardet.detect(text)
    enc = res['encoding']
-    #print '->', enc, "%.2f" % res['confidence']
+    # print '->', enc, "%.2f" % res['confidence']
    if enc == 'MacCyrillic':
        enc = 'cp1251'
    return enc
--- a/readability/htmls.py
+++ b/readability/htmls.py
@ -1,13 +1,17 @@
-from cleaners import normalize_spaces, clean_attributes
-from encoding import get_encoding
-from lxml.html import tostring
 import logging
-import lxml.html
 import re

-logging.getLogger().setLevel(logging.DEBUG)
+from lxml.html import document_fromstring
+from lxml.html import HTMLParser
+from lxml.html import tostring
+
+from cleaners import clean_attributes
+from cleaners import normalize_spaces
+from encoding import get_encoding
+
+logging.getLogger().setLevel(logging.DEBUG)
+utf8_parser = HTMLParser(encoding='utf-8')

-utf8_parser = lxml.html.HTMLParser(encoding='utf-8')

 def build_doc(page):
    if isinstance(page, unicode):
@ -15,17 +19,20 @@ def build_doc(page):
    else:
        enc = get_encoding(page)
        page_unicode = page.decode(enc, 'replace')
-    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
+    doc = document_fromstring(
+        page_unicode.encode('utf-8', 'replace'),
+        parser=utf8_parser)
    return doc

+
 def js_re(src, pattern, flags, repl):
    return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))


 def normalize_entities(cur_title):
    entities = {
-        u'\u2014':'-',
-        u'\u2013':'-',
+        u'\u2014': '-',
+        u'\u2013': '-',
        u'&mdash;': '-',
        u'&ndash;': '-',
        u'\u00A0': ' ',
@ -39,27 +46,31 @@ def normalize_entities(cur_title):

    return cur_title

+
 def norm_title(title):
    return normalize_entities(normalize_spaces(title))

+
 def get_title(doc):
    title = doc.find('.//title').text
    if not title:
        return '[no-title]'
-    
+
    return norm_title(title)

+
 def add_match(collection, text, orig):
    text = norm_title(text)
    if len(text.split()) >= 2 and len(text) >= 15:
        if text.replace('"', '') in orig.replace('"', ''):
            collection.add(text)

+
 def shorten_title(doc):
    title = doc.find('.//title').text
    if not title:
        return ''
-    
+
    title = orig = norm_title(title)

    candidates = set()
@ -71,13 +82,14 @@ def shorten_title(doc):
            if e.text_content():
                add_match(candidates, e.text_content(), orig)

-    for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
+    for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title',
+        '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
        for e in doc.cssselect(item):
            if e.text:
                add_match(candidates, e.text, orig)
            if e.text_content():
                add_match(candidates, e.text_content(), orig)
-                
+
    if candidates:
        title = sorted(candidates, key=len)[-1]
    else:
@ -103,13 +115,16 @@ def shorten_title(doc):

    return title

+
 def get_body(doc):
-    [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
+    [elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')]
    raw_html = unicode(tostring(doc.body or doc))
    cleaned = clean_attributes(raw_html)
    try:
        #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
        return cleaned
-    except Exception: #FIXME find the equivalent lxml error
-        logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
+    except Exception:  # FIXME find the equivalent lxml error
+        logging.error("cleansing broke html content: %s\n---------\n%s" % (
+            raw_html,
+            cleaned))
        return raw_html