From b498df200b3208d285a6dd91f1ae4a92fa16f7ce Mon Sep 17 00:00:00 2001
From: Richard Harding <rharding@mitechie.com>
Date: Tue, 17 Apr 2012 14:14:02 -0400
Subject: [PATCH] More pep8, almost there

---
 readability/cleaners.py | 32 ++++++++++++++++------------
 readability/debug.py    | 21 ++++++++++++------
 readability/encoding.py |  8 ++++---
 readability/htmls.py    | 47 +++++++++++++++++++++++++++--------------
 4 files changed, 69 insertions(+), 39 deletions(-)
diff --git a/readability/cleaners.py b/readability/cleaners.py
index 9b158c5..1415df1 100644
--- a/readability/cleaners.py
+++ b/readability/cleaners.py
@@ -1,32 +1,38 @@
-# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
+# strip out a set of nuisance html attributes that can mess up rendering in
+# RSS feeds
 import re
 from lxml.html.clean import Cleaner
 
-bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
+bad_attrs = ['width', 'height', 'style', '[-a-z]*color',
+    'background[-a-z]*', 'on*']
 single_quoted = "'[^']+'"
 double_quoted = '"[^"]+"'
 non_space = '[^ "\'>]+'
-htmlstrip = re.compile("<" # open
-    "([^>]+) " # prefix
-    "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
-    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
+htmlstrip = re.compile("<"  # open
+    "([^>]+) "  # prefix
+    "(?:%s) *" % ('|'.join(bad_attrs),) +  # undesirable attributes
+    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) +  # value
     "([^>]*)"  # postfix
-    ">"        # end
-, re.I)
+    ">",       # end
+    re.I)
+
 
 def clean_attributes(html):
     while htmlstrip.search(html):
         html = htmlstrip.sub('<\\1\\2>', html)
     return html
 
+
 def normalize_spaces(s):
-    if not s: return ''
-    """replace any sequence of whitespace
-    characters with a single space"""
+    """replace any sequence of whitespace characters with a single space"""
+    if not s:
+        return ''
     return ' '.join(s.split())
 
+
 html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                   style=True, links=True, meta=False, add_nofollow=False,
-                  page_structure=False, processing_instructions=True, embedded=False,
-                  frames=False, forms=False, annoying_tags=False, remove_tags=None,
+                  page_structure=False, processing_instructions=True,
+                  embedded=False, frames=False, forms=False,
+                  annoying_tags=False, remove_tags=None,
                   remove_unknown_tags=False, safe_attrs_only=False)
diff --git a/readability/debug.py b/readability/debug.py
index a5e644d..fbf13c0 100644
--- a/readability/debug.py
+++ b/readability/debug.py
@@ -1,25 +1,32 @@
+uids = {}
+
+
 def save_to_file(text, filename):
     f = open(filename, 'wt')
-    f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
+    f.write("""
+        <meta http-equiv="Content-Type"
+            content="text/html; charset=UTF-8"
+        />""")
     f.write(text.encode('utf-8'))
     f.close()
 
-uids = {} 
+
 def describe(node, depth=2):
     if not hasattr(node, 'tag'):
         return "[%s]" % type(node)
     name = node.tag
-    if node.get('id', ''): name += '#'+node.get('id') 
-    if node.get('class', ''): 
-        name += '.' + node.get('class').replace(' ','.')
+    if node.get('id', ''):
+        name += '#' + node.get('id')
+    if node.get('class', ''):
+        name += '.' + node.get('class').replace(' ', '.')
     if name[:4] in ['div#', 'div.']:
         name = name[3:]
     if name in ['tr', 'td', 'div', 'p']:
         if not node in uids:
-            uid = uids[node] = len(uids)+1
+            uid = uids[node] = len(uids) + 1
         else:
             uid = uids.get(node)
         name += "%02d" % (uid)
     if depth and node.getparent() is not None:
-        return name+' - '+describe(node.getparent(), depth-1)
+        return name + ' - ' + describe(node.getparent(), depth - 1)
     return name
diff --git a/readability/encoding.py b/readability/encoding.py
index d05b7f4..2207495 100644
--- a/readability/encoding.py
+++ b/readability/encoding.py
@@ -1,21 +1,23 @@
 import re
 import chardet
 
+
 def get_encoding(page):
     text = re.sub('</?[^>]*>\s*', ' ', page)
     enc = 'utf-8'
     if not text.strip() or len(text) < 10:
-        return enc # can't guess
+        return enc  # can't guess
     try:
         diff = text.decode(enc, 'ignore').encode(enc)
         sizes = len(diff), len(text)
-        if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
+        # 99% of utf-8
+        if abs(len(text) - len(diff)) < max(sizes) * 0.01:
             return enc
     except UnicodeDecodeError:
         pass
     res = chardet.detect(text)
     enc = res['encoding']
-    #print '->', enc, "%.2f" % res['confidence']
+    # print '->', enc, "%.2f" % res['confidence']
     if enc == 'MacCyrillic':
         enc = 'cp1251'
     return enc
diff --git a/readability/htmls.py b/readability/htmls.py
index 97aa55b..1f8b5e3 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -1,13 +1,17 @@
-from cleaners import normalize_spaces, clean_attributes
-from encoding import get_encoding
-from lxml.html import tostring
 import logging
-import lxml.html
 import re
 
-logging.getLogger().setLevel(logging.DEBUG)
+from lxml.html import document_fromstring
+from lxml.html import HTMLParser
+from lxml.html import tostring
+
+from cleaners import clean_attributes
+from cleaners import normalize_spaces
+from encoding import get_encoding
+
+logging.getLogger().setLevel(logging.DEBUG)
+utf8_parser = HTMLParser(encoding='utf-8')
 
-utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
 
 def build_doc(page):
     if isinstance(page, unicode):
@@ -15,17 +19,20 @@ def build_doc(page):
     else:
         enc = get_encoding(page)
         page_unicode = page.decode(enc, 'replace')
-    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
+    doc = document_fromstring(
+        page_unicode.encode('utf-8', 'replace'),
+        parser=utf8_parser)
     return doc
 
+
 def js_re(src, pattern, flags, repl):
     return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
 
 
 def normalize_entities(cur_title):
     entities = {
-        u'\u2014':'-',
-        u'\u2013':'-',
+        u'\u2014': '-',
+        u'\u2013': '-',
         u'&mdash;': '-',
         u'&ndash;': '-',
         u'\u00A0': ' ',
@@ -39,27 +46,31 @@ def normalize_entities(cur_title):
 
     return cur_title
 
+
 def norm_title(title):
     return normalize_entities(normalize_spaces(title))
 
+
 def get_title(doc):
     title = doc.find('.//title').text
     if not title:
         return '[no-title]'
-    
+
     return norm_title(title)
 
+
 def add_match(collection, text, orig):
     text = norm_title(text)
     if len(text.split()) >= 2 and len(text) >= 15:
         if text.replace('"', '') in orig.replace('"', ''):
             collection.add(text)
 
+
 def shorten_title(doc):
     title = doc.find('.//title').text
     if not title:
         return ''
-    
+
     title = orig = norm_title(title)
 
     candidates = set()
@@ -71,13 +82,14 @@ def shorten_title(doc):
             if e.text_content():
                 add_match(candidates, e.text_content(), orig)
 
-    for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
+    for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title',
+        '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
         for e in doc.cssselect(item):
             if e.text:
                 add_match(candidates, e.text, orig)
             if e.text_content():
                 add_match(candidates, e.text_content(), orig)
-                
+
     if candidates:
         title = sorted(candidates, key=len)[-1]
     else:
@@ -103,13 +115,16 @@ def shorten_title(doc):
 
     return title
 
+
 def get_body(doc):
-    [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
+    [elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')]
     raw_html = unicode(tostring(doc.body or doc))
     cleaned = clean_attributes(raw_html)
     try:
         #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
         return cleaned
-    except Exception: #FIXME find the equivalent lxml error
-        logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
+    except Exception:  # FIXME find the equivalent lxml error
+        logging.error("cleansing broke html content: %s\n---------\n%s" % (
+            raw_html,
+            cleaned))
         return raw_html