diff --git a/readability/cleaners.py b/readability/cleaners.py
index 9b158c5..1415df1 100644
--- a/readability/cleaners.py
+++ b/readability/cleaners.py
@@ -1,32 +1,38 @@
-# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
+# strip out a set of nuisance html attributes that can mess up rendering in
+# RSS feeds
import re
from lxml.html.clean import Cleaner
-bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
+bad_attrs = ['width', 'height', 'style', '[-a-z]*color',
+ 'background[-a-z]*', 'on*']
single_quoted = "'[^']+'"
double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+'
-htmlstrip = re.compile("<" # open
- "([^>]+) " # prefix
- "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
- '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
+htmlstrip = re.compile("<" # open
+ "([^>]+) " # prefix
+ "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
+ '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
"([^>]*)" # postfix
- ">" # end
-, re.I)
+ ">", # end
+ re.I)
+
def clean_attributes(html):
while htmlstrip.search(html):
html = htmlstrip.sub('<\\1\\2>', html)
return html
+
def normalize_spaces(s):
- if not s: return ''
- """replace any sequence of whitespace
- characters with a single space"""
+ """replace any sequence of whitespace characters with a single space"""
+ if not s:
+ return ''
return ' '.join(s.split())
+
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False,
- page_structure=False, processing_instructions=True, embedded=False,
- frames=False, forms=False, annoying_tags=False, remove_tags=None,
+ page_structure=False, processing_instructions=True,
+ embedded=False, frames=False, forms=False,
+ annoying_tags=False, remove_tags=None,
remove_unknown_tags=False, safe_attrs_only=False)
diff --git a/readability/debug.py b/readability/debug.py
index a5e644d..fbf13c0 100644
--- a/readability/debug.py
+++ b/readability/debug.py
@@ -1,25 +1,32 @@
+uids = {}
+
+
def save_to_file(text, filename):
f = open(filename, 'wt')
- f.write('')
+ f.write("""
+ """)
f.write(text.encode('utf-8'))
f.close()
-uids = {}
+
def describe(node, depth=2):
if not hasattr(node, 'tag'):
return "[%s]" % type(node)
name = node.tag
- if node.get('id', ''): name += '#'+node.get('id')
- if node.get('class', ''):
- name += '.' + node.get('class').replace(' ','.')
+ if node.get('id', ''):
+ name += '#' + node.get('id')
+ if node.get('class', ''):
+ name += '.' + node.get('class').replace(' ', '.')
if name[:4] in ['div#', 'div.']:
name = name[3:]
if name in ['tr', 'td', 'div', 'p']:
if not node in uids:
- uid = uids[node] = len(uids)+1
+ uid = uids[node] = len(uids) + 1
else:
uid = uids.get(node)
name += "%02d" % (uid)
if depth and node.getparent() is not None:
- return name+' - '+describe(node.getparent(), depth-1)
+ return name + ' - ' + describe(node.getparent(), depth - 1)
return name
diff --git a/readability/encoding.py b/readability/encoding.py
index d05b7f4..2207495 100644
--- a/readability/encoding.py
+++ b/readability/encoding.py
@@ -1,21 +1,23 @@
import re
import chardet
+
def get_encoding(page):
text = re.sub('?[^>]*>\s*', ' ', page)
enc = 'utf-8'
if not text.strip() or len(text) < 10:
- return enc # can't guess
+ return enc # can't guess
try:
diff = text.decode(enc, 'ignore').encode(enc)
sizes = len(diff), len(text)
- if abs(len(text) - len(diff)) < max(sizes) * 0.01: # 99% of utf-8
+ # 99% of utf-8
+ if abs(len(text) - len(diff)) < max(sizes) * 0.01:
return enc
except UnicodeDecodeError:
pass
res = chardet.detect(text)
enc = res['encoding']
- #print '->', enc, "%.2f" % res['confidence']
+ # print '->', enc, "%.2f" % res['confidence']
if enc == 'MacCyrillic':
enc = 'cp1251'
return enc
diff --git a/readability/htmls.py b/readability/htmls.py
index 97aa55b..1f8b5e3 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -1,13 +1,17 @@
-from cleaners import normalize_spaces, clean_attributes
-from encoding import get_encoding
-from lxml.html import tostring
import logging
-import lxml.html
import re
-logging.getLogger().setLevel(logging.DEBUG)
+from lxml.html import document_fromstring
+from lxml.html import HTMLParser
+from lxml.html import tostring
+
+from cleaners import clean_attributes
+from cleaners import normalize_spaces
+from encoding import get_encoding
+
+logging.getLogger().setLevel(logging.DEBUG)
+utf8_parser = HTMLParser(encoding='utf-8')
-utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
def build_doc(page):
if isinstance(page, unicode):
@@ -15,17 +19,20 @@ def build_doc(page):
else:
enc = get_encoding(page)
page_unicode = page.decode(enc, 'replace')
- doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
+ doc = document_fromstring(
+ page_unicode.encode('utf-8', 'replace'),
+ parser=utf8_parser)
return doc
+
def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
def normalize_entities(cur_title):
entities = {
- u'\u2014':'-',
- u'\u2013':'-',
+ u'\u2014': '-',
+ u'\u2013': '-',
u'—': '-',
u'–': '-',
u'\u00A0': ' ',
@@ -39,27 +46,31 @@ def normalize_entities(cur_title):
return cur_title
+
def norm_title(title):
return normalize_entities(normalize_spaces(title))
+
def get_title(doc):
title = doc.find('.//title').text
if not title:
return '[no-title]'
-
+
return norm_title(title)
+
def add_match(collection, text, orig):
text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', '') in orig.replace('"', ''):
collection.add(text)
+
def shorten_title(doc):
title = doc.find('.//title').text
if not title:
return ''
-
+
title = orig = norm_title(title)
candidates = set()
@@ -71,13 +82,14 @@ def shorten_title(doc):
if e.text_content():
add_match(candidates, e.text_content(), orig)
- for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
+ for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title',
+ '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
for e in doc.cssselect(item):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
-
+
if candidates:
title = sorted(candidates, key=len)[-1]
else:
@@ -103,13 +115,16 @@ def shorten_title(doc):
return title
+
def get_body(doc):
- [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
+ [elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')]
raw_html = unicode(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html)
try:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
return cleaned
- except Exception: #FIXME find the equivalent lxml error
- logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
+ except Exception: # FIXME find the equivalent lxml error
+ logging.error("cleansing broke html content: %s\n---------\n%s" % (
+ raw_html,
+ cleaned))
return raw_html