diff --git a/readability/readability.py b/readability/readability.py
index ae760c5..0a40198 100755
--- a/readability/readability.py
+++ b/readability/readability.py
@@ -1,21 +1,32 @@
#!/usr/bin/env python
-from cleaners import html_cleaner, clean_attributes
-from collections import defaultdict
-from htmls import build_doc, get_body, get_title, shorten_title
-from lxml.etree import tostring, tounicode
-from lxml.html import fragment_fromstring, document_fromstring
import logging
import re
import sys
+from collections import defaultdict
+from lxml.etree import tostring
+from lxml.etree import tounicode
+from lxml.html import document_fromstring
+from lxml.html import fragment_fromstring
+
+from cleaners import clean_attributes
+from cleaners import html_cleaner
+from htmls import build_doc
+from htmls import get_body
+from htmls import get_title
+from htmls import shorten_title
+
+
logging.basicConfig(level=logging.INFO)
+log = logging.getLogger()
+
REGEXES = {
- 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter',re.I),
- 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow',re.I),
- 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story',re.I),
- 'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget',re.I),
- 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
+ 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
+ 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I),
+ 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I),
+ 'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I),
+ 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
#'replaceBrsRe': re.compile('(
]*>[ \n\r\t]*){2,}',re.I),
#'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
#'trimRe': re.compile('^\s+|\s+$/'),
@@ -25,21 +36,29 @@ REGEXES = {
#skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
}
+
+class Unparseable(ValueError):
+ pass
+
+
def describe(node, depth=1):
if not hasattr(node, 'tag'):
return "[%s]" % type(node)
name = node.tag
- if node.get('id', ''): name += '#'+node.get('id')
+ if node.get('id', ''):
+ name += '#' + node.get('id')
if node.get('class', ''):
- name += '.' + node.get('class').replace(' ','.')
+ name += '.' + node.get('class').replace(' ', '.')
if name[:4] in ['div#', 'div.']:
name = name[3:]
if depth and node.getparent() is not None:
- return name+' - '+describe(node.getparent(), depth-1)
+ return name + ' - ' + describe(node.getparent(), depth - 1)
return name
+
def to_int(x):
- if not x: return None
+ if not x:
+ return None
x = x.strip()
if x.endswith('px'):
return int(x[:-2])
@@ -47,26 +66,37 @@ def to_int(x):
return int(x[:-2]) * 12
return int(x)
+
def clean(text):
text = re.sub('\s*\n\s*', '\n', text)
text = re.sub('[ \t]{2,}', ' ', text)
return text.strip()
+
def text_length(i):
return len(clean(i.text_content() or ""))
-class Unparseable(ValueError):
- pass
class Document:
+ """Class to build a etree document out of html."""
TEXT_LENGTH_THRESHOLD = 25
RETRY_LENGTH = 250
def __init__(self, input, **options):
+ """Generate the document
+
+ :param input: string of the html content.
+
+ kwargs:
+ - attributes:
+ - debug: output debug messages
+ - min_text_length:
+ - retry_length:
+ - url: will allow adjusting links to be absolute
+
+ """
self.input = input
- self.options = defaultdict(lambda: None)
- for k, v in options.items():
- self.options[k] = v
+ self.options = options
self.html = None
def _html(self, force=False):
@@ -77,7 +107,7 @@ class Document:
def _parse(self, input):
doc = build_doc(input)
doc = html_cleaner.clean_html(doc)
- base_href = self.options['url']
+ base_href = self.options.get('url', None)
if base_href:
doc.make_links_absolute(base_href, resolve_base_href=True)
else:
@@ -94,6 +124,12 @@ class Document:
return shorten_title(self._html(True))
def summary(self, document_only=False):
+ """Generate the summary of the html docuemnt
+
+ :param document_only: return only the div of the document, don't wrap
+ in html and body tags.
+
+ """
try:
ruthless = True
while True:
@@ -114,32 +150,43 @@ class Document:
document_only=document_only)
else:
if ruthless:
- logging.debug("ruthless removal did not work. ")
+ log.debug("ruthless removal did not work. ")
ruthless = False
- self.debug("ended up stripping too much - going for a safer _parse")
+ self.debug(
+ ("ended up stripping too much - "
+ "going for a safer _parse"))
# try again
continue
else:
- logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
+ log.debug(
+ ("Ruthless and lenient parsing did not work. "
+ "Returning raw html"))
article = self.html.find('body')
if article is None:
article = self.html
cleaned_article = self.sanitize(article, candidates)
- of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
+ article_length = len(cleaned_article or '')
+ retry_length = self.options.get(
+ 'retry_length',
+ self.RETRY_LENGTH)
+ of_acceptable_length = article_length >= retry_length
if ruthless and not of_acceptable_length:
ruthless = False
- continue # try again
+ # Loop through and try again.
+ continue
else:
return cleaned_article
except StandardError, e:
- #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
- logging.exception('error getting summary: ' )
+ log.exception('error getting summary: ')
raise Unparseable(str(e)), None, sys.exc_info()[2]
def get_article(self, candidates, best_candidate, document_only=False):
- # Now that we have the top candidate, look through its siblings for content that might also be related.
+ # Now that we have the top candidate, look through its siblings for
+ # content that might also be related.
# Things like preambles, content split by ads that we removed, etc.
- sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
+ sibling_score_threshold = max([
+ 10,
+ best_candidate['content_score'] * 0.2])
# create a new html document with a html->body->div
if document_only:
output = fragment_fromstring('
s
- #FIXME: The current implementation ignores all descendants that are not direct children of elem
- # This results in incorrect results in case there is an buried within an for example
- if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
+ # transform