failsafe parsing and more logging

2010-04-30 22:33:22 +10:00 · 2010-04-30 22:33:22 +10:00 · 0eacd959a4
commit 0eacd959a4
parent 87ad057706
1 changed files with 32 additions and 27 deletions
--- a/readability/readability.py
+++ b/readability/readability.py
@ -28,17 +28,17 @@ class Document:
 	TEXT_LENGTH_THRESHOLD = 25
 	RETRY_LENGTH = 250
-	def __init__(self, input, **options):
+	def __init__(self, input, notify=None, **options):
-		self.input = inpuunicodear
+		self.input = input
 		self.options = defaultdict(lambda: None)
 		for k, v in options.items():
 			self.options[k] = v
 		self.notify = notify or logging.info
 		self.html = None
 	def _html(self, force=False):
 		if force or self.html is None:
-			notify = self.options['notify'] or (lambda x: None)
+			self.html = parse(self.input, self.options['url'], notify=self.notify)
 			self.html = parse(self.input, self.options['url'], notify=notify)
 		return self.html
 	def content(self):
@ -48,32 +48,36 @@ class Document:
 		return get_title(self._html())
 	def summary(self):
-		ruthless = True
+		try:
-		while True:
+			ruthless = True
-			self._html(True)
+			while True:
-			[i.extract() for i in self.tags(self.html, 'script', 'style')]
+				self._html(True)
 				[i.extract() for i in self.tags(self.html, 'script', 'style')]
-			if ruthless: self.remove_unlikely_candidates()
+				if ruthless: self.remove_unlikely_candidates()
-			self.transform_misused_divs_into_paragraphs()
+				self.transform_misused_divs_into_paragraphs()
-			candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
+				candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
-			best_candidate = self.select_best_candidate(candidates)
+				best_candidate = self.select_best_candidate(candidates)
-			if best_candidate:
+				if best_candidate:
-				article = self.get_article(candidates, best_candidate)
+					article = self.get_article(candidates, best_candidate)
 			else:
 				if ruthless:
 					ruthless = False
 					# try again
 					continue
 				else:
-					article = self.html.find('body') or self.html
+					if ruthless:
 						ruthless = False
 						# try again
 						continue
 					else:
 						article = self.html.find('body') or self.html
-			cleaned_article = self.sanitize(article, candidates)
+				cleaned_article = self.sanitize(article, candidates)
-			of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
+				of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
-			if ruthless and not of_acceptable_length:
+				if ruthless and not of_acceptable_length:
-				ruthless = False
+					ruthless = False
-				continue # try again
+					continue # try again
-			else:
+				else:
-				return cleaned_article
+					return cleaned_article
 		except StandardError, e:
 			logging.exception('error getting summary:')
 			raise Unparseable(str(e))
 	def get_article(self, candidates, best_candidate):
 		# Now that we have the top candidate, look through its siblings for content that might also be related.
@ -322,6 +326,7 @@ def main():
 	if not (len(args) == 1 or options.url):
 		parser.print_help()
 		sys.exit(1)
 	logging.basicConfig(level=logging.DEBUG)
 	file = None
 	if options.url: