clean up content method and debug

15 years ago · c952f421b7
parent c0ca60ee26
commit c952f421b7
1 changed files with 26 additions and 26 deletions
--- a/readability/readability.py
+++ b/readability/readability.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python
 from BeautifulSoup import NavigableString
 from page_parser import parse
 import logging
 import re
 REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor',re.I),
@ -32,28 +33,32 @@ class Document:
 		self.options = defaultdict(lambda: None)
 		for k, v in options.items():
 			self.options[k] = v
 		self.make_html()
 	def make_html(self):
 		self.html = parse(self.input, self.options['url'])
-	def content(self, remove_unlikely_candidates = True):
+	def content(self):
-		def remove(tag): [i.extract() for i in self.html.findAll(tag)]
+		ruthless = True
-		remove('script')
+		while True:
 		remove('style')
 		if remove_unlikely_candidates: self.remove_unlikely_candidates()
 		self.transform_misused_divs_into_paragraphs()
 		candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
 		best_candidate = self.select_best_candidate(candidates)
 		article = self.get_article(candidates, best_candidate)
 		cleaned_article = self.sanitize(article, candidates)
 		if remove_unlikely_candidates and len(cleaned_article or '') < (self.options['retry_length'] or self.RETRY_LENGTH):
 			self.make_html()
-			return self.content(False)
+			[i.extract() for i in self.tags(self.html, 'script', 'style')]
-		else:
+
-			return cleaned_article
+			if ruthless: self.remove_unlikely_candidates()
 			self.transform_misused_divs_into_paragraphs()
 			candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
 			best_candidate = self.select_best_candidate(candidates)
 			if ruthless and best_candidate is None:
 				ruthless = False
 				continue
 			article = self.get_article(candidates, best_candidate)
 			cleaned_article = self.sanitize(article, candidates)
 			of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
 			if ruthless and not of_acceptable_length:
 				ruthless = False
 				continue # try again
 			else:
 				return cleaned_article
 	def get_article(self, candidates, best_candidate):
 		# Now that we have the top candidate, look through its siblings for content that might also be related.
@ -87,18 +92,13 @@ class Document:
 	def select_best_candidate(self, candidates):
 		sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
 		self.debug("Top 5 canidates:")
 		for candidate in sorted_candidates[:5]:
 			elem = candidate['elem']
-			self.debug("Candidate %s with score %s" % (
+			self.debug("Candidate %s with score %s" % (describe(elem), candidate['content_score']))
 				describe(elem), candidate['content_score']))
 		best_candidate = sorted_candidates[0] if len(sorted_candidates) > 1 else { 'elem': self.html.find("body"), 'content_score': 0 }
-		elem = best_candidate['elem']
+		self.debug("Best candidate %s with score %s" % (describe(best_candidate['elem']), best_candidate['content_score']))
 		self.debug("Best candidate %s#%s.%s with score %s" % (
 			elem.name, elem.get('id',''), elem.get('class',''), best_candidate['content_score']))
 		return best_candidate
 	def get_link_density(self, elem):
@ -173,9 +173,9 @@ class Document:
 			content_score -= 5
 		return { 'content_score': content_score, 'elem': elem }
-	def debug(self, str):
+	def debug(self, *a):
 		if self.options['debug']:
-			print(str)
+			logging.debug(*a)
 	def remove_unlikely_candidates(self):
 		for elem in self.html.findAll():