failsafe parsing and more logging

This commit is contained in:
gfxmonk 2010-04-30 22:33:22 +10:00
parent 87ad057706
commit 0eacd959a4

View File

@ -28,17 +28,17 @@ class Document:
TEXT_LENGTH_THRESHOLD = 25 TEXT_LENGTH_THRESHOLD = 25
RETRY_LENGTH = 250 RETRY_LENGTH = 250
def __init__(self, input, **options): def __init__(self, input, notify=None, **options):
self.input = inpuunicodear self.input = input
self.options = defaultdict(lambda: None) self.options = defaultdict(lambda: None)
for k, v in options.items(): for k, v in options.items():
self.options[k] = v self.options[k] = v
self.notify = notify or logging.info
self.html = None self.html = None
def _html(self, force=False): def _html(self, force=False):
if force or self.html is None: if force or self.html is None:
notify = self.options['notify'] or (lambda x: None) self.html = parse(self.input, self.options['url'], notify=self.notify)
self.html = parse(self.input, self.options['url'], notify=notify)
return self.html return self.html
def content(self): def content(self):
@ -48,32 +48,36 @@ class Document:
return get_title(self._html()) return get_title(self._html())
def summary(self): def summary(self):
ruthless = True try:
while True: ruthless = True
self._html(True) while True:
[i.extract() for i in self.tags(self.html, 'script', 'style')] self._html(True)
[i.extract() for i in self.tags(self.html, 'script', 'style')]
if ruthless: self.remove_unlikely_candidates() if ruthless: self.remove_unlikely_candidates()
self.transform_misused_divs_into_paragraphs() self.transform_misused_divs_into_paragraphs()
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
best_candidate = self.select_best_candidate(candidates) best_candidate = self.select_best_candidate(candidates)
if best_candidate: if best_candidate:
article = self.get_article(candidates, best_candidate) article = self.get_article(candidates, best_candidate)
else:
if ruthless:
ruthless = False
# try again
continue
else: else:
article = self.html.find('body') or self.html if ruthless:
ruthless = False
# try again
continue
else:
article = self.html.find('body') or self.html
cleaned_article = self.sanitize(article, candidates) cleaned_article = self.sanitize(article, candidates)
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH) of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
if ruthless and not of_acceptable_length: if ruthless and not of_acceptable_length:
ruthless = False ruthless = False
continue # try again continue # try again
else: else:
return cleaned_article return cleaned_article
except StandardError, e:
logging.exception('error getting summary:')
raise Unparseable(str(e))
def get_article(self, candidates, best_candidate): def get_article(self, candidates, best_candidate):
# Now that we have the top candidate, look through its siblings for content that might also be related. # Now that we have the top candidate, look through its siblings for content that might also be related.
@ -322,6 +326,7 @@ def main():
if not (len(args) == 1 or options.url): if not (len(args) == 1 or options.url):
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
logging.basicConfig(level=logging.DEBUG)
file = None file = None
if options.url: if options.url: