failsafe parsing and more logging

pull/1/head
gfxmonk 15 years ago
parent 87ad057706
commit 0eacd959a4

@ -28,17 +28,17 @@ class Document:
TEXT_LENGTH_THRESHOLD = 25
RETRY_LENGTH = 250
def __init__(self, input, **options):
self.input = inpuunicodear
def __init__(self, input, notify=None, **options):
self.input = input
self.options = defaultdict(lambda: None)
for k, v in options.items():
self.options[k] = v
self.notify = notify or logging.info
self.html = None
def _html(self, force=False):
if force or self.html is None:
notify = self.options['notify'] or (lambda x: None)
self.html = parse(self.input, self.options['url'], notify=notify)
self.html = parse(self.input, self.options['url'], notify=self.notify)
return self.html
def content(self):
@ -48,6 +48,7 @@ class Document:
return get_title(self._html())
def summary(self):
try:
ruthless = True
while True:
self._html(True)
@ -74,6 +75,9 @@ class Document:
continue # try again
else:
return cleaned_article
except StandardError, e:
logging.exception('error getting summary:')
raise Unparseable(str(e))
def get_article(self, candidates, best_candidate):
# Now that we have the top candidate, look through its siblings for content that might also be related.
@ -322,6 +326,7 @@ def main():
if not (len(args) == 1 or options.url):
parser.print_help()
sys.exit(1)
logging.basicConfig(level=logging.DEBUG)
file = None
if options.url:

Loading…
Cancel
Save