failsafe parsing and more logging
This commit is contained in:
parent
87ad057706
commit
0eacd959a4
@ -28,17 +28,17 @@ class Document:
|
|||||||
TEXT_LENGTH_THRESHOLD = 25
|
TEXT_LENGTH_THRESHOLD = 25
|
||||||
RETRY_LENGTH = 250
|
RETRY_LENGTH = 250
|
||||||
|
|
||||||
def __init__(self, input, **options):
|
def __init__(self, input, notify=None, **options):
|
||||||
self.input = inpuunicodear
|
self.input = input
|
||||||
self.options = defaultdict(lambda: None)
|
self.options = defaultdict(lambda: None)
|
||||||
for k, v in options.items():
|
for k, v in options.items():
|
||||||
self.options[k] = v
|
self.options[k] = v
|
||||||
|
self.notify = notify or logging.info
|
||||||
self.html = None
|
self.html = None
|
||||||
|
|
||||||
def _html(self, force=False):
|
def _html(self, force=False):
|
||||||
if force or self.html is None:
|
if force or self.html is None:
|
||||||
notify = self.options['notify'] or (lambda x: None)
|
self.html = parse(self.input, self.options['url'], notify=self.notify)
|
||||||
self.html = parse(self.input, self.options['url'], notify=notify)
|
|
||||||
return self.html
|
return self.html
|
||||||
|
|
||||||
def content(self):
|
def content(self):
|
||||||
@ -48,32 +48,36 @@ class Document:
|
|||||||
return get_title(self._html())
|
return get_title(self._html())
|
||||||
|
|
||||||
def summary(self):
|
def summary(self):
|
||||||
ruthless = True
|
try:
|
||||||
while True:
|
ruthless = True
|
||||||
self._html(True)
|
while True:
|
||||||
[i.extract() for i in self.tags(self.html, 'script', 'style')]
|
self._html(True)
|
||||||
|
[i.extract() for i in self.tags(self.html, 'script', 'style')]
|
||||||
|
|
||||||
if ruthless: self.remove_unlikely_candidates()
|
if ruthless: self.remove_unlikely_candidates()
|
||||||
self.transform_misused_divs_into_paragraphs()
|
self.transform_misused_divs_into_paragraphs()
|
||||||
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
|
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
|
||||||
best_candidate = self.select_best_candidate(candidates)
|
best_candidate = self.select_best_candidate(candidates)
|
||||||
if best_candidate:
|
if best_candidate:
|
||||||
article = self.get_article(candidates, best_candidate)
|
article = self.get_article(candidates, best_candidate)
|
||||||
else:
|
|
||||||
if ruthless:
|
|
||||||
ruthless = False
|
|
||||||
# try again
|
|
||||||
continue
|
|
||||||
else:
|
else:
|
||||||
article = self.html.find('body') or self.html
|
if ruthless:
|
||||||
|
ruthless = False
|
||||||
|
# try again
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
article = self.html.find('body') or self.html
|
||||||
|
|
||||||
cleaned_article = self.sanitize(article, candidates)
|
cleaned_article = self.sanitize(article, candidates)
|
||||||
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
|
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
|
||||||
if ruthless and not of_acceptable_length:
|
if ruthless and not of_acceptable_length:
|
||||||
ruthless = False
|
ruthless = False
|
||||||
continue # try again
|
continue # try again
|
||||||
else:
|
else:
|
||||||
return cleaned_article
|
return cleaned_article
|
||||||
|
except StandardError, e:
|
||||||
|
logging.exception('error getting summary:')
|
||||||
|
raise Unparseable(str(e))
|
||||||
|
|
||||||
def get_article(self, candidates, best_candidate):
|
def get_article(self, candidates, best_candidate):
|
||||||
# Now that we have the top candidate, look through its siblings for content that might also be related.
|
# Now that we have the top candidate, look through its siblings for content that might also be related.
|
||||||
@ -322,6 +326,7 @@ def main():
|
|||||||
if not (len(args) == 1 or options.url):
|
if not (len(args) == 1 or options.url):
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
file = None
|
file = None
|
||||||
if options.url:
|
if options.url:
|
||||||
|
Loading…
Reference in New Issue
Block a user