From 95852d5c1828823e2706cff792229f8b112f2047 Mon Sep 17 00:00:00 2001 From: Andrey Popp <8mayday@gmail.com> Date: Thu, 7 Jun 2012 14:08:09 +0400 Subject: [PATCH] readability.htmls: some docs do not have title elem --- readability/htmls.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/readability/htmls.py b/readability/htmls.py index 97aa55b..cb4ada7 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -43,11 +43,11 @@ def norm_title(title): return normalize_entities(normalize_spaces(title)) def get_title(doc): - title = doc.find('.//title').text - if not title: + title = doc.find('.//title') + if not title or not title.text: return '[no-title]' - - return norm_title(title) + + return norm_title(title.text) def add_match(collection, text, orig): text = norm_title(text) @@ -56,11 +56,11 @@ def add_match(collection, text, orig): collection.add(text) def shorten_title(doc): - title = doc.find('.//title').text - if not title: + title = doc.find('.//title') + if not title or not title.text: return '' - - title = orig = norm_title(title) + + title = orig = norm_title(title.text) candidates = set() @@ -77,7 +77,7 @@ def shorten_title(doc): add_match(candidates, e.text, orig) if e.text_content(): add_match(candidates, e.text_content(), orig) - + if candidates: title = sorted(candidates, key=len)[-1] else: