readability.htmls: some docs do not have title elem
This commit is contained in:
parent
274b60cdb1
commit
95852d5c18
@ -43,11 +43,11 @@ def norm_title(title):
|
||||
return normalize_entities(normalize_spaces(title))
|
||||
|
||||
def get_title(doc):
|
||||
title = doc.find('.//title').text
|
||||
if not title:
|
||||
title = doc.find('.//title')
|
||||
if not title or not title.text:
|
||||
return '[no-title]'
|
||||
|
||||
return norm_title(title)
|
||||
|
||||
return norm_title(title.text)
|
||||
|
||||
def add_match(collection, text, orig):
|
||||
text = norm_title(text)
|
||||
@ -56,11 +56,11 @@ def add_match(collection, text, orig):
|
||||
collection.add(text)
|
||||
|
||||
def shorten_title(doc):
|
||||
title = doc.find('.//title').text
|
||||
if not title:
|
||||
title = doc.find('.//title')
|
||||
if not title or not title.text:
|
||||
return ''
|
||||
|
||||
title = orig = norm_title(title)
|
||||
|
||||
title = orig = norm_title(title.text)
|
||||
|
||||
candidates = set()
|
||||
|
||||
@ -77,7 +77,7 @@ def shorten_title(doc):
|
||||
add_match(candidates, e.text, orig)
|
||||
if e.text_content():
|
||||
add_match(candidates, e.text_content(), orig)
|
||||
|
||||
|
||||
if candidates:
|
||||
title = sorted(candidates, key=len)[-1]
|
||||
else:
|
||||
|
Loading…
Reference in New Issue
Block a user