python-readability/readability/htmls.py

131 lines
3.5 KiB
Python
Raw Normal View History

import logging
import re
2012-04-17 18:14:02 +00:00
from lxml.html import document_fromstring
from lxml.html import HTMLParser
from lxml.html import tostring
from cleaners import clean_attributes
from cleaners import normalize_spaces
from encoding import get_encoding
logging.getLogger().setLevel(logging.DEBUG)
2012-04-17 18:14:02 +00:00
utf8_parser = HTMLParser(encoding='utf-8')
def build_doc(page):
if isinstance(page, unicode):
page_unicode = page
2011-06-27 16:54:36 +00:00
else:
enc = get_encoding(page)
page_unicode = page.decode(enc, 'replace')
2012-04-17 18:14:02 +00:00
doc = document_fromstring(
page_unicode.encode('utf-8', 'replace'),
parser=utf8_parser)
return doc
2012-04-17 18:14:02 +00:00
def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
def normalize_entities(cur_title):
entities = {
2012-04-17 18:14:02 +00:00
u'\u2014': '-',
u'\u2013': '-',
u'—': '-',
u'–': '-',
u'\u00A0': ' ',
u'\u00AB': '"',
u'\u00BB': '"',
u'"': '"',
}
for c, r in entities.iteritems():
if c in cur_title:
cur_title = cur_title.replace(c, r)
return cur_title
2012-04-17 18:14:02 +00:00
def norm_title(title):
return normalize_entities(normalize_spaces(title))
2012-04-17 18:14:02 +00:00
def get_title(doc):
title = doc.find('.//title').text
if not title:
return '[no-title]'
2012-04-17 18:14:02 +00:00
return norm_title(title)
2012-04-17 18:14:02 +00:00
def add_match(collection, text, orig):
text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', '') in orig.replace('"', ''):
collection.add(text)
2012-04-17 18:14:02 +00:00
def shorten_title(doc):
title = doc.find('.//title').text
if not title:
return ''
2012-04-17 18:14:02 +00:00
title = orig = norm_title(title)
candidates = set()
for item in ['.//h1', './/h2', './/h3']:
for e in list(doc.iterfind(item)):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
2012-04-17 18:14:02 +00:00
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title',
'.title', '.head', '.heading', '.contentheading', '.small_header_red']:
for e in doc.cssselect(item):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
2012-04-17 18:14:02 +00:00
if candidates:
title = sorted(candidates, key=len)[-1]
else:
for delimiter in [' | ', ' - ', ' :: ', ' / ']:
if delimiter in title:
parts = orig.split(delimiter)
if len(parts[0].split()) >= 4:
title = parts[0]
break
elif len(parts[-1].split()) >= 4:
title = parts[-1]
break
else:
if ': ' in title:
parts = orig.split(': ')
if len(parts[-1].split()) >= 4:
title = parts[-1]
else:
title = orig.split(': ', 1)[1]
if not 15 < len(title) < 150:
return orig
return title
2012-04-17 18:14:02 +00:00
def get_body(doc):
2012-04-17 18:14:02 +00:00
[elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')]
raw_html = unicode(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html)
try:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
return cleaned
2012-04-17 18:14:02 +00:00
except Exception: # FIXME find the equivalent lxml error
logging.error("cleansing broke html content: %s\n---------\n%s" % (
raw_html,
cleaned))
return raw_html