Martin Thurau aa4132f57a Adds Python 3.4 support.
Code now supports Python 2.6, 2.7 and 3.4. PYthon 3.3 isn't support
because of some issues with the parser and the difference between old and
new `raise` syntax.
2015-04-29 16:18:21 +02:00

119 lines
3.5 KiB

from lxml.html import tostring
import logging
import lxml.html
import re, sys
from .cleaners import normalize_spaces, clean_attributes
from .encoding import get_encoding
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
if sys.version_info[0] == 2:
str = unicode
def build_doc(page):
if isinstance(page, str):
enc = None
page_unicode = page
enc = get_encoding(page) or 'utf-8'
page_unicode = page.decode(enc, 'replace')
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
return doc, enc
def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
def normalize_entities(cur_title):
entities = {
u'—': '-',
u'–': '-',
u'\u00A0': ' ',
u'\u00AB': '"',
u'\u00BB': '"',
u'"': '"',
for c, r in list(entities.items()):
if c in cur_title:
cur_title = cur_title.replace(c, r)
return cur_title
def norm_title(title):
return normalize_entities(normalize_spaces(title))
def get_title(doc):
title = doc.find('.//title')
if title is None or title.text is None or len(title.text) == 0:
return '[no-title]'
return norm_title(title.text)
def add_match(collection, text, orig):
text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', '') in orig.replace('"', ''):
def shorten_title(doc):
title = doc.find('.//title')
if title is None or title.text is None or len(title.text) == 0:
return ''
title = orig = norm_title(title.text)
candidates = set()
for item in ['.//h1', './/h2', './/h3']:
for e in list(doc.iterfind(item)):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
for e in doc.cssselect(item):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
if candidates:
title = sorted(candidates, key=len)[-1]
for delimiter in [' | ', ' - ', ' :: ', ' / ']:
if delimiter in title:
parts = orig.split(delimiter)
if len(parts[0].split()) >= 4:
title = parts[0]
elif len(parts[-1].split()) >= 4:
title = parts[-1]
if ': ' in title:
parts = orig.split(': ')
if len(parts[-1].split()) >= 4:
title = parts[-1]
title = orig.split(': ', 1)[1]
if not 15 < len(title) < 150:
return orig
return title
def get_body(doc):
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
raw_html = str(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html)
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
return cleaned
except Exception: #FIXME find the equivalent lxml error
#logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
return raw_html