|
|
@ -1,13 +1,17 @@
|
|
|
|
from cleaners import normalize_spaces, clean_attributes
|
|
|
|
|
|
|
|
from encoding import get_encoding
|
|
|
|
|
|
|
|
from lxml.html import tostring
|
|
|
|
|
|
|
|
import logging
|
|
|
|
import logging
|
|
|
|
import lxml.html
|
|
|
|
|
|
|
|
import re
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from lxml.html import document_fromstring
|
|
|
|
|
|
|
|
from lxml.html import HTMLParser
|
|
|
|
|
|
|
|
from lxml.html import tostring
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from cleaners import clean_attributes
|
|
|
|
|
|
|
|
from cleaners import normalize_spaces
|
|
|
|
|
|
|
|
from encoding import get_encoding
|
|
|
|
|
|
|
|
|
|
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
|
|
|
|
utf8_parser = HTMLParser(encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_doc(page):
|
|
|
|
def build_doc(page):
|
|
|
|
if isinstance(page, unicode):
|
|
|
|
if isinstance(page, unicode):
|
|
|
@ -15,9 +19,12 @@ def build_doc(page):
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
enc = get_encoding(page)
|
|
|
|
enc = get_encoding(page)
|
|
|
|
page_unicode = page.decode(enc, 'replace')
|
|
|
|
page_unicode = page.decode(enc, 'replace')
|
|
|
|
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
|
|
|
|
doc = document_fromstring(
|
|
|
|
|
|
|
|
page_unicode.encode('utf-8', 'replace'),
|
|
|
|
|
|
|
|
parser=utf8_parser)
|
|
|
|
return doc
|
|
|
|
return doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def js_re(src, pattern, flags, repl):
|
|
|
|
def js_re(src, pattern, flags, repl):
|
|
|
|
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
|
|
|
|
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
|
|
|
|
|
|
|
|
|
|
|
@ -39,9 +46,11 @@ def normalize_entities(cur_title):
|
|
|
|
|
|
|
|
|
|
|
|
return cur_title
|
|
|
|
return cur_title
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def norm_title(title):
|
|
|
|
def norm_title(title):
|
|
|
|
return normalize_entities(normalize_spaces(title))
|
|
|
|
return normalize_entities(normalize_spaces(title))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_title(doc):
|
|
|
|
def get_title(doc):
|
|
|
|
title = doc.find('.//title').text
|
|
|
|
title = doc.find('.//title').text
|
|
|
|
if not title:
|
|
|
|
if not title:
|
|
|
@ -49,12 +58,14 @@ def get_title(doc):
|
|
|
|
|
|
|
|
|
|
|
|
return norm_title(title)
|
|
|
|
return norm_title(title)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def add_match(collection, text, orig):
|
|
|
|
def add_match(collection, text, orig):
|
|
|
|
text = norm_title(text)
|
|
|
|
text = norm_title(text)
|
|
|
|
if len(text.split()) >= 2 and len(text) >= 15:
|
|
|
|
if len(text.split()) >= 2 and len(text) >= 15:
|
|
|
|
if text.replace('"', '') in orig.replace('"', ''):
|
|
|
|
if text.replace('"', '') in orig.replace('"', ''):
|
|
|
|
collection.add(text)
|
|
|
|
collection.add(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def shorten_title(doc):
|
|
|
|
def shorten_title(doc):
|
|
|
|
title = doc.find('.//title').text
|
|
|
|
title = doc.find('.//title').text
|
|
|
|
if not title:
|
|
|
|
if not title:
|
|
|
@ -71,7 +82,8 @@ def shorten_title(doc):
|
|
|
|
if e.text_content():
|
|
|
|
if e.text_content():
|
|
|
|
add_match(candidates, e.text_content(), orig)
|
|
|
|
add_match(candidates, e.text_content(), orig)
|
|
|
|
|
|
|
|
|
|
|
|
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
|
|
|
|
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title',
|
|
|
|
|
|
|
|
'.title', '.head', '.heading', '.contentheading', '.small_header_red']:
|
|
|
|
for e in doc.cssselect(item):
|
|
|
|
for e in doc.cssselect(item):
|
|
|
|
if e.text:
|
|
|
|
if e.text:
|
|
|
|
add_match(candidates, e.text, orig)
|
|
|
|
add_match(candidates, e.text, orig)
|
|
|
@ -103,6 +115,7 @@ def shorten_title(doc):
|
|
|
|
|
|
|
|
|
|
|
|
return title
|
|
|
|
return title
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_body(doc):
|
|
|
|
def get_body(doc):
|
|
|
|
[elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')]
|
|
|
|
[elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')]
|
|
|
|
raw_html = unicode(tostring(doc.body or doc))
|
|
|
|
raw_html = unicode(tostring(doc.body or doc))
|
|
|
@ -111,5 +124,7 @@ def get_body(doc):
|
|
|
|
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
|
|
|
|
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
|
|
|
|
return cleaned
|
|
|
|
return cleaned
|
|
|
|
except Exception: # FIXME find the equivalent lxml error
|
|
|
|
except Exception: # FIXME find the equivalent lxml error
|
|
|
|
logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
|
|
|
|
logging.error("cleansing broke html content: %s\n---------\n%s" % (
|
|
|
|
|
|
|
|
raw_html,
|
|
|
|
|
|
|
|
cleaned))
|
|
|
|
return raw_html
|
|
|
|
return raw_html
|
|
|
|