|
|
@ -76,13 +76,23 @@ def clean(text):
|
|
|
|
def text_length(i):
|
|
|
|
def text_length(i):
|
|
|
|
return len(clean(i.text_content() or ""))
|
|
|
|
return len(clean(i.text_content() or ""))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
regexp_type = type(re.compile('hello, world'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compile_pattern(elements):
|
|
|
|
|
|
|
|
if not elements:
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
if isinstance(elements, regexp_type):
|
|
|
|
|
|
|
|
return elements
|
|
|
|
|
|
|
|
if isinstance(elements, basestring):
|
|
|
|
|
|
|
|
elements = elements.split(',')
|
|
|
|
|
|
|
|
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
|
|
|
|
|
|
|
|
|
|
|
|
class Document:
|
|
|
|
class Document:
|
|
|
|
"""Class to build a etree document out of html."""
|
|
|
|
"""Class to build a etree document out of html."""
|
|
|
|
TEXT_LENGTH_THRESHOLD = 25
|
|
|
|
TEXT_LENGTH_THRESHOLD = 25
|
|
|
|
RETRY_LENGTH = 250
|
|
|
|
RETRY_LENGTH = 250
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, input, **options):
|
|
|
|
def __init__(self, input, positive_keywords=None, negative_keywords=None, **options):
|
|
|
|
"""Generate the document
|
|
|
|
"""Generate the document
|
|
|
|
|
|
|
|
|
|
|
|
:param input: string of the html content.
|
|
|
|
:param input: string of the html content.
|
|
|
@ -93,11 +103,16 @@ class Document:
|
|
|
|
- min_text_length:
|
|
|
|
- min_text_length:
|
|
|
|
- retry_length:
|
|
|
|
- retry_length:
|
|
|
|
- url: will allow adjusting links to be absolute
|
|
|
|
- url: will allow adjusting links to be absolute
|
|
|
|
|
|
|
|
- positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
|
|
|
|
|
|
|
|
- negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
|
|
|
|
|
|
|
|
Also positive_keywords and negative_keywords could be a regexp.
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
self.input = input
|
|
|
|
self.input = input
|
|
|
|
self.options = options
|
|
|
|
self.options = options
|
|
|
|
self.html = None
|
|
|
|
self.html = None
|
|
|
|
|
|
|
|
self.encoding = None
|
|
|
|
|
|
|
|
self.positive_keywords = compile_pattern(positive_keywords)
|
|
|
|
|
|
|
|
self.negative_keywords = compile_pattern(negative_keywords)
|
|
|
|
|
|
|
|
|
|
|
|
def _html(self, force=False):
|
|
|
|
def _html(self, force=False):
|
|
|
|
if force or self.html is None:
|
|
|
|
if force or self.html is None:
|
|
|
@ -105,7 +120,7 @@ class Document:
|
|
|
|
return self.html
|
|
|
|
return self.html
|
|
|
|
|
|
|
|
|
|
|
|
def _parse(self, input):
|
|
|
|
def _parse(self, input):
|
|
|
|
doc = build_doc(input)
|
|
|
|
doc, self.encoding = build_doc(input)
|
|
|
|
doc = html_cleaner.clean_html(doc)
|
|
|
|
doc = html_cleaner.clean_html(doc)
|
|
|
|
base_href = self.options.get('url', None)
|
|
|
|
base_href = self.options.get('url', None)
|
|
|
|
if base_href:
|
|
|
|
if base_href:
|
|
|
@ -311,19 +326,25 @@ class Document:
|
|
|
|
|
|
|
|
|
|
|
|
def class_weight(self, e):
|
|
|
|
def class_weight(self, e):
|
|
|
|
weight = 0
|
|
|
|
weight = 0
|
|
|
|
if e.get('class', None):
|
|
|
|
for feature in [e.get('class', None), e.get('id', None)]:
|
|
|
|
if REGEXES['negativeRe'].search(e.get('class')):
|
|
|
|
if feature:
|
|
|
|
weight -= 25
|
|
|
|
if REGEXES['negativeRe'].search(feature):
|
|
|
|
|
|
|
|
weight -= 25
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if REGEXES['positiveRe'].search(feature):
|
|
|
|
|
|
|
|
weight += 25
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if self.positive_keywords and self.positive_keywords.search(feature):
|
|
|
|
|
|
|
|
weight += 25
|
|
|
|
|
|
|
|
|
|
|
|
if REGEXES['positiveRe'].search(e.get('class')):
|
|
|
|
if self.negative_keywords and self.negative_keywords.search(feature):
|
|
|
|
weight += 25
|
|
|
|
weight -= 25
|
|
|
|
|
|
|
|
|
|
|
|
if e.get('id', None):
|
|
|
|
if self.positive_keywords and self.positive_keywords.match('tag-'+e.tag):
|
|
|
|
if REGEXES['negativeRe'].search(e.get('id')):
|
|
|
|
weight += 25
|
|
|
|
weight -= 25
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if REGEXES['positiveRe'].search(e.get('id')):
|
|
|
|
if self.negative_keywords and self.negative_keywords.match('tag-'+e.tag):
|
|
|
|
weight += 25
|
|
|
|
weight -= 25
|
|
|
|
|
|
|
|
|
|
|
|
return weight
|
|
|
|
return weight
|
|
|
|
|
|
|
|
|
|
|
@ -569,6 +590,8 @@ def main():
|
|
|
|
parser = OptionParser(usage="%prog: [options] [file]")
|
|
|
|
parser = OptionParser(usage="%prog: [options] [file]")
|
|
|
|
parser.add_option('-v', '--verbose', action='store_true')
|
|
|
|
parser.add_option('-v', '--verbose', action='store_true')
|
|
|
|
parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
|
|
|
|
parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
|
|
|
|
|
|
|
|
parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store')
|
|
|
|
|
|
|
|
parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store')
|
|
|
|
(options, args) = parser.parse_args()
|
|
|
|
(options, args) = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
if not (len(args) == 1 or options.url):
|
|
|
|
if not (len(args) == 1 or options.url):
|
|
|
@ -581,11 +604,14 @@ def main():
|
|
|
|
file = urllib.urlopen(options.url)
|
|
|
|
file = urllib.urlopen(options.url)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
file = open(args[0], 'rt')
|
|
|
|
file = open(args[0], 'rt')
|
|
|
|
enc = sys.__stdout__.encoding or 'utf-8'
|
|
|
|
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
print Document(file.read(),
|
|
|
|
print Document(file.read(),
|
|
|
|
debug=options.verbose,
|
|
|
|
debug=options.verbose,
|
|
|
|
url=options.url).summary().encode(enc, 'replace')
|
|
|
|
url=options.url,
|
|
|
|
|
|
|
|
positive_keywords = options.positive_keywords,
|
|
|
|
|
|
|
|
negative_keywords = options.negative_keywords,
|
|
|
|
|
|
|
|
).summary().encode(enc, 'replace')
|
|
|
|
finally:
|
|
|
|
finally:
|
|
|
|
file.close()
|
|
|
|
file.close()
|
|
|
|
|
|
|
|
|
|
|
|