|
|
@ -1,6 +1,6 @@
|
|
|
|
#!/usr/bin/env python
|
|
|
|
#!/usr/bin/env python
|
|
|
|
from BeautifulSoup import NavigableString
|
|
|
|
from BeautifulSoup import NavigableString
|
|
|
|
from page_parser import parse
|
|
|
|
from page_parser import parse, get_title, get_body
|
|
|
|
import logging
|
|
|
|
import logging
|
|
|
|
import re
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
@ -33,24 +33,38 @@ class Document:
|
|
|
|
self.options = defaultdict(lambda: None)
|
|
|
|
self.options = defaultdict(lambda: None)
|
|
|
|
for k, v in options.items():
|
|
|
|
for k, v in options.items():
|
|
|
|
self.options[k] = v
|
|
|
|
self.options[k] = v
|
|
|
|
|
|
|
|
self.html = None
|
|
|
|
|
|
|
|
|
|
|
|
def make_html(self):
|
|
|
|
def _html(self, force=False):
|
|
|
|
|
|
|
|
if force or self.html is None:
|
|
|
|
self.html = parse(self.input, self.options['url'])
|
|
|
|
self.html = parse(self.input, self.options['url'])
|
|
|
|
|
|
|
|
return self.html
|
|
|
|
|
|
|
|
|
|
|
|
def content(self):
|
|
|
|
def content(self):
|
|
|
|
|
|
|
|
return get_body(self._html())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def title(self):
|
|
|
|
|
|
|
|
return get_title(self._html())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def summary(self):
|
|
|
|
ruthless = True
|
|
|
|
ruthless = True
|
|
|
|
while True:
|
|
|
|
while True:
|
|
|
|
self.make_html()
|
|
|
|
self._html(True)
|
|
|
|
[i.extract() for i in self.tags(self.html, 'script', 'style')]
|
|
|
|
[i.extract() for i in self.tags(self.html, 'script', 'style')]
|
|
|
|
|
|
|
|
|
|
|
|
if ruthless: self.remove_unlikely_candidates()
|
|
|
|
if ruthless: self.remove_unlikely_candidates()
|
|
|
|
self.transform_misused_divs_into_paragraphs()
|
|
|
|
self.transform_misused_divs_into_paragraphs()
|
|
|
|
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
|
|
|
|
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
|
|
|
|
best_candidate = self.select_best_candidate(candidates)
|
|
|
|
best_candidate = self.select_best_candidate(candidates)
|
|
|
|
if ruthless and best_candidate is None:
|
|
|
|
if best_candidate:
|
|
|
|
|
|
|
|
article = self.get_article(candidates, best_candidate)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
if ruthless:
|
|
|
|
ruthless = False
|
|
|
|
ruthless = False
|
|
|
|
|
|
|
|
# try again
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
article = self.get_article(candidates, best_candidate)
|
|
|
|
else:
|
|
|
|
|
|
|
|
article = self.html.find('body') or self.html
|
|
|
|
|
|
|
|
|
|
|
|
cleaned_article = self.sanitize(article, candidates)
|
|
|
|
cleaned_article = self.sanitize(article, candidates)
|
|
|
|
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
|
|
|
|
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
|
|
|
@ -88,16 +102,19 @@ class Document:
|
|
|
|
if append:
|
|
|
|
if append:
|
|
|
|
output.append(sibling)
|
|
|
|
output.append(sibling)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not output: output.append(best_candidate)
|
|
|
|
return output
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
|
|
def select_best_candidate(self, candidates):
|
|
|
|
def select_best_candidate(self, candidates):
|
|
|
|
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
|
|
|
|
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
|
|
|
|
self.debug("Top 5 canidates:")
|
|
|
|
self.debug("Top 5 candidates:")
|
|
|
|
for candidate in sorted_candidates[:5]:
|
|
|
|
for candidate in sorted_candidates[:5]:
|
|
|
|
elem = candidate['elem']
|
|
|
|
elem = candidate['elem']
|
|
|
|
self.debug("Candidate %s with score %s" % (describe(elem), candidate['content_score']))
|
|
|
|
self.debug("Candidate %s with score %s" % (describe(elem), candidate['content_score']))
|
|
|
|
|
|
|
|
|
|
|
|
best_candidate = sorted_candidates[0] if len(sorted_candidates) > 1 else { 'elem': self.html.find("body"), 'content_score': 0 }
|
|
|
|
if len(sorted_candidates) == 0:
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
best_candidate = sorted_candidates[0]
|
|
|
|
self.debug("Best candidate %s with score %s" % (describe(best_candidate['elem']), best_candidate['content_score']))
|
|
|
|
self.debug("Best candidate %s with score %s" % (describe(best_candidate['elem']), best_candidate['content_score']))
|
|
|
|
return best_candidate
|
|
|
|
return best_candidate
|
|
|
|
|
|
|
|
|
|
|
@ -108,7 +125,7 @@ class Document:
|
|
|
|
|
|
|
|
|
|
|
|
def score_paragraphs(self, min_text_length):
|
|
|
|
def score_paragraphs(self, min_text_length):
|
|
|
|
candidates = {}
|
|
|
|
candidates = {}
|
|
|
|
elems = self.html.findAll("p") + self.html.findAll("td")
|
|
|
|
elems = self.tags(self.html, "p","td")
|
|
|
|
|
|
|
|
|
|
|
|
for elem in elems:
|
|
|
|
for elem in elems:
|
|
|
|
parent_node = elem.parent
|
|
|
|
parent_node = elem.parent
|
|
|
@ -201,7 +218,7 @@ class Document:
|
|
|
|
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
|
|
|
|
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
|
|
|
|
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.extract()
|
|
|
|
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.extract()
|
|
|
|
|
|
|
|
|
|
|
|
for elem in self.tags(node, "form", "object", "iframe", "embed"):
|
|
|
|
for elem in self.tags(node, "form", "iframe"):
|
|
|
|
elem.extract()
|
|
|
|
elem.extract()
|
|
|
|
|
|
|
|
|
|
|
|
# remove empty <p> tags
|
|
|
|
# remove empty <p> tags
|
|
|
@ -265,7 +282,7 @@ class Document:
|
|
|
|
if not (self.options['attributes']):
|
|
|
|
if not (self.options['attributes']):
|
|
|
|
el.attrMap = {}
|
|
|
|
el.attrMap = {}
|
|
|
|
|
|
|
|
|
|
|
|
return str(node)
|
|
|
|
return unicode(node)
|
|
|
|
|
|
|
|
|
|
|
|
class HashableElement():
|
|
|
|
class HashableElement():
|
|
|
|
def __init__(self, node):
|
|
|
|
def __init__(self, node):
|
|
|
@ -312,7 +329,7 @@ def main():
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
file = open(args[0])
|
|
|
|
file = open(args[0])
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
print Document(file.read(), debug=options.verbose).content()
|
|
|
|
print Document(file.read(), debug=options.verbose).summary().encode('ascii','ignore')
|
|
|
|
finally:
|
|
|
|
finally:
|
|
|
|
file.close()
|
|
|
|
file.close()
|
|
|
|
|
|
|
|
|
|
|
|