|
|
@ -1,6 +1,7 @@
|
|
|
|
#!/usr/bin/env python
|
|
|
|
#!/usr/bin/env python
|
|
|
|
from BeautifulSoup import NavigableString
|
|
|
|
from BeautifulSoup import NavigableString
|
|
|
|
from page_parser import parse
|
|
|
|
from page_parser import parse
|
|
|
|
|
|
|
|
import logging
|
|
|
|
import re
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor',re.I),
|
|
|
|
REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor',re.I),
|
|
|
@ -32,28 +33,32 @@ class Document:
|
|
|
|
self.options = defaultdict(lambda: None)
|
|
|
|
self.options = defaultdict(lambda: None)
|
|
|
|
for k, v in options.items():
|
|
|
|
for k, v in options.items():
|
|
|
|
self.options[k] = v
|
|
|
|
self.options[k] = v
|
|
|
|
self.make_html()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_html(self):
|
|
|
|
def make_html(self):
|
|
|
|
self.html = parse(self.input, self.options['url'])
|
|
|
|
self.html = parse(self.input, self.options['url'])
|
|
|
|
|
|
|
|
|
|
|
|
def content(self, remove_unlikely_candidates = True):
|
|
|
|
def content(self):
|
|
|
|
def remove(tag): [i.extract() for i in self.html.findAll(tag)]
|
|
|
|
ruthless = True
|
|
|
|
remove('script')
|
|
|
|
while True:
|
|
|
|
remove('style')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if remove_unlikely_candidates: self.remove_unlikely_candidates()
|
|
|
|
|
|
|
|
self.transform_misused_divs_into_paragraphs()
|
|
|
|
|
|
|
|
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
|
|
|
|
|
|
|
|
best_candidate = self.select_best_candidate(candidates)
|
|
|
|
|
|
|
|
article = self.get_article(candidates, best_candidate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cleaned_article = self.sanitize(article, candidates)
|
|
|
|
|
|
|
|
if remove_unlikely_candidates and len(cleaned_article or '') < (self.options['retry_length'] or self.RETRY_LENGTH):
|
|
|
|
|
|
|
|
self.make_html()
|
|
|
|
self.make_html()
|
|
|
|
return self.content(False)
|
|
|
|
[i.extract() for i in self.tags(self.html, 'script', 'style')]
|
|
|
|
else:
|
|
|
|
|
|
|
|
return cleaned_article
|
|
|
|
if ruthless: self.remove_unlikely_candidates()
|
|
|
|
|
|
|
|
self.transform_misused_divs_into_paragraphs()
|
|
|
|
|
|
|
|
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
|
|
|
|
|
|
|
|
best_candidate = self.select_best_candidate(candidates)
|
|
|
|
|
|
|
|
if ruthless and best_candidate is None:
|
|
|
|
|
|
|
|
ruthless = False
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
article = self.get_article(candidates, best_candidate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cleaned_article = self.sanitize(article, candidates)
|
|
|
|
|
|
|
|
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
|
|
|
|
|
|
|
|
if ruthless and not of_acceptable_length:
|
|
|
|
|
|
|
|
ruthless = False
|
|
|
|
|
|
|
|
continue # try again
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
return cleaned_article
|
|
|
|
|
|
|
|
|
|
|
|
def get_article(self, candidates, best_candidate):
|
|
|
|
def get_article(self, candidates, best_candidate):
|
|
|
|
# Now that we have the top candidate, look through its siblings for content that might also be related.
|
|
|
|
# Now that we have the top candidate, look through its siblings for content that might also be related.
|
|
|
@ -87,18 +92,13 @@ class Document:
|
|
|
|
|
|
|
|
|
|
|
|
def select_best_candidate(self, candidates):
|
|
|
|
def select_best_candidate(self, candidates):
|
|
|
|
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
|
|
|
|
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
|
|
|
|
|
|
|
|
|
|
|
|
self.debug("Top 5 canidates:")
|
|
|
|
self.debug("Top 5 canidates:")
|
|
|
|
for candidate in sorted_candidates[:5]:
|
|
|
|
for candidate in sorted_candidates[:5]:
|
|
|
|
elem = candidate['elem']
|
|
|
|
elem = candidate['elem']
|
|
|
|
self.debug("Candidate %s with score %s" % (
|
|
|
|
self.debug("Candidate %s with score %s" % (describe(elem), candidate['content_score']))
|
|
|
|
describe(elem), candidate['content_score']))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
best_candidate = sorted_candidates[0] if len(sorted_candidates) > 1 else { 'elem': self.html.find("body"), 'content_score': 0 }
|
|
|
|
best_candidate = sorted_candidates[0] if len(sorted_candidates) > 1 else { 'elem': self.html.find("body"), 'content_score': 0 }
|
|
|
|
elem = best_candidate['elem']
|
|
|
|
self.debug("Best candidate %s with score %s" % (describe(best_candidate['elem']), best_candidate['content_score']))
|
|
|
|
self.debug("Best candidate %s#%s.%s with score %s" % (
|
|
|
|
|
|
|
|
elem.name, elem.get('id',''), elem.get('class',''), best_candidate['content_score']))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return best_candidate
|
|
|
|
return best_candidate
|
|
|
|
|
|
|
|
|
|
|
|
def get_link_density(self, elem):
|
|
|
|
def get_link_density(self, elem):
|
|
|
@ -173,9 +173,9 @@ class Document:
|
|
|
|
content_score -= 5
|
|
|
|
content_score -= 5
|
|
|
|
return { 'content_score': content_score, 'elem': elem }
|
|
|
|
return { 'content_score': content_score, 'elem': elem }
|
|
|
|
|
|
|
|
|
|
|
|
def debug(self, str):
|
|
|
|
def debug(self, *a):
|
|
|
|
if self.options['debug']:
|
|
|
|
if self.options['debug']:
|
|
|
|
print(str)
|
|
|
|
logging.debug(*a)
|
|
|
|
|
|
|
|
|
|
|
|
def remove_unlikely_candidates(self):
|
|
|
|
def remove_unlikely_candidates(self):
|
|
|
|
for elem in self.html.findAll():
|
|
|
|
for elem in self.html.findAll():
|
|
|
|