split out into content and summary methods

pull/1/head
gfxmonk 15 years ago
parent c952f421b7
commit f73b5f05c4

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
from BeautifulSoup import NavigableString from BeautifulSoup import NavigableString
from page_parser import parse from page_parser import parse, get_title, get_body
import logging import logging
import re import re
@ -33,24 +33,38 @@ class Document:
self.options = defaultdict(lambda: None) self.options = defaultdict(lambda: None)
for k, v in options.items(): for k, v in options.items():
self.options[k] = v self.options[k] = v
self.html = None
def make_html(self): def _html(self, force=False):
self.html = parse(self.input, self.options['url']) if force or self.html is None:
self.html = parse(self.input, self.options['url'])
return self.html
def content(self): def content(self):
return get_body(self._html())
def title(self):
return get_title(self._html())
def summary(self):
ruthless = True ruthless = True
while True: while True:
self.make_html() self._html(True)
[i.extract() for i in self.tags(self.html, 'script', 'style')] [i.extract() for i in self.tags(self.html, 'script', 'style')]
if ruthless: self.remove_unlikely_candidates() if ruthless: self.remove_unlikely_candidates()
self.transform_misused_divs_into_paragraphs() self.transform_misused_divs_into_paragraphs()
candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
best_candidate = self.select_best_candidate(candidates) best_candidate = self.select_best_candidate(candidates)
if ruthless and best_candidate is None: if best_candidate:
ruthless = False article = self.get_article(candidates, best_candidate)
continue else:
article = self.get_article(candidates, best_candidate) if ruthless:
ruthless = False
# try again
continue
else:
article = self.html.find('body') or self.html
cleaned_article = self.sanitize(article, candidates) cleaned_article = self.sanitize(article, candidates)
of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH) of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
@ -88,16 +102,19 @@ class Document:
if append: if append:
output.append(sibling) output.append(sibling)
if not output: output.append(best_candidate)
return output return output
def select_best_candidate(self, candidates): def select_best_candidate(self, candidates):
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
self.debug("Top 5 canidates:") self.debug("Top 5 candidates:")
for candidate in sorted_candidates[:5]: for candidate in sorted_candidates[:5]:
elem = candidate['elem'] elem = candidate['elem']
self.debug("Candidate %s with score %s" % (describe(elem), candidate['content_score'])) self.debug("Candidate %s with score %s" % (describe(elem), candidate['content_score']))
best_candidate = sorted_candidates[0] if len(sorted_candidates) > 1 else { 'elem': self.html.find("body"), 'content_score': 0 } if len(sorted_candidates) == 0:
return None
best_candidate = sorted_candidates[0]
self.debug("Best candidate %s with score %s" % (describe(best_candidate['elem']), best_candidate['content_score'])) self.debug("Best candidate %s with score %s" % (describe(best_candidate['elem']), best_candidate['content_score']))
return best_candidate return best_candidate
@ -108,7 +125,7 @@ class Document:
def score_paragraphs(self, min_text_length): def score_paragraphs(self, min_text_length):
candidates = {} candidates = {}
elems = self.html.findAll("p") + self.html.findAll("td") elems = self.tags(self.html, "p","td")
for elem in elems: for elem in elems:
parent_node = elem.parent parent_node = elem.parent
@ -201,7 +218,7 @@ class Document:
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.extract() if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.extract()
for elem in self.tags(node, "form", "object", "iframe", "embed"): for elem in self.tags(node, "form", "iframe"):
elem.extract() elem.extract()
# remove empty <p> tags # remove empty <p> tags
@ -265,7 +282,7 @@ class Document:
if not (self.options['attributes']): if not (self.options['attributes']):
el.attrMap = {} el.attrMap = {}
return str(node) return unicode(node)
class HashableElement(): class HashableElement():
def __init__(self, node): def __init__(self, node):
@ -312,7 +329,7 @@ def main():
else: else:
file = open(args[0]) file = open(args[0])
try: try:
print Document(file.read(), debug=options.verbose).content() print Document(file.read(), debug=options.verbose).summary().encode('ascii','ignore')
finally: finally:
file.close() file.close()

Loading…
Cancel
Save