Add support for sibling detection, need to figure out how to test it well still

pull/4/merge
Richard Harding 12 years ago
parent 4455ec226d
commit 19a38a2cea

@ -58,7 +58,7 @@ def main():
# print m.html.encode(enc, 'replace')
# else:
# print doc.summary().encode(enc, 'replace')
print unicode(doc)
print doc
finally:
target.close()

@ -69,6 +69,18 @@ def build_base_document(html):
return found_body
def get_link_density(node):
"""Generate a value for the number of links in the node.
:param node: pared elementree node
:returns float:
"""
link_length = len("".join([a.text or "" for a in node.findall(".//a")]))
text_length = len(node.text_content())
return float(link_length) / max(text_length, 1)
def transform_misused_divs_into_paragraphs(doc):
"""Turn all divs that don't have children block level elements into p's
@ -96,16 +108,58 @@ def transform_misused_divs_into_paragraphs(doc):
return doc
def get_link_density(node):
"""Generate a value for the number of links in the node.
def check_siblings(candidate_node, candidate_list):
"""Look through siblings for content that might also be related.
:param node: pared elementree node
:returns float:
Things like preambles, content split by ads that we removed, etc.
"""
link_length = len("".join([a.text or "" for a in node.findall(".//a")]))
text_length = len(node.text_content())
return float(link_length) / max(text_length, 1)
candidate_css = candidate_node.node.get('class')
potential_target = candidate_node.content_score * 0.2
sibling_target_score = potential_target if potential_target > 10 else 10
parent = candidate_node.node.getparent()
siblings = parent.getchildren() if parent is not None else []
for sibling in siblings:
append = False
content_bonus = 0;
if sibling is candidate_node.node:
append = True
# Give a bonus if sibling nodes and top candidates have the example
# same classname
if candidate_css and sibling.get('class') == candidate_css:
content_bonus += candidate_node.content_score * 0.2;
if sibling in candidate_list:
adjusted_score = candidate_list[sibling].content_score + \
content_bonus
if adjusted_score >= sibling_target_score:
append = True
if sibling.tag == 'p':
link_density = get_link_density(sibling)
content = sibling.text_content()
content_length = len(content)
if content_length > 80 and link_density < 0.25:
append = true;
elif content_length < 80 and link_density == 0:
if ". " in content:
append = True
if append:
if sibling.tag not in ['div', 'p']:
# We have a node that isn't a common block level element, like
# a form or td tag. Turn it into a div so it doesn't get
# filtered out later by accident.
sibling.tag = 'div'
candidate_node.node.append(sibling)
return candidate_node
###### SCORING
@ -277,7 +331,11 @@ class Article(object):
by_score = sorted([c for c in candidates.values()],
key=attrgetter('content_score'), reverse=True)
doc = build_base_document(by_score[0].node)
# since we have several candidates, check the winner's siblings
# for extra content
winner = by_score[0]
updated_winner = check_siblings(winner, candidates)
doc = build_base_document(updated_winner.node)
else:
doc = build_base_document(doc)

@ -36,7 +36,6 @@ class TestReadableDocument(TestCase):
No sense processing anything other than the body content.
"""
print "MIN DOCUMENT"
doc = Article(load_snippet('document_min.html'))
self.assertEqual(doc.readable.tag, 'div')
self.assertEqual(doc.readable.get('id'), 'readabilityBody')

Loading…
Cancel
Save