Added scored nodes into candidates

pull/21/head
Mišo Belica 11 years ago
parent f858f0dbb0
commit df5cb8c8f6

@ -192,6 +192,10 @@ def score_candidates(nodes):
logger.debug("Giving parent bonus points: %f", candidates[parent].content_score)
# the grand node gets half
candidates[grand].content_score += content_score / 2.0
if node not in candidates:
candidates[node] = ScoredNode(node)
candidates[node].content_score += content_score
logger.debug("Giving grand bonus points: %f", candidates[grand].content_score)
for candidate in candidates.values():

@ -216,20 +216,16 @@ class TestScoringNodes(unittest.TestCase):
def test_we_get_candidates(self):
"""Processing candidates should get us a list of nodes to try out."""
# we'll start out using our first real test document
test_nodes = []
doc = document_fromstring(load_article('ars.001.html'))
for node in doc.iter('p', 'td', 'pre'):
test_nodes.append(node)
doc = document_fromstring(load_article("ars.001.html"))
test_nodes = tuple(doc.iter("p", "td", "pre"))
candidates = score_candidates(test_nodes)
# this might change as we tweak our algorithm, but if it does change,
# this might change as we tweak our algorithm, but if it does,
# it signifies we need to look at what we changed.
self.assertEqual(len(candidates.keys()), 6)
self.assertEqual(len(candidates.keys()), 37)
# one of these should have a decent score
scores = sorted([c.content_score for c in candidates.values()])
scores = sorted(c.content_score for c in candidates.values())
self.assertTrue(scores[-1] > 100)
def test_bonus_score_per_100_chars_in_p(self):

@ -248,7 +248,7 @@ class TestScoreCandidates(unittest.TestCase):
def test_simple_candidate_set(self):
"""Tests a simple case of two candidate nodes"""
doc = """
html = """
<html>
<body>
<div class="content">
@ -262,18 +262,16 @@ class TestScoreCandidates(unittest.TestCase):
</body>
</html>
"""
d_elem = document_fromstring(doc)
divs = d_elem.findall(".//div")
f_elem = divs[0]
s_elem = divs[1]
res = score_candidates([f_elem, s_elem])
ordered = sorted([c for c in res.values()],
key=attrgetter('content_score'),
reverse=True)
# the body element should have a higher score
self.assertTrue(ordered[0].node.tag == 'body')
# the html element is the outer should come in second
self.assertTrue(ordered[1].node.tag == 'html')
dom = document_fromstring(html)
div_nodes = dom.findall(".//div")
candidates = score_candidates(div_nodes)
ordered = sorted((c for c in candidates.values()), reverse=True,
key=attrgetter("content_score"))
self.assertEqual(ordered[0].node.tag, "div")
self.assertEqual(ordered[0].node.attrib["class"], "content")
self.assertEqual(ordered[1].node.tag, "body")
self.assertEqual(ordered[2].node.tag, "html")
self.assertEqual(ordered[3].node.tag, "div")
self.assertEqual(ordered[3].node.attrib["class"], "footer")

Loading…
Cancel
Save