Complete upstream merge

pull/11/head
Richard Harding 12 years ago
commit 7c220535df

1
.gitignore vendored

@ -1,4 +1,5 @@
*.pyc
*.prof
.installed.cfg
bin

@ -128,7 +128,7 @@ def score_candidates(nodes):
# For every 100 characters in this paragraph, add another point. Up to
# 3 points.
length_points = len(innertext) % 100 if innertext else 0
length_points = len(innertext) / 100 if innertext else 0
if length_points > 3:
content_score += 3
else:

@ -146,7 +146,6 @@ class TestCandidateNodes(TestCase):
doc = fragment_fromstring(n)
self.assertEqual(ScoredNode(doc).content_score, -5)
def test_article_enables_candidate_access(self):
"""Candidates are accessible after document processing."""
doc = Article(load_article('ars/ars.001.html'))
@ -199,6 +198,35 @@ class TestScoringNodes(TestCase):
scores = sorted([c.content_score for c in candidates.values()])
self.assertTrue(scores[-1] > 100)
def test_bonus_score_per_100_chars_in_p(self):
"""Nodes get 1pt per 100 characters up to 3 max points"""
def build_doc(length):
div = '<div id="content" class=""><p>{0}</p></div>'
document_str = '<html><body>{0}</body></html>'
content = 'c' * length
test_div = div.format(content)
doc = document_fromstring(document_str.format(test_div))
test_nodes = []
for node in doc.getiterator():
if node.tag == 'p':
test_nodes.append(node)
return test_nodes
test_nodes = build_doc(400)
candidates = score_candidates(test_nodes)
pscore_400 = max([c.content_score for c in candidates.values()])
test_nodes = build_doc(100)
candidates = score_candidates(test_nodes)
pscore_100 = max([c.content_score for c in candidates.values()])
test_nodes = build_doc(50)
candidates = score_candidates(test_nodes)
pscore_50 = max([c.content_score for c in candidates.values()])
self.assertEqual(pscore_100, pscore_50 + 1)
self.assertEqual(pscore_400, pscore_50 + 3)
class TestLinkDensityScoring(TestCase):
"""Link density will adjust out candidate scoresself."""

Loading…
Cancel
Save