Bonus per 100 chars logic was incorrect

Number of characters was being mod'd by 100 instead of divided,
so a paragraph with a character length of 103 would have
incorrectly gotten 3 bonus points added to the content score.

Add Greg to credits
pull/11/head
Greg Jastrab 12 years ago committed by Richard Harding
parent ae9208374b
commit c8c53b304b

1
.gitignore vendored

@ -1,4 +1,5 @@
*.pyc
*.prof
.installed.cfg
bin

@ -123,7 +123,7 @@ def score_candidates(nodes):
# For every 100 characters in this paragraph, add another point. Up to
# 3 points.
length_points = len(innertext) % 100 if innertext else 0
length_points = len(innertext) / 100 if innertext else 0
if length_points > 3:
content_score += 3
else:

@ -57,7 +57,7 @@ class TestReadableDocument(TestCase):
"""
doc = Article(load_snippet('document_only_content.html'))
self.assertEqual(doc._readable.tag, 'div')
self.assertEqual(doc._readable.get('id'), 'readabilityBody')
@ -119,6 +119,7 @@ class TestCleaning(TestCase):
u'<html><body><p>simple<a href="">link</a></p></body></html>'
)
class TestCandidateNodes(TestCase):
"""Candidate nodes are scoring containers we use."""
@ -192,6 +193,35 @@ class TestScoringNodes(TestCase):
scores = sorted([c.content_score for c in candidates.values()])
self.assertTrue(scores[-1] > 100)
def test_bonus_score_per_100_chars_in_p(self):
"""Nodes get 1pt per 100 characters up to 3 max points"""
def build_doc(length):
div = '<div id="content" class=""><p>{0}</p></div>'
content = 'c' * length
test_div = div.format(content)
doc = document_fromstring('<html><body>' + test_div + '</body></html>')
test_nodes = []
for node in doc.getiterator():
if node.tag == 'p':
test_nodes.append(node)
return test_nodes
test_nodes = build_doc(400)
candidates = score_candidates(test_nodes)
pscore_400 = max([c.content_score for c in candidates.values()])
test_nodes = build_doc(100)
candidates = score_candidates(test_nodes)
pscore_100 = max([c.content_score for c in candidates.values()])
test_nodes = build_doc(50)
candidates = score_candidates(test_nodes)
pscore_50 = max([c.content_score for c in candidates.values()])
self.assertEqual(pscore_100, pscore_50 + 1)
self.assertEqual(pscore_400, pscore_50 + 3)
class TestLinkDensityScoring(TestCase):
"""Link density will adjust out candidate scoresself."""

Loading…
Cancel
Save