Adding tests for the id/css weights and link density

pull/4/merge
Richard Harding 12 years ago
parent 90a02569ca
commit e57f8f02ce

@ -276,7 +276,7 @@ def prep_article(doc):
content_score = 0
if (weight + content_score < 0):
LNODE(node, 2, 'Dropping conditional node')
LNODE.log(node, 2, 'Dropping conditional node')
return True
if node.text_content().count(',') < 10:

@ -3,7 +3,11 @@ import re
from lxml.html import fragment_fromstring
from unittest import TestCase
from breadability.readable import Article
from breadability.scoring import check_node_attr
from breadability.scoring import get_class_weight
from breadability.readable import get_link_density
from breadability.tests import load_snippet
class TestCheckNodeAttr(TestCase):
@ -42,3 +46,78 @@ class TestCheckNodeAttr(TestCase):
test_node = fragment_fromstring('<div/>')
test_node.set('id', 'test4')
self.assertFalse(check_node_attr(test_node, 'id', test_re))
class TestLinkDensity(TestCase):
"""Verify we calc our link density correctly."""
def test_empty_node(self):
"""An empty node doesn't have much of a link density"""
empty_div = u"<div></div>"
doc = Article(empty_div)
assert 0 == get_link_density(doc._readable), "Link density is nadda"
def test_small_doc_no_links(self):
doc = Article(load_snippet('document_min.html'))
assert 0 == get_link_density(doc._readable), "Still no link density"
def test_several_links(self):
"""This doc has a 3 links with the majority of content."""
doc = Article(load_snippet('document_absolute_url.html'))
self.assertAlmostEqual(
get_link_density(doc._readable), 0.349,
places=3)
class TestClassWeight(TestCase):
"""Verify we score nodes correctly based on their class/id attributes."""
def test_no_matches_zero(self):
"""If you don't have the attribute then you get a weight of 0"""
empty_div = u"<div></div>"
node = fragment_fromstring(empty_div)
self.assertEqual(get_class_weight(node), 0)
def test_id_hits(self):
"""If the id is in the list then it gets a weight"""
test_div = '<div id="post">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 25)
test_div = '<div id="comments">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), -25)
def test_class_hits(self):
"""If the class is in the list then it gets a weight"""
test_div = '<div class="something post">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 25)
test_div = '<div class="something comments">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), -25)
def test_scores_collide(self):
"""We might hit both positive and negative scores.
Positive and negative scoring is done independently so it's possible
to hit both positive and negative scores and cancel each other out.
"""
test_div = '<div id="post" class="something comment">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 0)
test_div = '<div id="post" class="post comment">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 25)
def test_scores_only_once(self):
"""Scoring is not cumulative within a class hit."""
test_div = '<div class="post main">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 25)

Loading…
Cancel
Save