Add some ScoredNode tests as well

2024-11-04 12:00:19 +00:00 · 2012-05-12 13:56:23 -04:00 · 2012-05-12 13:56:23 -04:00 · ae9208374b
commit ae9208374b
parent e57f8f02ce
2 changed files with 86 additions and 1 deletions
--- a/src/breadability/scoring.py
+++ b/src/breadability/scoring.py
@ -167,7 +167,7 @@ class ScoredNode(object):
        """Given node, set an initial score and weigh based on css and id"""
        self.node = node
        content_score = 0
-        if node.tag == 'div':
+        if node.tag in ['div', 'article']:
            content_score = 5

        if node.tag in ['pre', 'td', 'blockquote']:
@ -178,6 +178,7 @@ class ScoredNode(object):
            content_score = -3
        if node.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th']:
            content_score = -5
+
        content_score += get_class_weight(node)
        self.content_score = content_score

--- a/src/breadability/tests/test_scoring.py
+++ b/src/breadability/tests/test_scoring.py
@ -6,7 +6,9 @@ from unittest import TestCase
 from breadability.readable import Article
 from breadability.scoring import check_node_attr
 from breadability.scoring import get_class_weight
+from breadability.scoring import ScoredNode
 from breadability.readable import get_link_density
+from breadability.readable import is_unlikely_node
 from breadability.tests import load_snippet


@ -121,3 +123,85 @@ class TestClassWeight(TestCase):
        self.assertEqual(get_class_weight(node), 25)


+class TestUnlikelyNode(TestCase):
+    """is_unlikely_node should help verify our node is good/bad."""
+
+    def test_body_is_always_likely(self):
+        """The body tag is always a likely node."""
+        test_div = '<body class="comment"><div>Content</div></body>'
+        node = fragment_fromstring(test_div)
+        self.assertFalse(is_unlikely_node(node))
+
+    def test_is_unlikely(self):
+        "Keywords in the class/id will make us believe this is unlikely."
+        test_div = '<div class="something comments">Content</div>'
+        node = fragment_fromstring(test_div)
+        self.assertTrue(is_unlikely_node(node))
+
+        test_div = '<div id="comments">Content</div>'
+        node = fragment_fromstring(test_div)
+        self.assertTrue(is_unlikely_node(node))
+
+    def test_not_unlikely(self):
+        """Suck it double negatives."""
+        test_div = '<div id="post">Content</div>'
+        node = fragment_fromstring(test_div)
+        self.assertFalse(is_unlikely_node(node))
+
+        test_div = '<div class="something post">Content</div>'
+        node = fragment_fromstring(test_div)
+        self.assertFalse(is_unlikely_node(node))
+
+    def test_maybe_hits(self):
+        """We've got some maybes that will overrule an unlikely node."""
+        test_div = '<div id="comments" class="article">Content</div>'
+        node = fragment_fromstring(test_div)
+        self.assertFalse(is_unlikely_node(node))
+
+
+class TestScoredNode(TestCase):
+    """ScoredNodes constructed have initial content_scores, etc."""
+
+    def test_hash_id(self):
+        """ScoredNodes have a hash_id based on their content
+
+        Since this is based on the html there are chances for collisions, but
+        it helps us follow and identify nodes through the scoring process. Two
+        identical nodes would score the same, so meh all good.
+
+        """
+        test_div = '<div id="comments" class="article">Content</div>'
+        node = fragment_fromstring(test_div)
+        snode = ScoredNode(node)
+        self.assertEqual(snode.hash_id, 'ffa4c519')
+
+    def test_div_content_score(self):
+        """A div starts out with a score of 5 and modifies from there"""
+        test_div = '<div id="" class="">Content</div>'
+        node = fragment_fromstring(test_div)
+        snode = ScoredNode(node)
+        self.assertEqual(snode.content_score, 5)
+
+        test_div = '<div id="article" class="">Content</div>'
+        node = fragment_fromstring(test_div)
+        snode = ScoredNode(node)
+        self.assertEqual(snode.content_score, 30)
+
+        test_div = '<div id="comments" class="">Content</div>'
+        node = fragment_fromstring(test_div)
+        snode = ScoredNode(node)
+        self.assertEqual(snode.content_score, -20)
+
+    def test_headings_score(self):
+        """Heading tags aren't likely candidates, hurt their scores."""
+        test_div = '<h2>Heading</h2>'
+        node = fragment_fromstring(test_div)
+        snode = ScoredNode(node)
+        self.assertEqual(snode.content_score, -5)
+
+    def test_list_items(self):
+        """Heading tags aren't likely candidates, hurt their scores."""
+        test_div = '<li>list item</li>'
+        node = fragment_fromstring(test_div)
+        snode = ScoredNode(node)
+        self.assertEqual(snode.content_score, -3)