mirror of
https://github.com/bookieio/breadability
synced 2024-11-04 12:00:19 +00:00
Add some ScoredNode tests as well
This commit is contained in:
parent
e57f8f02ce
commit
ae9208374b
@ -167,7 +167,7 @@ class ScoredNode(object):
|
||||
"""Given node, set an initial score and weigh based on css and id"""
|
||||
self.node = node
|
||||
content_score = 0
|
||||
if node.tag == 'div':
|
||||
if node.tag in ['div', 'article']:
|
||||
content_score = 5
|
||||
|
||||
if node.tag in ['pre', 'td', 'blockquote']:
|
||||
@ -178,6 +178,7 @@ class ScoredNode(object):
|
||||
content_score = -3
|
||||
if node.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th']:
|
||||
content_score = -5
|
||||
|
||||
content_score += get_class_weight(node)
|
||||
self.content_score = content_score
|
||||
|
||||
|
@ -6,7 +6,9 @@ from unittest import TestCase
|
||||
from breadability.readable import Article
|
||||
from breadability.scoring import check_node_attr
|
||||
from breadability.scoring import get_class_weight
|
||||
from breadability.scoring import ScoredNode
|
||||
from breadability.readable import get_link_density
|
||||
from breadability.readable import is_unlikely_node
|
||||
from breadability.tests import load_snippet
|
||||
|
||||
|
||||
@ -121,3 +123,85 @@ class TestClassWeight(TestCase):
|
||||
self.assertEqual(get_class_weight(node), 25)
|
||||
|
||||
|
||||
class TestUnlikelyNode(TestCase):
|
||||
"""is_unlikely_node should help verify our node is good/bad."""
|
||||
|
||||
def test_body_is_always_likely(self):
|
||||
"""The body tag is always a likely node."""
|
||||
test_div = '<body class="comment"><div>Content</div></body>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertFalse(is_unlikely_node(node))
|
||||
|
||||
def test_is_unlikely(self):
|
||||
"Keywords in the class/id will make us believe this is unlikely."
|
||||
test_div = '<div class="something comments">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertTrue(is_unlikely_node(node))
|
||||
|
||||
test_div = '<div id="comments">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertTrue(is_unlikely_node(node))
|
||||
|
||||
def test_not_unlikely(self):
|
||||
"""Suck it double negatives."""
|
||||
test_div = '<div id="post">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertFalse(is_unlikely_node(node))
|
||||
|
||||
test_div = '<div class="something post">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertFalse(is_unlikely_node(node))
|
||||
|
||||
def test_maybe_hits(self):
|
||||
"""We've got some maybes that will overrule an unlikely node."""
|
||||
test_div = '<div id="comments" class="article">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertFalse(is_unlikely_node(node))
|
||||
|
||||
|
||||
class TestScoredNode(TestCase):
|
||||
"""ScoredNodes constructed have initial content_scores, etc."""
|
||||
|
||||
def test_hash_id(self):
|
||||
"""ScoredNodes have a hash_id based on their content
|
||||
|
||||
Since this is based on the html there are chances for collisions, but
|
||||
it helps us follow and identify nodes through the scoring process. Two
|
||||
identical nodes would score the same, so meh all good.
|
||||
|
||||
"""
|
||||
test_div = '<div id="comments" class="article">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
self.assertEqual(snode.hash_id, 'ffa4c519')
|
||||
|
||||
def test_div_content_score(self):
|
||||
"""A div starts out with a score of 5 and modifies from there"""
|
||||
test_div = '<div id="" class="">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
self.assertEqual(snode.content_score, 5)
|
||||
|
||||
test_div = '<div id="article" class="">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
self.assertEqual(snode.content_score, 30)
|
||||
|
||||
test_div = '<div id="comments" class="">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
self.assertEqual(snode.content_score, -20)
|
||||
|
||||
def test_headings_score(self):
|
||||
"""Heading tags aren't likely candidates, hurt their scores."""
|
||||
test_div = '<h2>Heading</h2>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
self.assertEqual(snode.content_score, -5)
|
||||
|
||||
def test_list_items(self):
|
||||
"""Heading tags aren't likely candidates, hurt their scores."""
|
||||
test_div = '<li>list item</li>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
self.assertEqual(snode.content_score, -3)
|
||||
|
Loading…
Reference in New Issue
Block a user