Started to do some testing, but really not happy with it

pull/11/head
Richard Harding 12 years ago
parent 7c220535df
commit 811921775c

@ -1,10 +1,10 @@
import re
from operator import attrgetter
from lxml.etree import tounicode
from lxml.etree import tostring
from lxml.html.clean import Cleaner
from lxml.html import fragment_fromstring
from lxml.html import fromstring
from operator import attrgetter
from pprint import PrettyPrinter
from breadability.document import OriginalDocument

@ -94,6 +94,8 @@ def score_candidates(nodes):
LNODE.log(node, 1, "Scoring Node")
content_score = 0
# if the node has no parent it knows of, then it ends up creating a
# body and html tag to parent the html fragment.
parent = node.getparent()
grand = parent.getparent() if parent is not None else None
innertext = node.text_content()

@ -1,12 +1,14 @@
import re
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from operator import attrgetter
from unittest import TestCase
from breadability.readable import Article
from breadability.scoring import check_node_attr
from breadability.scoring import get_class_weight
from breadability.scoring import ScoredNode
from breadability.scoring import score_candidates
from breadability.readable import get_link_density
from breadability.readable import is_unlikely_node
from breadability.tests import load_snippet
@ -210,7 +212,7 @@ class TestScoredNode(TestCase):
class TestScoreCandidates(TestCase):
"""The grand daddy of tests to make sure our scoring works
Now scoring details will change over time, so the most imporant thing is
Now scoring details will change over time, so the most important thing is
to make sure candidates come out in the right order, not necessarily how
they scored. Make sure to keep this in mind while getting tests going.
@ -218,4 +220,32 @@ class TestScoreCandidates(TestCase):
def test_simple_candidate_set(self):
"""Tests a simple case of two candidate nodes"""
# TBD
doc = """
<html>
<body>
<div class="content">
<p>This is a great amount of info</p>
<p>And more content <a href="/index">Home</a>
</div>
<div class="footer">
<p>This is a footer</p>
<p>And more content <a href="/index">Home</a>
</div>
</body>
</html>
"""
d_elem = document_fromstring(doc)
divs = d_elem.findall(".//div")
f_elem = divs[0]
s_elem = divs[1]
res = score_candidates([f_elem, s_elem])
ordered = sorted([c for c in res.values()],
key=attrgetter('content_score'),
reverse=True)
# the body element should have a higher score
self.assertTrue(ordered[0].node.tag == 'body')
# the html element is the outer should come in second
self.assertTrue(ordered[1].node.tag == 'html')

Loading…
Cancel
Save