Tests migrated into pytest style

pull/35/head
Mišo Belica 6 years ago
parent 48acf389b1
commit aa83825334

@ -35,41 +35,41 @@ TEST_PATH = join(
TEST_TEMPLATE = '''# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
"""
Test the scoring and parsing of the article from URL below:
%(source_url)s
"""
from os.path import join, dirname
from breadability.readable import Article
from ...compat import unittest
from __future__ import absolute_import, division, print_function, unicode_literals
import os
class TestArticle(unittest.TestCase):
"""
Test the scoring and parsing of the article from URL below:
%(source_url)s
"""
import pytest
def setUp(self):
from breadability.readable import Article
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = join(dirname(__file__), "article.html")
article_path = os.path.join(os.path.dirname(__file__), "article.html")
with open(article_path, "rb") as file:
self.document = Article(file.read(), "%(source_url)s")
return Article(file.read(), "%(source_url)s")
def tearDown(self):
"""Drop the article"""
self.document = None
def test_parses(self):
def test_parses(article):
"""Verify we can parse the document."""
self.assertIn('id="readabilityBody"', self.document.readable)
assert 'id="readabilityBody"' in article.readable
def test_content_exists(self):
def test_content_exists(article):
"""Verify that some content exists."""
self.assertIn("#&@#&@#&@", self.document.readable)
assert "#&@#&@#&@" in article.readable
def test_content_does_not_exist(self):
def test_content_does_not_exist(article):
"""Verify we cleaned out some content that shouldn't exist."""
self.assertNotIn("", self.document.readable)
assert "" not in article.readable
'''

@ -32,9 +32,6 @@ tests_require = [
]
if sys.version_info < (2, 7):
install_requires.append("unittest2")
console_script_targets = [
"breadability = breadability.scripts.client:main",
"breadability-{0} = breadability.scripts.client:main",

@ -2,8 +2,3 @@
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
try:
import unittest2 as unittest
except ImportError:
import unittest

@ -10,38 +10,36 @@ from __future__ import (
from lxml.html import fragment_fromstring, document_fromstring
from breadability.readable import Article
from breadability.annotated_text import AnnotatedTextHandler
from .compat import unittest
from .utils import load_snippet, load_article
class TestAnnotatedText(unittest.TestCase):
def test_simple_document(self):
def test_simple_document():
dom = fragment_fromstring("<p>This is\n\tsimple\ttext.</p>")
annotated_text = AnnotatedTextHandler.parse(dom)
expected = [
assert annotated_text == [
(
("This is\nsimple text.", None),
),
]
self.assertEqual(annotated_text, expected)
def test_empty_paragraph(self):
def test_empty_paragraph():
dom = fragment_fromstring("<div><p>Paragraph <p>\t \n</div>")
annotated_text = AnnotatedTextHandler.parse(dom)
expected = [
assert annotated_text == [
(
("Paragraph", None),
),
]
self.assertEqual(annotated_text, expected)
def test_multiple_paragraphs(self):
def test_multiple_paragraphs():
dom = fragment_fromstring("<div><p> 1 first<p> 2\tsecond <p>3\rthird </div>")
annotated_text = AnnotatedTextHandler.parse(dom)
expected = [
assert annotated_text == [
(
("1 first", None),
),
@ -52,13 +50,13 @@ class TestAnnotatedText(unittest.TestCase):
("3\nthird", None),
),
]
self.assertEqual(annotated_text, expected)
def test_single_annotation(self):
def test_single_annotation():
dom = fragment_fromstring("<div><p> text <em>emphasis</em> <p> last</div>")
annotated_text = AnnotatedTextHandler.parse(dom)
expected = [
assert annotated_text == [
(
("text", None),
("emphasis", ("em",)),
@ -67,13 +65,13 @@ class TestAnnotatedText(unittest.TestCase):
("last", None),
),
]
self.assertEqual(annotated_text, expected)
def test_recursive_annotation(self):
def test_recursive_annotation():
dom = fragment_fromstring("<div><p> text <em><i><em>emphasis</em></i></em> <p> last</div>")
annotated_text = AnnotatedTextHandler.parse(dom)
expected = [
assert annotated_text == [
(
("text", None),
("emphasis", ("em", "i")),
@ -82,22 +80,22 @@ class TestAnnotatedText(unittest.TestCase):
("last", None),
),
]
self.assertEqual(annotated_text, expected)
def test_annotations_without_explicit_paragraph(self):
def test_annotations_without_explicit_paragraph():
dom = fragment_fromstring("<div>text <strong>emphasis</strong>\t<b>hmm</b> </div>")
annotated_text = AnnotatedTextHandler.parse(dom)
expected = [
assert annotated_text == [
(
("text", None),
("emphasis", ("strong",)),
("hmm", ("b",)),
),
]
self.assertEqual(annotated_text, expected)
def test_process_paragraph_with_chunked_text(self):
def test_process_paragraph_with_chunked_text():
handler = AnnotatedTextHandler()
paragraph = handler._process_paragraph([
(" 1", ("b", "del")),
@ -108,18 +106,18 @@ class TestAnnotatedText(unittest.TestCase):
(" 6", ("em",)),
])
expected = (
assert paragraph == (
("1 2", ("b", "del")),
("3 4 5", None),
("6", ("em",)),
)
self.assertEqual(paragraph, expected)
def test_include_heading(self):
def test_include_heading():
dom = document_fromstring(load_snippet("h1_and_2_paragraphs.html"))
annotated_text = AnnotatedTextHandler.parse(dom.find("body"))
expected = [
assert annotated_text == [
(
('Nadpis H1, ktorý chce byť prvý s textom ale predbehol ho "title"', ("h1",)),
("Toto je prvý odstavec a to je fajn.", None),
@ -128,13 +126,13 @@ class TestAnnotatedText(unittest.TestCase):
("Tento text je tu aby vyplnil prázdne miesto v srdci súboru.\nAj súbory majú predsa city.", None),
),
]
self.assertSequenceEqual(annotated_text, expected)
def test_real_article(self):
def test_real_article():
article = Article(load_article("zdrojak_automaticke_zabezpeceni.html"))
annotated_text = article.main_text
expected = [
assert annotated_text == [
(
("Automatické zabezpečení", ("h1",)),
("Úroveň zabezpečení aplikace bych rozdělil do tří úrovní:", None),
@ -170,4 +168,3 @@ class TestAnnotatedText(unittest.TestCase):
(".", None),
),
]
self.assertSequenceEqual(annotated_text, expected)

@ -1,42 +1,45 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
"""Test the scoring and parsing of the Blog Post"""
from __future__ import absolute_import, division, print_function, unicode_literals
import os
from breadability.readable import Article
from ...compat import unittest
import pytest
from breadability.readable import Article
class TestAntipopeBlog(unittest.TestCase):
"""Test the scoring and parsing of the Blog Post"""
def setUp(self):
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
self.article = open(article_path).read()
with open(article_path) as file:
return file.read()
def tearDown(self):
"""Drop the article"""
self.article = None
def test_parses(self):
def test_parses(article):
"""Verify we can parse the document."""
doc = Article(self.article)
self.assertTrue('id="readabilityBody"' in doc.readable)
doc = Article(article)
assert 'id="readabilityBody"' in doc.readable
def test_comments_cleaned(self):
def test_comments_cleaned(article):
"""The div with the comments should be removed."""
doc = Article(self.article)
self.assertTrue('class="comments"' not in doc.readable)
doc = Article(article)
assert 'class="comments"' not in doc.readable
def test_beta_removed(self):
def test_beta_removed(article):
"""The id=beta element should be removed
It's link heavy and causing a lot of garbage content. This should be
removed.
"""
doc = Article(self.article)
self.assertTrue('id="beta"' not in doc.readable)
doc = Article(article)
assert 'id="beta"' not in doc.readable

@ -1,33 +1,34 @@
import os
try:
# Python < 2.7
import unittest2 as unittest
except ImportError:
import unittest
# -*- coding: utf8 -*-
from breadability.readable import Article
"""Test the scoring and parsing of the Blog Post"""
from __future__ import absolute_import, division, print_function, unicode_literals
class TestBusinessInsiderArticle(unittest.TestCase):
"""Test the scoring and parsing of the Blog Post"""
import os
def setUp(self):
import pytest
from breadability.readable import Article
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
self.article = open(article_path).read()
with open(article_path) as file:
return file.read()
def tearDown(self):
"""Drop the article"""
self.article = None
def test_parses(self):
def test_parses(article):
"""Verify we can parse the document."""
doc = Article(self.article)
self.assertTrue('id="readabilityBody"' in doc.readable)
doc = Article(article)
assert 'id="readabilityBody"' in doc.readable
def test_images_preserved(self):
def test_images_preserved(article):
"""The div with the comments should be removed."""
doc = Article(self.article)
self.assertTrue('bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg' in doc.readable)
self.assertTrue('bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg' in doc.readable)
doc = Article(article)
assert 'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg' in doc.readable
assert 'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg' in doc.readable

@ -1,39 +1,33 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
"""
Test the scoring and parsing of the article from URL below:
http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8
"""
from os.path import join, dirname
from breadability.readable import Article
from ...compat import unittest
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import pytest
from breadability.readable import Article
class TestArticle(unittest.TestCase):
"""
Test the scoring and parsing of the article from URL below:
http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8
"""
def setUp(self):
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = join(dirname(__file__), "article.html")
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
with open(article_path, "rb") as file:
self.document = Article(file.read(), "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8")
return Article(file.read(), "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8")
def tearDown(self):
"""Drop the article"""
self.document = None
def test_parses(self):
def test_parses(article):
"""Verify we can parse the document."""
self.assertIn('id="readabilityBody"', self.document.readable)
assert 'id="readabilityBody"' in article.readable
def test_images_preserved(self):
"""The div with the comments should be removed."""
images = [
'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg',
'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg',
]
for image in images:
self.assertIn(image, self.document.readable, image)
def test_images_preserved(article):
"""The div with the comments should be removed."""
assert 'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg' in article.readable
assert 'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg' in article.readable

@ -1,44 +1,44 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
"""
Test the scoring and parsing of the article from URL below:
http://www.zdrojak.cz/clanky/jeste-k-testovani/
"""
from os.path import join, dirname
from breadability.readable import Article
from breadability._compat import unicode
from ...compat import unittest
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import pytest
from breadability._compat import unicode
from breadability.readable import Article
class TestArticle(unittest.TestCase):
"""
Test the scoring and parsing of the article from URL below:
http://www.zdrojak.cz/clanky/jeste-k-testovani/
"""
def setUp(self):
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = join(dirname(__file__), "article.html")
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
with open(article_path, "rb") as file:
self.document = Article(file.read(), "http://www.zdrojak.cz/clanky/jeste-k-testovani/")
return Article(file.read(), "http://www.zdrojak.cz/clanky/jeste-k-testovani/")
def tearDown(self):
"""Drop the article"""
self.document = None
def test_parses(self):
def test_parses(article):
"""Verify we can parse the document."""
self.assertIn('id="readabilityBody"', self.document.readable)
assert 'id="readabilityBody"' in article.readable
def test_content_exists(self):
def test_content_exists(article):
"""Verify that some content exists."""
self.assertIsInstance(self.document.readable, unicode)
assert isinstance(article.readable, unicode)
text = "S automatizovaným testováním kódu (a ve zbytku článku budu mít na mysli právě to) jsem se setkal v několika firmách."
self.assertIn(text, self.document.readable)
assert text in article.readable
text = "Ke čtení naleznete mnoho různých materiálů, od teoretických po praktické ukázky."
self.assertIn(text, self.document.readable)
assert text in article.readable
def test_content_does_not_exist(self):
def test_content_does_not_exist(article):
"""Verify we cleaned out some content that shouldn't exist."""
self.assertNotIn("Pokud vás problematika zajímá, využijte možnosti navštívit školení", self.document.readable)
assert "Pokud vás problematika zajímá, využijte možnosti navštívit školení" not in article.readable

@ -1,72 +1,62 @@
# -*- coding: utf8 -*-
from __future__ import (
absolute_import,
division,
print_function,
unicode_literals
)
"""Test the scoring and parsing of the Article"""
import os
from __future__ import absolute_import, division, print_function, unicode_literals
import os
from operator import attrgetter
from breadability.readable import Article
from breadability.readable import check_siblings
from breadability.readable import prep_article
from ...compat import unittest
import pytest
from breadability.readable import Article, check_siblings, prep_article
class TestArticle(unittest.TestCase):
"""Test the scoring and parsing of the Article"""
def setUp(self):
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
self.article = open(article_path).read()
with open(article_path) as file:
return Article(file.read())
def tearDown(self):
"""Drop the article"""
self.article = None
def test_parses(self):
def test_parses(article):
"""Verify we can parse the document."""
doc = Article(self.article)
self.assertTrue('id="readabilityBody"' in doc.readable)
assert 'id="readabilityBody"' in article.readable
def test_content_exists(self):
def test_content_exists(article):
"""Verify that some content exists."""
doc = Article(self.article)
self.assertTrue('Amazon and Google' in doc.readable)
self.assertFalse('Linkblog updated' in doc.readable)
self.assertFalse(
'#anExampleGoogleDoesntIntendToShareBlogAndItWill' in doc.readable)
@unittest.skip("Test fails because of some weird hash.")
def test_candidates(self):
assert 'Amazon and Google' in article.readable
assert not 'Linkblog updated' in article.readable
assert not '#anExampleGoogleDoesntIntendToShareBlogAndItWill' in article.readable
@pytest.mark.skip("Test fails because of some weird hash.")
def test_candidates(article):
"""Verify we have candidates."""
doc = Article(self.article)
# from lxml.etree import tounicode
found = False
wanted_hash = '04e46055'
for node in doc.candidates.values():
for node in article.candidates.values():
if node.hash_id == wanted_hash:
found = node
self.assertTrue(found)
assert found
# we have the right node, it must be deleted for some reason if it's
# not still there when we need it to be.
# Make sure it's not in our to drop list.
for node in doc._should_drop:
self.assertFalse(node == found.node)
for node in article._should_drop:
assert node != found.node
by_score = sorted(
[c for c in doc.candidates.values()],
[c for c in article.candidates.values()],
key=attrgetter('content_score'), reverse=True)
self.assertTrue(by_score[0].node == found.node)
assert by_score[0].node == found.node
updated_winner = check_siblings(by_score[0], doc.candidates)
updated_winner = check_siblings(by_score[0], article.candidates)
updated_winner.node = prep_article(updated_winner.node)
# This article hits up against the img > p conditional filtering

@ -1,33 +1,32 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
"""
Test the scoring and parsing of the article from URL below:
http://sweetshark.livejournal.com/11564.html
"""
from os.path import join, dirname
from breadability.readable import Article
from ...compat import unittest
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import pytest
class TestSweetsharkBlog(unittest.TestCase):
"""
Test the scoring and parsing of the article from URL below:
http://sweetshark.livejournal.com/11564.html
"""
from breadability.readable import Article
def setUp(self):
@pytest.fixture(scope="module")
def article():
"""Load up the article for us"""
article_path = join(dirname(__file__), "article.html")
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
with open(article_path, "rb") as file:
self.document = Article(file.read(), "http://sweetshark.livejournal.com/11564.html")
return Article(file.read(), "http://sweetshark.livejournal.com/11564.html")
def tearDown(self):
"""Drop the article"""
self.document = None
def test_parses(self):
def test_parses(article):
"""Verify we can parse the document."""
self.assertIn('id="readabilityBody"', self.document.readable)
assert 'id="readabilityBody"' in article.readable
def test_content_after_video(self):
def test_content_after_video(article):
"""The div with the comments should be removed."""
self.assertIn('Stay hungry, Stay foolish', self.document.readable)
assert 'Stay hungry, Stay foolish' in article.readable

@ -1,5 +1,7 @@
# -*- coding: utf8 -*-
"""Verify we can process html into a document to work off of."""
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
@ -15,47 +17,44 @@ from breadability.document import (
decode_html,
OriginalDocument,
)
from .compat import unittest
from .utils import load_snippet
class TestOriginalDocument(unittest.TestCase):
"""Verify we can process html into a document to work off of."""
def test_convert_br_tags_to_paragraphs(self):
def test_convert_br_tags_to_paragraphs():
returned = convert_breaks_to_paragraphs(
("<div>HI<br><br>How are you?<br><br> \t \n <br>"
"Fine\n I guess</div>"))
self.assertEqual(
returned,
"<div>HI</p><p>How are you?</p><p>Fine\n I guess</div>")
assert returned == "<div>HI</p><p>How are you?</p><p>Fine\n I guess</div>"
def test_convert_hr_tags_to_paragraphs(self):
def test_convert_hr_tags_to_paragraphs():
returned = convert_breaks_to_paragraphs(
"<div>HI<br><br>How are you?<hr/> \t \n <br>Fine\n I guess</div>")
self.assertEqual(
returned,
"<div>HI</p><p>How are you?</p><p>Fine\n I guess</div>")
assert returned == "<div>HI</p><p>How are you?</p><p>Fine\n I guess</div>"
def test_readin_min_document(self):
def test_readin_min_document():
"""Verify we can read in a min html document"""
doc = OriginalDocument(load_snippet('document_min.html'))
self.assertTrue(to_unicode(doc).startswith('<html>'))
self.assertEqual(doc.title, 'Min Document Title')
def test_readin_with_base_url(self):
assert to_unicode(doc).startswith('<html>')
assert doc.title == 'Min Document Title'
def test_readin_with_base_url():
"""Passing a url should update links to be absolute links"""
doc = OriginalDocument(
load_snippet('document_absolute_url.html'),
url="http://blog.mitechie.com/test.html")
self.assertTrue(to_unicode(doc).startswith('<html>'))
assert to_unicode(doc).startswith('<html>')
# find the links on the page and make sure each one starts with out
# base url we told it to use.
links = doc.links
self.assertEqual(len(links), 3)
assert len(links) == 3
# we should have two links that start with our blog url
# and one link that starts with amazon
link_counts = defaultdict(int)
@ -65,38 +64,50 @@ class TestOriginalDocument(unittest.TestCase):
else:
link_counts['other'] += 1
self.assertEqual(link_counts['blog'], 2)
self.assertEqual(link_counts['other'], 1)
assert link_counts['blog'] == 2
assert link_counts['other'] == 1
def test_no_br_allowed(self):
def test_no_br_allowed():
"""We convert all <br/> tags to <p> tags"""
doc = OriginalDocument(load_snippet('document_min.html'))
self.assertIsNone(doc.dom.find('.//br'))
def test_empty_title(self):
assert doc.dom.find('.//br') is None
def test_empty_title():
"""We convert all <br/> tags to <p> tags"""
document = OriginalDocument(
"<html><head><title></title></head><body></body></html>")
self.assertEqual(document.title, "")
def test_title_only_with_tags(self):
assert document.title == ""
def test_title_only_with_tags():
"""We convert all <br/> tags to <p> tags"""
document = OriginalDocument(
"<html><head><title><em></em></title></head><body></body></html>")
self.assertEqual(document.title, "")
def test_no_title(self):
assert document.title == ""
def test_no_title():
"""We convert all <br/> tags to <p> tags"""
document = OriginalDocument("<html><head></head><body></body></html>")
self.assertEqual(document.title, "")
def test_encoding(self):
assert document.title == ""
def test_encoding():
text = "ľščťžýáíéäúňôůě".encode("iso-8859-2")
html = decode_html(text)
self.assertEqual(type(html), unicode)
def test_encoding_short(self):
assert type(html) is unicode
def test_encoding_short():
text = to_bytes("ľščťžýáíé")
html = decode_html(text)
self.assertEqual(type(html), unicode)
self.assertEqual(html, "ľščťžýáíé")
assert type(html) is unicode
assert html == "ľščťžýáíé"

@ -1,92 +1,95 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from __future__ import absolute_import, division, print_function, unicode_literals
import pytest
from lxml.etree import tounicode
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from lxml.html import document_fromstring, fragment_fromstring
from breadability._compat import to_unicode
from breadability.readable import (
Article,
get_class_weight,
get_link_density,
is_bad_link,
leaf_div_elements_into_paragraphs,
score_candidates,
)
from breadability.readable import (Article, get_class_weight, get_link_density, is_bad_link,
leaf_div_elements_into_paragraphs, score_candidates, )
from breadability.scoring import ScoredNode
from .compat import unittest
from .utils import load_snippet, load_article
from .utils import load_article, load_snippet
# TestReadableDocument
"""Verify we can process html into a document to work off of."""
class TestReadableDocument(unittest.TestCase):
"""Verify we can process html into a document to work off of."""
def test_load_doc(self):
def test_load_doc():
"""We get back an element tree from our original doc"""
doc = Article(load_snippet('document_min.html'))
# We get back the document as a div tag currently by default.
self.assertEqual(doc.readable_dom.tag, 'div')
def test_title_loads(self):
assert doc.readable_dom.tag == 'div'
def test_title_loads():
"""Verify we can fetch the title of the parsed article"""
doc = Article(load_snippet('document_min.html'))
self.assertEqual(
doc._original_document.title,
'Min Document Title'
)
def test_doc_no_scripts_styles(self):
assert doc._original_document.title == 'Min Document Title'
def test_doc_no_scripts_styles():
"""Step #1 remove all scripts from the document"""
doc = Article(load_snippet('document_scripts.html'))
readable = doc.readable_dom
self.assertEqual(readable.findall(".//script"), [])
self.assertEqual(readable.findall(".//style"), [])
self.assertEqual(readable.findall(".//link"), [])
def test_find_body_exists(self):
assert readable.findall(".//script") == []
assert readable.findall(".//style") == []
assert readable.findall(".//link") == []
def test_find_body_exists():
"""If the document has a body, we store that as the readable html
No sense processing anything other than the body content.
"""
doc = Article(load_snippet('document_min.html'))
self.assertEqual(doc.readable_dom.tag, 'div')
self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
def test_body_doesnt_exist(self):
assert doc.readable_dom.tag == 'div'
assert doc.readable_dom.get('id') == 'readabilityBody'
def test_body_doesnt_exist():
"""If we can't find a body, then we create one.
We build our doc around the rest of the html we parsed.
"""
doc = Article(load_snippet('document_no_body.html'))
self.assertEqual(doc.readable_dom.tag, 'div')
self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
def test_bare_content(self):
assert doc.readable_dom.tag == 'div'
assert doc.readable_dom.get('id') == 'readabilityBody'
def test_bare_content():
"""If the document is just pure content, no html tags we should be ok
We build our doc around the rest of the html we parsed.
"""
doc = Article(load_snippet('document_only_content.html'))
self.assertEqual(doc.readable_dom.tag, 'div')
self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
def test_no_content(self):
assert doc.readable_dom.tag == 'div'
assert doc.readable_dom.get('id') == 'readabilityBody'
def test_no_content():
"""Without content we supply an empty unparsed doc."""
doc = Article('')
self.assertEqual(doc.readable_dom.tag, 'div')
self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
self.assertEqual(doc.readable_dom.get('class'), 'parsing-error')
assert doc.readable_dom.tag == 'div'
assert doc.readable_dom.get('id') == 'readabilityBody'
assert doc.readable_dom.get('class') == 'parsing-error'
class TestCleaning(unittest.TestCase):
"""Test out our cleaning processing we do."""
def test_unlikely_hits(self):
# Test out our cleaning processing we do.
def test_unlikely_hits():
"""Verify we wipe out things from our unlikely list."""
doc = Article(load_snippet('test_readable_unlikely.html'))
readable = doc.readable_dom
@ -109,7 +112,7 @@ class TestCleaning(unittest.TestCase):
for cls in test.get('class').split():
if cls in want_to_appear:
found = True
self.assertTrue(found)
assert found
by_ids = readable.get_element_by_id(i, False)
if by_ids is not False:
@ -117,9 +120,10 @@ class TestCleaning(unittest.TestCase):
for ids in test.get('id').split():
if ids in want_to_appear:
found = True
self.assertTrue(found)
assert found
def test_misused_divs_transform(self):
def test_misused_divs_transform():
"""Verify we replace leaf node divs with p's
They should have the same content, just be a p vs a div
@ -127,39 +131,32 @@ class TestCleaning(unittest.TestCase):
"""
test_html = "<html><body><div>simple</div></body></html>"
test_doc = document_fromstring(test_html)
self.assertEqual(
tounicode(
leaf_div_elements_into_paragraphs(test_doc)),
to_unicode("<html><body><p>simple</p></body></html>")
assert tounicode(leaf_div_elements_into_paragraphs(test_doc)) == to_unicode(
"<html><body><p>simple</p></body></html>"
)
test_html2 = ('<html><body><div>simple<a href="">link</a>'
'</div></body></html>')
test_doc2 = document_fromstring(test_html2)
self.assertEqual(
tounicode(
leaf_div_elements_into_paragraphs(test_doc2)),
to_unicode(
'<html><body><p>simple<a href="">link</a></p></body></html>')
assert tounicode(leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode(
'<html><body><p>simple<a href="">link</a></p></body></html>'
)
def test_dont_transform_div_with_div(self):
def test_dont_transform_div_with_div():
"""Verify that only child <div> element is replaced by <p>."""
dom = document_fromstring(
"<html><body><div>text<div>child</div>"
"aftertext</div></body></html>"
)
self.assertEqual(
tounicode(
leaf_div_elements_into_paragraphs(dom)),
to_unicode(
assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode(
"<html><body><div>text<p>child</p>"
"aftertext</div></body></html>"
)
)
def test_bad_links(self):
def test_bad_links():
"""Some links should just not belong."""
bad_links = [
'<a name="amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress">&nbsp;</a>',
@ -169,13 +166,13 @@ class TestCleaning(unittest.TestCase):
for l in bad_links:
link = fragment_fromstring(l)
self.assertTrue(is_bad_link(link))
assert is_bad_link(link)
# Candidate nodes are scoring containers we use.
class TestCandidateNodes(unittest.TestCase):
"""Candidate nodes are scoring containers we use."""
def test_candidate_scores(self):
def test_candidate_scores():
"""We should be getting back objects with some scores."""
fives = ['<div/>']
threes = ['<pre/>', '<td/>', '<blockquote/>']
@ -184,54 +181,59 @@ class TestCandidateNodes(unittest.TestCase):
for n in fives:
doc = fragment_fromstring(n)
self.assertEqual(ScoredNode(doc).content_score, 5)
assert ScoredNode(doc).content_score == 5
for n in threes:
doc = fragment_fromstring(n)
self.assertEqual(ScoredNode(doc).content_score, 3)
assert ScoredNode(doc).content_score == 3
for n in neg_threes:
doc = fragment_fromstring(n)
self.assertEqual(ScoredNode(doc).content_score, -3)
assert ScoredNode(doc).content_score == -3
for n in neg_fives:
doc = fragment_fromstring(n)
self.assertEqual(ScoredNode(doc).content_score, -5)
assert ScoredNode(doc).content_score == -5
def test_article_enables_candidate_access(self):
def test_article_enables_candidate_access():
"""Candidates are accessible after document processing."""
doc = Article(load_article('ars.001.html'))
self.assertTrue(hasattr(doc, 'candidates'))
assert hasattr(doc, 'candidates')
class TestClassWeights(unittest.TestCase):
"""Certain ids and classes get us bonus points."""
# Certain ids and classes get us bonus points.
def test_positive_class(self):
def test_positive_class():
"""Some classes get us bonus points."""
node = fragment_fromstring('<p class="article">')
self.assertEqual(get_class_weight(node), 25)
assert get_class_weight(node) == 25
def test_positive_ids(self):
def test_positive_ids():
"""Some ids get us bonus points."""
node = fragment_fromstring('<p id="content">')
self.assertEqual(get_class_weight(node), 25)
assert get_class_weight(node) == 25
def test_negative_class(self):
def test_negative_class():
"""Some classes get us negative points."""
node = fragment_fromstring('<p class="comment">')
self.assertEqual(get_class_weight(node), -25)
assert get_class_weight(node) == -25
def test_negative_ids(self):
def test_negative_ids():
"""Some ids get us negative points."""
node = fragment_fromstring('<p id="media">')
self.assertEqual(get_class_weight(node), -25)
assert get_class_weight(node) == -25
class TestScoringNodes(unittest.TestCase):
"""We take out list of potential nodes and score them up."""
# We take out list of potential nodes and score them up.
def test_we_get_candidates(self):
def test_we_get_candidates():
"""Processing candidates should get us a list of nodes to try out."""
doc = document_fromstring(load_article("ars.001.html"))
test_nodes = tuple(doc.iter("p", "td", "pre"))
@ -239,13 +241,14 @@ class TestScoringNodes(unittest.TestCase):
# this might change as we tweak our algorithm, but if it does,
# it signifies we need to look at what we changed.
self.assertEqual(len(candidates.keys()), 37)
assert len(candidates.keys()) == 37
# one of these should have a decent score
scores = sorted(c.content_score for c in candidates.values())
self.assertTrue(scores[-1] > 100)
assert scores[-1] > 100
def test_bonus_score_per_100_chars_in_p(self):
def test_bonus_score_per_100_chars_in_p():
"""Nodes get 1 point per 100 characters up to max. 3 points."""
def build_candidates(length):
html = "<p>%s</p>" % ("c" * length)
@ -269,66 +272,69 @@ class TestScoringNodes(unittest.TestCase):
candidates = score_candidates(test_nodes)
pscore_400 = max(c.content_score for c in candidates.values())
self.assertAlmostEqual(pscore_50 + 0.5, pscore_100)
self.assertAlmostEqual(pscore_100 + 2.0, pscore_300)
self.assertAlmostEqual(pscore_300, pscore_400)
assert pscore_50 + 0.5 == pscore_100
assert pscore_100 + 2.0 == pscore_300
assert pscore_300 == pscore_400
# Link density will adjust out candidate scoresself.
class TestLinkDensityScoring(unittest.TestCase):
"""Link density will adjust out candidate scoresself."""
def test_link_density(self):
def test_link_density():
"""Test that we get a link density"""
doc = document_fromstring(load_article('ars.001.html'))
for node in doc.iter('p', 'td', 'pre'):
density = get_link_density(node)
# the density must be between 0, 1
self.assertTrue(density >= 0.0 and density <= 1.0)
assert density >= 0.0 and density <= 1.0
class TestSiblings(unittest.TestCase):
"""Siblings will be included if their content is related."""
# Siblings will be included if their content is related.
@unittest.skip("Not implemented yet.")
def test_bad_siblings_not_counted(self):
@pytest.mark.skip("Not implemented yet.")
def test_bad_siblings_not_counted():
raise NotImplementedError()
@unittest.skip("Not implemented yet.")
def test_good_siblings_counted(self):
@pytest.mark.skip("Not implemented yet.")
def test_good_siblings_counted():
raise NotImplementedError()
class TestMainText(unittest.TestCase):
def test_empty(self):
# TestMainText
def test_empty():
article = Article("")
annotated_text = article.main_text
self.assertEqual(annotated_text, [])
assert annotated_text == []
def test_no_annotations(self):
def test_no_annotations():
article = Article("<div><p>This is text with no annotations</p></div>")
annotated_text = article.main_text
self.assertEqual(annotated_text,
[(("This is text with no annotations", None),)])
assert annotated_text == [(("This is text with no annotations", None),)]
def test_one_annotation(self):
def test_one_annotation():
article = Article("<div><p>This is text\r\twith <del>no</del> annotations</p></div>")
annotated_text = article.main_text
expected = [(
assert annotated_text == [(
("This is text\nwith", None),
("no", ("del",)),
("annotations", None),
)]
self.assertEqual(annotated_text, expected)
def test_simple_snippet(self):
def test_simple_snippet():
snippet = Article(load_snippet("annotated_1.html"))
annotated_text = snippet.main_text
expected = [
assert annotated_text == [
(
("Paragraph is more", None),
("better", ("em",)),
@ -344,4 +350,3 @@ class TestMainText(unittest.TestCase):
("me :)", None),
)
]
self.assertEqual(annotated_text, expected)

@ -1,143 +1,146 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from __future__ import absolute_import, division, print_function, unicode_literals
import re
from operator import attrgetter
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from breadability.readable import Article
from breadability.scoring import (
check_node_attributes,
generate_hash_id,
get_class_weight,
score_candidates,
ScoredNode,
)
from breadability.readable import (
get_link_density,
is_unlikely_node,
)
from .compat import unittest
from lxml.html import document_fromstring, fragment_fromstring
from breadability.readable import Article, get_link_density, is_unlikely_node
from breadability.scoring import (ScoredNode, check_node_attributes, generate_hash_id, get_class_weight,
score_candidates)
from .utils import load_snippet
class TestHashId(unittest.TestCase):
def test_generate_hash(self):
def test_generate_hash():
dom = fragment_fromstring("<div>ľščťžýáí</div>")
generate_hash_id(dom)
def test_hash_from_id_on_exception(self):
def test_hash_from_id_on_exception():
generate_hash_id(None)
def test_different_hashes(self):
def test_different_hashes():
dom = fragment_fromstring("<div>ľščťžýáí</div>")
hash_dom = generate_hash_id(dom)
hash_none = generate_hash_id(None)
self.assertNotEqual(hash_dom, hash_none)
assert hash_dom != hash_none
def test_equal_hashes(self):
def test_equal_hashes():
dom1 = fragment_fromstring("<div>ľščťžýáí</div>")
dom2 = fragment_fromstring("<div>ľščťžýáí</div>")
hash_dom1 = generate_hash_id(dom1)
hash_dom2 = generate_hash_id(dom2)
self.assertEqual(hash_dom1, hash_dom2)
assert hash_dom1 == hash_dom2
hash_none1 = generate_hash_id(None)
hash_none2 = generate_hash_id(None)
self.assertEqual(hash_none1, hash_none2)
assert hash_none1 == hash_none2
class TestCheckNodeAttr(unittest.TestCase):
"""Verify a node has a class/id in the given set.
# Verify a node has a class/id in the given set.
# The idea is that we have sets of known good/bad ids and classes and need
# to verify the given node does/doesn't have those classes/ids.
The idea is that we have sets of known good/bad ids and classes and need
to verify the given node does/doesn't have those classes/ids.
"""
def test_has_class(self):
def test_has_class():
"""Verify that a node has a class in our set."""
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('class', 'test2 comment')
self.assertTrue(
check_node_attributes(test_pattern, test_node, 'class'))
assert check_node_attributes(test_pattern, test_node, 'class')
def test_has_id(self):
def test_has_id():
"""Verify that a node has an id in our set."""
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('id', 'test2')
self.assertTrue(check_node_attributes(test_pattern, test_node, 'id'))
assert check_node_attributes(test_pattern, test_node, 'id')
def test_lacks_class(self):
def test_lacks_class():
"""Verify that a node does not have a class in our set."""
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('class', 'test4 comment')
self.assertFalse(
check_node_attributes(test_pattern, test_node, 'class'))
def test_lacks_id(self):
assert not check_node_attributes(test_pattern, test_node, 'class')
def test_lacks_id():
"""Verify that a node does not have an id in our set."""
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('id', 'test4')
self.assertFalse(check_node_attributes(test_pattern, test_node, 'id'))
assert not check_node_attributes(test_pattern, test_node, 'id')
# Verify we calc our link density correctly.
class TestLinkDensity(unittest.TestCase):
"""Verify we calc our link density correctly."""
def test_empty_node(self):
def test_empty_node():
"""An empty node doesn't have much of a link density"""
doc = Article("<div></div>")
self.assertEqual(get_link_density(doc.readable_dom), 0.0)
def test_small_doc_no_links(self):
assert get_link_density(doc.readable_dom) == 0.0
def test_small_doc_no_links():
doc = Article(load_snippet('document_min.html'))
self.assertEqual(get_link_density(doc.readable_dom), 0.0)
def test_several_links(self):
assert get_link_density(doc.readable_dom) == 0.0
def test_several_links():
"""This doc has a 3 links with the majority of content."""
doc = Article(load_snippet('document_absolute_url.html'))
self.assertAlmostEqual(get_link_density(doc.readable_dom), 22/37)
assert get_link_density(doc.readable_dom) == 22/37
class TestClassWeight(unittest.TestCase):
"""Verify we score nodes correctly based on their class/id attributes."""
# Verify we score nodes correctly based on their class/id attributes.
def test_no_matches_zero(self):
def test_no_matches_zero():
"""If you don't have the attribute then you get a weight of 0"""
node = fragment_fromstring("<div></div>")
self.assertEqual(get_class_weight(node), 0)
def test_id_hits(self):
assert get_class_weight(node) == 0
def test_id_hits():
"""If the id is in the list then it gets a weight"""
test_div = '<div id="post">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 25)
assert get_class_weight(node) == 25
test_div = '<div id="comments">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), -25)
def test_class_hits(self):
assert get_class_weight(node) == -25
def test_class_hits():
"""If the class is in the list then it gets a weight"""
test_div = '<div class="something post">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 25)
assert get_class_weight(node) == 25
test_div = '<div class="something comments">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), -25)
assert get_class_weight(node) == -25
def test_scores_collide(self):
def test_scores_collide():
"""We might hit both positive and negative scores.
Positive and negative scoring is done independently so it's possible
@ -146,59 +149,65 @@ class TestClassWeight(unittest.TestCase):
"""
test_div = '<div id="post" class="something comment">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 0)
assert get_class_weight(node) == 0
test_div = '<div id="post" class="post comment">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 25)
assert get_class_weight(node) == 25
def test_scores_only_once(self):
def test_scores_only_once():
"""Scoring is not cumulative within a class hit."""
test_div = '<div class="post main">Content</div>'
node = fragment_fromstring(test_div)
self.assertEqual(get_class_weight(node), 25)
assert get_class_weight(node) == 25
# is_unlikely_node should help verify our node is good/bad.
class TestUnlikelyNode(unittest.TestCase):
"""is_unlikely_node should help verify our node is good/bad."""
def test_body_is_always_likely(self):
def test_body_is_always_likely():
"""The body tag is always a likely node."""
test_div = '<body class="comment"><div>Content</div></body>'
node = fragment_fromstring(test_div)
self.assertFalse(is_unlikely_node(node))
def test_is_unlikely(self):
"Keywords in the class/id will make us believe this is unlikely."
assert not is_unlikely_node(node)
def test_is_unlikely():
"""Keywords in the class/id will make us believe this is unlikely."""
test_div = '<div class="something comments">Content</div>'
node = fragment_fromstring(test_div)
self.assertTrue(is_unlikely_node(node))
assert is_unlikely_node(node)
test_div = '<div id="comments">Content</div>'
node = fragment_fromstring(test_div)
self.assertTrue(is_unlikely_node(node))
assert is_unlikely_node(node)
def test_not_unlikely(self):
def test_not_unlikely():
"""Suck it double negatives."""
test_div = '<div id="post">Content</div>'
node = fragment_fromstring(test_div)
self.assertFalse(is_unlikely_node(node))
assert not is_unlikely_node(node)
test_div = '<div class="something post">Content</div>'
node = fragment_fromstring(test_div)
self.assertFalse(is_unlikely_node(node))
assert not is_unlikely_node(node)
def test_maybe_hits(self):
def test_maybe_hits():
"""We've got some maybes that will overrule an unlikely node."""
test_div = '<div id="comments" class="article">Content</div>'
node = fragment_fromstring(test_div)
self.assertFalse(is_unlikely_node(node))
assert not is_unlikely_node(node)
# ScoredNodes constructed have initial content_scores, etc.
class TestScoredNode(unittest.TestCase):
"""ScoredNodes constructed have initial content_scores, etc."""
def test_hash_id(self):
def test_hash_id():
"""ScoredNodes have a hash_id based on their content
Since this is based on the html there are chances for collisions, but
@ -209,50 +218,52 @@ class TestScoredNode(unittest.TestCase):
test_div = '<div id="comments" class="article">Content</div>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
self.assertEqual(snode.hash_id, 'ffa4c519')
def test_div_content_score(self):
assert snode.hash_id == 'ffa4c519'
def test_div_content_score():
"""A div starts out with a score of 5 and modifies from there"""
test_div = '<div id="" class="">Content</div>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
self.assertEqual(snode.content_score, 5)
assert snode.content_score == 5
test_div = '<div id="article" class="">Content</div>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
self.assertEqual(snode.content_score, 30)
assert snode.content_score == 30
test_div = '<div id="comments" class="">Content</div>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
self.assertEqual(snode.content_score, -20)
assert snode.content_score == -20
def test_headings_score(self):
def test_headings_score():
"""Heading tags aren't likely candidates, hurt their scores."""
test_div = '<h2>Heading</h2>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
self.assertEqual(snode.content_score, -5)
def test_list_items(self):
assert snode.content_score == -5
def test_list_items():
"""Heading tags aren't likely candidates, hurt their scores."""
test_div = '<li>list item</li>'
node = fragment_fromstring(test_div)
snode = ScoredNode(node)
self.assertEqual(snode.content_score, -3)
assert snode.content_score == -3
class TestScoreCandidates(unittest.TestCase):
"""The grand daddy of tests to make sure our scoring works
# The grand daddy of tests to make sure our scoring works
# Now scoring details will change over time, so the most important thing is
# to make sure candidates come out in the right order, not necessarily how
# they scored. Make sure to keep this in mind while getting tests going.
Now scoring details will change over time, so the most important thing is
to make sure candidates come out in the right order, not necessarily how
they scored. Make sure to keep this in mind while getting tests going.
"""
def test_simple_candidate_set(self):
def test_simple_candidate_set():
"""Tests a simple case of two candidate nodes"""
html = """
<html>
@ -276,9 +287,9 @@ class TestScoreCandidates(unittest.TestCase):
(c for c in candidates.values()), reverse=True,
key=attrgetter("content_score"))
self.assertEqual(ordered[0].node.tag, "div")
self.assertEqual(ordered[0].node.attrib["class"], "content")
self.assertEqual(ordered[1].node.tag, "body")
self.assertEqual(ordered[2].node.tag, "html")
self.assertEqual(ordered[3].node.tag, "div")
self.assertEqual(ordered[3].node.attrib["class"], "footer")
assert ordered[0].node.tag == "div"
assert ordered[0].node.attrib["class"] == "content"
assert ordered[1].node.tag == "body"
assert ordered[2].node.tag == "html"
assert ordered[3].node.tag == "div"
assert ordered[3].node.attrib["class"] == "footer"

Loading…
Cancel
Save