diff --git a/breadability/scripts/test_helper.py b/breadability/scripts/test_helper.py index b8266bf..7d79d37 100644 --- a/breadability/scripts/test_helper.py +++ b/breadability/scripts/test_helper.py @@ -35,41 +35,41 @@ TEST_PATH = join( TEST_TEMPLATE = '''# -*- coding: utf8 -*- -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals +""" +Test the scoring and parsing of the article from URL below: +%(source_url)s +""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os + +import pytest -from os.path import join, dirname from breadability.readable import Article -from ...compat import unittest -class TestArticle(unittest.TestCase): - """ - Test the scoring and parsing of the article from URL below: - %(source_url)s - """ +@pytest.fixture(scope="module") +def article(): + """Load up the article for us""" + article_path = os.path.join(os.path.dirname(__file__), "article.html") + with open(article_path, "rb") as file: + return Article(file.read(), "%(source_url)s") + - def setUp(self): - """Load up the article for us""" - article_path = join(dirname(__file__), "article.html") - with open(article_path, "rb") as file: - self.document = Article(file.read(), "%(source_url)s") +def test_parses(article): + """Verify we can parse the document.""" + assert 'id="readabilityBody"' in article.readable - def tearDown(self): - """Drop the article""" - self.document = None - def test_parses(self): - """Verify we can parse the document.""" - self.assertIn('id="readabilityBody"', self.document.readable) +def test_content_exists(article): + """Verify that some content exists.""" + assert "#&@#&@#&@" in article.readable - def test_content_exists(self): - """Verify that some content exists.""" - self.assertIn("#&@#&@#&@", self.document.readable) - def test_content_does_not_exist(self): - """Verify we cleaned out some content that shouldn't exist.""" - self.assertNotIn("", self.document.readable) +def test_content_does_not_exist(article): + """Verify we cleaned out some content that shouldn't exist.""" + assert "" not in article.readable ''' diff --git a/setup.py b/setup.py index 7ec4f89..0239192 100644 --- a/setup.py +++ b/setup.py @@ -32,9 +32,6 @@ tests_require = [ ] -if sys.version_info < (2, 7): - install_requires.append("unittest2") - console_script_targets = [ "breadability = breadability.scripts.client:main", "breadability-{0} = breadability.scripts.client:main", diff --git a/tests/compat.py b/tests/compat.py index 0c6f910..6b2f5e0 100644 --- a/tests/compat.py +++ b/tests/compat.py @@ -2,8 +2,3 @@ from __future__ import absolute_import from __future__ import division, print_function, unicode_literals - -try: - import unittest2 as unittest -except ImportError: - import unittest diff --git a/tests/test_annotated_text.py b/tests/test_annotated_text.py index 1bef627..a417c94 100644 --- a/tests/test_annotated_text.py +++ b/tests/test_annotated_text.py @@ -10,164 +10,161 @@ from __future__ import ( from lxml.html import fragment_fromstring, document_fromstring from breadability.readable import Article from breadability.annotated_text import AnnotatedTextHandler -from .compat import unittest from .utils import load_snippet, load_article -class TestAnnotatedText(unittest.TestCase): - def test_simple_document(self): - dom = fragment_fromstring("

This is\n\tsimple\ttext.

") - annotated_text = AnnotatedTextHandler.parse(dom) - - expected = [ - ( - ("This is\nsimple text.", None), - ), - ] - self.assertEqual(annotated_text, expected) - - def test_empty_paragraph(self): - dom = fragment_fromstring("

Paragraph

\t \n

") - annotated_text = AnnotatedTextHandler.parse(dom) - - expected = [ - ( - ("Paragraph", None), - ), - ] - self.assertEqual(annotated_text, expected) - - def test_multiple_paragraphs(self): - dom = fragment_fromstring("

1 first

2\tsecond

3\rthird

") - annotated_text = AnnotatedTextHandler.parse(dom) - - expected = [ - ( - ("1 first", None), - ), - ( - ("2 second", None), - ), - ( - ("3\nthird", None), - ), - ] - self.assertEqual(annotated_text, expected) - - def test_single_annotation(self): - dom = fragment_fromstring("

text emphasis

last

") - annotated_text = AnnotatedTextHandler.parse(dom) - - expected = [ - ( - ("text", None), - ("emphasis", ("em",)), - ), - ( - ("last", None), - ), - ] - self.assertEqual(annotated_text, expected) - - def test_recursive_annotation(self): - dom = fragment_fromstring("

text emphasis

last

") - annotated_text = AnnotatedTextHandler.parse(dom) - - expected = [ - ( - ("text", None), - ("emphasis", ("em", "i")), - ), - ( - ("last", None), - ), - ] - self.assertEqual(annotated_text, expected) - - def test_annotations_without_explicit_paragraph(self): - dom = fragment_fromstring("
text emphasis\thmm
") - annotated_text = AnnotatedTextHandler.parse(dom) - - expected = [ - ( - ("text", None), - ("emphasis", ("strong",)), - ("hmm", ("b",)), - ), - ] - self.assertEqual(annotated_text, expected) - - def test_process_paragraph_with_chunked_text(self): - handler = AnnotatedTextHandler() - paragraph = handler._process_paragraph([ - (" 1", ("b", "del")), - (" 2", ("b", "del")), - (" 3", None), - (" 4", None), - (" 5", None), - (" 6", ("em",)), - ]) - - expected = ( - ("1 2", ("b", "del")), - ("3 4 5", None), - ("6", ("em",)), - ) - self.assertEqual(paragraph, expected) - - def test_include_heading(self): - dom = document_fromstring(load_snippet("h1_and_2_paragraphs.html")) - annotated_text = AnnotatedTextHandler.parse(dom.find("body")) - - expected = [ - ( - ('Nadpis H1, ktorý chce byť prvý s textom ale predbehol ho "title"', ("h1",)), - ("Toto je prvý odstavec a to je fajn.", None), - ), - ( - ("Tento text je tu aby vyplnil prázdne miesto v srdci súboru.\nAj súbory majú predsa city.", None), - ), - ] - self.assertSequenceEqual(annotated_text, expected) - - def test_real_article(self): - article = Article(load_article("zdrojak_automaticke_zabezpeceni.html")) - annotated_text = article.main_text - - expected = [ - ( - ("Automatické zabezpečení", ("h1",)), - ("Úroveň zabezpečení aplikace bych rozdělil do tří úrovní:", None), - ), - ( - ("Aplikace zabezpečená není, neošetřuje uživatelské vstupy ani své výstupy.", ("li", "ol")), - ("Aplikace se o zabezpečení snaží, ale takovým způsobem, že na ně lze zapomenout.", ("li", "ol")), - ("Aplikace se o zabezpečení stará sama, prakticky se nedá udělat chyba.", ("li", "ol")), - ), - ( - ("Jak se tyto úrovně projevují v jednotlivých oblastech?", None), - ), - ( - ("XSS", ("a", "h2")), - ("Druhou úroveň představuje ruční ošetřování pomocí", None), - ("htmlspecialchars", ("a", "kbd")), - (". Třetí úroveň zdánlivě reprezentuje automatické ošetřování v šablonách, např. v", None), - ("Nette Latte", ("a", "strong")), - (". Proč píšu zdánlivě? Problém je v tom, že ošetření se dá obvykle snadno zakázat, např. v Latte pomocí", None), - ("{!$var}", ("code",)), - (". Viděl jsem šablony plné vykřičníků i na místech, kde být neměly. Autor to vysvětlil tak, že psaní", None), - ("{$var}", ("code",)), - ("někde způsobovalo problémy, které po přidání vykřičníku zmizely, tak je začal psát všude.", None), - ), - ( - ("process($content_texy);\n$content = Html::el()->setHtml($safeHtml);\n// v šabloně pak můžeme použít {$content}\n?>", ("pre", )), - ), - ( - ("Ideální by bylo, když by už samotná metoda", None), - ("process()", ("code",)), - ("vracela instanci", None), - ("Html", ("code",)), - (".", None), - ), - ] - self.assertSequenceEqual(annotated_text, expected) +def test_simple_document(): + dom = fragment_fromstring("

This is\n\tsimple\ttext.

") + annotated_text = AnnotatedTextHandler.parse(dom) + + assert annotated_text == [ + ( + ("This is\nsimple text.", None), + ), + ] + + +def test_empty_paragraph(): + dom = fragment_fromstring("

Paragraph

\t \n

") + annotated_text = AnnotatedTextHandler.parse(dom) + + assert annotated_text == [ + ( + ("Paragraph", None), + ), + ] + + +def test_multiple_paragraphs(): + dom = fragment_fromstring("

1 first

2\tsecond

3\rthird

") + annotated_text = AnnotatedTextHandler.parse(dom) + + assert annotated_text == [ + ( + ("1 first", None), + ), + ( + ("2 second", None), + ), + ( + ("3\nthird", None), + ), + ] + + +def test_single_annotation(): + dom = fragment_fromstring("

text emphasis

last

") + annotated_text = AnnotatedTextHandler.parse(dom) + + assert annotated_text == [ + ( + ("text", None), + ("emphasis", ("em",)), + ), + ( + ("last", None), + ), + ] + + +def test_recursive_annotation(): + dom = fragment_fromstring("

text emphasis

last

") + annotated_text = AnnotatedTextHandler.parse(dom) + + assert annotated_text == [ + ( + ("text", None), + ("emphasis", ("em", "i")), + ), + ( + ("last", None), + ), + ] + + +def test_annotations_without_explicit_paragraph(): + dom = fragment_fromstring("
text emphasis\thmm
") + annotated_text = AnnotatedTextHandler.parse(dom) + + assert annotated_text == [ + ( + ("text", None), + ("emphasis", ("strong",)), + ("hmm", ("b",)), + ), + ] + + +def test_process_paragraph_with_chunked_text(): + handler = AnnotatedTextHandler() + paragraph = handler._process_paragraph([ + (" 1", ("b", "del")), + (" 2", ("b", "del")), + (" 3", None), + (" 4", None), + (" 5", None), + (" 6", ("em",)), + ]) + + assert paragraph == ( + ("1 2", ("b", "del")), + ("3 4 5", None), + ("6", ("em",)), + ) + + +def test_include_heading(): + dom = document_fromstring(load_snippet("h1_and_2_paragraphs.html")) + annotated_text = AnnotatedTextHandler.parse(dom.find("body")) + + assert annotated_text == [ + ( + ('Nadpis H1, ktorý chce byť prvý s textom ale predbehol ho "title"', ("h1",)), + ("Toto je prvý odstavec a to je fajn.", None), + ), + ( + ("Tento text je tu aby vyplnil prázdne miesto v srdci súboru.\nAj súbory majú predsa city.", None), + ), + ] + + +def test_real_article(): + article = Article(load_article("zdrojak_automaticke_zabezpeceni.html")) + annotated_text = article.main_text + + assert annotated_text == [ + ( + ("Automatické zabezpečení", ("h1",)), + ("Úroveň zabezpečení aplikace bych rozdělil do tří úrovní:", None), + ), + ( + ("Aplikace zabezpečená není, neošetřuje uživatelské vstupy ani své výstupy.", ("li", "ol")), + ("Aplikace se o zabezpečení snaží, ale takovým způsobem, že na ně lze zapomenout.", ("li", "ol")), + ("Aplikace se o zabezpečení stará sama, prakticky se nedá udělat chyba.", ("li", "ol")), + ), + ( + ("Jak se tyto úrovně projevují v jednotlivých oblastech?", None), + ), + ( + ("XSS", ("a", "h2")), + ("Druhou úroveň představuje ruční ošetřování pomocí", None), + ("htmlspecialchars", ("a", "kbd")), + (". Třetí úroveň zdánlivě reprezentuje automatické ošetřování v šablonách, např. v", None), + ("Nette Latte", ("a", "strong")), + (". Proč píšu zdánlivě? Problém je v tom, že ošetření se dá obvykle snadno zakázat, např. v Latte pomocí", None), + ("{!$var}", ("code",)), + (". Viděl jsem šablony plné vykřičníků i na místech, kde být neměly. Autor to vysvětlil tak, že psaní", None), + ("{$var}", ("code",)), + ("někde způsobovalo problémy, které po přidání vykřičníku zmizely, tak je začal psát všude.", None), + ), + ( + ("process($content_texy);\n$content = Html::el()->setHtml($safeHtml);\n// v šabloně pak můžeme použít {$content}\n?>", ("pre", )), + ), + ( + ("Ideální by bylo, když by už samotná metoda", None), + ("process()", ("code",)), + ("vracela instanci", None), + ("Html", ("code",)), + (".", None), + ), + ] diff --git a/tests/test_articles/test_antipope_org/test_article.py b/tests/test_articles/test_antipope_org/test_article.py index 10db633..e79bba7 100644 --- a/tests/test_articles/test_antipope_org/test_article.py +++ b/tests/test_articles/test_antipope_org/test_article.py @@ -1,42 +1,45 @@ # -*- coding: utf8 -*- -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals +"""Test the scoring and parsing of the Blog Post""" + +from __future__ import absolute_import, division, print_function, unicode_literals import os +import pytest + from breadability.readable import Article -from ...compat import unittest -class TestAntipopeBlog(unittest.TestCase): - """Test the scoring and parsing of the Blog Post""" +@pytest.fixture(scope="module") +def article(): + """Load up the article for us""" + article_path = os.path.join(os.path.dirname(__file__), 'article.html') + with open(article_path) as file: + return file.read() + + +def test_parses(article): + """Verify we can parse the document.""" + doc = Article(article) + + assert 'id="readabilityBody"' in doc.readable + - def setUp(self): - """Load up the article for us""" - article_path = os.path.join(os.path.dirname(__file__), 'article.html') - self.article = open(article_path).read() +def test_comments_cleaned(article): + """The div with the comments should be removed.""" + doc = Article(article) - def tearDown(self): - """Drop the article""" - self.article = None + assert 'class="comments"' not in doc.readable - def test_parses(self): - """Verify we can parse the document.""" - doc = Article(self.article) - self.assertTrue('id="readabilityBody"' in doc.readable) - def test_comments_cleaned(self): - """The div with the comments should be removed.""" - doc = Article(self.article) - self.assertTrue('class="comments"' not in doc.readable) +def test_beta_removed(article): + """The id=beta element should be removed - def test_beta_removed(self): - """The id=beta element should be removed + It's link heavy and causing a lot of garbage content. This should be + removed. - It's link heavy and causing a lot of garbage content. This should be - removed. + """ + doc = Article(article) - """ - doc = Article(self.article) - self.assertTrue('id="beta"' not in doc.readable) + assert 'id="beta"' not in doc.readable diff --git a/tests/test_articles/test_businessinsider-com/test_article.py b/tests/test_articles/test_businessinsider-com/test_article.py index 54d5570..d122a51 100644 --- a/tests/test_articles/test_businessinsider-com/test_article.py +++ b/tests/test_articles/test_businessinsider-com/test_article.py @@ -1,33 +1,34 @@ +# -*- coding: utf8 -*- + +"""Test the scoring and parsing of the Blog Post""" + +from __future__ import absolute_import, division, print_function, unicode_literals + import os -try: - # Python < 2.7 - import unittest2 as unittest -except ImportError: - import unittest + +import pytest from breadability.readable import Article -class TestBusinessInsiderArticle(unittest.TestCase): - """Test the scoring and parsing of the Blog Post""" +@pytest.fixture(scope="module") +def article(): + """Load up the article for us""" + article_path = os.path.join(os.path.dirname(__file__), 'article.html') + with open(article_path) as file: + return file.read() + - def setUp(self): +def test_parses(article): + """Verify we can parse the document.""" + doc = Article(article) - """Load up the article for us""" - article_path = os.path.join(os.path.dirname(__file__), 'article.html') - self.article = open(article_path).read() + assert 'id="readabilityBody"' in doc.readable - def tearDown(self): - """Drop the article""" - self.article = None - def test_parses(self): - """Verify we can parse the document.""" - doc = Article(self.article) - self.assertTrue('id="readabilityBody"' in doc.readable) +def test_images_preserved(article): + """The div with the comments should be removed.""" + doc = Article(article) - def test_images_preserved(self): - """The div with the comments should be removed.""" - doc = Article(self.article) - self.assertTrue('bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg' in doc.readable) - self.assertTrue('bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg' in doc.readable) + assert 'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg' in doc.readable + assert 'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg' in doc.readable diff --git a/tests/test_articles/test_businessinsider_com/test_article.py b/tests/test_articles/test_businessinsider_com/test_article.py index 97d19c6..57ea8cf 100644 --- a/tests/test_articles/test_businessinsider_com/test_article.py +++ b/tests/test_articles/test_businessinsider_com/test_article.py @@ -1,39 +1,33 @@ # -*- coding: utf8 -*- -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals +""" +Test the scoring and parsing of the article from URL below: +http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8 +""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os + +import pytest -from os.path import join, dirname from breadability.readable import Article -from ...compat import unittest - - -class TestArticle(unittest.TestCase): - """ - Test the scoring and parsing of the article from URL below: - http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8 - """ - - def setUp(self): - """Load up the article for us""" - article_path = join(dirname(__file__), "article.html") - with open(article_path, "rb") as file: - self.document = Article(file.read(), "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8") - - def tearDown(self): - """Drop the article""" - self.document = None - - def test_parses(self): - """Verify we can parse the document.""" - self.assertIn('id="readabilityBody"', self.document.readable) - - def test_images_preserved(self): - """The div with the comments should be removed.""" - images = [ - 'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg', - 'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg', - ] - - for image in images: - self.assertIn(image, self.document.readable, image) + + +@pytest.fixture(scope="module") +def article(): + """Load up the article for us""" + article_path = os.path.join(os.path.dirname(__file__), 'article.html') + with open(article_path, "rb") as file: + return Article(file.read(), "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8") + + +def test_parses(article): + """Verify we can parse the document.""" + assert 'id="readabilityBody"' in article.readable + + +def test_images_preserved(article): + """The div with the comments should be removed.""" + assert 'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg' in article.readable + assert 'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg' in article.readable diff --git a/tests/test_articles/test_cz_zdrojak_tests/test_article.py b/tests/test_articles/test_cz_zdrojak_tests/test_article.py index 853ce15..e0e68c8 100644 --- a/tests/test_articles/test_cz_zdrojak_tests/test_article.py +++ b/tests/test_articles/test_cz_zdrojak_tests/test_article.py @@ -1,44 +1,44 @@ # -*- coding: utf8 -*- -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals +""" +Test the scoring and parsing of the article from URL below: +http://www.zdrojak.cz/clanky/jeste-k-testovani/ +""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os + +import pytest -from os.path import join, dirname -from breadability.readable import Article from breadability._compat import unicode -from ...compat import unittest +from breadability.readable import Article + +@pytest.fixture(scope="module") +def article(): + """Load up the article for us""" + article_path = os.path.join(os.path.dirname(__file__), 'article.html') + with open(article_path, "rb") as file: + return Article(file.read(), "http://www.zdrojak.cz/clanky/jeste-k-testovani/") -class TestArticle(unittest.TestCase): - """ - Test the scoring and parsing of the article from URL below: - http://www.zdrojak.cz/clanky/jeste-k-testovani/ - """ - def setUp(self): - """Load up the article for us""" - article_path = join(dirname(__file__), "article.html") - with open(article_path, "rb") as file: - self.document = Article(file.read(), "http://www.zdrojak.cz/clanky/jeste-k-testovani/") +def test_parses(article): + """Verify we can parse the document.""" + assert 'id="readabilityBody"' in article.readable - def tearDown(self): - """Drop the article""" - self.document = None - def test_parses(self): - """Verify we can parse the document.""" - self.assertIn('id="readabilityBody"', self.document.readable) +def test_content_exists(article): + """Verify that some content exists.""" + assert isinstance(article.readable, unicode) - def test_content_exists(self): - """Verify that some content exists.""" - self.assertIsInstance(self.document.readable, unicode) + text = "S automatizovaným testováním kódu (a ve zbytku článku budu mít na mysli právě to) jsem se setkal v několika firmách." + assert text in article.readable - text = "S automatizovaným testováním kódu (a ve zbytku článku budu mít na mysli právě to) jsem se setkal v několika firmách." - self.assertIn(text, self.document.readable) + text = "Ke čtení naleznete mnoho různých materiálů, od teoretických po praktické ukázky." + assert text in article.readable - text = "Ke čtení naleznete mnoho různých materiálů, od teoretických po praktické ukázky." - self.assertIn(text, self.document.readable) - def test_content_does_not_exist(self): - """Verify we cleaned out some content that shouldn't exist.""" - self.assertNotIn("Pokud vás problematika zajímá, využijte možnosti navštívit školení", self.document.readable) +def test_content_does_not_exist(article): + """Verify we cleaned out some content that shouldn't exist.""" + assert "Pokud vás problematika zajímá, využijte možnosti navštívit školení" not in article.readable diff --git a/tests/test_articles/test_scripting_com/test_article.py b/tests/test_articles/test_scripting_com/test_article.py index 6b74d52..02f1143 100644 --- a/tests/test_articles/test_scripting_com/test_article.py +++ b/tests/test_articles/test_scripting_com/test_article.py @@ -1,74 +1,64 @@ # -*- coding: utf8 -*- -from __future__ import ( - absolute_import, - division, - print_function, - unicode_literals -) +"""Test the scoring and parsing of the Article""" -import os +from __future__ import absolute_import, division, print_function, unicode_literals +import os from operator import attrgetter -from breadability.readable import Article -from breadability.readable import check_siblings -from breadability.readable import prep_article -from ...compat import unittest - - -class TestArticle(unittest.TestCase): - """Test the scoring and parsing of the Article""" - - def setUp(self): - """Load up the article for us""" - article_path = os.path.join(os.path.dirname(__file__), 'article.html') - self.article = open(article_path).read() - - def tearDown(self): - """Drop the article""" - self.article = None - - def test_parses(self): - """Verify we can parse the document.""" - doc = Article(self.article) - self.assertTrue('id="readabilityBody"' in doc.readable) - - def test_content_exists(self): - """Verify that some content exists.""" - doc = Article(self.article) - self.assertTrue('Amazon and Google' in doc.readable) - self.assertFalse('Linkblog updated' in doc.readable) - self.assertFalse( - '#anExampleGoogleDoesntIntendToShareBlogAndItWill' in doc.readable) - - @unittest.skip("Test fails because of some weird hash.") - def test_candidates(self): - """Verify we have candidates.""" - doc = Article(self.article) - # from lxml.etree import tounicode - found = False - wanted_hash = '04e46055' - - for node in doc.candidates.values(): - if node.hash_id == wanted_hash: - found = node - - self.assertTrue(found) - - # we have the right node, it must be deleted for some reason if it's - # not still there when we need it to be. - # Make sure it's not in our to drop list. - for node in doc._should_drop: - self.assertFalse(node == found.node) - - by_score = sorted( - [c for c in doc.candidates.values()], - key=attrgetter('content_score'), reverse=True) - self.assertTrue(by_score[0].node == found.node) - - updated_winner = check_siblings(by_score[0], doc.candidates) - updated_winner.node = prep_article(updated_winner.node) - - # This article hits up against the img > p conditional filtering - # because of the many .gif images in the content. We've removed that - # rule. + +import pytest + +from breadability.readable import Article, check_siblings, prep_article + + +@pytest.fixture(scope="module") +def article(): + """Load up the article for us""" + article_path = os.path.join(os.path.dirname(__file__), 'article.html') + with open(article_path) as file: + return Article(file.read()) + + +def test_parses(article): + """Verify we can parse the document.""" + assert 'id="readabilityBody"' in article.readable + + +def test_content_exists(article): + """Verify that some content exists.""" + assert 'Amazon and Google' in article.readable + assert not 'Linkblog updated' in article.readable + assert not '#anExampleGoogleDoesntIntendToShareBlogAndItWill' in article.readable + + +@pytest.mark.skip("Test fails because of some weird hash.") +def test_candidates(article): + """Verify we have candidates.""" + # from lxml.etree import tounicode + found = False + wanted_hash = '04e46055' + + for node in article.candidates.values(): + if node.hash_id == wanted_hash: + found = node + + assert found + + # we have the right node, it must be deleted for some reason if it's + # not still there when we need it to be. + # Make sure it's not in our to drop list. + for node in article._should_drop: + assert node != found.node + + by_score = sorted( + [c for c in article.candidates.values()], + key=attrgetter('content_score'), reverse=True) + assert by_score[0].node == found.node + + updated_winner = check_siblings(by_score[0], article.candidates) + updated_winner.node = prep_article(updated_winner.node) + + # This article hits up against the img > p conditional filtering + # because of the many .gif images in the content. We've removed that + # rule. diff --git a/tests/test_articles/test_sweetshark/test_article.py b/tests/test_articles/test_sweetshark/test_article.py index 46eef84..ae3d695 100644 --- a/tests/test_articles/test_sweetshark/test_article.py +++ b/tests/test_articles/test_sweetshark/test_article.py @@ -1,33 +1,32 @@ # -*- coding: utf8 -*- -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals +""" +Test the scoring and parsing of the article from URL below: +http://sweetshark.livejournal.com/11564.html +""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os + +import pytest -from os.path import join, dirname from breadability.readable import Article -from ...compat import unittest -class TestSweetsharkBlog(unittest.TestCase): - """ - Test the scoring and parsing of the article from URL below: - http://sweetshark.livejournal.com/11564.html - """ +@pytest.fixture(scope="module") +def article(): + """Load up the article for us""" + article_path = os.path.join(os.path.dirname(__file__), 'article.html') + with open(article_path, "rb") as file: + return Article(file.read(), "http://sweetshark.livejournal.com/11564.html") - def setUp(self): - """Load up the article for us""" - article_path = join(dirname(__file__), "article.html") - with open(article_path, "rb") as file: - self.document = Article(file.read(), "http://sweetshark.livejournal.com/11564.html") - def tearDown(self): - """Drop the article""" - self.document = None +def test_parses(article): + """Verify we can parse the document.""" + assert 'id="readabilityBody"' in article.readable - def test_parses(self): - """Verify we can parse the document.""" - self.assertIn('id="readabilityBody"', self.document.readable) - def test_content_after_video(self): - """The div with the comments should be removed.""" - self.assertIn('Stay hungry, Stay foolish', self.document.readable) +def test_content_after_video(article): + """The div with the comments should be removed.""" + assert 'Stay hungry, Stay foolish' in article.readable diff --git a/tests/test_orig_document.py b/tests/test_orig_document.py index cff46aa..cdc2847 100644 --- a/tests/test_orig_document.py +++ b/tests/test_orig_document.py @@ -1,5 +1,7 @@ # -*- coding: utf8 -*- +"""Verify we can process html into a document to work off of.""" + from __future__ import absolute_import from __future__ import division, print_function, unicode_literals @@ -15,88 +17,97 @@ from breadability.document import ( decode_html, OriginalDocument, ) -from .compat import unittest from .utils import load_snippet -class TestOriginalDocument(unittest.TestCase): - """Verify we can process html into a document to work off of.""" - - def test_convert_br_tags_to_paragraphs(self): - returned = convert_breaks_to_paragraphs( - ("
HI

How are you?

\t \n
" - "Fine\n I guess
")) - - self.assertEqual( - returned, - "
HI

How are you?

Fine\n I guess

") - - def test_convert_hr_tags_to_paragraphs(self): - returned = convert_breaks_to_paragraphs( - "
HI

How are you?
\t \n
Fine\n I guess
") - - self.assertEqual( - returned, - "
HI

How are you?

Fine\n I guess

") - - def test_readin_min_document(self): - """Verify we can read in a min html document""" - doc = OriginalDocument(load_snippet('document_min.html')) - self.assertTrue(to_unicode(doc).startswith('')) - self.assertEqual(doc.title, 'Min Document Title') - - def test_readin_with_base_url(self): - """Passing a url should update links to be absolute links""" - doc = OriginalDocument( - load_snippet('document_absolute_url.html'), - url="http://blog.mitechie.com/test.html") - self.assertTrue(to_unicode(doc).startswith('')) - - # find the links on the page and make sure each one starts with out - # base url we told it to use. - links = doc.links - self.assertEqual(len(links), 3) - # we should have two links that start with our blog url - # and one link that starts with amazon - link_counts = defaultdict(int) - for link in links: - if link.get('href').startswith('http://blog.mitechie.com'): - link_counts['blog'] += 1 - else: - link_counts['other'] += 1 - - self.assertEqual(link_counts['blog'], 2) - self.assertEqual(link_counts['other'], 1) - - def test_no_br_allowed(self): - """We convert all
tags to

tags""" - doc = OriginalDocument(load_snippet('document_min.html')) - self.assertIsNone(doc.dom.find('.//br')) - - def test_empty_title(self): - """We convert all
tags to

tags""" - document = OriginalDocument( - "") - self.assertEqual(document.title, "") - - def test_title_only_with_tags(self): - """We convert all
tags to

tags""" - document = OriginalDocument( - "<em></em>") - self.assertEqual(document.title, "") - - def test_no_title(self): - """We convert all
tags to

tags""" - document = OriginalDocument("") - self.assertEqual(document.title, "") - - def test_encoding(self): - text = "ľščťžýáíéäúňôůě".encode("iso-8859-2") - html = decode_html(text) - self.assertEqual(type(html), unicode) - - def test_encoding_short(self): - text = to_bytes("ľščťžýáíé") - html = decode_html(text) - self.assertEqual(type(html), unicode) - self.assertEqual(html, "ľščťžýáíé") +def test_convert_br_tags_to_paragraphs(): + returned = convert_breaks_to_paragraphs( + ("

HI

How are you?

\t \n
" + "Fine\n I guess
")) + + assert returned == "
HI

How are you?

Fine\n I guess

" + + +def test_convert_hr_tags_to_paragraphs(): + returned = convert_breaks_to_paragraphs( + "
HI

How are you?
\t \n
Fine\n I guess
") + + assert returned == "
HI

How are you?

Fine\n I guess

" + + +def test_readin_min_document(): + """Verify we can read in a min html document""" + doc = OriginalDocument(load_snippet('document_min.html')) + + assert to_unicode(doc).startswith('') + assert doc.title == 'Min Document Title' + + +def test_readin_with_base_url(): + """Passing a url should update links to be absolute links""" + doc = OriginalDocument( + load_snippet('document_absolute_url.html'), + url="http://blog.mitechie.com/test.html") + + assert to_unicode(doc).startswith('') + + # find the links on the page and make sure each one starts with out + # base url we told it to use. + links = doc.links + assert len(links) == 3 + # we should have two links that start with our blog url + # and one link that starts with amazon + link_counts = defaultdict(int) + for link in links: + if link.get('href').startswith('http://blog.mitechie.com'): + link_counts['blog'] += 1 + else: + link_counts['other'] += 1 + + assert link_counts['blog'] == 2 + assert link_counts['other'] == 1 + + +def test_no_br_allowed(): + """We convert all
tags to

tags""" + doc = OriginalDocument(load_snippet('document_min.html')) + + assert doc.dom.find('.//br') is None + + +def test_empty_title(): + """We convert all
tags to

tags""" + document = OriginalDocument( + "") + + assert document.title == "" + + +def test_title_only_with_tags(): + """We convert all
tags to

tags""" + document = OriginalDocument( + "<em></em>") + + assert document.title == "" + + +def test_no_title(): + """We convert all
tags to

tags""" + document = OriginalDocument("") + + assert document.title == "" + + +def test_encoding(): + text = "ľščťžýáíéäúňôůě".encode("iso-8859-2") + html = decode_html(text) + + assert type(html) is unicode + + +def test_encoding_short(): + text = to_bytes("ľščťžýáíé") + html = decode_html(text) + + assert type(html) is unicode + assert html == "ľščťžýáíé" diff --git a/tests/test_readable.py b/tests/test_readable.py index 483bb6f..bba0ffb 100644 --- a/tests/test_readable.py +++ b/tests/test_readable.py @@ -1,347 +1,352 @@ # -*- coding: utf8 -*- -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals +import pytest from lxml.etree import tounicode -from lxml.html import document_fromstring -from lxml.html import fragment_fromstring +from lxml.html import document_fromstring, fragment_fromstring + from breadability._compat import to_unicode -from breadability.readable import ( - Article, - get_class_weight, - get_link_density, - is_bad_link, - leaf_div_elements_into_paragraphs, - score_candidates, -) +from breadability.readable import (Article, get_class_weight, get_link_density, is_bad_link, + leaf_div_elements_into_paragraphs, score_candidates, ) from breadability.scoring import ScoredNode -from .compat import unittest -from .utils import load_snippet, load_article +from .utils import load_article, load_snippet +# TestReadableDocument +"""Verify we can process html into a document to work off of.""" -class TestReadableDocument(unittest.TestCase): - """Verify we can process html into a document to work off of.""" - def test_load_doc(self): - """We get back an element tree from our original doc""" - doc = Article(load_snippet('document_min.html')) - # We get back the document as a div tag currently by default. - self.assertEqual(doc.readable_dom.tag, 'div') +def test_load_doc(): + """We get back an element tree from our original doc""" + doc = Article(load_snippet('document_min.html')) + # We get back the document as a div tag currently by default. - def test_title_loads(self): - """Verify we can fetch the title of the parsed article""" - doc = Article(load_snippet('document_min.html')) - self.assertEqual( - doc._original_document.title, - 'Min Document Title' - ) + assert doc.readable_dom.tag == 'div' - def test_doc_no_scripts_styles(self): - """Step #1 remove all scripts from the document""" - doc = Article(load_snippet('document_scripts.html')) - readable = doc.readable_dom - self.assertEqual(readable.findall(".//script"), []) - self.assertEqual(readable.findall(".//style"), []) - self.assertEqual(readable.findall(".//link"), []) - - def test_find_body_exists(self): - """If the document has a body, we store that as the readable html - - No sense processing anything other than the body content. - - """ - doc = Article(load_snippet('document_min.html')) - self.assertEqual(doc.readable_dom.tag, 'div') - self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody') - - def test_body_doesnt_exist(self): - """If we can't find a body, then we create one. - - We build our doc around the rest of the html we parsed. - - """ - doc = Article(load_snippet('document_no_body.html')) - self.assertEqual(doc.readable_dom.tag, 'div') - self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody') - - def test_bare_content(self): - """If the document is just pure content, no html tags we should be ok - - We build our doc around the rest of the html we parsed. - - """ - doc = Article(load_snippet('document_only_content.html')) - self.assertEqual(doc.readable_dom.tag, 'div') - self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody') - - def test_no_content(self): - """Without content we supply an empty unparsed doc.""" - doc = Article('') - self.assertEqual(doc.readable_dom.tag, 'div') - self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody') - self.assertEqual(doc.readable_dom.get('class'), 'parsing-error') - - -class TestCleaning(unittest.TestCase): - """Test out our cleaning processing we do.""" - - def test_unlikely_hits(self): - """Verify we wipe out things from our unlikely list.""" - doc = Article(load_snippet('test_readable_unlikely.html')) - readable = doc.readable_dom - must_not_appear = [ - 'comment', 'community', 'disqus', 'extra', 'foot', - 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar', - 'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager', - 'popup', 'tweet', 'twitter', 'imgBlogpostPermalink'] - - want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow'] - - for i in must_not_appear: - # we cannot find any class or id with this value - by_class = readable.find_class(i) - - for test in by_class: - # if it's here it cannot have the must not class without the - # want to appear class - found = False - for cls in test.get('class').split(): - if cls in want_to_appear: - found = True - self.assertTrue(found) - - by_ids = readable.get_element_by_id(i, False) - if by_ids is not False: - found = False - for ids in test.get('id').split(): - if ids in want_to_appear: - found = True - self.assertTrue(found) - - def test_misused_divs_transform(self): - """Verify we replace leaf node divs with p's - - They should have the same content, just be a p vs a div - - """ - test_html = "

simple
" - test_doc = document_fromstring(test_html) - self.assertEqual( - tounicode( - leaf_div_elements_into_paragraphs(test_doc)), - to_unicode("

simple

") - ) - test_html2 = ('
simplelink' - '
') - test_doc2 = document_fromstring(test_html2) - self.assertEqual( - tounicode( - leaf_div_elements_into_paragraphs(test_doc2)), - to_unicode( - '

simplelink

') - ) +def test_title_loads(): + """Verify we can fetch the title of the parsed article""" + doc = Article(load_snippet('document_min.html')) - def test_dont_transform_div_with_div(self): - """Verify that only child
element is replaced by

.""" - dom = document_fromstring( - "

text
child
" - "aftertext
" - ) + assert doc._original_document.title == 'Min Document Title' - self.assertEqual( - tounicode( - leaf_div_elements_into_paragraphs(dom)), - to_unicode( - "
text

child

" - "aftertext
" - ) - ) - def test_bad_links(self): - """Some links should just not belong.""" - bad_links = [ - ' ', - 'permalink', - 'permalink' - ] +def test_doc_no_scripts_styles(): + """Step #1 remove all scripts from the document""" + doc = Article(load_snippet('document_scripts.html')) + readable = doc.readable_dom + + assert readable.findall(".//script") == [] + assert readable.findall(".//style") == [] + assert readable.findall(".//link") == [] + + +def test_find_body_exists(): + """If the document has a body, we store that as the readable html + + No sense processing anything other than the body content. + + """ + doc = Article(load_snippet('document_min.html')) + + assert doc.readable_dom.tag == 'div' + assert doc.readable_dom.get('id') == 'readabilityBody' + + +def test_body_doesnt_exist(): + """If we can't find a body, then we create one. + + We build our doc around the rest of the html we parsed. + + """ + doc = Article(load_snippet('document_no_body.html')) + + assert doc.readable_dom.tag == 'div' + assert doc.readable_dom.get('id') == 'readabilityBody' + + +def test_bare_content(): + """If the document is just pure content, no html tags we should be ok + + We build our doc around the rest of the html we parsed. + + """ + doc = Article(load_snippet('document_only_content.html')) + + assert doc.readable_dom.tag == 'div' + assert doc.readable_dom.get('id') == 'readabilityBody' + + +def test_no_content(): + """Without content we supply an empty unparsed doc.""" + doc = Article('') + + assert doc.readable_dom.tag == 'div' + assert doc.readable_dom.get('id') == 'readabilityBody' + assert doc.readable_dom.get('class') == 'parsing-error' + + +# Test out our cleaning processing we do. + + +def test_unlikely_hits(): + """Verify we wipe out things from our unlikely list.""" + doc = Article(load_snippet('test_readable_unlikely.html')) + readable = doc.readable_dom + must_not_appear = [ + 'comment', 'community', 'disqus', 'extra', 'foot', + 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar', + 'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager', + 'popup', 'tweet', 'twitter', 'imgBlogpostPermalink'] + + want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow'] + + for i in must_not_appear: + # we cannot find any class or id with this value + by_class = readable.find_class(i) + + for test in by_class: + # if it's here it cannot have the must not class without the + # want to appear class + found = False + for cls in test.get('class').split(): + if cls in want_to_appear: + found = True + assert found + + by_ids = readable.get_element_by_id(i, False) + if by_ids is not False: + found = False + for ids in test.get('id').split(): + if ids in want_to_appear: + found = True + assert found + + +def test_misused_divs_transform(): + """Verify we replace leaf node divs with p's + + They should have the same content, just be a p vs a div + + """ + test_html = "
simple
" + test_doc = document_fromstring(test_html) + assert tounicode(leaf_div_elements_into_paragraphs(test_doc)) == to_unicode( + "

simple

" + ) + + test_html2 = ('
simplelink' + '
') + test_doc2 = document_fromstring(test_html2) + assert tounicode(leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode( + '

simplelink

' + ) + + +def test_dont_transform_div_with_div(): + """Verify that only child
element is replaced by

.""" + dom = document_fromstring( + "

text
child
" + "aftertext
" + ) + + assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode( + "
text

child

" + "aftertext
" + ) + - for l in bad_links: - link = fragment_fromstring(l) - self.assertTrue(is_bad_link(link)) +def test_bad_links(): + """Some links should just not belong.""" + bad_links = [ + ' ', + 'permalink', + 'permalink' + ] + for l in bad_links: + link = fragment_fromstring(l) + assert is_bad_link(link) -class TestCandidateNodes(unittest.TestCase): - """Candidate nodes are scoring containers we use.""" - def test_candidate_scores(self): - """We should be getting back objects with some scores.""" - fives = ['
'] - threes = ['
', '', '
'] - neg_threes = ['
', '
    '] - neg_fives = ['

    ', '

    ', '

    ', '

    '] +# Candidate nodes are scoring containers we use. - for n in fives: - doc = fragment_fromstring(n) - self.assertEqual(ScoredNode(doc).content_score, 5) - for n in threes: - doc = fragment_fromstring(n) - self.assertEqual(ScoredNode(doc).content_score, 3) +def test_candidate_scores(): + """We should be getting back objects with some scores.""" + fives = ['
    '] + threes = ['
    ', '', '
    '] + neg_threes = ['
    ', '
      '] + neg_fives = ['

      ', '

      ', '

      ', '

      '] - for n in neg_threes: - doc = fragment_fromstring(n) - self.assertEqual(ScoredNode(doc).content_score, -3) + for n in fives: + doc = fragment_fromstring(n) + assert ScoredNode(doc).content_score == 5 - for n in neg_fives: - doc = fragment_fromstring(n) - self.assertEqual(ScoredNode(doc).content_score, -5) + for n in threes: + doc = fragment_fromstring(n) + assert ScoredNode(doc).content_score == 3 - def test_article_enables_candidate_access(self): - """Candidates are accessible after document processing.""" - doc = Article(load_article('ars.001.html')) - self.assertTrue(hasattr(doc, 'candidates')) + for n in neg_threes: + doc = fragment_fromstring(n) + assert ScoredNode(doc).content_score == -3 + for n in neg_fives: + doc = fragment_fromstring(n) + assert ScoredNode(doc).content_score == -5 -class TestClassWeights(unittest.TestCase): - """Certain ids and classes get us bonus points.""" - def test_positive_class(self): - """Some classes get us bonus points.""" - node = fragment_fromstring('

      ') - self.assertEqual(get_class_weight(node), 25) +def test_article_enables_candidate_access(): + """Candidates are accessible after document processing.""" + doc = Article(load_article('ars.001.html')) - def test_positive_ids(self): - """Some ids get us bonus points.""" - node = fragment_fromstring('

      ') - self.assertEqual(get_class_weight(node), 25) + assert hasattr(doc, 'candidates') - def test_negative_class(self): - """Some classes get us negative points.""" - node = fragment_fromstring('

      ') - self.assertEqual(get_class_weight(node), -25) - def test_negative_ids(self): - """Some ids get us negative points.""" - node = fragment_fromstring('

      ') - self.assertEqual(get_class_weight(node), -25) +# Certain ids and classes get us bonus points. -class TestScoringNodes(unittest.TestCase): - """We take out list of potential nodes and score them up.""" +def test_positive_class(): + """Some classes get us bonus points.""" + node = fragment_fromstring('

      ') + assert get_class_weight(node) == 25 - def test_we_get_candidates(self): - """Processing candidates should get us a list of nodes to try out.""" - doc = document_fromstring(load_article("ars.001.html")) - test_nodes = tuple(doc.iter("p", "td", "pre")) - candidates = score_candidates(test_nodes) - # this might change as we tweak our algorithm, but if it does, - # it signifies we need to look at what we changed. - self.assertEqual(len(candidates.keys()), 37) +def test_positive_ids(): + """Some ids get us bonus points.""" + node = fragment_fromstring('

      ') + assert get_class_weight(node) == 25 - # one of these should have a decent score - scores = sorted(c.content_score for c in candidates.values()) - self.assertTrue(scores[-1] > 100) - - def test_bonus_score_per_100_chars_in_p(self): - """Nodes get 1 point per 100 characters up to max. 3 points.""" - def build_candidates(length): - html = "

      %s

      " % ("c" * length) - node = fragment_fromstring(html) - - return [node] - - test_nodes = build_candidates(50) - candidates = score_candidates(test_nodes) - pscore_50 = max(c.content_score for c in candidates.values()) - - test_nodes = build_candidates(100) - candidates = score_candidates(test_nodes) - pscore_100 = max(c.content_score for c in candidates.values()) - - test_nodes = build_candidates(300) - candidates = score_candidates(test_nodes) - pscore_300 = max(c.content_score for c in candidates.values()) - - test_nodes = build_candidates(400) - candidates = score_candidates(test_nodes) - pscore_400 = max(c.content_score for c in candidates.values()) - - self.assertAlmostEqual(pscore_50 + 0.5, pscore_100) - self.assertAlmostEqual(pscore_100 + 2.0, pscore_300) - self.assertAlmostEqual(pscore_300, pscore_400) - - -class TestLinkDensityScoring(unittest.TestCase): - """Link density will adjust out candidate scoresself.""" - - def test_link_density(self): - """Test that we get a link density""" - doc = document_fromstring(load_article('ars.001.html')) - for node in doc.iter('p', 'td', 'pre'): - density = get_link_density(node) - - # the density must be between 0, 1 - self.assertTrue(density >= 0.0 and density <= 1.0) - - -class TestSiblings(unittest.TestCase): - """Siblings will be included if their content is related.""" - - @unittest.skip("Not implemented yet.") - def test_bad_siblings_not_counted(self): - raise NotImplementedError() - - @unittest.skip("Not implemented yet.") - def test_good_siblings_counted(self): - raise NotImplementedError() - - -class TestMainText(unittest.TestCase): - def test_empty(self): - article = Article("") - annotated_text = article.main_text - - self.assertEqual(annotated_text, []) - - def test_no_annotations(self): - article = Article("

      This is text with no annotations

      ") - annotated_text = article.main_text - - self.assertEqual(annotated_text, - [(("This is text with no annotations", None),)]) - - def test_one_annotation(self): - article = Article("

      This is text\r\twith no annotations

      ") - annotated_text = article.main_text - - expected = [( - ("This is text\nwith", None), - ("no", ("del",)), - ("annotations", None), - )] - self.assertEqual(annotated_text, expected) - - def test_simple_snippet(self): - snippet = Article(load_snippet("annotated_1.html")) - annotated_text = snippet.main_text - - expected = [ - ( - ("Paragraph is more", None), - ("better", ("em",)), - (".\nThis text is very", None), - ("pretty", ("strong",)), - ("'cause she's girl.", None), - ), - ( - ("This is not", None), - ("crap", ("big",)), - ("so", None), - ("readability", ("dfn",)), - ("me :)", None), - ) - ] - self.assertEqual(annotated_text, expected) + +def test_negative_class(): + """Some classes get us negative points.""" + node = fragment_fromstring('

      ') + assert get_class_weight(node) == -25 + + +def test_negative_ids(): + """Some ids get us negative points.""" + node = fragment_fromstring('

      ') + assert get_class_weight(node) == -25 + + +# We take out list of potential nodes and score them up. + + +def test_we_get_candidates(): + """Processing candidates should get us a list of nodes to try out.""" + doc = document_fromstring(load_article("ars.001.html")) + test_nodes = tuple(doc.iter("p", "td", "pre")) + candidates = score_candidates(test_nodes) + + # this might change as we tweak our algorithm, but if it does, + # it signifies we need to look at what we changed. + assert len(candidates.keys()) == 37 + + # one of these should have a decent score + scores = sorted(c.content_score for c in candidates.values()) + assert scores[-1] > 100 + + +def test_bonus_score_per_100_chars_in_p(): + """Nodes get 1 point per 100 characters up to max. 3 points.""" + def build_candidates(length): + html = "

      %s

      " % ("c" * length) + node = fragment_fromstring(html) + + return [node] + + test_nodes = build_candidates(50) + candidates = score_candidates(test_nodes) + pscore_50 = max(c.content_score for c in candidates.values()) + + test_nodes = build_candidates(100) + candidates = score_candidates(test_nodes) + pscore_100 = max(c.content_score for c in candidates.values()) + + test_nodes = build_candidates(300) + candidates = score_candidates(test_nodes) + pscore_300 = max(c.content_score for c in candidates.values()) + + test_nodes = build_candidates(400) + candidates = score_candidates(test_nodes) + pscore_400 = max(c.content_score for c in candidates.values()) + + assert pscore_50 + 0.5 == pscore_100 + assert pscore_100 + 2.0 == pscore_300 + assert pscore_300 == pscore_400 + + +# Link density will adjust out candidate scoresself. + + +def test_link_density(): + """Test that we get a link density""" + doc = document_fromstring(load_article('ars.001.html')) + for node in doc.iter('p', 'td', 'pre'): + density = get_link_density(node) + + # the density must be between 0, 1 + assert density >= 0.0 and density <= 1.0 + + +# Siblings will be included if their content is related. + + +@pytest.mark.skip("Not implemented yet.") +def test_bad_siblings_not_counted(): + raise NotImplementedError() + + +@pytest.mark.skip("Not implemented yet.") +def test_good_siblings_counted(): + raise NotImplementedError() + + +# TestMainText + +def test_empty(): + article = Article("") + annotated_text = article.main_text + + assert annotated_text == [] + + +def test_no_annotations(): + article = Article("

      This is text with no annotations

      ") + annotated_text = article.main_text + + assert annotated_text == [(("This is text with no annotations", None),)] + + +def test_one_annotation(): + article = Article("

      This is text\r\twith no annotations

      ") + annotated_text = article.main_text + + assert annotated_text == [( + ("This is text\nwith", None), + ("no", ("del",)), + ("annotations", None), + )] + + +def test_simple_snippet(): + snippet = Article(load_snippet("annotated_1.html")) + annotated_text = snippet.main_text + + assert annotated_text == [ + ( + ("Paragraph is more", None), + ("better", ("em",)), + (".\nThis text is very", None), + ("pretty", ("strong",)), + ("'cause she's girl.", None), + ), + ( + ("This is not", None), + ("crap", ("big",)), + ("so", None), + ("readability", ("dfn",)), + ("me :)", None), + ) + ] diff --git a/tests/test_scoring.py b/tests/test_scoring.py index 80d3462..891468b 100644 --- a/tests/test_scoring.py +++ b/tests/test_scoring.py @@ -1,284 +1,295 @@ # -*- coding: utf8 -*- -from __future__ import absolute_import -from __future__ import division, print_function, unicode_literals +from __future__ import absolute_import, division, print_function, unicode_literals import re - from operator import attrgetter -from lxml.html import document_fromstring -from lxml.html import fragment_fromstring -from breadability.readable import Article -from breadability.scoring import ( - check_node_attributes, - generate_hash_id, - get_class_weight, - score_candidates, - ScoredNode, -) -from breadability.readable import ( - get_link_density, - is_unlikely_node, -) -from .compat import unittest + +from lxml.html import document_fromstring, fragment_fromstring + +from breadability.readable import Article, get_link_density, is_unlikely_node +from breadability.scoring import (ScoredNode, check_node_attributes, generate_hash_id, get_class_weight, + score_candidates) from .utils import load_snippet -class TestHashId(unittest.TestCase): - def test_generate_hash(self): - dom = fragment_fromstring("
      ľščťžýáí
      ") - generate_hash_id(dom) +def test_generate_hash(): + dom = fragment_fromstring("
      ľščťžýáí
      ") + generate_hash_id(dom) - def test_hash_from_id_on_exception(self): - generate_hash_id(None) - def test_different_hashes(self): - dom = fragment_fromstring("
      ľščťžýáí
      ") - hash_dom = generate_hash_id(dom) - hash_none = generate_hash_id(None) +def test_hash_from_id_on_exception(): + generate_hash_id(None) - self.assertNotEqual(hash_dom, hash_none) - def test_equal_hashes(self): - dom1 = fragment_fromstring("
      ľščťžýáí
      ") - dom2 = fragment_fromstring("
      ľščťžýáí
      ") - hash_dom1 = generate_hash_id(dom1) - hash_dom2 = generate_hash_id(dom2) - self.assertEqual(hash_dom1, hash_dom2) +def test_different_hashes(): + dom = fragment_fromstring("
      ľščťžýáí
      ") + hash_dom = generate_hash_id(dom) + hash_none = generate_hash_id(None) - hash_none1 = generate_hash_id(None) - hash_none2 = generate_hash_id(None) - self.assertEqual(hash_none1, hash_none2) + assert hash_dom != hash_none -class TestCheckNodeAttr(unittest.TestCase): - """Verify a node has a class/id in the given set. +def test_equal_hashes(): + dom1 = fragment_fromstring("
      ľščťžýáí
      ") + dom2 = fragment_fromstring("
      ľščťžýáí
      ") + hash_dom1 = generate_hash_id(dom1) + hash_dom2 = generate_hash_id(dom2) + assert hash_dom1 == hash_dom2 - The idea is that we have sets of known good/bad ids and classes and need - to verify the given node does/doesn't have those classes/ids. + hash_none1 = generate_hash_id(None) + hash_none2 = generate_hash_id(None) + assert hash_none1 == hash_none2 + + +# Verify a node has a class/id in the given set. +# The idea is that we have sets of known good/bad ids and classes and need +# to verify the given node does/doesn't have those classes/ids. + + +def test_has_class(): + """Verify that a node has a class in our set.""" + test_pattern = re.compile('test1|test2', re.I) + test_node = fragment_fromstring('
      ') + test_node.set('class', 'test2 comment') + + assert check_node_attributes(test_pattern, test_node, 'class') + + +def test_has_id(): + """Verify that a node has an id in our set.""" + test_pattern = re.compile('test1|test2', re.I) + test_node = fragment_fromstring('
      ') + test_node.set('id', 'test2') + + assert check_node_attributes(test_pattern, test_node, 'id') + + +def test_lacks_class(): + """Verify that a node does not have a class in our set.""" + test_pattern = re.compile('test1|test2', re.I) + test_node = fragment_fromstring('
      ') + test_node.set('class', 'test4 comment') + + assert not check_node_attributes(test_pattern, test_node, 'class') + + +def test_lacks_id(): + """Verify that a node does not have an id in our set.""" + test_pattern = re.compile('test1|test2', re.I) + test_node = fragment_fromstring('
      ') + test_node.set('id', 'test4') + + assert not check_node_attributes(test_pattern, test_node, 'id') + + +# Verify we calc our link density correctly. + + +def test_empty_node(): + """An empty node doesn't have much of a link density""" + doc = Article("
      ") + + assert get_link_density(doc.readable_dom) == 0.0 + + +def test_small_doc_no_links(): + doc = Article(load_snippet('document_min.html')) + + assert get_link_density(doc.readable_dom) == 0.0 + + +def test_several_links(): + """This doc has a 3 links with the majority of content.""" + doc = Article(load_snippet('document_absolute_url.html')) + + assert get_link_density(doc.readable_dom) == 22/37 + + +# Verify we score nodes correctly based on their class/id attributes. + + +def test_no_matches_zero(): + """If you don't have the attribute then you get a weight of 0""" + node = fragment_fromstring("
      ") + + assert get_class_weight(node) == 0 - """ - def test_has_class(self): - """Verify that a node has a class in our set.""" - test_pattern = re.compile('test1|test2', re.I) - test_node = fragment_fromstring('
      ') - test_node.set('class', 'test2 comment') - - self.assertTrue( - check_node_attributes(test_pattern, test_node, 'class')) - - def test_has_id(self): - """Verify that a node has an id in our set.""" - test_pattern = re.compile('test1|test2', re.I) - test_node = fragment_fromstring('
      ') - test_node.set('id', 'test2') - - self.assertTrue(check_node_attributes(test_pattern, test_node, 'id')) - - def test_lacks_class(self): - """Verify that a node does not have a class in our set.""" - test_pattern = re.compile('test1|test2', re.I) - test_node = fragment_fromstring('
      ') - test_node.set('class', 'test4 comment') - self.assertFalse( - check_node_attributes(test_pattern, test_node, 'class')) - - def test_lacks_id(self): - """Verify that a node does not have an id in our set.""" - test_pattern = re.compile('test1|test2', re.I) - test_node = fragment_fromstring('
      ') - test_node.set('id', 'test4') - self.assertFalse(check_node_attributes(test_pattern, test_node, 'id')) - - -class TestLinkDensity(unittest.TestCase): - """Verify we calc our link density correctly.""" - - def test_empty_node(self): - """An empty node doesn't have much of a link density""" - doc = Article("
      ") - self.assertEqual(get_link_density(doc.readable_dom), 0.0) - - def test_small_doc_no_links(self): - doc = Article(load_snippet('document_min.html')) - self.assertEqual(get_link_density(doc.readable_dom), 0.0) - - def test_several_links(self): - """This doc has a 3 links with the majority of content.""" - doc = Article(load_snippet('document_absolute_url.html')) - self.assertAlmostEqual(get_link_density(doc.readable_dom), 22/37) - - -class TestClassWeight(unittest.TestCase): - """Verify we score nodes correctly based on their class/id attributes.""" - - def test_no_matches_zero(self): - """If you don't have the attribute then you get a weight of 0""" - node = fragment_fromstring("
      ") - self.assertEqual(get_class_weight(node), 0) - - def test_id_hits(self): - """If the id is in the list then it gets a weight""" - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - self.assertEqual(get_class_weight(node), 25) - - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - self.assertEqual(get_class_weight(node), -25) - - def test_class_hits(self): - """If the class is in the list then it gets a weight""" - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - self.assertEqual(get_class_weight(node), 25) - - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - self.assertEqual(get_class_weight(node), -25) - - def test_scores_collide(self): - """We might hit both positive and negative scores. - - Positive and negative scoring is done independently so it's possible - to hit both positive and negative scores and cancel each other out. - - """ - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - self.assertEqual(get_class_weight(node), 0) - - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - self.assertEqual(get_class_weight(node), 25) - - def test_scores_only_once(self): - """Scoring is not cumulative within a class hit.""" - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - self.assertEqual(get_class_weight(node), 25) - - -class TestUnlikelyNode(unittest.TestCase): - """is_unlikely_node should help verify our node is good/bad.""" - - def test_body_is_always_likely(self): - """The body tag is always a likely node.""" - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - self.assertFalse(is_unlikely_node(node)) - - def test_is_unlikely(self): - "Keywords in the class/id will make us believe this is unlikely." - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - self.assertTrue(is_unlikely_node(node)) - - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - self.assertTrue(is_unlikely_node(node)) - - def test_not_unlikely(self): - """Suck it double negatives.""" - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - self.assertFalse(is_unlikely_node(node)) - - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - self.assertFalse(is_unlikely_node(node)) - - def test_maybe_hits(self): - """We've got some maybes that will overrule an unlikely node.""" - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - self.assertFalse(is_unlikely_node(node)) - - -class TestScoredNode(unittest.TestCase): - """ScoredNodes constructed have initial content_scores, etc.""" - - def test_hash_id(self): - """ScoredNodes have a hash_id based on their content - - Since this is based on the html there are chances for collisions, but - it helps us follow and identify nodes through the scoring process. Two - identical nodes would score the same, so meh all good. - - """ - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - snode = ScoredNode(node) - self.assertEqual(snode.hash_id, 'ffa4c519') - - def test_div_content_score(self): - """A div starts out with a score of 5 and modifies from there""" - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - snode = ScoredNode(node) - self.assertEqual(snode.content_score, 5) - - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - snode = ScoredNode(node) - self.assertEqual(snode.content_score, 30) - - test_div = '
      Content
      ' - node = fragment_fromstring(test_div) - snode = ScoredNode(node) - self.assertEqual(snode.content_score, -20) - - def test_headings_score(self): - """Heading tags aren't likely candidates, hurt their scores.""" - test_div = '

      Heading

      ' - node = fragment_fromstring(test_div) - snode = ScoredNode(node) - self.assertEqual(snode.content_score, -5) - - def test_list_items(self): - """Heading tags aren't likely candidates, hurt their scores.""" - test_div = '
    1. list item
    2. ' - node = fragment_fromstring(test_div) - snode = ScoredNode(node) - self.assertEqual(snode.content_score, -3) - - -class TestScoreCandidates(unittest.TestCase): - """The grand daddy of tests to make sure our scoring works - - Now scoring details will change over time, so the most important thing is - to make sure candidates come out in the right order, not necessarily how - they scored. Make sure to keep this in mind while getting tests going. + +def test_id_hits(): + """If the id is in the list then it gets a weight""" + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + + assert get_class_weight(node) == 25 + + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + + assert get_class_weight(node) == -25 + + +def test_class_hits(): + """If the class is in the list then it gets a weight""" + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + assert get_class_weight(node) == 25 + + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + assert get_class_weight(node) == -25 + + +def test_scores_collide(): + """We might hit both positive and negative scores. + + Positive and negative scoring is done independently so it's possible + to hit both positive and negative scores and cancel each other out. """ + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + assert get_class_weight(node) == 0 + + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + assert get_class_weight(node) == 25 + + +def test_scores_only_once(): + """Scoring is not cumulative within a class hit.""" + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + + assert get_class_weight(node) == 25 + + +# is_unlikely_node should help verify our node is good/bad. + - def test_simple_candidate_set(self): - """Tests a simple case of two candidate nodes""" - html = """ - - -
      -

      This is a great amount of info

      -

      And more content Home -

      - - - - """ - dom = document_fromstring(html) - div_nodes = dom.findall(".//div") - - candidates = score_candidates(div_nodes) - ordered = sorted( - (c for c in candidates.values()), reverse=True, - key=attrgetter("content_score")) - - self.assertEqual(ordered[0].node.tag, "div") - self.assertEqual(ordered[0].node.attrib["class"], "content") - self.assertEqual(ordered[1].node.tag, "body") - self.assertEqual(ordered[2].node.tag, "html") - self.assertEqual(ordered[3].node.tag, "div") - self.assertEqual(ordered[3].node.attrib["class"], "footer") +def test_body_is_always_likely(): + """The body tag is always a likely node.""" + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + + assert not is_unlikely_node(node) + + +def test_is_unlikely(): + """Keywords in the class/id will make us believe this is unlikely.""" + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + assert is_unlikely_node(node) + + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + assert is_unlikely_node(node) + + +def test_not_unlikely(): + """Suck it double negatives.""" + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + assert not is_unlikely_node(node) + + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + assert not is_unlikely_node(node) + + +def test_maybe_hits(): + """We've got some maybes that will overrule an unlikely node.""" + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + assert not is_unlikely_node(node) + + +# ScoredNodes constructed have initial content_scores, etc. + + +def test_hash_id(): + """ScoredNodes have a hash_id based on their content + + Since this is based on the html there are chances for collisions, but + it helps us follow and identify nodes through the scoring process. Two + identical nodes would score the same, so meh all good. + + """ + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + snode = ScoredNode(node) + + assert snode.hash_id == 'ffa4c519' + + +def test_div_content_score(): + """A div starts out with a score of 5 and modifies from there""" + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + snode = ScoredNode(node) + assert snode.content_score == 5 + + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + snode = ScoredNode(node) + assert snode.content_score == 30 + + test_div = '
      Content
      ' + node = fragment_fromstring(test_div) + snode = ScoredNode(node) + assert snode.content_score == -20 + + +def test_headings_score(): + """Heading tags aren't likely candidates, hurt their scores.""" + test_div = '

      Heading

      ' + node = fragment_fromstring(test_div) + snode = ScoredNode(node) + + assert snode.content_score == -5 + + +def test_list_items(): + """Heading tags aren't likely candidates, hurt their scores.""" + test_div = '
    3. list item
    4. ' + node = fragment_fromstring(test_div) + snode = ScoredNode(node) + assert snode.content_score == -3 + + +# The grand daddy of tests to make sure our scoring works +# Now scoring details will change over time, so the most important thing is +# to make sure candidates come out in the right order, not necessarily how +# they scored. Make sure to keep this in mind while getting tests going. + + +def test_simple_candidate_set(): + """Tests a simple case of two candidate nodes""" + html = """ + + +
      +

      This is a great amount of info

      +

      And more content Home +

      + + + + """ + dom = document_fromstring(html) + div_nodes = dom.findall(".//div") + + candidates = score_candidates(div_nodes) + ordered = sorted( + (c for c in candidates.values()), reverse=True, + key=attrgetter("content_score")) + + assert ordered[0].node.tag == "div" + assert ordered[0].node.attrib["class"] == "content" + assert ordered[1].node.tag == "body" + assert ordered[2].node.tag == "html" + assert ordered[3].node.tag == "div" + assert ordered[3].node.attrib["class"] == "footer"