Tests migrated into pytest style
parent
48acf389b1
commit
aa83825334
@ -1,42 +1,45 @@
|
||||
# -*- coding: utf8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division, print_function, unicode_literals
|
||||
"""Test the scoring and parsing of the Blog Post"""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from breadability.readable import Article
|
||||
from ...compat import unittest
|
||||
|
||||
|
||||
class TestAntipopeBlog(unittest.TestCase):
|
||||
"""Test the scoring and parsing of the Blog Post"""
|
||||
@pytest.fixture(scope="module")
|
||||
def article():
|
||||
"""Load up the article for us"""
|
||||
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
|
||||
with open(article_path) as file:
|
||||
return file.read()
|
||||
|
||||
|
||||
def test_parses(article):
|
||||
"""Verify we can parse the document."""
|
||||
doc = Article(article)
|
||||
|
||||
assert 'id="readabilityBody"' in doc.readable
|
||||
|
||||
|
||||
def setUp(self):
|
||||
"""Load up the article for us"""
|
||||
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
|
||||
self.article = open(article_path).read()
|
||||
def test_comments_cleaned(article):
|
||||
"""The div with the comments should be removed."""
|
||||
doc = Article(article)
|
||||
|
||||
def tearDown(self):
|
||||
"""Drop the article"""
|
||||
self.article = None
|
||||
assert 'class="comments"' not in doc.readable
|
||||
|
||||
def test_parses(self):
|
||||
"""Verify we can parse the document."""
|
||||
doc = Article(self.article)
|
||||
self.assertTrue('id="readabilityBody"' in doc.readable)
|
||||
|
||||
def test_comments_cleaned(self):
|
||||
"""The div with the comments should be removed."""
|
||||
doc = Article(self.article)
|
||||
self.assertTrue('class="comments"' not in doc.readable)
|
||||
def test_beta_removed(article):
|
||||
"""The id=beta element should be removed
|
||||
|
||||
def test_beta_removed(self):
|
||||
"""The id=beta element should be removed
|
||||
It's link heavy and causing a lot of garbage content. This should be
|
||||
removed.
|
||||
|
||||
It's link heavy and causing a lot of garbage content. This should be
|
||||
removed.
|
||||
"""
|
||||
doc = Article(article)
|
||||
|
||||
"""
|
||||
doc = Article(self.article)
|
||||
self.assertTrue('id="beta"' not in doc.readable)
|
||||
assert 'id="beta"' not in doc.readable
|
||||
|
@ -1,33 +1,34 @@
|
||||
# -*- coding: utf8 -*-
|
||||
|
||||
"""Test the scoring and parsing of the Blog Post"""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
try:
|
||||
# Python < 2.7
|
||||
import unittest2 as unittest
|
||||
except ImportError:
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from breadability.readable import Article
|
||||
|
||||
|
||||
class TestBusinessInsiderArticle(unittest.TestCase):
|
||||
"""Test the scoring and parsing of the Blog Post"""
|
||||
@pytest.fixture(scope="module")
|
||||
def article():
|
||||
"""Load up the article for us"""
|
||||
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
|
||||
with open(article_path) as file:
|
||||
return file.read()
|
||||
|
||||
|
||||
def setUp(self):
|
||||
def test_parses(article):
|
||||
"""Verify we can parse the document."""
|
||||
doc = Article(article)
|
||||
|
||||
"""Load up the article for us"""
|
||||
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
|
||||
self.article = open(article_path).read()
|
||||
assert 'id="readabilityBody"' in doc.readable
|
||||
|
||||
def tearDown(self):
|
||||
"""Drop the article"""
|
||||
self.article = None
|
||||
|
||||
def test_parses(self):
|
||||
"""Verify we can parse the document."""
|
||||
doc = Article(self.article)
|
||||
self.assertTrue('id="readabilityBody"' in doc.readable)
|
||||
def test_images_preserved(article):
|
||||
"""The div with the comments should be removed."""
|
||||
doc = Article(article)
|
||||
|
||||
def test_images_preserved(self):
|
||||
"""The div with the comments should be removed."""
|
||||
doc = Article(self.article)
|
||||
self.assertTrue('bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg' in doc.readable)
|
||||
self.assertTrue('bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg' in doc.readable)
|
||||
assert 'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg' in doc.readable
|
||||
assert 'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg' in doc.readable
|
||||
|
@ -1,39 +1,33 @@
|
||||
# -*- coding: utf8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division, print_function, unicode_literals
|
||||
"""
|
||||
Test the scoring and parsing of the article from URL below:
|
||||
http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from os.path import join, dirname
|
||||
from breadability.readable import Article
|
||||
from ...compat import unittest
|
||||
|
||||
|
||||
class TestArticle(unittest.TestCase):
|
||||
"""
|
||||
Test the scoring and parsing of the article from URL below:
|
||||
http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
"""Load up the article for us"""
|
||||
article_path = join(dirname(__file__), "article.html")
|
||||
with open(article_path, "rb") as file:
|
||||
self.document = Article(file.read(), "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8")
|
||||
|
||||
def tearDown(self):
|
||||
"""Drop the article"""
|
||||
self.document = None
|
||||
|
||||
def test_parses(self):
|
||||
"""Verify we can parse the document."""
|
||||
self.assertIn('id="readabilityBody"', self.document.readable)
|
||||
|
||||
def test_images_preserved(self):
|
||||
"""The div with the comments should be removed."""
|
||||
images = [
|
||||
'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg',
|
||||
'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg',
|
||||
]
|
||||
|
||||
for image in images:
|
||||
self.assertIn(image, self.document.readable, image)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def article():
|
||||
"""Load up the article for us"""
|
||||
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
|
||||
with open(article_path, "rb") as file:
|
||||
return Article(file.read(), "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8")
|
||||
|
||||
|
||||
def test_parses(article):
|
||||
"""Verify we can parse the document."""
|
||||
assert 'id="readabilityBody"' in article.readable
|
||||
|
||||
|
||||
def test_images_preserved(article):
|
||||
"""The div with the comments should be removed."""
|
||||
assert 'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg' in article.readable
|
||||
assert 'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg' in article.readable
|
||||
|
@ -1,74 +1,64 @@
|
||||
# -*- coding: utf8 -*-
|
||||
|
||||
from __future__ import (
|
||||
absolute_import,
|
||||
division,
|
||||
print_function,
|
||||
unicode_literals
|
||||
)
|
||||
"""Test the scoring and parsing of the Article"""
|
||||
|
||||
import os
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
from operator import attrgetter
|
||||
from breadability.readable import Article
|
||||
from breadability.readable import check_siblings
|
||||
from breadability.readable import prep_article
|
||||
from ...compat import unittest
|
||||
|
||||
|
||||
class TestArticle(unittest.TestCase):
|
||||
"""Test the scoring and parsing of the Article"""
|
||||
|
||||
def setUp(self):
|
||||
"""Load up the article for us"""
|
||||
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
|
||||
self.article = open(article_path).read()
|
||||
|
||||
def tearDown(self):
|
||||
"""Drop the article"""
|
||||
self.article = None
|
||||
|
||||
def test_parses(self):
|
||||
"""Verify we can parse the document."""
|
||||
doc = Article(self.article)
|
||||
self.assertTrue('id="readabilityBody"' in doc.readable)
|
||||
|
||||
def test_content_exists(self):
|
||||
"""Verify that some content exists."""
|
||||
doc = Article(self.article)
|
||||
self.assertTrue('Amazon and Google' in doc.readable)
|
||||
self.assertFalse('Linkblog updated' in doc.readable)
|
||||
self.assertFalse(
|
||||
'#anExampleGoogleDoesntIntendToShareBlogAndItWill' in doc.readable)
|
||||
|
||||
@unittest.skip("Test fails because of some weird hash.")
|
||||
def test_candidates(self):
|
||||
"""Verify we have candidates."""
|
||||
doc = Article(self.article)
|
||||
# from lxml.etree import tounicode
|
||||
found = False
|
||||
wanted_hash = '04e46055'
|
||||
|
||||
for node in doc.candidates.values():
|
||||
if node.hash_id == wanted_hash:
|
||||
found = node
|
||||
|
||||
self.assertTrue(found)
|
||||
|
||||
# we have the right node, it must be deleted for some reason if it's
|
||||
# not still there when we need it to be.
|
||||
# Make sure it's not in our to drop list.
|
||||
for node in doc._should_drop:
|
||||
self.assertFalse(node == found.node)
|
||||
|
||||
by_score = sorted(
|
||||
[c for c in doc.candidates.values()],
|
||||
key=attrgetter('content_score'), reverse=True)
|
||||
self.assertTrue(by_score[0].node == found.node)
|
||||
|
||||
updated_winner = check_siblings(by_score[0], doc.candidates)
|
||||
updated_winner.node = prep_article(updated_winner.node)
|
||||
|
||||
# This article hits up against the img > p conditional filtering
|
||||
# because of the many .gif images in the content. We've removed that
|
||||
# rule.
|
||||
|
||||
import pytest
|
||||
|
||||
from breadability.readable import Article, check_siblings, prep_article
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def article():
|
||||
"""Load up the article for us"""
|
||||
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
|
||||
with open(article_path) as file:
|
||||
return Article(file.read())
|
||||
|
||||
|
||||
def test_parses(article):
|
||||
"""Verify we can parse the document."""
|
||||
assert 'id="readabilityBody"' in article.readable
|
||||
|
||||
|
||||
def test_content_exists(article):
|
||||
"""Verify that some content exists."""
|
||||
assert 'Amazon and Google' in article.readable
|
||||
assert not 'Linkblog updated' in article.readable
|
||||
assert not '#anExampleGoogleDoesntIntendToShareBlogAndItWill' in article.readable
|
||||
|
||||
|
||||
@pytest.mark.skip("Test fails because of some weird hash.")
|
||||
def test_candidates(article):
|
||||
"""Verify we have candidates."""
|
||||
# from lxml.etree import tounicode
|
||||
found = False
|
||||
wanted_hash = '04e46055'
|
||||
|
||||
for node in article.candidates.values():
|
||||
if node.hash_id == wanted_hash:
|
||||
found = node
|
||||
|
||||
assert found
|
||||
|
||||
# we have the right node, it must be deleted for some reason if it's
|
||||
# not still there when we need it to be.
|
||||
# Make sure it's not in our to drop list.
|
||||
for node in article._should_drop:
|
||||
assert node != found.node
|
||||
|
||||
by_score = sorted(
|
||||
[c for c in article.candidates.values()],
|
||||
key=attrgetter('content_score'), reverse=True)
|
||||
assert by_score[0].node == found.node
|
||||
|
||||
updated_winner = check_siblings(by_score[0], article.candidates)
|
||||
updated_winner.node = prep_article(updated_winner.node)
|
||||
|
||||
# This article hits up against the img > p conditional filtering
|
||||
# because of the many .gif images in the content. We've removed that
|
||||
# rule.
|
||||
|
@ -1,33 +1,32 @@
|
||||
# -*- coding: utf8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division, print_function, unicode_literals
|
||||
"""
|
||||
Test the scoring and parsing of the article from URL below:
|
||||
http://sweetshark.livejournal.com/11564.html
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from os.path import join, dirname
|
||||
from breadability.readable import Article
|
||||
from ...compat import unittest
|
||||
|
||||
|
||||
class TestSweetsharkBlog(unittest.TestCase):
|
||||
"""
|
||||
Test the scoring and parsing of the article from URL below:
|
||||
http://sweetshark.livejournal.com/11564.html
|
||||
"""
|
||||
@pytest.fixture(scope="module")
|
||||
def article():
|
||||
"""Load up the article for us"""
|
||||
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
|
||||
with open(article_path, "rb") as file:
|
||||
return Article(file.read(), "http://sweetshark.livejournal.com/11564.html")
|
||||
|
||||
def setUp(self):
|
||||
"""Load up the article for us"""
|
||||
article_path = join(dirname(__file__), "article.html")
|
||||
with open(article_path, "rb") as file:
|
||||
self.document = Article(file.read(), "http://sweetshark.livejournal.com/11564.html")
|
||||
|
||||
def tearDown(self):
|
||||
"""Drop the article"""
|
||||
self.document = None
|
||||
def test_parses(article):
|
||||
"""Verify we can parse the document."""
|
||||
assert 'id="readabilityBody"' in article.readable
|
||||
|
||||
def test_parses(self):
|
||||
"""Verify we can parse the document."""
|
||||
self.assertIn('id="readabilityBody"', self.document.readable)
|
||||
|
||||
def test_content_after_video(self):
|
||||
"""The div with the comments should be removed."""
|
||||
self.assertIn('Stay hungry, Stay foolish', self.document.readable)
|
||||
def test_content_after_video(article):
|
||||
"""The div with the comments should be removed."""
|
||||
assert 'Stay hungry, Stay foolish' in article.readable
|
||||
|
@ -1,347 +1,352 @@
|
||||
# -*- coding: utf8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division, print_function, unicode_literals
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import pytest
|
||||
from lxml.etree import tounicode
|
||||
from lxml.html import document_fromstring
|
||||
from lxml.html import fragment_fromstring
|
||||
from lxml.html import document_fromstring, fragment_fromstring
|
||||
|
||||
from breadability._compat import to_unicode
|
||||
from breadability.readable import (
|
||||
Article,
|
||||
get_class_weight,
|
||||
get_link_density,
|
||||
is_bad_link,
|
||||
leaf_div_elements_into_paragraphs,
|
||||
score_candidates,
|
||||
)
|
||||
from breadability.readable import (Article, get_class_weight, get_link_density, is_bad_link,
|
||||
leaf_div_elements_into_paragraphs, score_candidates, )
|
||||
from breadability.scoring import ScoredNode
|
||||
from .compat import unittest
|
||||
from .utils import load_snippet, load_article
|
||||
from .utils import load_article, load_snippet
|
||||
|
||||
# TestReadableDocument
|
||||
"""Verify we can process html into a document to work off of."""
|
||||
|
||||
class TestReadableDocument(unittest.TestCase):
|
||||
"""Verify we can process html into a document to work off of."""
|
||||
|
||||
def test_load_doc(self):
|
||||
"""We get back an element tree from our original doc"""
|
||||
doc = Article(load_snippet('document_min.html'))
|
||||
# We get back the document as a div tag currently by default.
|
||||
self.assertEqual(doc.readable_dom.tag, 'div')
|
||||
def test_load_doc():
|
||||
"""We get back an element tree from our original doc"""
|
||||
doc = Article(load_snippet('document_min.html'))
|
||||
# We get back the document as a div tag currently by default.
|
||||
|
||||
def test_title_loads(self):
|
||||
"""Verify we can fetch the title of the parsed article"""
|
||||
doc = Article(load_snippet('document_min.html'))
|
||||
self.assertEqual(
|
||||
doc._original_document.title,
|
||||
'Min Document Title'
|
||||
)
|
||||
assert doc.readable_dom.tag == 'div'
|
||||
|
||||
def test_doc_no_scripts_styles(self):
|
||||
"""Step #1 remove all scripts from the document"""
|
||||
doc = Article(load_snippet('document_scripts.html'))
|
||||
readable = doc.readable_dom
|
||||
self.assertEqual(readable.findall(".//script"), [])
|
||||
self.assertEqual(readable.findall(".//style"), [])
|
||||
self.assertEqual(readable.findall(".//link"), [])
|
||||
|
||||
def test_find_body_exists(self):
|
||||
"""If the document has a body, we store that as the readable html
|
||||
|
||||
No sense processing anything other than the body content.
|
||||
|
||||
"""
|
||||
doc = Article(load_snippet('document_min.html'))
|
||||
self.assertEqual(doc.readable_dom.tag, 'div')
|
||||
self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
|
||||
|
||||
def test_body_doesnt_exist(self):
|
||||
"""If we can't find a body, then we create one.
|
||||
|
||||
We build our doc around the rest of the html we parsed.
|
||||
|
||||
"""
|
||||
doc = Article(load_snippet('document_no_body.html'))
|
||||
self.assertEqual(doc.readable_dom.tag, 'div')
|
||||
self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
|
||||
|
||||
def test_bare_content(self):
|
||||
"""If the document is just pure content, no html tags we should be ok
|
||||
|
||||
We build our doc around the rest of the html we parsed.
|
||||
|
||||
"""
|
||||
doc = Article(load_snippet('document_only_content.html'))
|
||||
self.assertEqual(doc.readable_dom.tag, 'div')
|
||||
self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
|
||||
|
||||
def test_no_content(self):
|
||||
"""Without content we supply an empty unparsed doc."""
|
||||
doc = Article('')
|
||||
self.assertEqual(doc.readable_dom.tag, 'div')
|
||||
self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
|
||||
self.assertEqual(doc.readable_dom.get('class'), 'parsing-error')
|
||||
|
||||
|
||||
class TestCleaning(unittest.TestCase):
|
||||
"""Test out our cleaning processing we do."""
|
||||
|
||||
def test_unlikely_hits(self):
|
||||
"""Verify we wipe out things from our unlikely list."""
|
||||
doc = Article(load_snippet('test_readable_unlikely.html'))
|
||||
readable = doc.readable_dom
|
||||
must_not_appear = [
|
||||
'comment', 'community', 'disqus', 'extra', 'foot',
|
||||
'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
|
||||
'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',
|
||||
'popup', 'tweet', 'twitter', 'imgBlogpostPermalink']
|
||||
|
||||
want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']
|
||||
|
||||
for i in must_not_appear:
|
||||
# we cannot find any class or id with this value
|
||||
by_class = readable.find_class(i)
|
||||
|
||||
for test in by_class:
|
||||
# if it's here it cannot have the must not class without the
|
||||
# want to appear class
|
||||
found = False
|
||||
for cls in test.get('class').split():
|
||||
if cls in want_to_appear:
|
||||
found = True
|
||||
self.assertTrue(found)
|
||||
|
||||
by_ids = readable.get_element_by_id(i, False)
|
||||
if by_ids is not False:
|
||||
found = False
|
||||
for ids in test.get('id').split():
|
||||
if ids in want_to_appear:
|
||||
found = True
|
||||
self.assertTrue(found)
|
||||
|
||||
def test_misused_divs_transform(self):
|
||||
"""Verify we replace leaf node divs with p's
|
||||
|
||||
They should have the same content, just be a p vs a div
|
||||
|
||||
"""
|
||||
test_html = "<html><body><div>simple</div></body></html>"
|
||||
test_doc = document_fromstring(test_html)
|
||||
self.assertEqual(
|
||||
tounicode(
|
||||
leaf_div_elements_into_paragraphs(test_doc)),
|
||||
to_unicode("<html><body><p>simple</p></body></html>")
|
||||
)
|
||||
|
||||
test_html2 = ('<html><body><div>simple<a href="">link</a>'
|
||||
'</div></body></html>')
|
||||
test_doc2 = document_fromstring(test_html2)
|
||||
self.assertEqual(
|
||||
tounicode(
|
||||
leaf_div_elements_into_paragraphs(test_doc2)),
|
||||
to_unicode(
|
||||
'<html><body><p>simple<a href="">link</a></p></body></html>')
|
||||
)
|
||||
def test_title_loads():
|
||||
"""Verify we can fetch the title of the parsed article"""
|
||||
doc = Article(load_snippet('document_min.html'))
|
||||
|
||||
def test_dont_transform_div_with_div(self):
|
||||
"""Verify that only child <div> element is replaced by <p>."""
|
||||
dom = document_fromstring(
|
||||
"<html><body><div>text<div>child</div>"
|
||||
"aftertext</div></body></html>"
|
||||
)
|
||||
assert doc._original_document.title == 'Min Document Title'
|
||||
|
||||
self.assertEqual(
|
||||
tounicode(
|
||||
leaf_div_elements_into_paragraphs(dom)),
|
||||
to_unicode(
|
||||
"<html><body><div>text<p>child</p>"
|
||||
"aftertext</div></body></html>"
|
||||
)
|
||||
)
|
||||
|
||||
def test_bad_links(self):
|
||||
"""Some links should just not belong."""
|
||||
bad_links = [
|
||||
'<a name="amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress"> </a>',
|
||||
'<a href="#amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>',
|
||||
'<a href="http://scripting.com/stories/2012/06/15/theTechPressIsOutToLunch.html#anExampleGoogleDoesntIntendToShareBlogAndItWillOnlyBeUsedToPointToBloggerSitesIfYouHaveATumblrOrWordpressBlogYouCantHaveABlogDomainHereIsTheAHrefhttpgtldresulticannorgapplicationresultapplicationstatusapplicationdetails527publicListingaOfGooglesAHrefhttpdropboxscriptingcomdavemiscgoogleblogapplicationhtmlapplicationa"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>'
|
||||
]
|
||||
def test_doc_no_scripts_styles():
|
||||
"""Step #1 remove all scripts from the document"""
|
||||
doc = Article(load_snippet('document_scripts.html'))
|
||||
readable = doc.readable_dom
|
||||
|
||||
assert readable.findall(".//script") == []
|
||||
assert readable.findall(".//style") == []
|
||||
assert readable.findall(".//link") == []
|
||||
|
||||
|
||||
def test_find_body_exists():
|
||||
"""If the document has a body, we store that as the readable html
|
||||
|
||||
No sense processing anything other than the body content.
|
||||
|
||||
"""
|
||||
doc = Article(load_snippet('document_min.html'))
|
||||
|
||||
assert doc.readable_dom.tag == 'div'
|
||||
assert doc.readable_dom.get('id') == 'readabilityBody'
|
||||
|
||||
|
||||
def test_body_doesnt_exist():
|
||||
"""If we can't find a body, then we create one.
|
||||
|
||||
We build our doc around the rest of the html we parsed.
|
||||
|
||||
"""
|
||||
doc = Article(load_snippet('document_no_body.html'))
|
||||
|
||||
assert doc.readable_dom.tag == 'div'
|
||||
assert doc.readable_dom.get('id') == 'readabilityBody'
|
||||
|
||||
|
||||
def test_bare_content():
|
||||
"""If the document is just pure content, no html tags we should be ok
|
||||
|
||||
We build our doc around the rest of the html we parsed.
|
||||
|
||||
"""
|
||||
doc = Article(load_snippet('document_only_content.html'))
|
||||
|
||||
assert doc.readable_dom.tag == 'div'
|
||||
assert doc.readable_dom.get('id') == 'readabilityBody'
|
||||
|
||||
|
||||
def test_no_content():
|
||||
"""Without content we supply an empty unparsed doc."""
|
||||
doc = Article('')
|
||||
|
||||
assert doc.readable_dom.tag == 'div'
|
||||
assert doc.readable_dom.get('id') == 'readabilityBody'
|
||||
assert doc.readable_dom.get('class') == 'parsing-error'
|
||||
|
||||
|
||||
# Test out our cleaning processing we do.
|
||||
|
||||
|
||||
def test_unlikely_hits():
|
||||
"""Verify we wipe out things from our unlikely list."""
|
||||
doc = Article(load_snippet('test_readable_unlikely.html'))
|
||||
readable = doc.readable_dom
|
||||
must_not_appear = [
|
||||
'comment', 'community', 'disqus', 'extra', 'foot',
|
||||
'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
|
||||
'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',
|
||||
'popup', 'tweet', 'twitter', 'imgBlogpostPermalink']
|
||||
|
||||
want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']
|
||||
|
||||
for i in must_not_appear:
|
||||
# we cannot find any class or id with this value
|
||||
by_class = readable.find_class(i)
|
||||
|
||||
for test in by_class:
|
||||
# if it's here it cannot have the must not class without the
|
||||
# want to appear class
|
||||
found = False
|
||||
for cls in test.get('class').split():
|
||||
if cls in want_to_appear:
|
||||
found = True
|
||||
assert found
|
||||
|
||||
by_ids = readable.get_element_by_id(i, False)
|
||||
if by_ids is not False:
|
||||
found = False
|
||||
for ids in test.get('id').split():
|
||||
if ids in want_to_appear:
|
||||
found = True
|
||||
assert found
|
||||
|
||||
|
||||
def test_misused_divs_transform():
|
||||
"""Verify we replace leaf node divs with p's
|
||||
|
||||
They should have the same content, just be a p vs a div
|
||||
|
||||
"""
|
||||
test_html = "<html><body><div>simple</div></body></html>"
|
||||
test_doc = document_fromstring(test_html)
|
||||
assert tounicode(leaf_div_elements_into_paragraphs(test_doc)) == to_unicode(
|
||||
"<html><body><p>simple</p></body></html>"
|
||||
)
|
||||
|
||||
test_html2 = ('<html><body><div>simple<a href="">link</a>'
|
||||
'</div></body></html>')
|
||||
test_doc2 = document_fromstring(test_html2)
|
||||
assert tounicode(leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode(
|
||||
'<html><body><p>simple<a href="">link</a></p></body></html>'
|
||||
)
|
||||
|
||||
|
||||
def test_dont_transform_div_with_div():
|
||||
"""Verify that only child <div> element is replaced by <p>."""
|
||||
dom = document_fromstring(
|
||||
"<html><body><div>text<div>child</div>"
|
||||
"aftertext</div></body></html>"
|
||||
)
|
||||
|
||||
assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode(
|
||||
"<html><body><div>text<p>child</p>"
|
||||
"aftertext</div></body></html>"
|
||||
)
|
||||
|
||||
|
||||
for l in bad_links:
|
||||
link = fragment_fromstring(l)
|
||||
self.assertTrue(is_bad_link(link))
|
||||
def test_bad_links():
|
||||
"""Some links should just not belong."""
|
||||
bad_links = [
|
||||
'<a name="amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress"> </a>',
|
||||
'<a href="#amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>',
|
||||
'<a href="http://scripting.com/stories/2012/06/15/theTechPressIsOutToLunch.html#anExampleGoogleDoesntIntendToShareBlogAndItWillOnlyBeUsedToPointToBloggerSitesIfYouHaveATumblrOrWordpressBlogYouCantHaveABlogDomainHereIsTheAHrefhttpgtldresulticannorgapplicationresultapplicationstatusapplicationdetails527publicListingaOfGooglesAHrefhttpdropboxscriptingcomdavemiscgoogleblogapplicationhtmlapplicationa"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>'
|
||||
]
|
||||
|
||||
for l in bad_links:
|
||||
link = fragment_fromstring(l)
|
||||
assert is_bad_link(link)
|
||||
|
||||
class TestCandidateNodes(unittest.TestCase):
|
||||
"""Candidate nodes are scoring containers we use."""
|
||||
|
||||
def test_candidate_scores(self):
|
||||
"""We should be getting back objects with some scores."""
|
||||
fives = ['<div/>']
|
||||
threes = ['<pre/>', '<td/>', '<blockquote/>']
|
||||
neg_threes = ['<address/>', '<ol/>']
|
||||
neg_fives = ['<h1/>', '<h2/>', '<h3/>', '<h4/>']
|
||||
# Candidate nodes are scoring containers we use.
|
||||
|
||||
for n in fives:
|
||||
doc = fragment_fromstring(n)
|
||||
self.assertEqual(ScoredNode(doc).content_score, 5)
|
||||
|
||||
for n in threes:
|
||||
doc = fragment_fromstring(n)
|
||||
self.assertEqual(ScoredNode(doc).content_score, 3)
|
||||
def test_candidate_scores():
|
||||
"""We should be getting back objects with some scores."""
|
||||
fives = ['<div/>']
|
||||
threes = ['<pre/>', '<td/>', '<blockquote/>']
|
||||
neg_threes = ['<address/>', '<ol/>']
|
||||
neg_fives = ['<h1/>', '<h2/>', '<h3/>', '<h4/>']
|
||||
|
||||
for n in neg_threes:
|
||||
doc = fragment_fromstring(n)
|
||||
self.assertEqual(ScoredNode(doc).content_score, -3)
|
||||
for n in fives:
|
||||
doc = fragment_fromstring(n)
|
||||
assert ScoredNode(doc).content_score == 5
|
||||
|
||||
for n in neg_fives:
|
||||
doc = fragment_fromstring(n)
|
||||
self.assertEqual(ScoredNode(doc).content_score, -5)
|
||||
for n in threes:
|
||||
doc = fragment_fromstring(n)
|
||||
assert ScoredNode(doc).content_score == 3
|
||||
|
||||
def test_article_enables_candidate_access(self):
|
||||
"""Candidates are accessible after document processing."""
|
||||
doc = Article(load_article('ars.001.html'))
|
||||
self.assertTrue(hasattr(doc, 'candidates'))
|
||||
for n in neg_threes:
|
||||
doc = fragment_fromstring(n)
|
||||
assert ScoredNode(doc).content_score == -3
|
||||
|
||||
for n in neg_fives:
|
||||
doc = fragment_fromstring(n)
|
||||
assert ScoredNode(doc).content_score == -5
|
||||
|
||||
class TestClassWeights(unittest.TestCase):
|
||||
"""Certain ids and classes get us bonus points."""
|
||||
|
||||
def test_positive_class(self):
|
||||
"""Some classes get us bonus points."""
|
||||
node = fragment_fromstring('<p class="article">')
|
||||
self.assertEqual(get_class_weight(node), 25)
|
||||
def test_article_enables_candidate_access():
|
||||
"""Candidates are accessible after document processing."""
|
||||
doc = Article(load_article('ars.001.html'))
|
||||
|
||||
def test_positive_ids(self):
|
||||
"""Some ids get us bonus points."""
|
||||
node = fragment_fromstring('<p id="content">')
|
||||
self.assertEqual(get_class_weight(node), 25)
|
||||
assert hasattr(doc, 'candidates')
|
||||
|
||||
def test_negative_class(self):
|
||||
"""Some classes get us negative points."""
|
||||
node = fragment_fromstring('<p class="comment">')
|
||||
self.assertEqual(get_class_weight(node), -25)
|
||||
|
||||
def test_negative_ids(self):
|
||||
"""Some ids get us negative points."""
|
||||
node = fragment_fromstring('<p id="media">')
|
||||
self.assertEqual(get_class_weight(node), -25)
|
||||
# Certain ids and classes get us bonus points.
|
||||
|
||||
|
||||
class TestScoringNodes(unittest.TestCase):
|
||||
"""We take out list of potential nodes and score them up."""
|
||||
def test_positive_class():
|
||||
"""Some classes get us bonus points."""
|
||||
node = fragment_fromstring('<p class="article">')
|
||||
assert get_class_weight(node) == 25
|
||||
|
||||
def test_we_get_candidates(self):
|
||||
"""Processing candidates should get us a list of nodes to try out."""
|
||||
doc = document_fromstring(load_article("ars.001.html"))
|
||||
test_nodes = tuple(doc.iter("p", "td", "pre"))
|
||||
candidates = score_candidates(test_nodes)
|
||||
|
||||
# this might change as we tweak our algorithm, but if it does,
|
||||
# it signifies we need to look at what we changed.
|
||||
self.assertEqual(len(candidates.keys()), 37)
|
||||
def test_positive_ids():
|
||||
"""Some ids get us bonus points."""
|
||||
node = fragment_fromstring('<p id="content">')
|
||||
assert get_class_weight(node) == 25
|
||||
|
||||
# one of these should have a decent score
|
||||
scores = sorted(c.content_score for c in candidates.values())
|
||||
self.assertTrue(scores[-1] > 100)
|
||||
|
||||
def test_bonus_score_per_100_chars_in_p(self):
|
||||
"""Nodes get 1 point per 100 characters up to max. 3 points."""
|
||||
def build_candidates(length):
|
||||
html = "<p>%s</p>" % ("c" * length)
|
||||
node = fragment_fromstring(html)
|
||||
|
||||
return [node]
|
||||
|
||||
test_nodes = build_candidates(50)
|
||||
candidates = score_candidates(test_nodes)
|
||||
pscore_50 = max(c.content_score for c in candidates.values())
|
||||
|
||||
test_nodes = build_candidates(100)
|
||||
candidates = score_candidates(test_nodes)
|
||||
pscore_100 = max(c.content_score for c in candidates.values())
|
||||
|
||||
test_nodes = build_candidates(300)
|
||||
candidates = score_candidates(test_nodes)
|
||||
pscore_300 = max(c.content_score for c in candidates.values())
|
||||
|
||||
test_nodes = build_candidates(400)
|
||||
candidates = score_candidates(test_nodes)
|
||||
pscore_400 = max(c.content_score for c in candidates.values())
|
||||
|
||||
self.assertAlmostEqual(pscore_50 + 0.5, pscore_100)
|
||||
self.assertAlmostEqual(pscore_100 + 2.0, pscore_300)
|
||||
self.assertAlmostEqual(pscore_300, pscore_400)
|
||||
|
||||
|
||||
class TestLinkDensityScoring(unittest.TestCase):
|
||||
"""Link density will adjust out candidate scoresself."""
|
||||
|
||||
def test_link_density(self):
|
||||
"""Test that we get a link density"""
|
||||
doc = document_fromstring(load_article('ars.001.html'))
|
||||
for node in doc.iter('p', 'td', 'pre'):
|
||||
density = get_link_density(node)
|
||||
|
||||
# the density must be between 0, 1
|
||||
self.assertTrue(density >= 0.0 and density <= 1.0)
|
||||
|
||||
|
||||
class TestSiblings(unittest.TestCase):
|
||||
"""Siblings will be included if their content is related."""
|
||||
|
||||
@unittest.skip("Not implemented yet.")
|
||||
def test_bad_siblings_not_counted(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
@unittest.skip("Not implemented yet.")
|
||||
def test_good_siblings_counted(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class TestMainText(unittest.TestCase):
|
||||
def test_empty(self):
|
||||
article = Article("")
|
||||
annotated_text = article.main_text
|
||||
|
||||
self.assertEqual(annotated_text, [])
|
||||
|
||||
def test_no_annotations(self):
|
||||
article = Article("<div><p>This is text with no annotations</p></div>")
|
||||
annotated_text = article.main_text
|
||||
|
||||
self.assertEqual(annotated_text,
|
||||
[(("This is text with no annotations", None),)])
|
||||
|
||||
def test_one_annotation(self):
|
||||
article = Article("<div><p>This is text\r\twith <del>no</del> annotations</p></div>")
|
||||
annotated_text = article.main_text
|
||||
|
||||
expected = [(
|
||||
("This is text\nwith", None),
|
||||
("no", ("del",)),
|
||||
("annotations", None),
|
||||
)]
|
||||
self.assertEqual(annotated_text, expected)
|
||||
|
||||
def test_simple_snippet(self):
|
||||
snippet = Article(load_snippet("annotated_1.html"))
|
||||
annotated_text = snippet.main_text
|
||||
|
||||
expected = [
|
||||
(
|
||||
("Paragraph is more", None),
|
||||
("better", ("em",)),
|
||||
(".\nThis text is very", None),
|
||||
("pretty", ("strong",)),
|
||||
("'cause she's girl.", None),
|
||||
),
|
||||
(
|
||||
("This is not", None),
|
||||
("crap", ("big",)),
|
||||
("so", None),
|
||||
("readability", ("dfn",)),
|
||||
("me :)", None),
|
||||
)
|
||||
]
|
||||
self.assertEqual(annotated_text, expected)
|
||||
|
||||
def test_negative_class():
|
||||
"""Some classes get us negative points."""
|
||||
node = fragment_fromstring('<p class="comment">')
|
||||
assert get_class_weight(node) == -25
|
||||
|
||||
|
||||
def test_negative_ids():
|
||||
"""Some ids get us negative points."""
|
||||
node = fragment_fromstring('<p id="media">')
|
||||
assert get_class_weight(node) == -25
|
||||
|
||||
|
||||
# We take out list of potential nodes and score them up.
|
||||
|
||||
|
||||
def test_we_get_candidates():
|
||||
"""Processing candidates should get us a list of nodes to try out."""
|
||||
doc = document_fromstring(load_article("ars.001.html"))
|
||||
test_nodes = tuple(doc.iter("p", "td", "pre"))
|
||||
candidates = score_candidates(test_nodes)
|
||||
|
||||
# this might change as we tweak our algorithm, but if it does,
|
||||
# it signifies we need to look at what we changed.
|
||||
assert len(candidates.keys()) == 37
|
||||
|
||||
# one of these should have a decent score
|
||||
scores = sorted(c.content_score for c in candidates.values())
|
||||
assert scores[-1] > 100
|
||||
|
||||
|
||||
def test_bonus_score_per_100_chars_in_p():
|
||||
"""Nodes get 1 point per 100 characters up to max. 3 points."""
|
||||
def build_candidates(length):
|
||||
html = "<p>%s</p>" % ("c" * length)
|
||||
node = fragment_fromstring(html)
|
||||
|
||||
return [node]
|
||||
|
||||
test_nodes = build_candidates(50)
|
||||
candidates = score_candidates(test_nodes)
|
||||
pscore_50 = max(c.content_score for c in candidates.values())
|
||||
|
||||
test_nodes = build_candidates(100)
|
||||
candidates = score_candidates(test_nodes)
|
||||
pscore_100 = max(c.content_score for c in candidates.values())
|
||||
|
||||
test_nodes = build_candidates(300)
|
||||
candidates = score_candidates(test_nodes)
|
||||
pscore_300 = max(c.content_score for c in candidates.values())
|
||||
|
||||
test_nodes = build_candidates(400)
|
||||
candidates = score_candidates(test_nodes)
|
||||
pscore_400 = max(c.content_score for c in candidates.values())
|
||||
|
||||
assert pscore_50 + 0.5 == pscore_100
|
||||
assert pscore_100 + 2.0 == pscore_300
|
||||
assert pscore_300 == pscore_400
|
||||
|
||||
|
||||
# Link density will adjust out candidate scoresself.
|
||||
|
||||
|
||||
def test_link_density():
|
||||
"""Test that we get a link density"""
|
||||
doc = document_fromstring(load_article('ars.001.html'))
|
||||
for node in doc.iter('p', 'td', 'pre'):
|
||||
density = get_link_density(node)
|
||||
|
||||
# the density must be between 0, 1
|
||||
assert density >= 0.0 and density <= 1.0
|
||||
|
||||
|
||||
# Siblings will be included if their content is related.
|
||||
|
||||
|
||||
@pytest.mark.skip("Not implemented yet.")
|
||||
def test_bad_siblings_not_counted():
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
@pytest.mark.skip("Not implemented yet.")
|
||||
def test_good_siblings_counted():
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
# TestMainText
|
||||
|
||||
def test_empty():
|
||||
article = Article("")
|
||||
annotated_text = article.main_text
|
||||
|
||||
assert annotated_text == []
|
||||
|
||||
|
||||
def test_no_annotations():
|
||||
article = Article("<div><p>This is text with no annotations</p></div>")
|
||||
annotated_text = article.main_text
|
||||
|
||||
assert annotated_text == [(("This is text with no annotations", None),)]
|
||||
|
||||
|
||||
def test_one_annotation():
|
||||
article = Article("<div><p>This is text\r\twith <del>no</del> annotations</p></div>")
|
||||
annotated_text = article.main_text
|
||||
|
||||
assert annotated_text == [(
|
||||
("This is text\nwith", None),
|
||||
("no", ("del",)),
|
||||
("annotations", None),
|
||||
)]
|
||||
|
||||
|
||||
def test_simple_snippet():
|
||||
snippet = Article(load_snippet("annotated_1.html"))
|
||||
annotated_text = snippet.main_text
|
||||
|
||||
assert annotated_text == [
|
||||
(
|
||||
("Paragraph is more", None),
|
||||
("better", ("em",)),
|
||||
(".\nThis text is very", None),
|
||||
("pretty", ("strong",)),
|
||||
("'cause she's girl.", None),
|
||||
),
|
||||
(
|
||||
("This is not", None),
|
||||
("crap", ("big",)),
|
||||
("so", None),
|
||||
("readability", ("dfn",)),
|
||||
("me :)", None),
|
||||
)
|
||||
]
|
||||
|
@ -1,284 +1,295 @@
|
||||
# -*- coding: utf8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division, print_function, unicode_literals
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from operator import attrgetter
|
||||
from lxml.html import document_fromstring
|
||||
from lxml.html import fragment_fromstring
|
||||
from breadability.readable import Article
|
||||
from breadability.scoring import (
|
||||
check_node_attributes,
|
||||
generate_hash_id,
|
||||
get_class_weight,
|
||||
score_candidates,
|
||||
ScoredNode,
|
||||
)
|
||||
from breadability.readable import (
|
||||
get_link_density,
|
||||
is_unlikely_node,
|
||||
)
|
||||
from .compat import unittest
|
||||
|
||||
from lxml.html import document_fromstring, fragment_fromstring
|
||||
|
||||
from breadability.readable import Article, get_link_density, is_unlikely_node
|
||||
from breadability.scoring import (ScoredNode, check_node_attributes, generate_hash_id, get_class_weight,
|
||||
score_candidates)
|
||||
from .utils import load_snippet
|
||||
|
||||
|
||||
class TestHashId(unittest.TestCase):
|
||||
def test_generate_hash(self):
|
||||
dom = fragment_fromstring("<div>ľščťžýáí</div>")
|
||||
generate_hash_id(dom)
|
||||
def test_generate_hash():
|
||||
dom = fragment_fromstring("<div>ľščťžýáí</div>")
|
||||
generate_hash_id(dom)
|
||||
|
||||
def test_hash_from_id_on_exception(self):
|
||||
generate_hash_id(None)
|
||||
|
||||
def test_different_hashes(self):
|
||||
dom = fragment_fromstring("<div>ľščťžýáí</div>")
|
||||
hash_dom = generate_hash_id(dom)
|
||||
hash_none = generate_hash_id(None)
|
||||
def test_hash_from_id_on_exception():
|
||||
generate_hash_id(None)
|
||||
|
||||
self.assertNotEqual(hash_dom, hash_none)
|
||||
|
||||
def test_equal_hashes(self):
|
||||
dom1 = fragment_fromstring("<div>ľščťžýáí</div>")
|
||||
dom2 = fragment_fromstring("<div>ľščťžýáí</div>")
|
||||
hash_dom1 = generate_hash_id(dom1)
|
||||
hash_dom2 = generate_hash_id(dom2)
|
||||
self.assertEqual(hash_dom1, hash_dom2)
|
||||
def test_different_hashes():
|
||||
dom = fragment_fromstring("<div>ľščťžýáí</div>")
|
||||
hash_dom = generate_hash_id(dom)
|
||||
hash_none = generate_hash_id(None)
|
||||
|
||||
hash_none1 = generate_hash_id(None)
|
||||
hash_none2 = generate_hash_id(None)
|
||||
self.assertEqual(hash_none1, hash_none2)
|
||||
assert hash_dom != hash_none
|
||||
|
||||
|
||||
class TestCheckNodeAttr(unittest.TestCase):
|
||||
"""Verify a node has a class/id in the given set.
|
||||
def test_equal_hashes():
|
||||
dom1 = fragment_fromstring("<div>ľščťžýáí</div>")
|
||||
dom2 = fragment_fromstring("<div>ľščťžýáí</div>")
|
||||
hash_dom1 = generate_hash_id(dom1)
|
||||
hash_dom2 = generate_hash_id(dom2)
|
||||
assert hash_dom1 == hash_dom2
|
||||
|
||||
The idea is that we have sets of known good/bad ids and classes and need
|
||||
to verify the given node does/doesn't have those classes/ids.
|
||||
hash_none1 = generate_hash_id(None)
|
||||
hash_none2 = generate_hash_id(None)
|
||||
assert hash_none1 == hash_none2
|
||||
|
||||
|
||||
# Verify a node has a class/id in the given set.
|
||||
# The idea is that we have sets of known good/bad ids and classes and need
|
||||
# to verify the given node does/doesn't have those classes/ids.
|
||||
|
||||
|
||||
def test_has_class():
|
||||
"""Verify that a node has a class in our set."""
|
||||
test_pattern = re.compile('test1|test2', re.I)
|
||||
test_node = fragment_fromstring('<div/>')
|
||||
test_node.set('class', 'test2 comment')
|
||||
|
||||
assert check_node_attributes(test_pattern, test_node, 'class')
|
||||
|
||||
|
||||
def test_has_id():
|
||||
"""Verify that a node has an id in our set."""
|
||||
test_pattern = re.compile('test1|test2', re.I)
|
||||
test_node = fragment_fromstring('<div/>')
|
||||
test_node.set('id', 'test2')
|
||||
|
||||
assert check_node_attributes(test_pattern, test_node, 'id')
|
||||
|
||||
|
||||
def test_lacks_class():
|
||||
"""Verify that a node does not have a class in our set."""
|
||||
test_pattern = re.compile('test1|test2', re.I)
|
||||
test_node = fragment_fromstring('<div/>')
|
||||
test_node.set('class', 'test4 comment')
|
||||
|
||||
assert not check_node_attributes(test_pattern, test_node, 'class')
|
||||
|
||||
|
||||
def test_lacks_id():
|
||||
"""Verify that a node does not have an id in our set."""
|
||||
test_pattern = re.compile('test1|test2', re.I)
|
||||
test_node = fragment_fromstring('<div/>')
|
||||
test_node.set('id', 'test4')
|
||||
|
||||
assert not check_node_attributes(test_pattern, test_node, 'id')
|
||||
|
||||
|
||||
# Verify we calc our link density correctly.
|
||||
|
||||
|
||||
def test_empty_node():
|
||||
"""An empty node doesn't have much of a link density"""
|
||||
doc = Article("<div></div>")
|
||||
|
||||
assert get_link_density(doc.readable_dom) == 0.0
|
||||
|
||||
|
||||
def test_small_doc_no_links():
|
||||
doc = Article(load_snippet('document_min.html'))
|
||||
|
||||
assert get_link_density(doc.readable_dom) == 0.0
|
||||
|
||||
|
||||
def test_several_links():
|
||||
"""This doc has a 3 links with the majority of content."""
|
||||
doc = Article(load_snippet('document_absolute_url.html'))
|
||||
|
||||
assert get_link_density(doc.readable_dom) == 22/37
|
||||
|
||||
|
||||
# Verify we score nodes correctly based on their class/id attributes.
|
||||
|
||||
|
||||
def test_no_matches_zero():
|
||||
"""If you don't have the attribute then you get a weight of 0"""
|
||||
node = fragment_fromstring("<div></div>")
|
||||
|
||||
assert get_class_weight(node) == 0
|
||||
|
||||
"""
|
||||
def test_has_class(self):
|
||||
"""Verify that a node has a class in our set."""
|
||||
test_pattern = re.compile('test1|test2', re.I)
|
||||
test_node = fragment_fromstring('<div/>')
|
||||
test_node.set('class', 'test2 comment')
|
||||
|
||||
self.assertTrue(
|
||||
check_node_attributes(test_pattern, test_node, 'class'))
|
||||
|
||||
def test_has_id(self):
|
||||
"""Verify that a node has an id in our set."""
|
||||
test_pattern = re.compile('test1|test2', re.I)
|
||||
test_node = fragment_fromstring('<div/>')
|
||||
test_node.set('id', 'test2')
|
||||
|
||||
self.assertTrue(check_node_attributes(test_pattern, test_node, 'id'))
|
||||
|
||||
def test_lacks_class(self):
|
||||
"""Verify that a node does not have a class in our set."""
|
||||
test_pattern = re.compile('test1|test2', re.I)
|
||||
test_node = fragment_fromstring('<div/>')
|
||||
test_node.set('class', 'test4 comment')
|
||||
self.assertFalse(
|
||||
check_node_attributes(test_pattern, test_node, 'class'))
|
||||
|
||||
def test_lacks_id(self):
|
||||
"""Verify that a node does not have an id in our set."""
|
||||
test_pattern = re.compile('test1|test2', re.I)
|
||||
test_node = fragment_fromstring('<div/>')
|
||||
test_node.set('id', 'test4')
|
||||
self.assertFalse(check_node_attributes(test_pattern, test_node, 'id'))
|
||||
|
||||
|
||||
class TestLinkDensity(unittest.TestCase):
|
||||
"""Verify we calc our link density correctly."""
|
||||
|
||||
def test_empty_node(self):
|
||||
"""An empty node doesn't have much of a link density"""
|
||||
doc = Article("<div></div>")
|
||||
self.assertEqual(get_link_density(doc.readable_dom), 0.0)
|
||||
|
||||
def test_small_doc_no_links(self):
|
||||
doc = Article(load_snippet('document_min.html'))
|
||||
self.assertEqual(get_link_density(doc.readable_dom), 0.0)
|
||||
|
||||
def test_several_links(self):
|
||||
"""This doc has a 3 links with the majority of content."""
|
||||
doc = Article(load_snippet('document_absolute_url.html'))
|
||||
self.assertAlmostEqual(get_link_density(doc.readable_dom), 22/37)
|
||||
|
||||
|
||||
class TestClassWeight(unittest.TestCase):
|
||||
"""Verify we score nodes correctly based on their class/id attributes."""
|
||||
|
||||
def test_no_matches_zero(self):
|
||||
"""If you don't have the attribute then you get a weight of 0"""
|
||||
node = fragment_fromstring("<div></div>")
|
||||
self.assertEqual(get_class_weight(node), 0)
|
||||
|
||||
def test_id_hits(self):
|
||||
"""If the id is in the list then it gets a weight"""
|
||||
test_div = '<div id="post">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertEqual(get_class_weight(node), 25)
|
||||
|
||||
test_div = '<div id="comments">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertEqual(get_class_weight(node), -25)
|
||||
|
||||
def test_class_hits(self):
|
||||
"""If the class is in the list then it gets a weight"""
|
||||
test_div = '<div class="something post">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertEqual(get_class_weight(node), 25)
|
||||
|
||||
test_div = '<div class="something comments">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertEqual(get_class_weight(node), -25)
|
||||
|
||||
def test_scores_collide(self):
|
||||
"""We might hit both positive and negative scores.
|
||||
|
||||
Positive and negative scoring is done independently so it's possible
|
||||
to hit both positive and negative scores and cancel each other out.
|
||||
|
||||
"""
|
||||
test_div = '<div id="post" class="something comment">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertEqual(get_class_weight(node), 0)
|
||||
|
||||
test_div = '<div id="post" class="post comment">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertEqual(get_class_weight(node), 25)
|
||||
|
||||
def test_scores_only_once(self):
|
||||
"""Scoring is not cumulative within a class hit."""
|
||||
test_div = '<div class="post main">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertEqual(get_class_weight(node), 25)
|
||||
|
||||
|
||||
class TestUnlikelyNode(unittest.TestCase):
|
||||
"""is_unlikely_node should help verify our node is good/bad."""
|
||||
|
||||
def test_body_is_always_likely(self):
|
||||
"""The body tag is always a likely node."""
|
||||
test_div = '<body class="comment"><div>Content</div></body>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertFalse(is_unlikely_node(node))
|
||||
|
||||
def test_is_unlikely(self):
|
||||
"Keywords in the class/id will make us believe this is unlikely."
|
||||
test_div = '<div class="something comments">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertTrue(is_unlikely_node(node))
|
||||
|
||||
test_div = '<div id="comments">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertTrue(is_unlikely_node(node))
|
||||
|
||||
def test_not_unlikely(self):
|
||||
"""Suck it double negatives."""
|
||||
test_div = '<div id="post">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertFalse(is_unlikely_node(node))
|
||||
|
||||
test_div = '<div class="something post">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertFalse(is_unlikely_node(node))
|
||||
|
||||
def test_maybe_hits(self):
|
||||
"""We've got some maybes that will overrule an unlikely node."""
|
||||
test_div = '<div id="comments" class="article">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
self.assertFalse(is_unlikely_node(node))
|
||||
|
||||
|
||||
class TestScoredNode(unittest.TestCase):
|
||||
"""ScoredNodes constructed have initial content_scores, etc."""
|
||||
|
||||
def test_hash_id(self):
|
||||
"""ScoredNodes have a hash_id based on their content
|
||||
|
||||
Since this is based on the html there are chances for collisions, but
|
||||
it helps us follow and identify nodes through the scoring process. Two
|
||||
identical nodes would score the same, so meh all good.
|
||||
|
||||
"""
|
||||
test_div = '<div id="comments" class="article">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
self.assertEqual(snode.hash_id, 'ffa4c519')
|
||||
|
||||
def test_div_content_score(self):
|
||||
"""A div starts out with a score of 5 and modifies from there"""
|
||||
test_div = '<div id="" class="">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
self.assertEqual(snode.content_score, 5)
|
||||
|
||||
test_div = '<div id="article" class="">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
self.assertEqual(snode.content_score, 30)
|
||||
|
||||
test_div = '<div id="comments" class="">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
self.assertEqual(snode.content_score, -20)
|
||||
|
||||
def test_headings_score(self):
|
||||
"""Heading tags aren't likely candidates, hurt their scores."""
|
||||
test_div = '<h2>Heading</h2>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
self.assertEqual(snode.content_score, -5)
|
||||
|
||||
def test_list_items(self):
|
||||
"""Heading tags aren't likely candidates, hurt their scores."""
|
||||
test_div = '<li>list item</li>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
self.assertEqual(snode.content_score, -3)
|
||||
|
||||
|
||||
class TestScoreCandidates(unittest.TestCase):
|
||||
"""The grand daddy of tests to make sure our scoring works
|
||||
|
||||
Now scoring details will change over time, so the most important thing is
|
||||
to make sure candidates come out in the right order, not necessarily how
|
||||
they scored. Make sure to keep this in mind while getting tests going.
|
||||
|
||||
def test_id_hits():
|
||||
"""If the id is in the list then it gets a weight"""
|
||||
test_div = '<div id="post">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
|
||||
assert get_class_weight(node) == 25
|
||||
|
||||
test_div = '<div id="comments">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
|
||||
assert get_class_weight(node) == -25
|
||||
|
||||
|
||||
def test_class_hits():
|
||||
"""If the class is in the list then it gets a weight"""
|
||||
test_div = '<div class="something post">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
assert get_class_weight(node) == 25
|
||||
|
||||
test_div = '<div class="something comments">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
assert get_class_weight(node) == -25
|
||||
|
||||
|
||||
def test_scores_collide():
|
||||
"""We might hit both positive and negative scores.
|
||||
|
||||
Positive and negative scoring is done independently so it's possible
|
||||
to hit both positive and negative scores and cancel each other out.
|
||||
|
||||
"""
|
||||
test_div = '<div id="post" class="something comment">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
assert get_class_weight(node) == 0
|
||||
|
||||
test_div = '<div id="post" class="post comment">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
assert get_class_weight(node) == 25
|
||||
|
||||
|
||||
def test_scores_only_once():
|
||||
"""Scoring is not cumulative within a class hit."""
|
||||
test_div = '<div class="post main">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
|
||||
assert get_class_weight(node) == 25
|
||||
|
||||
|
||||
# is_unlikely_node should help verify our node is good/bad.
|
||||
|
||||
|
||||
def test_simple_candidate_set(self):
|
||||
"""Tests a simple case of two candidate nodes"""
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<div class="content">
|
||||
<p>This is a great amount of info</p>
|
||||
<p>And more content <a href="/index">Home</a>
|
||||
</div>
|
||||
<div class="footer">
|
||||
<p>This is a footer</p>
|
||||
<p>And more content <a href="/index">Home</a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
dom = document_fromstring(html)
|
||||
div_nodes = dom.findall(".//div")
|
||||
|
||||
candidates = score_candidates(div_nodes)
|
||||
ordered = sorted(
|
||||
(c for c in candidates.values()), reverse=True,
|
||||
key=attrgetter("content_score"))
|
||||
|
||||
self.assertEqual(ordered[0].node.tag, "div")
|
||||
self.assertEqual(ordered[0].node.attrib["class"], "content")
|
||||
self.assertEqual(ordered[1].node.tag, "body")
|
||||
self.assertEqual(ordered[2].node.tag, "html")
|
||||
self.assertEqual(ordered[3].node.tag, "div")
|
||||
self.assertEqual(ordered[3].node.attrib["class"], "footer")
|
||||
def test_body_is_always_likely():
|
||||
"""The body tag is always a likely node."""
|
||||
test_div = '<body class="comment"><div>Content</div></body>'
|
||||
node = fragment_fromstring(test_div)
|
||||
|
||||
assert not is_unlikely_node(node)
|
||||
|
||||
|
||||
def test_is_unlikely():
|
||||
"""Keywords in the class/id will make us believe this is unlikely."""
|
||||
test_div = '<div class="something comments">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
assert is_unlikely_node(node)
|
||||
|
||||
test_div = '<div id="comments">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
assert is_unlikely_node(node)
|
||||
|
||||
|
||||
def test_not_unlikely():
|
||||
"""Suck it double negatives."""
|
||||
test_div = '<div id="post">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
assert not is_unlikely_node(node)
|
||||
|
||||
test_div = '<div class="something post">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
assert not is_unlikely_node(node)
|
||||
|
||||
|
||||
def test_maybe_hits():
|
||||
"""We've got some maybes that will overrule an unlikely node."""
|
||||
test_div = '<div id="comments" class="article">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
assert not is_unlikely_node(node)
|
||||
|
||||
|
||||
# ScoredNodes constructed have initial content_scores, etc.
|
||||
|
||||
|
||||
def test_hash_id():
|
||||
"""ScoredNodes have a hash_id based on their content
|
||||
|
||||
Since this is based on the html there are chances for collisions, but
|
||||
it helps us follow and identify nodes through the scoring process. Two
|
||||
identical nodes would score the same, so meh all good.
|
||||
|
||||
"""
|
||||
test_div = '<div id="comments" class="article">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
|
||||
assert snode.hash_id == 'ffa4c519'
|
||||
|
||||
|
||||
def test_div_content_score():
|
||||
"""A div starts out with a score of 5 and modifies from there"""
|
||||
test_div = '<div id="" class="">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
assert snode.content_score == 5
|
||||
|
||||
test_div = '<div id="article" class="">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
assert snode.content_score == 30
|
||||
|
||||
test_div = '<div id="comments" class="">Content</div>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
assert snode.content_score == -20
|
||||
|
||||
|
||||
def test_headings_score():
|
||||
"""Heading tags aren't likely candidates, hurt their scores."""
|
||||
test_div = '<h2>Heading</h2>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
|
||||
assert snode.content_score == -5
|
||||
|
||||
|
||||
def test_list_items():
|
||||
"""Heading tags aren't likely candidates, hurt their scores."""
|
||||
test_div = '<li>list item</li>'
|
||||
node = fragment_fromstring(test_div)
|
||||
snode = ScoredNode(node)
|
||||
assert snode.content_score == -3
|
||||
|
||||
|
||||
# The grand daddy of tests to make sure our scoring works
|
||||
# Now scoring details will change over time, so the most important thing is
|
||||
# to make sure candidates come out in the right order, not necessarily how
|
||||
# they scored. Make sure to keep this in mind while getting tests going.
|
||||
|
||||
|
||||
def test_simple_candidate_set():
|
||||
"""Tests a simple case of two candidate nodes"""
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<div class="content">
|
||||
<p>This is a great amount of info</p>
|
||||
<p>And more content <a href="/index">Home</a>
|
||||
</div>
|
||||
<div class="footer">
|
||||
<p>This is a footer</p>
|
||||
<p>And more content <a href="/index">Home</a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
dom = document_fromstring(html)
|
||||
div_nodes = dom.findall(".//div")
|
||||
|
||||
candidates = score_candidates(div_nodes)
|
||||
ordered = sorted(
|
||||
(c for c in candidates.values()), reverse=True,
|
||||
key=attrgetter("content_score"))
|
||||
|
||||
assert ordered[0].node.tag == "div"
|
||||
assert ordered[0].node.attrib["class"] == "content"
|
||||
assert ordered[1].node.tag == "body"
|
||||
assert ordered[2].node.tag == "html"
|
||||
assert ordered[3].node.tag == "div"
|
||||
assert ordered[3].node.attrib["class"] == "footer"
|
||||
|
Loading…
Reference in New Issue