# -*- coding: utf8 -*- from __future__ import absolute_import, division, print_function, unicode_literals import pytest from lxml.etree import tounicode from lxml.html import document_fromstring, fragment_fromstring from breadability._compat import to_unicode from breadability.readable import (Article, get_class_weight, get_link_density, is_bad_link, leaf_div_elements_into_paragraphs, score_candidates, ) from breadability.scoring import ScoredNode from .utils import load_article, load_snippet # TestReadableDocument """Verify we can process html into a document to work off of.""" def test_load_doc(): """We get back an element tree from our original doc""" doc = Article(load_snippet('document_min.html')) # We get back the document as a div tag currently by default. assert doc.readable_dom.tag == 'div' def test_title_loads(): """Verify we can fetch the title of the parsed article""" doc = Article(load_snippet('document_min.html')) assert doc._original_document.title == 'Min Document Title' def test_doc_no_scripts_styles(): """Step #1 remove all scripts from the document""" doc = Article(load_snippet('document_scripts.html')) readable = doc.readable_dom assert readable.findall(".//script") == [] assert readable.findall(".//style") == [] assert readable.findall(".//link") == [] def test_find_body_exists(): """If the document has a body, we store that as the readable html No sense processing anything other than the body content. """ doc = Article(load_snippet('document_min.html')) assert doc.readable_dom.tag == 'div' assert doc.readable_dom.get('id') == 'readabilityBody' def test_body_doesnt_exist(): """If we can't find a body, then we create one. We build our doc around the rest of the html we parsed. """ doc = Article(load_snippet('document_no_body.html')) assert doc.readable_dom.tag == 'div' assert doc.readable_dom.get('id') == 'readabilityBody' def test_bare_content(): """If the document is just pure content, no html tags we should be ok We build our doc around the rest of the html we parsed. """ doc = Article(load_snippet('document_only_content.html')) assert doc.readable_dom.tag == 'div' assert doc.readable_dom.get('id') == 'readabilityBody' def test_no_content(): """Without content we supply an empty unparsed doc.""" doc = Article('') assert doc.readable_dom.tag == 'div' assert doc.readable_dom.get('id') == 'readabilityBody' assert doc.readable_dom.get('class') == 'parsing-error' # Test out our cleaning processing we do. def test_unlikely_hits(): """Verify we wipe out things from our unlikely list.""" doc = Article(load_snippet('test_readable_unlikely.html')) readable = doc.readable_dom must_not_appear = [ 'comment', 'community', 'disqus', 'extra', 'foot', 'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar', 'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager', 'popup', 'tweet', 'twitter', 'imgBlogpostPermalink'] want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow'] for i in must_not_appear: # we cannot find any class or id with this value by_class = readable.find_class(i) for test in by_class: # if it's here it cannot have the must not class without the # want to appear class found = False for cls in test.get('class').split(): if cls in want_to_appear: found = True assert found by_ids = readable.get_element_by_id(i, False) if by_ids is not False: found = False for ids in test.get('id').split(): if ids in want_to_appear: found = True assert found def test_misused_divs_transform(): """Verify we replace leaf node divs with p's They should have the same content, just be a p vs a div """ test_html = "
simple
" ) test_html2 = ('simplelink
' ) def test_dont_transform_div_with_div(): """Verify that only child.""" dom = document_fromstring( "
child
" "aftertext') assert get_class_weight(node) == 25 def test_positive_ids(): """Some ids get us bonus points.""" node = fragment_fromstring('
') assert get_class_weight(node) == 25 def test_negative_class(): """Some classes get us negative points.""" node = fragment_fromstring('
') assert get_class_weight(node) == -25 def test_negative_ids(): """Some ids get us negative points.""" node = fragment_fromstring('
') assert get_class_weight(node) == -25 # We take out list of potential nodes and score them up. def test_we_get_candidates(): """Processing candidates should get us a list of nodes to try out.""" doc = document_fromstring(load_article("ars.001.html")) test_nodes = tuple(doc.iter("p", "td", "pre")) candidates = score_candidates(test_nodes) # this might change as we tweak our algorithm, but if it does, # it signifies we need to look at what we changed. assert len(candidates.keys()) == 37 # one of these should have a decent score scores = sorted(c.content_score for c in candidates.values()) assert scores[-1] > 100 def test_bonus_score_per_100_chars_in_p(): """Nodes get 1 point per 100 characters up to max. 3 points.""" def build_candidates(length): html = "
%s
" % ("c" * length) node = fragment_fromstring(html) return [node] test_nodes = build_candidates(50) candidates = score_candidates(test_nodes) pscore_50 = max(c.content_score for c in candidates.values()) test_nodes = build_candidates(100) candidates = score_candidates(test_nodes) pscore_100 = max(c.content_score for c in candidates.values()) test_nodes = build_candidates(300) candidates = score_candidates(test_nodes) pscore_300 = max(c.content_score for c in candidates.values()) test_nodes = build_candidates(400) candidates = score_candidates(test_nodes) pscore_400 = max(c.content_score for c in candidates.values()) assert pscore_50 + 0.5 == pscore_100 assert pscore_100 + 2.0 == pscore_300 assert pscore_300 == pscore_400 # Link density will adjust out candidate scoresself. def test_link_density(): """Test that we get a link density""" doc = document_fromstring(load_article('ars.001.html')) for node in doc.iter('p', 'td', 'pre'): density = get_link_density(node) # the density must be between 0, 1 assert density >= 0.0 and density <= 1.0 # Siblings will be included if their content is related. @pytest.mark.skip("Not implemented yet.") def test_bad_siblings_not_counted(): raise NotImplementedError() @pytest.mark.skip("Not implemented yet.") def test_good_siblings_counted(): raise NotImplementedError() # TestMainText def test_empty(): article = Article("") annotated_text = article.main_text assert annotated_text == [] def test_no_annotations(): article = Article("This is text with no annotations
This is text\r\twith no annotations