breadability/tests/test_readable.py

# -*- coding: utf8 -*-

from __future__ import absolute_import, division, print_function, unicode_literals

import pytest
from lxml.etree import tounicode
from lxml.html import document_fromstring, fragment_fromstring

from breadability._compat import to_unicode
from breadability.readable import (Article, get_class_weight, get_link_density, is_bad_link,
                                   leaf_div_elements_into_paragraphs, score_candidates, )
from breadability.scoring import ScoredNode
from .utils import load_article, load_snippet

# TestReadableDocument
"""Verify we can process html into a document to work off of."""


def test_load_doc():
    """We get back an element tree from our original doc"""
    doc = Article(load_snippet('document_min.html'))
    # We get back the document as a div tag currently by default.

    assert doc.readable_dom.tag == 'div'


def test_title_loads():
    """Verify we can fetch the title of the parsed article"""
    doc = Article(load_snippet('document_min.html'))

    assert doc._original_document.title == 'Min Document Title'


def test_doc_no_scripts_styles():
    """Step #1 remove all scripts from the document"""
    doc = Article(load_snippet('document_scripts.html'))
    readable = doc.readable_dom

    assert readable.findall(".//script") == []
    assert readable.findall(".//style") == []
    assert readable.findall(".//link") == []


def test_find_body_exists():
    """If the document has a body, we store that as the readable html

    No sense processing anything other than the body content.

    """
    doc = Article(load_snippet('document_min.html'))

    assert doc.readable_dom.tag == 'div'
    assert doc.readable_dom.get('id') == 'readabilityBody'


def test_body_doesnt_exist():
    """If we can't find a body, then we create one.

    We build our doc around the rest of the html we parsed.

    """
    doc = Article(load_snippet('document_no_body.html'))

    assert doc.readable_dom.tag == 'div'
    assert doc.readable_dom.get('id') == 'readabilityBody'


def test_bare_content():
    """If the document is just pure content, no html tags we should be ok

    We build our doc around the rest of the html we parsed.

    """
    doc = Article(load_snippet('document_only_content.html'))

    assert doc.readable_dom.tag == 'div'
    assert doc.readable_dom.get('id') == 'readabilityBody'


def test_no_content():
    """Without content we supply an empty unparsed doc."""
    doc = Article('')

    assert doc.readable_dom.tag == 'div'
    assert doc.readable_dom.get('id') == 'readabilityBody'
    assert doc.readable_dom.get('class') == 'parsing-error'


# Test out our cleaning processing we do.


def test_unlikely_hits():
    """Verify we wipe out things from our unlikely list."""
    doc = Article(load_snippet('test_readable_unlikely.html'))
    readable = doc.readable_dom
    must_not_appear = [
        'comment', 'community', 'disqus', 'extra', 'foot',
        'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
        'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',
        'popup', 'tweet', 'twitter', 'imgBlogpostPermalink']

    want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']

    for i in must_not_appear:
        # we cannot find any class or id with this value
        by_class = readable.find_class(i)

        for test in by_class:
            # if it's here it cannot have the must not class without the
            # want to appear class
            found = False
            for cls in test.get('class').split():
                if cls in want_to_appear:
                    found = True
            assert found

        by_ids = readable.get_element_by_id(i, False)
        if by_ids is not False:
            found = False
            for ids in test.get('id').split():
                if ids in want_to_appear:
                    found = True
            assert found


def test_misused_divs_transform():
    """Verify we replace leaf node divs with p's

    They should have the same content, just be a p vs a div

    """
    test_html = "<html><body><div>simple</div></body></html>"
    test_doc = document_fromstring(test_html)
    assert tounicode(leaf_div_elements_into_paragraphs(test_doc)) == to_unicode(
        "<html><body><p>simple</p></body></html>"
    )

    test_html2 = ('<html><body><div>simple<a href="">link</a>'
                  '</div></body></html>')
    test_doc2 = document_fromstring(test_html2)
    assert tounicode(leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode(
        '<html><body><p>simple<a href="">link</a></p></body></html>'
    )


def test_dont_transform_div_with_div():
    """Verify that only child <div> element is replaced by <p>."""
    dom = document_fromstring(
        "<html><body><div>text<div>child</div>"
        "aftertext</div></body></html>"
    )

    assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode(
        "<html><body><div>text<p>child</p>"
        "aftertext</div></body></html>"
    )


def test_bad_links():
    """Some links should just not belong."""
    bad_links = [
        '<a name="amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress">&nbsp;</a>',
        '<a href="#amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>',
        '<a href="http://scripting.com/stories/2012/06/15/theTechPressIsOutToLunch.html#anExampleGoogleDoesntIntendToShareBlogAndItWillOnlyBeUsedToPointToBloggerSitesIfYouHaveATumblrOrWordpressBlogYouCantHaveABlogDomainHereIsTheAHrefhttpgtldresulticannorgapplicationresultapplicationstatusapplicationdetails527publicListingaOfGooglesAHrefhttpdropboxscriptingcomdavemiscgoogleblogapplicationhtmlapplicationa"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>'
    ]

    for l in bad_links:
        link = fragment_fromstring(l)
        assert is_bad_link(link)


# Candidate nodes are scoring containers we use.


def test_candidate_scores():
    """We should be getting back objects with some scores."""
    fives = ['<div/>']
    threes = ['<pre/>', '<td/>', '<blockquote/>']
    neg_threes = ['<address/>', '<ol/>']
    neg_fives = ['<h1/>', '<h2/>', '<h3/>', '<h4/>']

    for n in fives:
        doc = fragment_fromstring(n)
        assert ScoredNode(doc).content_score == 5

    for n in threes:
        doc = fragment_fromstring(n)
        assert ScoredNode(doc).content_score == 3

    for n in neg_threes:
        doc = fragment_fromstring(n)
        assert ScoredNode(doc).content_score == -3

    for n in neg_fives:
        doc = fragment_fromstring(n)
        assert ScoredNode(doc).content_score == -5


def test_article_enables_candidate_access():
    """Candidates are accessible after document processing."""
    doc = Article(load_article('ars.001.html'))

    assert hasattr(doc, 'candidates')


# Certain ids and classes get us bonus points.


def test_positive_class():
    """Some classes get us bonus points."""
    node = fragment_fromstring('<p class="article">')
    assert get_class_weight(node) == 25


def test_positive_ids():
    """Some ids get us bonus points."""
    node = fragment_fromstring('<p id="content">')
    assert get_class_weight(node) == 25


def test_negative_class():
    """Some classes get us negative points."""
    node = fragment_fromstring('<p class="comment">')
    assert get_class_weight(node) == -25


def test_negative_ids():
    """Some ids get us negative points."""
    node = fragment_fromstring('<p id="media">')
    assert get_class_weight(node) == -25


# We take out list of potential nodes and score them up.


def test_we_get_candidates():
    """Processing candidates should get us a list of nodes to try out."""
    doc = document_fromstring(load_article("ars.001.html"))
    test_nodes = tuple(doc.iter("p", "td", "pre"))
    candidates = score_candidates(test_nodes)

    # this might change as we tweak our algorithm, but if it does,
    # it signifies we need to look at what we changed.
    assert len(candidates.keys()) == 37

    # one of these should have a decent score
    scores = sorted(c.content_score for c in candidates.values())
    assert scores[-1] > 100


def test_bonus_score_per_100_chars_in_p():
    """Nodes get 1 point per 100 characters up to max. 3 points."""
    def build_candidates(length):
        html = "<p>%s</p>" % ("c" * length)
        node = fragment_fromstring(html)

        return [node]

    test_nodes = build_candidates(50)
    candidates = score_candidates(test_nodes)
    pscore_50 = max(c.content_score for c in candidates.values())

    test_nodes = build_candidates(100)
    candidates = score_candidates(test_nodes)
    pscore_100 = max(c.content_score for c in candidates.values())

    test_nodes = build_candidates(300)
    candidates = score_candidates(test_nodes)
    pscore_300 = max(c.content_score for c in candidates.values())

    test_nodes = build_candidates(400)
    candidates = score_candidates(test_nodes)
    pscore_400 = max(c.content_score for c in candidates.values())

    assert pscore_50 + 0.5 == pscore_100
    assert pscore_100 + 2.0 == pscore_300
    assert pscore_300 == pscore_400


# Link density will adjust out candidate scoresself.


def test_link_density():
    """Test that we get a link density"""
    doc = document_fromstring(load_article('ars.001.html'))
    for node in doc.iter('p', 'td', 'pre'):
        density = get_link_density(node)

        # the density must be between 0, 1
        assert density >= 0.0 and density <= 1.0


# Siblings will be included if their content is related.


@pytest.mark.skip("Not implemented yet.")
def test_bad_siblings_not_counted():
    raise NotImplementedError()


@pytest.mark.skip("Not implemented yet.")
def test_good_siblings_counted():
    raise NotImplementedError()


# TestMainText

def test_empty():
    article = Article("")
    annotated_text = article.main_text

    assert annotated_text == []


def test_no_annotations():
    article = Article("<div><p>This is text with no annotations</p></div>")
    annotated_text = article.main_text

    assert annotated_text == [(("This is text with no annotations", None),)]


def test_one_annotation():
    article = Article("<div><p>This is text\r\twith <del>no</del> annotations</p></div>")
    annotated_text = article.main_text

    assert annotated_text == [(
        ("This is text\nwith", None),
        ("no", ("del",)),
        ("annotations", None),
    )]


def test_simple_snippet():
    snippet = Article(load_snippet("annotated_1.html"))
    annotated_text = snippet.main_text

    assert annotated_text == [
        (
            ("Paragraph is more", None),
            ("better", ("em",)),
            (".\nThis text is very", None),
            ("pretty", ("strong",)),
            ("'cause she's girl.", None),
        ),
        (
            ("This is not", None),
            ("crap", ("big",)),
            ("so", None),
            ("readability", ("dfn",)),
            ("me :)", None),
        )
    ]
Added compatibility file with unittest2 import 12 years ago			`# -- coding: utf8 --`
Make package from tests 12 years ago
Tests migrated into pytest style 7 years ago			`from __future__ import absolute_import, division, print_function, unicode_literals`
Make package from tests 12 years ago
Tests migrated into pytest style 7 years ago			`import pytest`
Add processing of content per the algorithm with some base tests 13 years ago			`from lxml.etree import tounicode`
Tests migrated into pytest style 7 years ago			`from lxml.html import document_fromstring, fragment_fromstring`

Rename it back. Respect others 11 years ago			`from breadability._compat import to_unicode`
Tests migrated into pytest style 7 years ago			`from breadability.readable import (Article, get_class_weight, get_link_density, is_bad_link,`
			`leaf_div_elements_into_paragraphs, score_candidates, )`
Rename it back. Respect others 11 years ago			`from breadability.scoring import ScoredNode`
Tests migrated into pytest style 7 years ago			`from .utils import load_article, load_snippet`
Start to add some processing for the readable contnet - Add removal of style, script, etc bits in the content 13 years ago
Tests migrated into pytest style 7 years ago			`# TestReadableDocument`
			`"""Verify we can process html into a document to work off of."""`
Start to add some processing for the readable contnet - Add removal of style, script, etc bits in the content 13 years ago

Tests migrated into pytest style 7 years ago			`def test_load_doc():`
			`"""We get back an element tree from our original doc"""`
			`doc = Article(load_snippet('document_min.html'))`
			`# We get back the document as a div tag currently by default.`
Start to add some processing for the readable contnet - Add removal of style, script, etc bits in the content 13 years ago
Tests migrated into pytest style 7 years ago			`assert doc.readable_dom.tag == 'div'`
Add test for title 11 years ago
Add processing of content per the algorithm with some base tests 13 years ago
Tests migrated into pytest style 7 years ago			`def test_title_loads():`
			`"""Verify we can fetch the title of the parsed article"""`
			`doc = Article(load_snippet('document_min.html'))`
Update tests for scoring, returning div/html doc depending on the found content 13 years ago
Tests migrated into pytest style 7 years ago			`assert doc._original_document.title == 'Min Document Title'`
Fixed transformation of leaf <div> into <p> 12 years ago

Tests migrated into pytest style 7 years ago			`def test_doc_no_scripts_styles():`
			`"""Step #1 remove all scripts from the document"""`
			`doc = Article(load_snippet('document_scripts.html'))`
			`readable = doc.readable_dom`

			`assert readable.findall(".//script") == []`
			`assert readable.findall(".//style") == []`
			`assert readable.findall(".//link") == []`


			`def test_find_body_exists():`
			`"""If the document has a body, we store that as the readable html`

			`No sense processing anything other than the body content.`

			`"""`
			`doc = Article(load_snippet('document_min.html'))`

			`assert doc.readable_dom.tag == 'div'`
			`assert doc.readable_dom.get('id') == 'readabilityBody'`


			`def test_body_doesnt_exist():`
			`"""If we can't find a body, then we create one.`

			`We build our doc around the rest of the html we parsed.`

			`"""`
			`doc = Article(load_snippet('document_no_body.html'))`

			`assert doc.readable_dom.tag == 'div'`
			`assert doc.readable_dom.get('id') == 'readabilityBody'`


			`def test_bare_content():`
			`"""If the document is just pure content, no html tags we should be ok`

			`We build our doc around the rest of the html we parsed.`

			`"""`
			`doc = Article(load_snippet('document_only_content.html'))`

			`assert doc.readable_dom.tag == 'div'`
			`assert doc.readable_dom.get('id') == 'readabilityBody'`


			`def test_no_content():`
			`"""Without content we supply an empty unparsed doc."""`
			`doc = Article('')`

			`assert doc.readable_dom.tag == 'div'`
			`assert doc.readable_dom.get('id') == 'readabilityBody'`
			`assert doc.readable_dom.get('class') == 'parsing-error'`


			`# Test out our cleaning processing we do.`


			`def test_unlikely_hits():`
			`"""Verify we wipe out things from our unlikely list."""`
			`doc = Article(load_snippet('test_readable_unlikely.html'))`
			`readable = doc.readable_dom`
			`must_not_appear = [`
			`'comment', 'community', 'disqus', 'extra', 'foot',`
			`'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',`
			`'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',`
			`'popup', 'tweet', 'twitter', 'imgBlogpostPermalink']`

			`want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']`

			`for i in must_not_appear:`
			`# we cannot find any class or id with this value`
			`by_class = readable.find_class(i)`

			`for test in by_class:`
			`# if it's here it cannot have the must not class without the`
			`# want to appear class`
			`found = False`
			`for cls in test.get('class').split():`
			`if cls in want_to_appear:`
			`found = True`
			`assert found`

			`by_ids = readable.get_element_by_id(i, False)`
			`if by_ids is not False:`
			`found = False`
			`for ids in test.get('id').split():`
			`if ids in want_to_appear:`
			`found = True`
			`assert found`


			`def test_misused_divs_transform():`
			`"""Verify we replace leaf node divs with p's`

			`They should have the same content, just be a p vs a div`

			`"""`
			`test_html = "<html><body><div>simple</div></body></html>"`
			`test_doc = document_fromstring(test_html)`
			`assert tounicode(leaf_div_elements_into_paragraphs(test_doc)) == to_unicode(`
			`"<html><body><p>simple</p></body></html>"`
			`)`

			`test_html2 = ('<html><body><div>simple<a href="">link</a>'`
			`'</div></body></html>')`
			`test_doc2 = document_fromstring(test_html2)`
			`assert tounicode(leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode(`
			`'<html><body><p>simple<a href="">link</a></p></body></html>'`
			`)`


			`def test_dont_transform_div_with_div():`
			`"""Verify that only child <div> element is replaced by <p>."""`
			`dom = document_fromstring(`
			`"<html><body><div>text<div>child</div>"`
			`"aftertext</div></body></html>"`
			`)`

			`assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode(`
			`"<html><body><div>text<p>child</p>"`
			`"aftertext</div></body></html>"`
			`)`

Do some link filtring to drop stupid permalinks from the content. 12 years ago
Tests migrated into pytest style 7 years ago			`def test_bad_links():`
			`"""Some links should just not belong."""`
			`bad_links = [`
			`'<a name="amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress"> </a>',`
			`'<a href="#amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>',`
			'<a href="http://scripting.com/stories/2012/06/15/theTechPressIsOutToLunch.html#anExampleGoogleDoesntIntendToShareBlogAndItWillOnlyBeUsedToPointToBloggerSitesIfYouHaveATumblrOrWordpressBlogYouCantHaveABlogDomainHereIsTheAHrefhttpgtldresulticannorgapplicationresultapplicationstatusapplicationdetails527publicListingaOfGooglesAHrefhttpdropboxscriptingcomdavemiscgoogleblogapplicationhtmlapplicationa"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>'
			`]`
Do some link filtring to drop stupid permalinks from the content. 12 years ago
Tests migrated into pytest style 7 years ago			`for l in bad_links:`
			`link = fragment_fromstring(l)`
			`assert is_bad_link(link)`
Garden 13 years ago
Update tests for scoring, returning div/html doc depending on the found content 13 years ago
Tests migrated into pytest style 7 years ago			`# Candidate nodes are scoring containers we use.`
Update tests for scoring, returning div/html doc depending on the found content 13 years ago

Tests migrated into pytest style 7 years ago			`def test_candidate_scores():`
			`"""We should be getting back objects with some scores."""`
			`fives = ['<div/>']`
			`threes = ['<pre/>', '<td/>', '<blockquote/>']`
			`neg_threes = ['<address/>', '<ol/>']`
			`neg_fives = ['<h1/>', '<h2/>', '<h3/>', '<h4/>']`
Update tests for scoring, returning div/html doc depending on the found content 13 years ago
Tests migrated into pytest style 7 years ago			`for n in fives:`
			`doc = fragment_fromstring(n)`
			`assert ScoredNode(doc).content_score == 5`
Update tests for scoring, returning div/html doc depending on the found content 13 years ago
Tests migrated into pytest style 7 years ago			`for n in threes:`
			`doc = fragment_fromstring(n)`
			`assert ScoredNode(doc).content_score == 3`
Update tests for scoring, returning div/html doc depending on the found content 13 years ago
Tests migrated into pytest style 7 years ago			`for n in neg_threes:`
			`doc = fragment_fromstring(n)`
			`assert ScoredNode(doc).content_score == -3`
Add doc and candidates properties to the article 13 years ago
Tests migrated into pytest style 7 years ago			`for n in neg_fives:`
			`doc = fragment_fromstring(n)`
			`assert ScoredNode(doc).content_score == -5`
Add doc and candidates properties to the article 13 years ago
Update tests for scoring, returning div/html doc depending on the found content 13 years ago
Tests migrated into pytest style 7 years ago			`def test_article_enables_candidate_access():`
			`"""Candidates are accessible after document processing."""`
			`doc = Article(load_article('ars.001.html'))`
Update tests for scoring, returning div/html doc depending on the found content 13 years ago
Tests migrated into pytest style 7 years ago			`assert hasattr(doc, 'candidates')`
Update tests for scoring, returning div/html doc depending on the found content 13 years ago

Tests migrated into pytest style 7 years ago			`# Certain ids and classes get us bonus points.`
Update tests for scoring, returning div/html doc depending on the found content 13 years ago

Tests migrated into pytest style 7 years ago			`def test_positive_class():`
			`"""Some classes get us bonus points."""`
			`node = fragment_fromstring('<p class="article">')`
			`assert get_class_weight(node) == 25`
Update tests for scoring, returning div/html doc depending on the found content 13 years ago

Tests migrated into pytest style 7 years ago			`def test_positive_ids():`
			`"""Some ids get us bonus points."""`
			`node = fragment_fromstring('<p id="content">')`
			`assert get_class_weight(node) == 25`
Update tests for scoring, returning div/html doc depending on the found content 13 years ago
Tests migrated into pytest style 7 years ago
			`def test_negative_class():`
			`"""Some classes get us negative points."""`
			`node = fragment_fromstring('<p class="comment">')`
			`assert get_class_weight(node) == -25`


			`def test_negative_ids():`
			`"""Some ids get us negative points."""`
			`node = fragment_fromstring('<p id="media">')`
			`assert get_class_weight(node) == -25`


			`# We take out list of potential nodes and score them up.`


			`def test_we_get_candidates():`
			`"""Processing candidates should get us a list of nodes to try out."""`
			`doc = document_fromstring(load_article("ars.001.html"))`
			`test_nodes = tuple(doc.iter("p", "td", "pre"))`
			`candidates = score_candidates(test_nodes)`

			`# this might change as we tweak our algorithm, but if it does,`
			`# it signifies we need to look at what we changed.`
			`assert len(candidates.keys()) == 37`

			`# one of these should have a decent score`
			`scores = sorted(c.content_score for c in candidates.values())`
			`assert scores[-1] > 100`


			`def test_bonus_score_per_100_chars_in_p():`
			`"""Nodes get 1 point per 100 characters up to max. 3 points."""`
			`def build_candidates(length):`
			`html = "<p>%s</p>" % ("c" * length)`
			`node = fragment_fromstring(html)`

			`return [node]`

			`test_nodes = build_candidates(50)`
			`candidates = score_candidates(test_nodes)`
			`pscore_50 = max(c.content_score for c in candidates.values())`

			`test_nodes = build_candidates(100)`
			`candidates = score_candidates(test_nodes)`
			`pscore_100 = max(c.content_score for c in candidates.values())`

			`test_nodes = build_candidates(300)`
			`candidates = score_candidates(test_nodes)`
			`pscore_300 = max(c.content_score for c in candidates.values())`

			`test_nodes = build_candidates(400)`
			`candidates = score_candidates(test_nodes)`
			`pscore_400 = max(c.content_score for c in candidates.values())`

			`assert pscore_50 + 0.5 == pscore_100`
			`assert pscore_100 + 2.0 == pscore_300`
			`assert pscore_300 == pscore_400`


			`# Link density will adjust out candidate scoresself.`


			`def test_link_density():`
			`"""Test that we get a link density"""`
			`doc = document_fromstring(load_article('ars.001.html'))`
			`for node in doc.iter('p', 'td', 'pre'):`
			`density = get_link_density(node)`

			`# the density must be between 0, 1`
			`assert density >= 0.0 and density <= 1.0`


			`# Siblings will be included if their content is related.`


			`@pytest.mark.skip("Not implemented yet.")`
			`def test_bad_siblings_not_counted():`
			`raise NotImplementedError()`


			`@pytest.mark.skip("Not implemented yet.")`
			`def test_good_siblings_counted():`
			`raise NotImplementedError()`


			`# TestMainText`

			`def test_empty():`
			`article = Article("")`
			`annotated_text = article.main_text`

			`assert annotated_text == []`


			`def test_no_annotations():`
			`article = Article("<div><p>This is text with no annotations</p></div>")`
			`annotated_text = article.main_text`

			`assert annotated_text == [(("This is text with no annotations", None),)]`


			`def test_one_annotation():`
			`article = Article("<div><p>This is text\r\twith <del>no</del> annotations</p></div>")`
			`annotated_text = article.main_text`

			`assert annotated_text == [(`
			`("This is text\nwith", None),`
			`("no", ("del",)),`
			`("annotations", None),`
			`)]`


			`def test_simple_snippet():`
			`snippet = Article(load_snippet("annotated_1.html"))`
			`annotated_text = snippet.main_text`

			`assert annotated_text == [`
			`(`
			`("Paragraph is more", None),`
			`("better", ("em",)),`
			`(".\nThis text is very", None),`
			`("pretty", ("strong",)),`
			`("'cause she's girl.", None),`
			`),`
			`(`
			`("This is not", None),`
			`("crap", ("big",)),`
			`("so", None),`
			`("readability", ("dfn",)),`
			`("me :)", None),`
			`)`
			`]`