breadability/tests/test_orig_document.py

# -*- coding: utf8 -*-

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from collections import defaultdict
from readability._py3k import to_unicode, to_bytes
from readability.document import OriginalDocument, determine_encoding
from .compat import unittest
from .utils import load_snippet


class TestOriginalDocument(unittest.TestCase):
    """Verify we can process html into a document to work off of."""

    def test_readin_min_document(self):
        """Verify we can read in a min html document"""
        doc = OriginalDocument(load_snippet('document_min.html'))
        self.assertTrue(to_unicode(doc).startswith('<html>'))
        self.assertEqual(doc.title, 'Min Document Title')

    def test_readin_with_base_url(self):
        """Passing a url should update links to be absolute links"""
        doc = OriginalDocument(
            load_snippet('document_absolute_url.html'),
            url="http://blog.mitechie.com/test.html")
        self.assertTrue(to_unicode(doc).startswith('<html>'))

        # find the links on the page and make sure each one starts with out
        # base url we told it to use.
        links = doc.links
        self.assertEqual(len(links), 3)
        # we should have two links that start with our blog url
        # and one link that starts with amazon
        link_counts = defaultdict(int)
        for link in links:
            if link.get('href').startswith('http://blog.mitechie.com'):
                link_counts['blog'] += 1
            else:
                link_counts['other'] += 1

        self.assertEqual(link_counts['blog'], 2)
        self.assertEqual(link_counts['other'], 1)

    def test_no_br_allowed(self):
        """We convert all <br/> tags to <p> tags"""
        doc = OriginalDocument(load_snippet('document_min.html'))
        self.assertIsNone(doc.html.find('.//br'))

    def test_empty_title(self):
        """We convert all <br/> tags to <p> tags"""
        document = OriginalDocument("<html><head><title></title></head><body></body></html>")
        self.assertEqual(document.title, "")

    def test_title_only_with_tags(self):
        """We convert all <br/> tags to <p> tags"""
        document = OriginalDocument("<html><head><title><em></em></title></head><body></body></html>")
        self.assertEqual(document.title, "")

    def test_no_title(self):
        """We convert all <br/> tags to <p> tags"""
        document = OriginalDocument("<html><head></head><body></body></html>")
        self.assertEqual(document.title, "")

    def test_encoding(self):
        text = "ľščťžýáíéäúňôůě".encode("iso-8859-2")
        encoding = determine_encoding(text)

    def test_encoding_short(self):
        text = "ľščťžýáíé".encode("iso-8859-2")
        encoding = determine_encoding(text)
        self.assertEqual(encoding, "utf8")

        text = to_bytes("ľščťžýáíé")
        encoding = determine_encoding(text)
        self.assertEqual(encoding, "utf8")
Use 'charade' for detecting encoding 2013-03-07 14:42:18 +00:00			`# -- coding: utf8 --`
Update the unittest import to grab unittest2 for 2.6 2012-12-13 01:37:24 +00:00
Make package from tests 2013-03-08 22:05:14 +00:00			`from __future__ import absolute_import`
Added compatibility file with unittest2 import 2013-03-18 21:01:11 +00:00			`from __future__ import division, print_function, unicode_literals`
Start to add some basic tests and layout to use for breaking down documents. 2012-05-03 03:57:49 +00:00
Use 'charade' for detecting encoding 2013-03-07 14:42:18 +00:00			`from collections import defaultdict`
Renamed to readability 2013-03-18 20:25:09 +00:00			`from readability._py3k import to_unicode, to_bytes`
			`from readability.document import OriginalDocument, determine_encoding`
Added compatibility file with unittest2 import 2013-03-18 21:01:11 +00:00			`from .compat import unittest`
Make package from tests 2013-03-08 22:05:14 +00:00			`from .utils import load_snippet`
Start to add some basic tests and layout to use for breaking down documents. 2012-05-03 03:57:49 +00:00

Update the unittest import to grab unittest2 for 2.6 2012-12-13 01:37:24 +00:00			`class TestOriginalDocument(unittest.TestCase):`
Start to add some basic tests and layout to use for breaking down documents. 2012-05-03 03:57:49 +00:00			`"""Verify we can process html into a document to work off of."""`

			`def test_readin_min_document(self):`
			`"""Verify we can read in a min html document"""`
			`doc = OriginalDocument(load_snippet('document_min.html'))`
Use unicode literals from future, not 'to_string' 2013-03-19 22:49:07 +00:00			`self.assertTrue(to_unicode(doc).startswith('<html>'))`
Start to add some basic tests and layout to use for breaking down documents. 2012-05-03 03:57:49 +00:00			`self.assertEqual(doc.title, 'Min Document Title')`

			`def test_readin_with_base_url(self):`
			`"""Passing a url should update links to be absolute links"""`
Add support for links, absoluting links - Add a test that we absolute correctly - Add a links cached attribute to get all links in the doc 2012-05-04 01:36:48 +00:00			`doc = OriginalDocument(`
			`load_snippet('document_absolute_url.html'),`
Start to add some basic tests and layout to use for breaking down documents. 2012-05-03 03:57:49 +00:00			`url="http://blog.mitechie.com/test.html")`
Use unicode literals from future, not 'to_string' 2013-03-19 22:49:07 +00:00			`self.assertTrue(to_unicode(doc).startswith('<html>'))`
Start to add some basic tests and layout to use for breaking down documents. 2012-05-03 03:57:49 +00:00
			`# find the links on the page and make sure each one starts with out`
			`# base url we told it to use.`
			`links = doc.links`
			`self.assertEqual(len(links), 3)`
Add support for links, absoluting links - Add a test that we absolute correctly - Add a links cached attribute to get all links in the doc 2012-05-04 01:36:48 +00:00			`# we should have two links that start with our blog url`
			`# and one link that starts with amazon`
			`link_counts = defaultdict(int)`
			`for link in links:`
			`if link.get('href').startswith('http://blog.mitechie.com'):`
			`link_counts['blog'] += 1`
			`else:`
			`link_counts['other'] += 1`

			`self.assertEqual(link_counts['blog'], 2)`
			`self.assertEqual(link_counts['other'], 1)`
Add processing of content per the algorithm with some base tests 2012-05-04 20:07:52 +00:00
			`def test_no_br_allowed(self):`
			`"""We convert all <br/> tags to <p> tags"""`
			`doc = OriginalDocument(load_snippet('document_min.html'))`
			`self.assertIsNone(doc.html.find('.//br'))`
Use 'charade' for detecting encoding 2013-03-07 14:42:18 +00:00
Refactored file 'document.py' Removed non-intuitive parts and dead code not covered by tests. Better names for objects. Better coverage by tests. 2013-03-11 21:10:26 +00:00			`def test_empty_title(self):`
			`"""We convert all <br/> tags to <p> tags"""`
			`document = OriginalDocument("<html><head><title></title></head><body></body></html>")`
			`self.assertEqual(document.title, "")`

			`def test_title_only_with_tags(self):`
			`"""We convert all <br/> tags to <p> tags"""`
			`document = OriginalDocument("<html><head><title><em></em></title></head><body></body></html>")`
			`self.assertEqual(document.title, "")`

			`def test_no_title(self):`
			`"""We convert all <br/> tags to <p> tags"""`
			`document = OriginalDocument("<html><head></head><body></body></html>")`
			`self.assertEqual(document.title, "")`

Use 'charade' for detecting encoding 2013-03-07 14:42:18 +00:00			`def test_encoding(self):`
Use unicode literals from future, not 'to_string' 2013-03-19 22:49:07 +00:00			`text = "ľščťžýáíéäúňôůě".encode("iso-8859-2")`
Refactored file 'document.py' Removed non-intuitive parts and dead code not covered by tests. Better names for objects. Better coverage by tests. 2013-03-11 21:10:26 +00:00			`encoding = determine_encoding(text)`

			`def test_encoding_short(self):`
Use unicode literals from future, not 'to_string' 2013-03-19 22:49:07 +00:00			`text = "ľščťžýáíé".encode("iso-8859-2")`
Refactored file 'document.py' Removed non-intuitive parts and dead code not covered by tests. Better names for objects. Better coverage by tests. 2013-03-11 21:10:26 +00:00			`encoding = determine_encoding(text)`
			`self.assertEqual(encoding, "utf8")`

			`text = to_bytes("ľščťžýáíé")`
			`encoding = determine_encoding(text)`
			`self.assertEqual(encoding, "utf8")`