2013-03-07 14:42:18 +00:00
|
|
|
# -*- coding: utf8 -*-
|
2012-12-13 01:37:24 +00:00
|
|
|
|
2013-03-08 22:05:14 +00:00
|
|
|
from __future__ import absolute_import
|
2013-03-18 21:01:11 +00:00
|
|
|
from __future__ import division, print_function, unicode_literals
|
2012-05-03 03:57:49 +00:00
|
|
|
|
2013-03-07 14:42:18 +00:00
|
|
|
from collections import defaultdict
|
2013-03-18 20:25:09 +00:00
|
|
|
from readability._py3k import to_unicode, to_bytes
|
|
|
|
from readability.document import OriginalDocument, determine_encoding
|
2013-03-18 21:01:11 +00:00
|
|
|
from .compat import unittest
|
2013-03-08 22:05:14 +00:00
|
|
|
from .utils import load_snippet
|
2012-05-03 03:57:49 +00:00
|
|
|
|
|
|
|
|
2012-12-13 01:37:24 +00:00
|
|
|
class TestOriginalDocument(unittest.TestCase):
|
2012-05-03 03:57:49 +00:00
|
|
|
"""Verify we can process html into a document to work off of."""
|
|
|
|
|
|
|
|
def test_readin_min_document(self):
|
|
|
|
"""Verify we can read in a min html document"""
|
|
|
|
doc = OriginalDocument(load_snippet('document_min.html'))
|
2013-03-19 22:49:07 +00:00
|
|
|
self.assertTrue(to_unicode(doc).startswith('<html>'))
|
2012-05-03 03:57:49 +00:00
|
|
|
self.assertEqual(doc.title, 'Min Document Title')
|
|
|
|
|
|
|
|
def test_readin_with_base_url(self):
|
|
|
|
"""Passing a url should update links to be absolute links"""
|
2012-05-04 01:36:48 +00:00
|
|
|
doc = OriginalDocument(
|
|
|
|
load_snippet('document_absolute_url.html'),
|
2012-05-03 03:57:49 +00:00
|
|
|
url="http://blog.mitechie.com/test.html")
|
2013-03-19 22:49:07 +00:00
|
|
|
self.assertTrue(to_unicode(doc).startswith('<html>'))
|
2012-05-03 03:57:49 +00:00
|
|
|
|
|
|
|
# find the links on the page and make sure each one starts with out
|
|
|
|
# base url we told it to use.
|
|
|
|
links = doc.links
|
|
|
|
self.assertEqual(len(links), 3)
|
2012-05-04 01:36:48 +00:00
|
|
|
# we should have two links that start with our blog url
|
|
|
|
# and one link that starts with amazon
|
|
|
|
link_counts = defaultdict(int)
|
|
|
|
for link in links:
|
|
|
|
if link.get('href').startswith('http://blog.mitechie.com'):
|
|
|
|
link_counts['blog'] += 1
|
|
|
|
else:
|
|
|
|
link_counts['other'] += 1
|
|
|
|
|
|
|
|
self.assertEqual(link_counts['blog'], 2)
|
|
|
|
self.assertEqual(link_counts['other'], 1)
|
2012-05-04 20:07:52 +00:00
|
|
|
|
|
|
|
def test_no_br_allowed(self):
|
|
|
|
"""We convert all <br/> tags to <p> tags"""
|
|
|
|
doc = OriginalDocument(load_snippet('document_min.html'))
|
|
|
|
self.assertIsNone(doc.html.find('.//br'))
|
2013-03-07 14:42:18 +00:00
|
|
|
|
2013-03-11 21:10:26 +00:00
|
|
|
def test_empty_title(self):
|
|
|
|
"""We convert all <br/> tags to <p> tags"""
|
|
|
|
document = OriginalDocument("<html><head><title></title></head><body></body></html>")
|
|
|
|
self.assertEqual(document.title, "")
|
|
|
|
|
|
|
|
def test_title_only_with_tags(self):
|
|
|
|
"""We convert all <br/> tags to <p> tags"""
|
|
|
|
document = OriginalDocument("<html><head><title><em></em></title></head><body></body></html>")
|
|
|
|
self.assertEqual(document.title, "")
|
|
|
|
|
|
|
|
def test_no_title(self):
|
|
|
|
"""We convert all <br/> tags to <p> tags"""
|
|
|
|
document = OriginalDocument("<html><head></head><body></body></html>")
|
|
|
|
self.assertEqual(document.title, "")
|
|
|
|
|
2013-03-07 14:42:18 +00:00
|
|
|
def test_encoding(self):
|
2013-03-19 22:49:07 +00:00
|
|
|
text = "ľščťžýáíéäúňôůě".encode("iso-8859-2")
|
2013-03-11 21:10:26 +00:00
|
|
|
encoding = determine_encoding(text)
|
|
|
|
|
|
|
|
def test_encoding_short(self):
|
2013-03-19 22:49:07 +00:00
|
|
|
text = "ľščťžýáíé".encode("iso-8859-2")
|
2013-03-11 21:10:26 +00:00
|
|
|
encoding = determine_encoding(text)
|
|
|
|
self.assertEqual(encoding, "utf8")
|
|
|
|
|
|
|
|
text = to_bytes("ľščťžýáíé")
|
|
|
|
encoding = determine_encoding(text)
|
|
|
|
self.assertEqual(encoding, "utf8")
|