Add support for links, absoluting links

- Add a test that we absolute correctly - Add a links cached attribute to get all links in the doc
12 years ago · ac053979a9
parent 590a94345f
commit ac053979a9
3 changed files with 40 additions and 8 deletions
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -1,4 +1,5 @@
 import re
+from lxml.etree import tostring
 from lxml.etree import tounicode
 from lxml.html import document_fromstring
 from lxml.html import HTMLParser
@ -46,7 +47,7 @@ def build_doc(page):
    return doc


-class OriginalDocument(unicode):
+class OriginalDocument(object):
    """The original document to process"""
    _base_href = None

@ -54,9 +55,13 @@ class OriginalDocument(unicode):
        self.orig_html = html
        self.url = url

+    def __str__(self):
+        """Render out our document as a string"""
+        return tostring(self.html)
+
    def __unicode__(self):
        """Render out our document as a string"""
-        tounicode(self.html)
+        return tounicode(self.html)

    def _parse(self, html):
        """Generate an lxml document from our html."""
@ -72,9 +77,13 @@ class OriginalDocument(unicode):
    @cached_property(ttl=600)
    def html(self):
        """The parsed html document from the input"""
-        print 'PARSED'
        return self._parse(self.orig_html)

+    @cached_property(ttl=600)
+    def links(self):
+        """Links within the document"""
+        return self.html.findall(".//a")
+
    @cached_property(ttl=600)
    def title(self):
        """Pull the title attribute out of the parsed document"""
--- a/src/breadability/tests/test_orig_document.py
+++ b/src/breadability/tests/test_orig_document.py
@ -1,3 +1,4 @@
+from collections import defaultdict
 from os import path
 from unittest import TestCase

@ -18,20 +19,31 @@ class TestOriginalDocuemtn(TestCase):
    def test_readin_min_document(self):
        """Verify we can read in a min html document"""
        doc = OriginalDocument(load_snippet('document_min.html'))
-        self.assertTrue(doc.startswith(u'<html>'))
+        self.assertTrue(str(doc).startswith(u'<html>'))
        self.assertEqual(doc.title, 'Min Document Title')

    def test_readin_with_base_url(self):
        """Passing a url should update links to be absolute links"""
-        doc = OriginalDocument(load_snippet('document_absolute_url.html'),
+        doc = OriginalDocument(
+            load_snippet('document_absolute_url.html'),
            url="http://blog.mitechie.com/test.html")
-        self.assertTrue(doc.startswith(u'<html>'))
+        self.assertTrue(str(doc).startswith(u'<html>'))

        # find the links on the page and make sure each one starts with out
        # base url we told it to use.
        links = doc.links
        self.assertEqual(len(links), 3)
-        for l in links:
-            self.assertEqual(l.startswith('http://blog.mitechie.com/'))
+        # we should have two links that start with our blog url
+        # and one link that starts with amazon
+        link_counts = defaultdict(int)
+        for link in links:
+            print link.get('href')
+            if link.get('href').startswith('http://blog.mitechie.com'):
+                link_counts['blog'] += 1
+            else:
+                link_counts['other'] += 1
+
+        self.assertEqual(link_counts['blog'], 2)
+        self.assertEqual(link_counts['other'], 1)


--- a/src/breadability/tests/test_snippets/document_absolute_url.html
+++ b/src/breadability/tests/test_snippets/document_absolute_url.html
@ -0,0 +1,11 @@
+<html>
+    <head>
+        <title>Min Document Title</title>
+    </head>
+    <body>
+        <h1>Min Document</h1>
+        <a href="/about.hml">About Us</a>
+        <a href="http://blog.mitechie.com/test.hml">About Us</a>
+        <a href="http://amazon.com/test.hml">Amazon</a>
+    </body>
+</html>