Add support for links, absoluting links

- Add a test that we absolute correctly
- Add a links cached attribute to get all links in the doc
pull/4/merge
Richard Harding 12 years ago
parent 590a94345f
commit ac053979a9

@ -1,4 +1,5 @@
import re
from lxml.etree import tostring
from lxml.etree import tounicode
from lxml.html import document_fromstring
from lxml.html import HTMLParser
@ -46,7 +47,7 @@ def build_doc(page):
return doc
class OriginalDocument(unicode):
class OriginalDocument(object):
"""The original document to process"""
_base_href = None
@ -54,9 +55,13 @@ class OriginalDocument(unicode):
self.orig_html = html
self.url = url
def __str__(self):
"""Render out our document as a string"""
return tostring(self.html)
def __unicode__(self):
"""Render out our document as a string"""
tounicode(self.html)
return tounicode(self.html)
def _parse(self, html):
"""Generate an lxml document from our html."""
@ -72,9 +77,13 @@ class OriginalDocument(unicode):
@cached_property(ttl=600)
def html(self):
"""The parsed html document from the input"""
print 'PARSED'
return self._parse(self.orig_html)
@cached_property(ttl=600)
def links(self):
"""Links within the document"""
return self.html.findall(".//a")
@cached_property(ttl=600)
def title(self):
"""Pull the title attribute out of the parsed document"""

@ -1,3 +1,4 @@
from collections import defaultdict
from os import path
from unittest import TestCase
@ -18,20 +19,31 @@ class TestOriginalDocuemtn(TestCase):
def test_readin_min_document(self):
"""Verify we can read in a min html document"""
doc = OriginalDocument(load_snippet('document_min.html'))
self.assertTrue(doc.startswith(u'<html>'))
self.assertTrue(str(doc).startswith(u'<html>'))
self.assertEqual(doc.title, 'Min Document Title')
def test_readin_with_base_url(self):
"""Passing a url should update links to be absolute links"""
doc = OriginalDocument(load_snippet('document_absolute_url.html'),
doc = OriginalDocument(
load_snippet('document_absolute_url.html'),
url="http://blog.mitechie.com/test.html")
self.assertTrue(doc.startswith(u'<html>'))
self.assertTrue(str(doc).startswith(u'<html>'))
# find the links on the page and make sure each one starts with out
# base url we told it to use.
links = doc.links
self.assertEqual(len(links), 3)
for l in links:
self.assertEqual(l.startswith('http://blog.mitechie.com/'))
# we should have two links that start with our blog url
# and one link that starts with amazon
link_counts = defaultdict(int)
for link in links:
print link.get('href')
if link.get('href').startswith('http://blog.mitechie.com'):
link_counts['blog'] += 1
else:
link_counts['other'] += 1
self.assertEqual(link_counts['blog'], 2)
self.assertEqual(link_counts['other'], 1)

@ -0,0 +1,11 @@
<html>
<head>
<title>Min Document Title</title>
</head>
<body>
<h1>Min Document</h1>
<a href="/about.hml">About Us</a>
<a href="http://blog.mitechie.com/test.hml">About Us</a>
<a href="http://amazon.com/test.hml">Amazon</a>
</body>
</html>
Loading…
Cancel
Save