Start to add some basic tests and layout to use for breaking down documents.

pull/4/merge
Richard Harding 12 years ago
parent 5e95f531bc
commit 590a94345f

@ -0,0 +1,99 @@
import re
from lxml.etree import tounicode
from lxml.html import document_fromstring
from lxml.html import HTMLParser
from breadability.utils import cached_property
utf8_parser = HTMLParser(encoding='utf-8')
def get_encoding(page):
text = re.sub('</?[^>]*>\s*', ' ', page)
enc = 'utf-8'
if not text.strip() or len(text) < 10:
return enc # can't guess
try:
diff = text.decode(enc, 'ignore').encode(enc)
sizes = len(diff), len(text)
# 99% of utf-8
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
return enc
except UnicodeDecodeError:
pass
res = chardet.detect(text)
enc = res['encoding']
# print '->', enc, "%.2f" % res['confidence']
if enc == 'MacCyrillic':
enc = 'cp1251'
return enc
def build_doc(page):
"""Requires that the `page` not be None"""
if page is None:
LOG.error("Page content is None, can't build_doc")
return ''
if isinstance(page, unicode):
page_unicode = page
else:
enc = get_encoding(page)
page_unicode = page.decode(enc, 'replace')
doc = document_fromstring(
page_unicode.encode('utf-8', 'replace'),
parser=utf8_parser)
return doc
class OriginalDocument(unicode):
"""The original document to process"""
_base_href = None
def __init__(self, html, url=None):
self.orig_html = html
self.url = url
def __unicode__(self):
"""Render out our document as a string"""
tounicode(self.html)
def _parse(self, html):
"""Generate an lxml document from our html."""
doc = build_doc(html)
# doc = html_cleaner.clean_html(doc)
base_href = self.url
if base_href:
doc.make_links_absolute(base_href, resolve_base_href=True)
else:
doc.resolve_base_href()
return doc
@cached_property(ttl=600)
def html(self):
"""The parsed html document from the input"""
print 'PARSED'
return self._parse(self.orig_html)
@cached_property(ttl=600)
def title(self):
"""Pull the title attribute out of the parsed document"""
titleElem = self.html.find('.//title')
if titleElem is None:
return ''
title = titleElem.text
if title is None:
return ''
return title
# return norm_title(title)
class Article(object):
"""Parsed readable object"""
def __init__(self):
pass

@ -0,0 +1,37 @@
from os import path
from unittest import TestCase
from breadability.readable import OriginalDocument
TEST_DIR = path.dirname(__file__)
def load_snippet(filename):
"""Helper to fetch in the content of a test snippet"""
return open(path.join(TEST_DIR, 'test_snippets', filename)).read()
class TestOriginalDocuemtn(TestCase):
"""Verify we can process html into a document to work off of."""
def test_readin_min_document(self):
"""Verify we can read in a min html document"""
doc = OriginalDocument(load_snippet('document_min.html'))
self.assertTrue(doc.startswith(u'<html>'))
self.assertEqual(doc.title, 'Min Document Title')
def test_readin_with_base_url(self):
"""Passing a url should update links to be absolute links"""
doc = OriginalDocument(load_snippet('document_absolute_url.html'),
url="http://blog.mitechie.com/test.html")
self.assertTrue(doc.startswith(u'<html>'))
# find the links on the page and make sure each one starts with out
# base url we told it to use.
links = doc.links
self.assertEqual(len(links), 3)
for l in links:
self.assertEqual(l.startswith('http://blog.mitechie.com/'))

@ -0,0 +1,8 @@
<html>
<head>
<title>Min Document Title</title>
</head>
<body>
<h1>Min Document</h1>
</body>
</html>

@ -0,0 +1,62 @@
import time
#
# ? 2011 Christopher Arndt, MIT License
#
class cached_property(object):
'''Decorator for read-only properties evaluated only once within TTL period.
It can be used to created a cached property like this::
import random
# the class containing the property must be a new-style class
class MyClass(object):
# create property whose value is cached for ten minutes
@cached_property(ttl=600)
def randint(self):
# will only be evaluated every 10 min. at maximum.
return random.randint(0, 100)
The value is cached in the '_cache' attribute of the object instance that
has the property getter method wrapped by this decorator. The '_cache'
attribute value is a dictionary which has a key for every property of the
object which is wrapped by this decorator. Each entry in the cache is
created only when the property is accessed for the first time and is a
two-element tuple with the last computed property value and the last time
it was updated in seconds since the epoch.
The default time-to-live (TTL) is 300 seconds (5 minutes). Set the TTL to
zero for the cached value to never expire.
To expire a cached property value manually just do::
del instance._cache[<property name>]
'''
def __init__(self, ttl=300):
self.ttl = ttl
def __call__(self, fget, doc=None):
self.fget = fget
self.__doc__ = doc or fget.__doc__
self.__name__ = fget.__name__
self.__module__ = fget.__module__
return self
def __get__(self, inst, owner):
now = time.time()
try:
value, last_update = inst._cache[self.__name__]
if self.ttl > 0 and now - last_update > self.ttl:
raise AttributeError
except (KeyError, AttributeError):
value = self.fget(inst)
try:
cache = inst._cache
except AttributeError:
cache = inst._cache = {}
cache[self.__name__] = (value, now)
return value
Loading…
Cancel
Save