Start to add some basic tests and layout to use for breaking down documents.
parent
5e95f531bc
commit
590a94345f
@ -0,0 +1,99 @@
|
||||
import re
|
||||
from lxml.etree import tounicode
|
||||
from lxml.html import document_fromstring
|
||||
from lxml.html import HTMLParser
|
||||
|
||||
from breadability.utils import cached_property
|
||||
|
||||
|
||||
utf8_parser = HTMLParser(encoding='utf-8')
|
||||
|
||||
|
||||
def get_encoding(page):
|
||||
text = re.sub('</?[^>]*>\s*', ' ', page)
|
||||
enc = 'utf-8'
|
||||
if not text.strip() or len(text) < 10:
|
||||
return enc # can't guess
|
||||
try:
|
||||
diff = text.decode(enc, 'ignore').encode(enc)
|
||||
sizes = len(diff), len(text)
|
||||
# 99% of utf-8
|
||||
if abs(len(text) - len(diff)) < max(sizes) * 0.01:
|
||||
return enc
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
res = chardet.detect(text)
|
||||
enc = res['encoding']
|
||||
# print '->', enc, "%.2f" % res['confidence']
|
||||
if enc == 'MacCyrillic':
|
||||
enc = 'cp1251'
|
||||
return enc
|
||||
|
||||
|
||||
def build_doc(page):
|
||||
"""Requires that the `page` not be None"""
|
||||
if page is None:
|
||||
LOG.error("Page content is None, can't build_doc")
|
||||
return ''
|
||||
if isinstance(page, unicode):
|
||||
page_unicode = page
|
||||
else:
|
||||
enc = get_encoding(page)
|
||||
page_unicode = page.decode(enc, 'replace')
|
||||
doc = document_fromstring(
|
||||
page_unicode.encode('utf-8', 'replace'),
|
||||
parser=utf8_parser)
|
||||
return doc
|
||||
|
||||
|
||||
class OriginalDocument(unicode):
|
||||
"""The original document to process"""
|
||||
_base_href = None
|
||||
|
||||
def __init__(self, html, url=None):
|
||||
self.orig_html = html
|
||||
self.url = url
|
||||
|
||||
def __unicode__(self):
|
||||
"""Render out our document as a string"""
|
||||
tounicode(self.html)
|
||||
|
||||
def _parse(self, html):
|
||||
"""Generate an lxml document from our html."""
|
||||
doc = build_doc(html)
|
||||
# doc = html_cleaner.clean_html(doc)
|
||||
base_href = self.url
|
||||
if base_href:
|
||||
doc.make_links_absolute(base_href, resolve_base_href=True)
|
||||
else:
|
||||
doc.resolve_base_href()
|
||||
return doc
|
||||
|
||||
@cached_property(ttl=600)
|
||||
def html(self):
|
||||
"""The parsed html document from the input"""
|
||||
print 'PARSED'
|
||||
return self._parse(self.orig_html)
|
||||
|
||||
@cached_property(ttl=600)
|
||||
def title(self):
|
||||
"""Pull the title attribute out of the parsed document"""
|
||||
titleElem = self.html.find('.//title')
|
||||
if titleElem is None:
|
||||
return ''
|
||||
|
||||
title = titleElem.text
|
||||
if title is None:
|
||||
return ''
|
||||
return title
|
||||
# return norm_title(title)
|
||||
|
||||
|
||||
|
||||
class Article(object):
|
||||
"""Parsed readable object"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
|
@ -0,0 +1,37 @@
|
||||
from os import path
|
||||
from unittest import TestCase
|
||||
|
||||
from breadability.readable import OriginalDocument
|
||||
|
||||
|
||||
TEST_DIR = path.dirname(__file__)
|
||||
|
||||
|
||||
def load_snippet(filename):
|
||||
"""Helper to fetch in the content of a test snippet"""
|
||||
return open(path.join(TEST_DIR, 'test_snippets', filename)).read()
|
||||
|
||||
|
||||
class TestOriginalDocuemtn(TestCase):
|
||||
"""Verify we can process html into a document to work off of."""
|
||||
|
||||
def test_readin_min_document(self):
|
||||
"""Verify we can read in a min html document"""
|
||||
doc = OriginalDocument(load_snippet('document_min.html'))
|
||||
self.assertTrue(doc.startswith(u'<html>'))
|
||||
self.assertEqual(doc.title, 'Min Document Title')
|
||||
|
||||
def test_readin_with_base_url(self):
|
||||
"""Passing a url should update links to be absolute links"""
|
||||
doc = OriginalDocument(load_snippet('document_absolute_url.html'),
|
||||
url="http://blog.mitechie.com/test.html")
|
||||
self.assertTrue(doc.startswith(u'<html>'))
|
||||
|
||||
# find the links on the page and make sure each one starts with out
|
||||
# base url we told it to use.
|
||||
links = doc.links
|
||||
self.assertEqual(len(links), 3)
|
||||
for l in links:
|
||||
self.assertEqual(l.startswith('http://blog.mitechie.com/'))
|
||||
|
||||
|
@ -0,0 +1,8 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>Min Document Title</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Min Document</h1>
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,62 @@
|
||||
import time
|
||||
|
||||
|
||||
|
||||
#
|
||||
# ? 2011 Christopher Arndt, MIT License
|
||||
#
|
||||
class cached_property(object):
|
||||
'''Decorator for read-only properties evaluated only once within TTL period.
|
||||
|
||||
It can be used to created a cached property like this::
|
||||
|
||||
import random
|
||||
|
||||
# the class containing the property must be a new-style class
|
||||
class MyClass(object):
|
||||
# create property whose value is cached for ten minutes
|
||||
@cached_property(ttl=600)
|
||||
def randint(self):
|
||||
# will only be evaluated every 10 min. at maximum.
|
||||
return random.randint(0, 100)
|
||||
|
||||
The value is cached in the '_cache' attribute of the object instance that
|
||||
has the property getter method wrapped by this decorator. The '_cache'
|
||||
attribute value is a dictionary which has a key for every property of the
|
||||
object which is wrapped by this decorator. Each entry in the cache is
|
||||
created only when the property is accessed for the first time and is a
|
||||
two-element tuple with the last computed property value and the last time
|
||||
it was updated in seconds since the epoch.
|
||||
|
||||
The default time-to-live (TTL) is 300 seconds (5 minutes). Set the TTL to
|
||||
zero for the cached value to never expire.
|
||||
|
||||
To expire a cached property value manually just do::
|
||||
|
||||
del instance._cache[<property name>]
|
||||
|
||||
'''
|
||||
def __init__(self, ttl=300):
|
||||
self.ttl = ttl
|
||||
|
||||
def __call__(self, fget, doc=None):
|
||||
self.fget = fget
|
||||
self.__doc__ = doc or fget.__doc__
|
||||
self.__name__ = fget.__name__
|
||||
self.__module__ = fget.__module__
|
||||
return self
|
||||
|
||||
def __get__(self, inst, owner):
|
||||
now = time.time()
|
||||
try:
|
||||
value, last_update = inst._cache[self.__name__]
|
||||
if self.ttl > 0 and now - last_update > self.ttl:
|
||||
raise AttributeError
|
||||
except (KeyError, AttributeError):
|
||||
value = self.fget(inst)
|
||||
try:
|
||||
cache = inst._cache
|
||||
except AttributeError:
|
||||
cache = inst._cache = {}
|
||||
cache[self.__name__] = (value, now)
|
||||
return value
|
Loading…
Reference in New Issue