Start to add some basic tests and layout to use for breaking down documents.

12 years ago · 590a94345f
parent 5e95f531bc
commit 590a94345f
4 changed files with 206 additions and 0 deletions
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -0,0 +1,99 @@
+import re
+from lxml.etree import tounicode
+from lxml.html import document_fromstring
+from lxml.html import HTMLParser
+
+from breadability.utils import cached_property
+
+
+utf8_parser = HTMLParser(encoding='utf-8')
+
+
+def get_encoding(page):
+    text = re.sub('</?[^>]*>\s*', ' ', page)
+    enc = 'utf-8'
+    if not text.strip() or len(text) < 10:
+        return enc  # can't guess
+    try:
+        diff = text.decode(enc, 'ignore').encode(enc)
+        sizes = len(diff), len(text)
+        # 99% of utf-8
+        if abs(len(text) - len(diff)) < max(sizes) * 0.01:
+            return enc
+    except UnicodeDecodeError:
+        pass
+    res = chardet.detect(text)
+    enc = res['encoding']
+    # print '->', enc, "%.2f" % res['confidence']
+    if enc == 'MacCyrillic':
+        enc = 'cp1251'
+    return enc
+
+
+def build_doc(page):
+    """Requires that the `page` not be None"""
+    if page is None:
+        LOG.error("Page content is None, can't build_doc")
+        return ''
+    if isinstance(page, unicode):
+        page_unicode = page
+    else:
+        enc = get_encoding(page)
+        page_unicode = page.decode(enc, 'replace')
+    doc = document_fromstring(
+        page_unicode.encode('utf-8', 'replace'),
+        parser=utf8_parser)
+    return doc
+
+
+class OriginalDocument(unicode):
+    """The original document to process"""
+    _base_href = None
+
+    def __init__(self, html, url=None):
+        self.orig_html = html
+        self.url = url
+
+    def __unicode__(self):
+        """Render out our document as a string"""
+        tounicode(self.html)
+
+    def _parse(self, html):
+        """Generate an lxml document from our html."""
+        doc = build_doc(html)
+        # doc = html_cleaner.clean_html(doc)
+        base_href = self.url
+        if base_href:
+            doc.make_links_absolute(base_href, resolve_base_href=True)
+        else:
+            doc.resolve_base_href()
+        return doc
+
+    @cached_property(ttl=600)
+    def html(self):
+        """The parsed html document from the input"""
+        print 'PARSED'
+        return self._parse(self.orig_html)
+
+    @cached_property(ttl=600)
+    def title(self):
+        """Pull the title attribute out of the parsed document"""
+        titleElem = self.html.find('.//title')
+        if titleElem is None:
+            return ''
+
+        title = titleElem.text
+        if title is None:
+            return ''
+        return title
+        # return norm_title(title)
+
+
+
+class Article(object):
+    """Parsed readable object"""
+
+    def __init__(self):
+        pass
+
+
--- a/src/breadability/tests/test_orig_document.py
+++ b/src/breadability/tests/test_orig_document.py
@ -0,0 +1,37 @@
+from os import path
+from unittest import TestCase
+
+from breadability.readable import OriginalDocument
+
+
+TEST_DIR = path.dirname(__file__)
+
+
+def load_snippet(filename):
+    """Helper to fetch in the content of a test snippet"""
+    return open(path.join(TEST_DIR, 'test_snippets', filename)).read()
+
+
+class TestOriginalDocuemtn(TestCase):
+    """Verify we can process html into a document to work off of."""
+
+    def test_readin_min_document(self):
+        """Verify we can read in a min html document"""
+        doc = OriginalDocument(load_snippet('document_min.html'))
+        self.assertTrue(doc.startswith(u'<html>'))
+        self.assertEqual(doc.title, 'Min Document Title')
+
+    def test_readin_with_base_url(self):
+        """Passing a url should update links to be absolute links"""
+        doc = OriginalDocument(load_snippet('document_absolute_url.html'),
+            url="http://blog.mitechie.com/test.html")
+        self.assertTrue(doc.startswith(u'<html>'))
+
+        # find the links on the page and make sure each one starts with out
+        # base url we told it to use.
+        links = doc.links
+        self.assertEqual(len(links), 3)
+        for l in links:
+            self.assertEqual(l.startswith('http://blog.mitechie.com/'))
+
+
--- a/src/breadability/tests/test_snippets/document_min.html
+++ b/src/breadability/tests/test_snippets/document_min.html
@ -0,0 +1,8 @@
+<html>
+    <head>
+        <title>Min Document Title</title>
+    </head>
+    <body>
+        <h1>Min Document</h1>
+    </body>
+</html>
--- a/src/breadability/utils.py
+++ b/src/breadability/utils.py
@ -0,0 +1,62 @@
+import time
+
+
+
+#
+# ? 2011 Christopher Arndt, MIT License
+#
+class cached_property(object):
+    '''Decorator for read-only properties evaluated only once within TTL period.
+
+    It can be used to created a cached property like this::
+
+        import random
+
+        # the class containing the property must be a new-style class
+        class MyClass(object):
+            # create property whose value is cached for ten minutes
+            @cached_property(ttl=600)
+            def randint(self):
+                # will only be evaluated every 10 min. at maximum.
+                return random.randint(0, 100)
+
+    The value is cached  in the '_cache' attribute of the object instance that
+    has the property getter method wrapped by this decorator. The '_cache'
+    attribute value is a dictionary which has a key for every property of the
+    object which is wrapped by this decorator. Each entry in the cache is
+    created only when the property is accessed for the first time and is a
+    two-element tuple with the last computed property value and the last time
+    it was updated in seconds since the epoch.
+
+    The default time-to-live (TTL) is 300 seconds (5 minutes). Set the TTL to
+    zero for the cached value to never expire.
+
+    To expire a cached property value manually just do::
+    
+        del instance._cache[<property name>]
+
+    '''
+    def __init__(self, ttl=300):
+        self.ttl = ttl
+
+    def __call__(self, fget, doc=None):
+        self.fget = fget
+        self.__doc__ = doc or fget.__doc__
+        self.__name__ = fget.__name__
+        self.__module__ = fget.__module__
+        return self
+
+    def __get__(self, inst, owner):
+        now = time.time()
+        try:
+            value, last_update = inst._cache[self.__name__]
+            if self.ttl > 0 and now - last_update > self.ttl:
+                raise AttributeError
+        except (KeyError, AttributeError):
+            value = self.fget(inst)
+            try:
+                cache = inst._cache
+            except AttributeError:
+                cache = inst._cache = {}
+            cache[self.__name__] = (value, now)
+        return value