Rework document into its own file

12 years ago · 2e7fb0aa89
parent ac053979a9
commit 2e7fb0aa89
3 changed files with 96 additions and 101 deletions
--- a/src/breadability/document.py
+++ b/src/breadability/document.py
@ -0,0 +1,94 @@
+import re
+from lxml.etree import tostring
+from lxml.etree import tounicode
+from lxml.html import document_fromstring
+from lxml.html import HTMLParser
+
+from breadability.utils import cached_property
+
+
+utf8_parser = HTMLParser(encoding='utf-8')
+
+
+def get_encoding(page):
+    text = re.sub('</?[^>]*>\s*', ' ', page)
+    enc = 'utf-8'
+    if not text.strip() or len(text) < 10:
+        return enc  # can't guess
+    try:
+        diff = text.decode(enc, 'ignore').encode(enc)
+        sizes = len(diff), len(text)
+        # 99% of utf-8
+        if abs(len(text) - len(diff)) < max(sizes) * 0.01:
+            return enc
+    except UnicodeDecodeError:
+        pass
+    res = chardet.detect(text)
+    enc = res['encoding']
+    # print '->', enc, "%.2f" % res['confidence']
+    if enc == 'MacCyrillic':
+        enc = 'cp1251'
+    return enc
+
+
+def build_doc(page):
+    """Requires that the `page` not be None"""
+    if page is None:
+        LOG.error("Page content is None, can't build_doc")
+        return ''
+    if isinstance(page, unicode):
+        page_unicode = page
+    else:
+        enc = get_encoding(page)
+        page_unicode = page.decode(enc, 'replace')
+    doc = document_fromstring(
+        page_unicode.encode('utf-8', 'replace'),
+        parser=utf8_parser)
+    return doc
+
+
+class OriginalDocument(object):
+    """The original document to process"""
+    _base_href = None
+
+    def __init__(self, html, url=None):
+        self.orig_html = html
+        self.url = url
+
+    def __str__(self):
+        """Render out our document as a string"""
+        return tostring(self.html)
+
+    def __unicode__(self):
+        """Render out our document as a string"""
+        return tounicode(self.html)
+
+    def _parse(self, html):
+        """Generate an lxml document from our html."""
+        doc = build_doc(html)
+        # doc = html_cleaner.clean_html(doc)
+        base_href = self.url
+        if base_href:
+            doc.make_links_absolute(base_href, resolve_base_href=True)
+        else:
+            doc.resolve_base_href()
+        return doc
+
+    @cached_property(ttl=600)
+    def html(self):
+        """The parsed html document from the input"""
+        return self._parse(self.orig_html)
+
+    @cached_property(ttl=600)
+    def links(self):
+        """Links within the document"""
+        return self.html.findall(".//a")
+
+    @cached_property(ttl=600)
+    def title(self):
+        """Pull the title attribute out of the parsed document"""
+        titleElem = self.html.find('.//title')
+        if titleElem is None or titleElem.text is None:
+            return ''
+        else:
+            return titleElem.text
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -1,102 +1,4 @@
-import re
-from lxml.etree import tostring
-from lxml.etree import tounicode
-from lxml.html import document_fromstring
-from lxml.html import HTMLParser
-
-from breadability.utils import cached_property
-
-
-utf8_parser = HTMLParser(encoding='utf-8')
-
-
-def get_encoding(page):
-    text = re.sub('</?[^>]*>\s*', ' ', page)
-    enc = 'utf-8'
-    if not text.strip() or len(text) < 10:
-        return enc  # can't guess
-    try:
-        diff = text.decode(enc, 'ignore').encode(enc)
-        sizes = len(diff), len(text)
-        # 99% of utf-8
-        if abs(len(text) - len(diff)) < max(sizes) * 0.01:
-            return enc
-    except UnicodeDecodeError:
-        pass
-    res = chardet.detect(text)
-    enc = res['encoding']
-    # print '->', enc, "%.2f" % res['confidence']
-    if enc == 'MacCyrillic':
-        enc = 'cp1251'
-    return enc
-
-
-def build_doc(page):
-    """Requires that the `page` not be None"""
-    if page is None:
-        LOG.error("Page content is None, can't build_doc")
-        return ''
-    if isinstance(page, unicode):
-        page_unicode = page
-    else:
-        enc = get_encoding(page)
-        page_unicode = page.decode(enc, 'replace')
-    doc = document_fromstring(
-        page_unicode.encode('utf-8', 'replace'),
-        parser=utf8_parser)
-    return doc
-
-
-class OriginalDocument(object):
-    """The original document to process"""
-    _base_href = None
-
-    def __init__(self, html, url=None):
-        self.orig_html = html
-        self.url = url
-
-    def __str__(self):
-        """Render out our document as a string"""
-        return tostring(self.html)
-
-    def __unicode__(self):
-        """Render out our document as a string"""
-        return tounicode(self.html)
-
-    def _parse(self, html):
-        """Generate an lxml document from our html."""
-        doc = build_doc(html)
-        # doc = html_cleaner.clean_html(doc)
-        base_href = self.url
-        if base_href:
-            doc.make_links_absolute(base_href, resolve_base_href=True)
-        else:
-            doc.resolve_base_href()
-        return doc
-
-    @cached_property(ttl=600)
-    def html(self):
-        """The parsed html document from the input"""
-        return self._parse(self.orig_html)
-
-    @cached_property(ttl=600)
-    def links(self):
-        """Links within the document"""
-        return self.html.findall(".//a")
-
-    @cached_property(ttl=600)
-    def title(self):
-        """Pull the title attribute out of the parsed document"""
-        titleElem = self.html.find('.//title')
-        if titleElem is None:
-            return ''
-
-        title = titleElem.text
-        if title is None:
-            return ''
-        return title
-        # return norm_title(title)
-
+from breadable.document import OriginalDocument


 class Article(object):
--- a/src/breadability/tests/test_orig_document.py
+++ b/src/breadability/tests/test_orig_document.py
@ -2,7 +2,7 @@ from collections import defaultdict
 from os import path
 from unittest import TestCase

-from breadability.readable import OriginalDocument
+from breadability.document import OriginalDocument


 TEST_DIR = path.dirname(__file__)
@ -37,7 +37,6 @@ class TestOriginalDocuemtn(TestCase):
        # and one link that starts with amazon
        link_counts = defaultdict(int)
        for link in links:
-            print link.get('href')
            if link.get('href').startswith('http://blog.mitechie.com'):
                link_counts['blog'] += 1
            else: