Add -f and -b flags to client

- added a -f flag that will override only getting a <div> fragement back and return a fully constructed document - added a -b flag to not just parse, but write to temp file and open in a browser, great for testing - Updated the Article to support the fragment=False so that you can get back a fully wrapped <html> document with a header (especially with utf-8 content type set yay)
12 years ago · 6b92dd2f83
parent 8b77675ab2
commit 6b92dd2f83
4 changed files with 95 additions and 41 deletions
--- a/src/breadability/client.py
+++ b/src/breadability/client.py
@ -1,5 +1,11 @@
 import argparse
+import codecs
+import os
 import sys
+import urllib
+import webbrowser
+
+from tempfile import mkstemp

 from breadability import VERSION
 from breadability.logconfig import LOG
@ -20,10 +26,20 @@ def parse_args():
        default=False,
        help='Increase logging verbosity to DEBUG.')

-    parser.add_argument('-m', '--metadata',
+    parser.add_argument('-f', '--fragment',
+        action='store_false',
+        default=True,
+        help='Output html fragment by default.')
+
+#     parser.add_argument('-m', '--metadata',
+#         action='store_true',
+#         default=False,
+#         help='print all metadata as well as content for the content')
+
+    parser.add_argument('-b', '--browser',
        action='store_true',
        default=False,
-        help='print all metadata as well as content for the content')
+        help='open the parsed content in your web browser')

    parser.add_argument('path', metavar='P', type=str, nargs=1,
        help="The url or file path to process in readable form.")
@ -49,26 +65,22 @@ def main():
        url = None

    if is_url:
-        import urllib
-        target = urllib.urlopen(target)
+        req = urllib.urlopen(target)
+        ucontent = req.read().encode('utf-8')
    else:
-        target = open(target, 'rt')
+        ucontent = codecs.open(target, "r", "utf-8").read()

    enc = sys.__stdout__.encoding or 'utf-8'

-    try:
-        doc = Article(target.read(), url=url)
-        # if args.metadata:
-        #     m = doc.summary_with_metadata()
-        #     print m.title()
-        #     print m.short_title()
-        #     print m.confidence
-        #     print m.html.encode(enc, 'replace')
-        # else:
-        #     print doc.summary().encode(enc, 'replace')
-        print doc
-    finally:
-        target.close()
+    doc = Article(ucontent, url=url, fragment=args.fragment)
+    if args.browser:
+        fg, pathname = mkstemp(suffix='.html')
+        out = codecs.open(pathname, 'w', 'utf-8')
+        out.write(doc.readable)
+        out.close()
+        webbrowser.open(pathname)
+    else:
+        sys.stdout(doc.readable.encode(enc, 'replace'))


 if __name__ == '__main__':
--- a/src/breadability/document.py
+++ b/src/breadability/document.py
@ -1,7 +1,6 @@
 """Generate a clean nice starting html document to process for an article."""

 import chardet
-import logging
 import re
 from lxml.etree import tostring
 from lxml.etree import tounicode
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -3,6 +3,7 @@ from operator import attrgetter
 from lxml.etree import tounicode
 from lxml.etree import tostring
 from lxml.html.clean import Cleaner
+from lxml.html import document_fromstring
 from lxml.html import fragment_fromstring
 from lxml.html import fromstring
 from pprint import PrettyPrinter
@ -24,6 +25,17 @@ html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                  remove_unknown_tags=False, safe_attrs_only=False)


+BASE_DOC = """
+<html>
+    <head>
+        <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
+    </head>
+    <body>
+    </body>
+</html>
+"""
+
+
 def drop_tag(doc, *tags):
    """Helper to just remove any nodes that match this html tag passed in

@ -38,20 +50,23 @@ def drop_tag(doc, *tags):
    return doc


+
 def ok_embedded_video(node):
    """Check if this embed/video is an ok one to count."""
    keep_keywords = ['youtube', 'blip.tv', 'vimeo']
    node_str = tounicode(n)
    for key in keep_keywords:
-        if not allow and key in node_str:
+        if key in node_str:
            return True
    return False


-def build_base_document(html):
+def build_base_document(html, fragment=True):
    """Return a base document with the body as root.

    :param html: Parsed Element object
+    :param fragment: Should we return a <div> doc fragment or a full <html>
+    doc.

    """
    if html.tag == 'body':
@ -61,15 +76,29 @@ def build_base_document(html):
        found_body = html.find('.//body')

    if found_body is None:
-        fragment = fragment_fromstring('<div/>')
-        fragment.set('id', 'readabilityBody')
-        fragment.append(html)
-        return fragment
+        frag = fragment_fromstring('<div/>')
+        frag.set('id', 'readabilityBody')
+        frag.append(html)
+
+        if not fragment:
+            output = fromstring(BASE_DOC)
+            insert_point = output.find('.//body')
+            insert_point.append(frag)
+        else:
+            output = frag
    else:
+
        found_body.tag = 'div'
        found_body.set('id', 'readabilityBody')

-    return found_body
+        if not fragment:
+            output = fromstring(BASE_DOC)
+            insert_point = output.find('.//body')
+            insert_point.append(found_body)
+        else:
+            output = found_body
+
+    return output


 def transform_misused_divs_into_paragraphs(doc):
@ -136,7 +165,7 @@ def check_siblings(candidate_node, candidate_list):
            content_length = len(content)

            if content_length > 80 and link_density < 0.25:
-                append = true
+                append = True
            elif content_length < 80 and link_density == 0:
                if ". " in content:
                    append = True
@ -328,18 +357,31 @@ def find_candidates(doc):
 class Article(object):
    """Parsed readable object"""

-    def __init__(self, html, url=None):
+    def __init__(self, html, url=None, fragment=True):
+        """Create the Article we're going to use.
+
+        :param html: The string of html we're going to parse.
+        :param url: The url so we can adjust the links to still work.
+        :param fragment: Should we return a <div> fragment or a full <html>
+        doc.
+
+        """
        LOG.debug('Url: ' + str(url))
        self.orig = OriginalDocument(html, url=url)
+        self.fragment = fragment

    def __str__(self):
-        return tostring(self.readable)
+        return tostring(self._readable)

    def __unicode__(self):
-        return tounicode(self.readable)
+        return tounicode(self._readable)

    @cached_property(ttl=600)
    def readable(self):
+        return tounicode(self._readable)
+
+    @cached_property(ttl=600)
+    def _readable(self):
        """The readable parsed article"""
        doc = self.orig.html
        # cleaning doesn't return, just wipes in place
@ -364,7 +406,7 @@ class Article(object):
            updated_winner = check_siblings(winner, candidates)
            LOG.debug('Begin final prep of article')
            updated_winner.node = prep_article(updated_winner.node)
-            doc = build_base_document(updated_winner.node)
+            doc = build_base_document(updated_winner.node, self.fragment)
        else:
            LOG.warning('No candidates found: using document.')
            LOG.debug('Begin final prep of article')
@ -372,6 +414,6 @@ class Article(object):
            # cleanup by removing the should_drop we spotted.
            [n.drop_tree() for n in should_drop]
            doc = prep_article(doc)
-            doc = build_base_document(doc)
+            doc = build_base_document(doc, self.fragment)

        return doc
--- a/src/breadability/tests/test_readable.py
+++ b/src/breadability/tests/test_readable.py
@ -20,12 +20,12 @@ class TestReadableDocument(TestCase):
        """We get back an element tree from our original doc"""
        doc = Article(load_snippet('document_min.html'))
        # We get back the document as a div tag currently by default.
-        self.assertEqual(doc.readable.tag, 'div')
+        self.assertEqual(doc._readable.tag, 'div')

    def test_doc_no_scripts_styles(self):
        """Step #1 remove all scripts from the document"""
        doc = Article(load_snippet('document_scripts.html'))
-        readable = doc.readable
+        readable = doc._readable
        self.assertEqual(readable.findall(".//script"), [])
        self.assertEqual(readable.findall(".//style"), [])
        self.assertEqual(readable.findall(".//link"), [])
@ -37,8 +37,8 @@ class TestReadableDocument(TestCase):

        """
        doc = Article(load_snippet('document_min.html'))
-        self.assertEqual(doc.readable.tag, 'div')
-        self.assertEqual(doc.readable.get('id'), 'readabilityBody')
+        self.assertEqual(doc._readable.tag, 'div')
+        self.assertEqual(doc._readable.get('id'), 'readabilityBody')

    def test_body_doesnt_exist(self):
        """If we can't find a body, then we create one.
@ -47,8 +47,8 @@ class TestReadableDocument(TestCase):

        """
        doc = Article(load_snippet('document_no_body.html'))
-        self.assertEqual(doc.readable.tag, 'div')
-        self.assertEqual(doc.readable.get('id'), 'readabilityBody')
+        self.assertEqual(doc._readable.tag, 'div')
+        self.assertEqual(doc._readable.get('id'), 'readabilityBody')

    def test_bare_content(self):
        """If the document is just pure content, no html tags we should be ok
@ -57,8 +57,9 @@ class TestReadableDocument(TestCase):

        """
        doc = Article(load_snippet('document_only_content.html'))
-        self.assertEqual(doc.readable.tag, 'div')
-        self.assertEqual(doc.readable.get('id'), 'readabilityBody')
+        
+        self.assertEqual(doc._readable.tag, 'div')
+        self.assertEqual(doc._readable.get('id'), 'readabilityBody')


 class TestCleaning(TestCase):
@ -67,7 +68,7 @@ class TestCleaning(TestCase):
    def test_unlikely_hits(self):
        """Verify we wipe out things from our unlikely list."""
        doc = Article(load_snippet('test_readable_unlikely.html'))
-        readable = doc.readable
+        readable = doc._readable
        must_not_appear = ['comment', 'community', 'disqus', 'extra', 'foot',
                'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
                'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',