Update cmd line client/interface, update doc builders

- For now we're always getting a div back from the parser - Update the client code, not all flags are enabled, but basic passing a url works
12 years ago · 5c1765a6ef
parent 5b3ef916ef
commit 5c1765a6ef
5 changed files with 94 additions and 16 deletions
--- a/setup.py
+++ b/setup.py
@ -1,5 +1,5 @@
 from setuptools import setup, find_packages
-import sys, os
+import os

 here = os.path.abspath(os.path.dirname(__file__))
 README = open(os.path.join(here, 'README.rst')).read()
@ -7,7 +7,6 @@ NEWS = open(os.path.join(here, 'NEWS.txt')).read()


 version = '0.1.0'
-
 install_requires = [
    # List your project dependencies here.
    # For more details, see:
@ -15,7 +14,6 @@ install_requires = [
    'chardet',
    'lxml',
 ]
-
 tests_require = [
    'coverage',
    'nose',
@ -47,6 +45,7 @@ setup(name='breadability',
    },
    entry_points={
        'console_scripts':
-            ['breadability=breadability:main']
+            ['breadability=breadability:client.main']
    }
 )
+
--- a/src/breadability/init.py
+++ b/src/breadability/init.py
@ -1,4 +1,3 @@
-# Example package with a console entry point
+VERSION = '0.1.0'

-def main():
-    print "Hello World"
+import client
--- a/src/breadability/client.py
+++ b/src/breadability/client.py
@ -0,0 +1,67 @@
+import argparse
+import sys
+
+from breadability import VERSION
+from breadability.readable import Article
+
+
+def parse_args():
+    desc = "A fast python port of arc90's readability tool"
+    parser = argparse.ArgumentParser(description=desc)
+    parser.add_argument('--version',
+        action='version', version=VERSION)
+
+    parser.add_argument('-v', '--verbose',
+        action='store_true',
+        default=False,
+        help='Increase logging verbosity to DEBUG.')
+
+    parser.add_argument('-m', '--metadata',
+        action='store_true',
+        default=False,
+        help='print all metadata as well as content for the content')
+
+    parser.add_argument('path', metavar='P', type=str, nargs=1,
+        help="The url or file path to process in readable form.")
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    target = args.path[0]
+
+    if target.startswith('http') or target.startswith('www'):
+        is_url = True
+        url = target
+    else:
+        is_url = False
+        url = None
+
+    if is_url:
+        import urllib
+        target = urllib.urlopen(target)
+    else:
+        target = open(target, 'rt')
+
+    enc = sys.__stdout__.encoding or 'utf-8'
+
+    try:
+        doc = Article(target.read(), url=url)
+        # if args.metadata:
+        #     m = doc.summary_with_metadata()
+        #     print m.title()
+        #     print m.short_title()
+        #     print m.confidence
+        #     print m.html.encode(enc, 'replace')
+        # else:
+        #     print doc.summary().encode(enc, 'replace')
+        print unicode(doc)
+    finally:
+        target.close()
+
+
+if __name__ == '__main__':
+    main()
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -1,8 +1,9 @@
 import re
-from collections import namedtuple
 from operator import attrgetter
 from lxml.etree import tounicode
+from lxml.etree import tostring
 from lxml.html import fragment_fromstring
+from lxml.html import fromstring
 from breadability.document import OriginalDocument
 from breadability.utils import cached_property

@ -50,7 +51,12 @@ def build_base_document(html):
    :param html: Parsed Element object

    """
-    found_body = html.find('.//body')
+    if html.tag == 'body':
+        html.tag = 'div'
+        found_body = html
+    else:
+        found_body = html.find('.//body')
+        found_body.tag = 'div'

    if found_body is None:
        fragment = fragment_fromstring('<div/>')
@ -59,7 +65,8 @@ def build_base_document(html):
        return fragment
    else:
        found_body.set('id', 'readabilityBody')
-        return html
+
+    return found_body


 def transform_misused_divs_into_paragraphs(doc):
@ -81,10 +88,10 @@ def transform_misused_divs_into_paragraphs(doc):
            # We need to create a <p> and put all it's contents in there
            # We'll just stringify it, then regex replace the first/last
            # div bits to turn them into <p> vs <div>.
-            orig = tounicode(elem)
+            orig = tounicode(elem).strip()
            started = re.sub(r'^<\s*div', '<p', orig)
            ended = re.sub(r'div>$', 'p>', started)
-            elem.getparent().replace(elem, fragment_fromstring(ended))
+            elem.getparent().replace(elem, fromstring(ended))

    return doc

@ -251,6 +258,12 @@ class Article(object):
    def __init__(self, html, url=None):
        self.orig = OriginalDocument(html, url=url)

+    def __str__(self):
+        return tostring(self.readable)
+
+    def __unicode__(self):
+        return tounicode(self.readable)
+
    @cached_property(ttl=600)
    def readable(self):
        """The readable parsed article"""
--- a/src/breadability/tests/test_readable.py
+++ b/src/breadability/tests/test_readable.py
@ -20,7 +20,7 @@ class TestReadableDocument(TestCase):
        """We get back an element tree from our original doc"""
        doc = Article(load_snippet('document_min.html'))
        # We get back the document as a div tag currently by default.
-        self.assertEqual(doc.readable.tag, 'html')
+        self.assertEqual(doc.readable.tag, 'div')

    def test_doc_no_scripts_styles(self):
        """Step #1 remove all scripts from the document"""
@ -36,6 +36,7 @@ class TestReadableDocument(TestCase):
        No sense processing anything other than the body content.

        """
+        print "MIN DOCUMENT"
        doc = Article(load_snippet('document_min.html'))
        self.assertEqual(doc.readable.tag, 'div')
        self.assertEqual(doc.readable.get('id'), 'readabilityBody')
@ -47,9 +48,8 @@ class TestReadableDocument(TestCase):

        """
        doc = Article(load_snippet('document_no_body.html'))
-        self.assertEqual(doc.readable.tag, 'html')
-        found_body = doc.readable.find('.//body')
-        self.assertEqual(found_body.get('id'), 'readabilityBody')
+        self.assertEqual(doc.readable.tag, 'div')
+        self.assertEqual(doc.readable.get('id'), 'readabilityBody')

    def test_bare_content(self):
        """If the document is just pure content, no html tags we should be ok