Update cmd line client/interface, update doc builders

- For now we're always getting a div back from the parser
- Update the client code, not all flags are enabled, but basic passing a url
works
pull/4/merge
Richard Harding 12 years ago
parent 5b3ef916ef
commit 5c1765a6ef

@ -1,5 +1,5 @@
from setuptools import setup, find_packages
import sys, os
import os
here = os.path.abspath(os.path.dirname(__file__))
README = open(os.path.join(here, 'README.rst')).read()
@ -7,7 +7,6 @@ NEWS = open(os.path.join(here, 'NEWS.txt')).read()
version = '0.1.0'
install_requires = [
# List your project dependencies here.
# For more details, see:
@ -15,7 +14,6 @@ install_requires = [
'chardet',
'lxml',
]
tests_require = [
'coverage',
'nose',
@ -47,6 +45,7 @@ setup(name='breadability',
},
entry_points={
'console_scripts':
['breadability=breadability:main']
['breadability=breadability:client.main']
}
)

@ -1,4 +1,3 @@
# Example package with a console entry point
VERSION = '0.1.0'
def main():
print "Hello World"
import client

@ -0,0 +1,67 @@
import argparse
import sys
from breadability import VERSION
from breadability.readable import Article
def parse_args():
desc = "A fast python port of arc90's readability tool"
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('--version',
action='version', version=VERSION)
parser.add_argument('-v', '--verbose',
action='store_true',
default=False,
help='Increase logging verbosity to DEBUG.')
parser.add_argument('-m', '--metadata',
action='store_true',
default=False,
help='print all metadata as well as content for the content')
parser.add_argument('path', metavar='P', type=str, nargs=1,
help="The url or file path to process in readable form.")
args = parser.parse_args()
return args
def main():
args = parse_args()
target = args.path[0]
if target.startswith('http') or target.startswith('www'):
is_url = True
url = target
else:
is_url = False
url = None
if is_url:
import urllib
target = urllib.urlopen(target)
else:
target = open(target, 'rt')
enc = sys.__stdout__.encoding or 'utf-8'
try:
doc = Article(target.read(), url=url)
# if args.metadata:
# m = doc.summary_with_metadata()
# print m.title()
# print m.short_title()
# print m.confidence
# print m.html.encode(enc, 'replace')
# else:
# print doc.summary().encode(enc, 'replace')
print unicode(doc)
finally:
target.close()
if __name__ == '__main__':
main()

@ -1,8 +1,9 @@
import re
from collections import namedtuple
from operator import attrgetter
from lxml.etree import tounicode
from lxml.etree import tostring
from lxml.html import fragment_fromstring
from lxml.html import fromstring
from breadability.document import OriginalDocument
from breadability.utils import cached_property
@ -50,7 +51,12 @@ def build_base_document(html):
:param html: Parsed Element object
"""
found_body = html.find('.//body')
if html.tag == 'body':
html.tag = 'div'
found_body = html
else:
found_body = html.find('.//body')
found_body.tag = 'div'
if found_body is None:
fragment = fragment_fromstring('<div/>')
@ -59,7 +65,8 @@ def build_base_document(html):
return fragment
else:
found_body.set('id', 'readabilityBody')
return html
return found_body
def transform_misused_divs_into_paragraphs(doc):
@ -81,10 +88,10 @@ def transform_misused_divs_into_paragraphs(doc):
# We need to create a <p> and put all it's contents in there
# We'll just stringify it, then regex replace the first/last
# div bits to turn them into <p> vs <div>.
orig = tounicode(elem)
orig = tounicode(elem).strip()
started = re.sub(r'^<\s*div', '<p', orig)
ended = re.sub(r'div>$', 'p>', started)
elem.getparent().replace(elem, fragment_fromstring(ended))
elem.getparent().replace(elem, fromstring(ended))
return doc
@ -251,6 +258,12 @@ class Article(object):
def __init__(self, html, url=None):
self.orig = OriginalDocument(html, url=url)
def __str__(self):
return tostring(self.readable)
def __unicode__(self):
return tounicode(self.readable)
@cached_property(ttl=600)
def readable(self):
"""The readable parsed article"""

@ -20,7 +20,7 @@ class TestReadableDocument(TestCase):
"""We get back an element tree from our original doc"""
doc = Article(load_snippet('document_min.html'))
# We get back the document as a div tag currently by default.
self.assertEqual(doc.readable.tag, 'html')
self.assertEqual(doc.readable.tag, 'div')
def test_doc_no_scripts_styles(self):
"""Step #1 remove all scripts from the document"""
@ -36,6 +36,7 @@ class TestReadableDocument(TestCase):
No sense processing anything other than the body content.
"""
print "MIN DOCUMENT"
doc = Article(load_snippet('document_min.html'))
self.assertEqual(doc.readable.tag, 'div')
self.assertEqual(doc.readable.get('id'), 'readabilityBody')
@ -47,9 +48,8 @@ class TestReadableDocument(TestCase):
"""
doc = Article(load_snippet('document_no_body.html'))
self.assertEqual(doc.readable.tag, 'html')
found_body = doc.readable.find('.//body')
self.assertEqual(found_body.get('id'), 'readabilityBody')
self.assertEqual(doc.readable.tag, 'div')
self.assertEqual(doc.readable.get('id'), 'readabilityBody')
def test_bare_content(self):
"""If the document is just pure content, no html tags we should be ok

Loading…
Cancel
Save