Add -f and -b flags to client

- added a -f flag that will override only getting a <div> fragement back and
return a fully constructed document
- added a -b flag to not just parse, but write to temp file and open in a
browser, great for testing
- Updated the Article to support the fragment=False so that you can get back a
fully wrapped <html> document with a header (especially with utf-8 content
type set yay)
pull/4/merge
Richard Harding 12 years ago
parent 8b77675ab2
commit 6b92dd2f83

@ -1,5 +1,11 @@
import argparse
import codecs
import os
import sys
import urllib
import webbrowser
from tempfile import mkstemp
from breadability import VERSION
from breadability.logconfig import LOG
@ -20,10 +26,20 @@ def parse_args():
default=False,
help='Increase logging verbosity to DEBUG.')
parser.add_argument('-m', '--metadata',
parser.add_argument('-f', '--fragment',
action='store_false',
default=True,
help='Output html fragment by default.')
# parser.add_argument('-m', '--metadata',
# action='store_true',
# default=False,
# help='print all metadata as well as content for the content')
parser.add_argument('-b', '--browser',
action='store_true',
default=False,
help='print all metadata as well as content for the content')
help='open the parsed content in your web browser')
parser.add_argument('path', metavar='P', type=str, nargs=1,
help="The url or file path to process in readable form.")
@ -49,26 +65,22 @@ def main():
url = None
if is_url:
import urllib
target = urllib.urlopen(target)
req = urllib.urlopen(target)
ucontent = req.read().encode('utf-8')
else:
target = open(target, 'rt')
ucontent = codecs.open(target, "r", "utf-8").read()
enc = sys.__stdout__.encoding or 'utf-8'
try:
doc = Article(target.read(), url=url)
# if args.metadata:
# m = doc.summary_with_metadata()
# print m.title()
# print m.short_title()
# print m.confidence
# print m.html.encode(enc, 'replace')
# else:
# print doc.summary().encode(enc, 'replace')
print doc
finally:
target.close()
doc = Article(ucontent, url=url, fragment=args.fragment)
if args.browser:
fg, pathname = mkstemp(suffix='.html')
out = codecs.open(pathname, 'w', 'utf-8')
out.write(doc.readable)
out.close()
webbrowser.open(pathname)
else:
sys.stdout(doc.readable.encode(enc, 'replace'))
if __name__ == '__main__':

@ -1,7 +1,6 @@
"""Generate a clean nice starting html document to process for an article."""
import chardet
import logging
import re
from lxml.etree import tostring
from lxml.etree import tounicode

@ -3,6 +3,7 @@ from operator import attrgetter
from lxml.etree import tounicode
from lxml.etree import tostring
from lxml.html.clean import Cleaner
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from lxml.html import fromstring
from pprint import PrettyPrinter
@ -24,6 +25,17 @@ html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
remove_unknown_tags=False, safe_attrs_only=False)
BASE_DOC = """
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
</head>
<body>
</body>
</html>
"""
def drop_tag(doc, *tags):
"""Helper to just remove any nodes that match this html tag passed in
@ -38,20 +50,23 @@ def drop_tag(doc, *tags):
return doc
def ok_embedded_video(node):
"""Check if this embed/video is an ok one to count."""
keep_keywords = ['youtube', 'blip.tv', 'vimeo']
node_str = tounicode(n)
for key in keep_keywords:
if not allow and key in node_str:
if key in node_str:
return True
return False
def build_base_document(html):
def build_base_document(html, fragment=True):
"""Return a base document with the body as root.
:param html: Parsed Element object
:param fragment: Should we return a <div> doc fragment or a full <html>
doc.
"""
if html.tag == 'body':
@ -61,15 +76,29 @@ def build_base_document(html):
found_body = html.find('.//body')
if found_body is None:
fragment = fragment_fromstring('<div/>')
fragment.set('id', 'readabilityBody')
fragment.append(html)
return fragment
frag = fragment_fromstring('<div/>')
frag.set('id', 'readabilityBody')
frag.append(html)
if not fragment:
output = fromstring(BASE_DOC)
insert_point = output.find('.//body')
insert_point.append(frag)
else:
output = frag
else:
found_body.tag = 'div'
found_body.set('id', 'readabilityBody')
return found_body
if not fragment:
output = fromstring(BASE_DOC)
insert_point = output.find('.//body')
insert_point.append(found_body)
else:
output = found_body
return output
def transform_misused_divs_into_paragraphs(doc):
@ -136,7 +165,7 @@ def check_siblings(candidate_node, candidate_list):
content_length = len(content)
if content_length > 80 and link_density < 0.25:
append = true
append = True
elif content_length < 80 and link_density == 0:
if ". " in content:
append = True
@ -328,18 +357,31 @@ def find_candidates(doc):
class Article(object):
"""Parsed readable object"""
def __init__(self, html, url=None):
def __init__(self, html, url=None, fragment=True):
"""Create the Article we're going to use.
:param html: The string of html we're going to parse.
:param url: The url so we can adjust the links to still work.
:param fragment: Should we return a <div> fragment or a full <html>
doc.
"""
LOG.debug('Url: ' + str(url))
self.orig = OriginalDocument(html, url=url)
self.fragment = fragment
def __str__(self):
return tostring(self.readable)
return tostring(self._readable)
def __unicode__(self):
return tounicode(self.readable)
return tounicode(self._readable)
@cached_property(ttl=600)
def readable(self):
return tounicode(self._readable)
@cached_property(ttl=600)
def _readable(self):
"""The readable parsed article"""
doc = self.orig.html
# cleaning doesn't return, just wipes in place
@ -364,7 +406,7 @@ class Article(object):
updated_winner = check_siblings(winner, candidates)
LOG.debug('Begin final prep of article')
updated_winner.node = prep_article(updated_winner.node)
doc = build_base_document(updated_winner.node)
doc = build_base_document(updated_winner.node, self.fragment)
else:
LOG.warning('No candidates found: using document.')
LOG.debug('Begin final prep of article')
@ -372,6 +414,6 @@ class Article(object):
# cleanup by removing the should_drop we spotted.
[n.drop_tree() for n in should_drop]
doc = prep_article(doc)
doc = build_base_document(doc)
doc = build_base_document(doc, self.fragment)
return doc

@ -20,12 +20,12 @@ class TestReadableDocument(TestCase):
"""We get back an element tree from our original doc"""
doc = Article(load_snippet('document_min.html'))
# We get back the document as a div tag currently by default.
self.assertEqual(doc.readable.tag, 'div')
self.assertEqual(doc._readable.tag, 'div')
def test_doc_no_scripts_styles(self):
"""Step #1 remove all scripts from the document"""
doc = Article(load_snippet('document_scripts.html'))
readable = doc.readable
readable = doc._readable
self.assertEqual(readable.findall(".//script"), [])
self.assertEqual(readable.findall(".//style"), [])
self.assertEqual(readable.findall(".//link"), [])
@ -37,8 +37,8 @@ class TestReadableDocument(TestCase):
"""
doc = Article(load_snippet('document_min.html'))
self.assertEqual(doc.readable.tag, 'div')
self.assertEqual(doc.readable.get('id'), 'readabilityBody')
self.assertEqual(doc._readable.tag, 'div')
self.assertEqual(doc._readable.get('id'), 'readabilityBody')
def test_body_doesnt_exist(self):
"""If we can't find a body, then we create one.
@ -47,8 +47,8 @@ class TestReadableDocument(TestCase):
"""
doc = Article(load_snippet('document_no_body.html'))
self.assertEqual(doc.readable.tag, 'div')
self.assertEqual(doc.readable.get('id'), 'readabilityBody')
self.assertEqual(doc._readable.tag, 'div')
self.assertEqual(doc._readable.get('id'), 'readabilityBody')
def test_bare_content(self):
"""If the document is just pure content, no html tags we should be ok
@ -57,8 +57,9 @@ class TestReadableDocument(TestCase):
"""
doc = Article(load_snippet('document_only_content.html'))
self.assertEqual(doc.readable.tag, 'div')
self.assertEqual(doc.readable.get('id'), 'readabilityBody')
self.assertEqual(doc._readable.tag, 'div')
self.assertEqual(doc._readable.get('id'), 'readabilityBody')
class TestCleaning(TestCase):
@ -67,7 +68,7 @@ class TestCleaning(TestCase):
def test_unlikely_hits(self):
"""Verify we wipe out things from our unlikely list."""
doc = Article(load_snippet('test_readable_unlikely.html'))
readable = doc.readable
readable = doc._readable
must_not_appear = ['comment', 'community', 'disqus', 'extra', 'foot',
'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',

Loading…
Cancel
Save