Merge pull request #21 from miso-belica/upstream-sync

Synchronize with miso-belica/readability.py fork
pull/23/head
Rick Harding 10 years ago
commit 6d8a76a2b9

2
.gitignore vendored

@ -1,4 +1,4 @@
*.pyc
*.py[co]
*.prof
.coverage

@ -6,6 +6,8 @@ python:
- "3.3"
before_install: sudo apt-get install libxml2-dev libxslt-dev
# command to install dependencies
install: pip install -r requirements.txt --use-mirrors
install:
- python setup.py install
- pip install -r requirements.txt --use-mirrors
# command to run tests
script: python setup.py install && nosetests tests
script: nosetests tests

@ -1,3 +1,4 @@
Rick Harding (original author)
Michal Belica (current maintainer)
nhnifong
Craig Maloney
Mišo Belica

@ -1,37 +1,28 @@
.. :changelog:
Changelog for readability
Changelog for breadability
==========================
0.1.17 (Jan 22nd 2014)
----------------------
- More log quieting down to INFO vs WARN
0.1.16 (Jan 22nd 2014)
----------------------
- Clean up logging output at warning when it's not a true warning
0.1.15 (Nov 29th 2013)
-----------------------
Merge changes from 0.1.14 of breadability with the fork
https://github.com/miso-belica/readability.py and tweaking to return to the
name breadability.
From the fork
~~~~~~~~~~~~~~
- Added property ``Article.main_text`` for getting text annotated with
- Merge changes from 0.1.14 of breadability with the fork https://github.com/miso-belica/readability.py and tweaking to return to the name breadability.
- Fork: Added property ``Article.main_text`` for getting text annotated with
semantic HTML tags (<em>, <strong>, ...).
- Join node with 1 child of the same type. From
- Fork: Join node with 1 child of the same type. From
``<div><div>...</div></div>`` we get ``<div>...</div>``.
- Don't change <div> to <p> if it contains <p> elements.
- Renamed test generation helper 'readability_newtest' -> 'readability_test'.
- Renamed package to readability. (Renamed back)
- Added support for Python >= 3.2.
- Py3k compatible package 'charade' is used instead of 'chardet'.
- Fork: Don't change <div> to <p> if it contains <p> elements.
- Fork: Renamed test generation helper 'readability_newtest' -> 'readability_test'.
- Fork: Renamed package to readability. (Renamed back)
- Fork: Added support for Python >= 3.2.
- Fork: Py3k compatible package 'charade' is used instead of 'chardet'.
0.1.14 (Nov 7th 2013)
----------------------

@ -1,4 +0,0 @@
Rick Harding
nhnifong
Craig Maloney
Mišo Belica

@ -1,4 +1,4 @@
Copyright (c) 2013 Rick Harding, Michal Belica and contributors
Copyright (c) 2013 Rick Harding and contributors
All rights reserved.

@ -1,3 +1,4 @@
include README.rst
include CHANGELOG.rst
include LICENSE.rst
include AUTHORS.txt

@ -19,8 +19,12 @@ This is a pretty straight port of the JS here:
- http://code.google.com/p/arc90labs-readability/source/browse/trunk/js/readability.js#82
- http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/
Some other ports:
Alternatives
------------
- https://github.com/codelucas/newspaper
- https://github.com/grangier/python-goose
- https://github.com/aidanf/BTE
- http://www.unixuser.org/~euske/python/webstemmer/#extract
- https://github.com/al3xandru/readability.py
@ -51,8 +55,7 @@ Tests
-----
.. code-block:: bash
$ nosetests --with-coverage --cover-package=breadability --cover-erase tests
$ nosetests-3.3 --with-coverage --cover-package=breadability --cover-erase tests
$ nosetests-2.6 tests && nosetests-3.2 tests && nosetests-2.7 tests && nosetests-3.3 tests
Usage

@ -91,15 +91,11 @@ def get_link_density(node, node_text=None):
if text_length == 0:
return 0.0
link_length = sum(
[len(a.text_content()) or 0 for a in node.findall(".//a")]
)
# For each img, give 50 bonus chars worth of length.
links_length = sum(map(_get_normalized_text_length, node.findall(".//a")))
# Give 50 bonus chars worth of length for each img.
# Tweaking this 50 down a notch should help if we hit false positives.
links_length = max(
link_length - sum([50 for img in node.findall(".//img")]), 0
)
img_bonuses = 50 * len(node.findall(".//img"))
links_length = max(0, links_length - img_bonuses)
return links_length / text_length

@ -1,8 +1,9 @@
charade
coverage
docopt>=0.6.1,<0.7
charade
lxml
nose
nose-selecttests
pep8
pylint
coverage
nose
pep8

@ -0,0 +1,7 @@
[nosetests]
with-coverage=1
cover-package=breadability
cover-erase=1
[wheel]
universal=1

@ -20,8 +20,11 @@ install_requires = [
"lxml>=2.0",
]
tests_require = [
"nose-selecttests",
"coverage",
"pylint",
"nose",
"pep8",
]
@ -80,7 +83,7 @@ setup(
zip_safe=False,
install_requires=install_requires,
tests_require=tests_require,
test_suite="tests.run_tests.run",
test_suite="nose.collector",
entry_points={
"console_scripts": console_script_targets,
}

@ -1,35 +0,0 @@
# -*- coding: utf8 -*-
from __future__ import print_function
import sys
import atexit
import nose
from os.path import dirname, abspath
DEFAULT_PARAMS = [
"nosetests",
"--with-coverage",
"--cover-package=breadability",
"--cover-erase",
]
@atexit.register
def exit_function(msg="Shutting down"):
print(msg, file=sys.stderr)
def run(argv=[]):
sys.exitfunc = exit_function
nose.run(
argv=DEFAULT_PARAMS + argv,
defaultTest=abspath(dirname(__file__)),
)
if __name__ == "__main__":
run(sys.argv[1:])

File diff suppressed because it is too large Load Diff

@ -0,0 +1,39 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from os.path import join, dirname
from breadability.readable import Article
from ...compat import unittest
class TestArticle(unittest.TestCase):
"""
Test the scoring and parsing of the article from URL below:
http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8
"""
def setUp(self):
"""Load up the article for us"""
article_path = join(dirname(__file__), "article.html")
with open(article_path, "rb") as file:
self.document = Article(file.read(), "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8")
def tearDown(self):
"""Drop the article"""
self.document = None
def test_parses(self):
"""Verify we can parse the document."""
self.assertIn('id="readabilityBody"', self.document.readable)
def test_images_preserved(self):
"""The div with the comments should be removed."""
images = [
'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg',
'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg',
]
for image in images:
self.assertIn(image, self.document.readable, image)

@ -28,8 +28,8 @@
<script type="text/javascript" src="http://l-stat.livejournal.com/js/??jquery/jquery.lj.calendar.js,jquery/jquery.mask.js,jquery/jquery.lj.share.js,ljshare-init.js,controlstrip.js,jquery/jquery.calendarEvents.js,jquery/jquery.lj.repostbutton.js,s2.js,esn.js,jquery/jquery.lj.confirmbubble.js,jquery/jquery.lj.ljcut.js,fb-select-image.js,quickreply.js,md5.js,thread_expander.js,thread_expander.ex.js,commentmanage.js,jquery/jquery.lj.journalPromoStrip.js,lj.api.js?v=1377871128"></script>
<!--[if lte IE 9]><script type="text/javascript" src="http://l-stat.livejournal.com/js/??lib/json3.min.js?v=1377871128"></script><![endif]-->
<!--[if gte IE 9]><script type="text/javascript" src="http://l-stat.livejournal.com/js/??ie9pinned.js?v=1377871128"></script><![endif]-->
<script type="text/javascript">
Site.LJShareParams = {"ml":{"close":"Close","title":"Share"},"services":{"stumbleupon":{"bindLink":"http://www.livejournal.com/redirect/SHARING_stumbleupon?url=http%3A%2F%2Fwww.stumbleupon.com%2Fsubmit%3Furl%3D{url}","title":"StumbleUpon"},"moimir":{"bindLink":"http://www.livejournal.com/redirect/SHARING_moimir?url=http%3A%2F%2Fconnect.mail.ru%2Fshare%3Furl%3D{url}","title":"Moi mir"},"twitter":{"bindLink":"http://www.livejournal.com/redirect/SHARING_twitter?url=http%3A%2F%2Ftwitter.com%2Fshare%3Furl%3D{url}%26text%3D{title}%26hashtags%3D{hashtags}","title":"Twitter"},"digg":{"bindLink":"http://www.livejournal.com/redirect/SHARING_digg?url=http%3A%2F%2Fdigg.com%2Fsubmit%3Furl%3D{url}","title":"Digg"},"email":{"bindLink":"http://www.livejournal.com/redirect/SHARING_email?url=http%3A%2F%2Fapi.addthis.com%2Foexchange%2F0.8%2Fforward%2Femail%2Foffer%3Fusername%3Dinternal%26url%3D{url}%26title%3D{title}","title":"E-mail"},"livejournal":{"bindLink":"http://www.livejournal.com/redirect/SHARING_livejournal?url=http%3A%2F%2Fwww.livejournal.com%2Fupdate.bml%3Frepost_type%3Dc%26repost%3D{url}","openInTab":1,"title":"LiveJournal"},"vkontakte":{"bindLink":"http://www.livejournal.com/redirect/SHARING_vkontakte?url=http%3A%2F%2Fvkontakte.ru%2Fshare.php%3Furl%3D{url}","title":"VKontakte"},"facebook":{"bindLink":"http://www.livejournal.com/redirect/SHARING_facebook?url=http%3A%2F%2Fwww.facebook.com%2Fsharer.php%3Fu%3D{url}","title":"Facebook"},"odnoklassniki":{"bindLink":"http://www.livejournal.com/redirect/SHARING_odnoklassniki?url=http%3A%2F%2Fwww.odnoklassniki.ru%2Fdk%3Fst.cmd%3DaddShare%26st.s%3D1%26st._surl%3D{url}","title":"Odnoklassniki"},"tumblr":{"bindLink":"http://www.livejournal.com/redirect/SHARING_tumblr?url=http%3A%2F%2Fwww.tumblr.com%2Fshare%2Flink%3Furl%3D{url}%26name%3D{title}%26description%3D{text}","title":"Tumblr"}},"links":["livejournal","facebook","twitter","digg","tumblr","stumbleupon","email"]};</script>
<script type="text/javascript">
Site.LJShareParams = {"ml":{"close":"Close","title":"Share"},"services":{"stumbleupon":{"bindLink":"http://www.livejournal.com/redirect/SHARING_stumbleupon?url=http%3A%2F%2Fwww.stumbleupon.com%2Fsubmit%3Furl%3D{url}","title":"StumbleUpon"},"moimir":{"bindLink":"http://www.livejournal.com/redirect/SHARING_moimir?url=http%3A%2F%2Fconnect.mail.ru%2Fshare%3Furl%3D{url}","title":"Moi mir"},"twitter":{"bindLink":"http://www.livejournal.com/redirect/SHARING_twitter?url=http%3A%2F%2Ftwitter.com%2Fshare%3Furl%3D{url}%26text%3D{title}%26hashtags%3D{hashtags}","title":"Twitter"},"digg":{"bindLink":"http://www.livejournal.com/redirect/SHARING_digg?url=http%3A%2F%2Fdigg.com%2Fsubmit%3Furl%3D{url}","title":"Digg"},"email":{"bindLink":"http://www.livejournal.com/redirect/SHARING_email?url=http%3A%2F%2Fapi.addthis.com%2Foexchange%2F0.8%2Fforward%2Femail%2Foffer%3Fusername%3Dinternal%26url%3D{url}%26title%3D{title}","title":"E-mail"},"livejournal":{"bindLink":"http://www.livejournal.com/redirect/SHARING_livejournal?url=http%3A%2F%2Fwww.livejournal.com%2Fupdate.bml%3Frepost_type%3Dc%26repost%3D{url}","openInTab":1,"title":"LiveJournal"},"vkontakte":{"bindLink":"http://www.livejournal.com/redirect/SHARING_vkontakte?url=http%3A%2F%2Fvkontakte.ru%2Fshare.php%3Furl%3D{url}","title":"VKontakte"},"facebook":{"bindLink":"http://www.livejournal.com/redirect/SHARING_facebook?url=http%3A%2F%2Fwww.facebook.com%2Fsharer.php%3Fu%3D{url}","title":"Facebook"},"odnoklassniki":{"bindLink":"http://www.livejournal.com/redirect/SHARING_odnoklassniki?url=http%3A%2F%2Fwww.odnoklassniki.ru%2Fdk%3Fst.cmd%3DaddShare%26st.s%3D1%26st._surl%3D{url}","title":"Odnoklassniki"},"tumblr":{"bindLink":"http://www.livejournal.com/redirect/SHARING_tumblr?url=http%3A%2F%2Fwww.tumblr.com%2Fshare%2Flink%3Furl%3D{url}%26name%3D{title}%26description%3D{text}","title":"Tumblr"}},"links":["livejournal","facebook","twitter","digg","tumblr","stumbleupon","email"]};</script>
<script type="text/javascript" src="http://l-stat.livejournal.com/tmpl/??Widgets/bubble.tmpl,Widgets/share.tmpl,CleanHtml/reposted.tmpl,CleanHtml/Repost.tmpl,CleanHtml/PaidRepost.tmpl,Widgets/popupcontent.tmpl?v=1354174850&tm=1531085;uselang=en_LJ"></script>
<script type="text/javascript" src="http://l-stat.livejournal.com/tmpl/??Widgets/contextualhover.jqtmpl?v=1343758569&tm=1531085;uselang=en_LJ"></script>
<script>
@ -60,10 +60,10 @@ var LJ_cmtinfo = {"form_auth":"c0%3A1377975600%3A1336%3A86400%3ARgZomE12Pp-0-%3A
class="w-cs-login"
action="https://www.livejournal.com/login.bml?ret=1"
method="post">
<input type="hidden" name="mode" value="login" />
<ul class="w-cs-signin">
<li class="w-cs-signin-item">
@ -262,12 +262,12 @@ var LJ_cmtinfo = {"form_auth":"c0%3A1377975600%3A1336%3A86400%3ARgZomE12Pp-0-%3A
</div>
<form class="w-cs-search" action="http://www.livejournal.com/search/">
<input
type="hidden"
name="journal"
value="sweetshark">
<fieldset>
<input
@ -294,7 +294,7 @@ var LJ_cmtinfo = {"form_auth":"c0%3A1377975600%3A1336%3A86400%3ARgZomE12Pp-0-%3A
</div>
</div><!-- w-cs-user-controls -->
<script>
jQuery( 'input.text' ).labeledPlaceholder();
</script>
@ -358,7 +358,7 @@ var s=document.getElementsByTagName('script')[0];s.parentNode.insertBefore(e,s);
<div class="asset-meta asset-entry-date">
<ul class="asset-meta-list">
<li class="item"><span><abbr class="datetime">May. 5th, 2012 at 8:18 PM</abbr></span></li>
</ul>
</div>
</div></div></div>
@ -556,18 +556,18 @@ var s=document.getElementsByTagName('script')[0];s.parentNode.insertBefore(e,s);
</script>
<!-- tns-counter.ru -->
<script language="JavaScript" type="text/javascript">
<!-- tns-counter.ru -->
<script language="JavaScript" type="text/javascript">
var img = new Image();
img.src = 'http://www.tns-counter.ru/V13a***R>' + document.referrer.replace(/\*/g,'%2a') + '*sup_ru/ru/UTF-8/tmsec=lj_noncyr/' + Math.round(Math.random() * 1000000000);
</script>
<noscript>
</script>
<noscript>
<img src="http://www.tns-counter.ru/V13a****sup_ru/ru/UTF-8/tmsec=lj_noncyr/" width="1" height="1" alt="">
</noscript>
<!--/ tns-counter.ru -->
<!-- Begin comScore Tag 1.1111.15 -->
<script type="text/javascript">
// <![CDATA[
// <![CDATA[
Site.page.comscore = {};
Site.page.comscore.url = 'http'+(document.location.href.charAt(4)=='s'?'s://sb':'://b')+'.scorecardresearch.com/b';
Site.page.comscore.query = '?c1=2&c2=7602110&sm_vd_cyrillic_status=nonCyr&sm_vd_view_own_journal=undef&sm_vd_id=undef&sm_vd_login_status=logout&sm_vd_account_level=undef&sm_vd_premium_package=undef&sm_vd_early_adopter=undef&sm_vd_log_in_service=undef&sm_vd_viewing_scheme=lanzelot&sm_vd_view_in_my_style=undef&sm_pd_visited_journal_account_type=personal&sm_pd_visited_journal_log_in_service=lj&sm_pd_ads_onpage=2&sm_pd_ad_eligible=yes&sm_pd_ad_1=demand_media_728x90&sm_pd_ad_2=demand_media_728x90&sm_pd_adult_content=none&sm_pd_comments_style=s2&sm_pd_error_pages=undef&sm_pd_visited_journal_name=sweetshark&sm_pd_page_type=journal&sm_pd_style_layout=Expressive&sm_pd_style_design=undef&sm_pd_style_system=s2&sm_pd_visited_journal_account_level=plus&sm_pd_early_adopter=no&sm_pd_visited_journal_premium_package=no&sm_pd_page_group=PostMainPage&category=undef&sm_pd_geotargeting=noncyr&sm_pd_rating_user_duplication=show&sm_pd_rating_friends=hide&sm_pd_rating_hidden_post=hide';
@ -582,7 +582,7 @@ udm_(Site.page.comscore.url + Site.page.comscore.query);
<!-- End comScore Tag --><!-- LiveJournal COUNTER -->
<img src="http://xc3.services.livejournal.com/ljcounter?d=srv:bil1-ws32,r:0,j:37450766,uri:%22%2F11564.html%22,vig:0,extra:Ajt0DgI7dA4AAC0s" alt="" />
<!-- /COUNTER -->
<!-- begin of yandex code -->
<!-- begin of yandex code -->
<script language=JavaScript>
<!--
var seed=Math.round(Math.random()*65535);
@ -591,7 +591,7 @@ udm_(Site.page.comscore.url + Site.page.comscore.query);
</script>
<noscript>
<img src=http://awaps.yandex.ru/0/9999/001001.gif?subsection=0 width=1 height=1 border=0>
</noscript>
</noscript>
<!-- end of yandex code -->
<!-- begin of Bogun code -->
<script>
@ -664,4 +664,4 @@ LiveJournal.injectScript('http://l-stat.livejournal.com/js/ads/xtcore.js');
</noscript>
<!-- End ATI Basic Tracking Code -->
</div></body>
</html>
</html>

@ -1,31 +1,33 @@
import os
try:
# Python < 2.7
import unittest2 as unittest
except ImportError:
import unittest
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from os.path import join, dirname
from breadability.readable import Article
from ...compat import unittest
class TestSweetsharkBlog(unittest.TestCase):
"""Test the scoring and parsing of the Blog Post"""
"""
Test the scoring and parsing of the article from URL below:
http://sweetshark.livejournal.com/11564.html
"""
def setUp(self):
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
self.article = open(article_path).read()
article_path = join(dirname(__file__), "article.html")
with open(article_path, "rb") as file:
self.document = Article(file.read(), "http://sweetshark.livejournal.com/11564.html")
def tearDown(self):
"""Drop the article"""
self.article = None
self.document = None
def test_parses(self):
"""Verify we can parse the document."""
doc = Article(self.article)
self.assertTrue('id="readabilityBody"' in doc.readable)
self.assertIn('id="readabilityBody"', self.document.readable)
def test_content_after_video(self):
"""The div with the comments should be removed."""
doc = Article(self.article)
self.assertTrue('Stay hungry, Stay foolish' in doc.readable)
self.assertIn('Stay hungry, Stay foolish', self.document.readable)

Loading…
Cancel
Save