Merge pull request #21 from miso-belica/upstream-sync

Synchronize with miso-belica/readability.py fork
10 years ago · 6d8a76a2b9
parent 6906f3b2fa 66022e2503
commit 6d8a76a2b9
18 changed files with 3879 additions and 109 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,4 @@
-*.pyc
+*.py[co]
 *.prof
 .coverage

--- a/.travis.yml
+++ b/.travis.yml
@ -6,6 +6,8 @@ python:
  - "3.3"
 before_install: sudo apt-get install libxml2-dev libxslt-dev
 # command to install dependencies
-install: pip install -r requirements.txt --use-mirrors
+install:
+  - python setup.py install
+  - pip install -r requirements.txt --use-mirrors
 # command to run tests
-script: python setup.py install && nosetests tests
+script: nosetests tests
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@ -1,3 +1,4 @@
 Rick Harding (original author)
-Michal Belica (current maintainer)
 nhnifong
+Craig Maloney
+Mišo Belica
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@ -1,37 +1,28 @@
 .. :changelog:

-Changelog for readability
+Changelog for breadability
 ==========================

 0.1.17 (Jan 22nd 2014)
 ----------------------
-
 - More log quieting down to INFO vs WARN

 0.1.16 (Jan 22nd 2014)
 ----------------------
-
 - Clean up logging output at warning when it's not a true warning

 0.1.15 (Nov 29th 2013)
 -----------------------
-
-Merge changes from 0.1.14 of breadability with the fork
-https://github.com/miso-belica/readability.py and tweaking to return to the
-name breadability.
-
-
-From the fork
-~~~~~~~~~~~~~~
- Added property ``Article.main_text`` for getting text annotated with
+- Merge changes from 0.1.14 of breadability with the fork https://github.com/miso-belica/readability.py and tweaking to return to the name breadability.
+- Fork: Added property ``Article.main_text`` for getting text annotated with
  semantic HTML tags (<em>, <strong>, ...).
- Join node with 1 child of the same type. From
+- Fork: Join node with 1 child of the same type. From
  ``<div><div>...</div></div>`` we get ``<div>...</div>``.
- Don't change <div> to <p> if it contains <p> elements.
- Renamed test generation helper 'readability_newtest' -> 'readability_test'.
- Renamed package to readability. (Renamed back)
- Added support for Python >= 3.2.
- Py3k compatible package 'charade' is used instead of 'chardet'.
+- Fork: Don't change <div> to <p> if it contains <p> elements.
+- Fork: Renamed test generation helper 'readability_newtest' -> 'readability_test'.
+- Fork: Renamed package to readability. (Renamed back)
+- Fork: Added support for Python >= 3.2.
+- Fork: Py3k compatible package 'charade' is used instead of 'chardet'.

 0.1.14 (Nov 7th 2013)
 ----------------------
--- a/CREDITS.txt
+++ b/CREDITS.txt
@ -1,4 +0,0 @@
-Rick Harding
-nhnifong
-Craig Maloney
-Mišo Belica
--- a/LICENSE.rst
+++ b/LICENSE.rst
@ -1,4 +1,4 @@
-Copyright (c) 2013 Rick Harding, Michal Belica and contributors
+Copyright (c) 2013 Rick Harding and contributors

 All rights reserved.

--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,3 +1,4 @@
 include README.rst
 include CHANGELOG.rst
 include LICENSE.rst
+include AUTHORS.txt
--- a/README.rst
+++ b/README.rst
@ -19,8 +19,12 @@ This is a pretty straight port of the JS here:
 - http://code.google.com/p/arc90labs-readability/source/browse/trunk/js/readability.js#82
 - http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/

-Some other ports:

+Alternatives
+------------
+
+- https://github.com/codelucas/newspaper
+- https://github.com/grangier/python-goose
 - https://github.com/aidanf/BTE
 - http://www.unixuser.org/~euske/python/webstemmer/#extract
 - https://github.com/al3xandru/readability.py
@ -51,8 +55,7 @@ Tests
 -----
 .. code-block:: bash

-    $ nosetests --with-coverage --cover-package=breadability --cover-erase tests
-    $ nosetests-3.3 --with-coverage --cover-package=breadability --cover-erase tests
+    $ nosetests-2.6 tests && nosetests-3.2 tests && nosetests-2.7 tests && nosetests-3.3 tests


 Usage
--- a/breadability/scoring.py
+++ b/breadability/scoring.py
@ -91,15 +91,11 @@ def get_link_density(node, node_text=None):
    if text_length == 0:
        return 0.0

-    link_length = sum(
-        [len(a.text_content()) or 0 for a in node.findall(".//a")]
-    )
-
-    # For each img, give 50 bonus chars worth of length.
+    links_length = sum(map(_get_normalized_text_length, node.findall(".//a")))
+    # Give 50 bonus chars worth of length for each img.
    # Tweaking this 50 down a notch should help if we hit false positives.
-    links_length = max(
-        link_length - sum([50 for img in node.findall(".//img")]), 0
-    )
+    img_bonuses = 50 * len(node.findall(".//img"))
+    links_length = max(0, links_length - img_bonuses)

    return links_length / text_length

--- a/requirements.txt
+++ b/requirements.txt
@ -1,8 +1,9 @@
-charade
-coverage
 docopt>=0.6.1,<0.7
+charade
 lxml
-nose
+
 nose-selecttests
-pep8
 pylint
+coverage
+nose
+pep8
--- a/setup.cfg
+++ b/setup.cfg
@ -0,0 +1,7 @@
+[nosetests]
+with-coverage=1
+cover-package=breadability
+cover-erase=1
+
+[wheel]
+universal=1
--- a/setup.py
+++ b/setup.py
@ -20,8 +20,11 @@ install_requires = [
    "lxml>=2.0",
 ]
 tests_require = [
+    "nose-selecttests",
    "coverage",
+    "pylint",
    "nose",
+    "pep8",
 ]


@ -80,7 +83,7 @@ setup(
    zip_safe=False,
    install_requires=install_requires,
    tests_require=tests_require,
-    test_suite="tests.run_tests.run",
+    test_suite="nose.collector",
    entry_points={
        "console_scripts": console_script_targets,
    }
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@ -1,35 +0,0 @@
-# -*- coding: utf8 -*-
-
-from __future__ import print_function
-
-import sys
-import atexit
-import nose
-
-from os.path import dirname, abspath
-
-
-DEFAULT_PARAMS = [
-    "nosetests",
-    "--with-coverage",
-    "--cover-package=breadability",
-    "--cover-erase",
-]
-
-
-@atexit.register
-def exit_function(msg="Shutting down"):
-    print(msg, file=sys.stderr)
-
-
-def run(argv=[]):
-    sys.exitfunc = exit_function
-
-    nose.run(
-        argv=DEFAULT_PARAMS + argv,
-        defaultTest=abspath(dirname(__file__)),
-    )
-
-
-if __name__ == "__main__":
-    run(sys.argv[1:])
--- a/tests/test_articles/test_businessinsider_com/init.py
+++ b/tests/test_articles/test_businessinsider_com/init.py
--- a/tests/test_articles/test_businessinsider_com/article.html
+++ b/tests/test_articles/test_businessinsider_com/article.html
--- a/tests/test_articles/test_businessinsider_com/test.py
+++ b/tests/test_articles/test_businessinsider_com/test.py
@ -0,0 +1,39 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+from os.path import join, dirname
+from breadability.readable import Article
+from ...compat import unittest
+
+
+class TestArticle(unittest.TestCase):
+    """
+    Test the scoring and parsing of the article from URL below:
+    http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8
+    """
+
+    def setUp(self):
+        """Load up the article for us"""
+        article_path = join(dirname(__file__), "article.html")
+        with open(article_path, "rb") as file:
+            self.document = Article(file.read(), "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8")
+
+    def tearDown(self):
+        """Drop the article"""
+        self.document = None
+
+    def test_parses(self):
+        """Verify we can parse the document."""
+        self.assertIn('id="readabilityBody"', self.document.readable)
+
+    def test_images_preserved(self):
+        """The div with the comments should be removed."""
+        images = [
+            'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg',
+            'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg',
+        ]
+
+        for image in images:
+            self.assertIn(image, self.document.readable, image)
--- a/tests/test_articles/test_sweetshark/article.html
+++ b/tests/test_articles/test_sweetshark/article.html
@ -28,8 +28,8 @@
 <script type="text/javascript" src="http://l-stat.livejournal.com/js/??jquery/jquery.lj.calendar.js,jquery/jquery.mask.js,jquery/jquery.lj.share.js,ljshare-init.js,controlstrip.js,jquery/jquery.calendarEvents.js,jquery/jquery.lj.repostbutton.js,s2.js,esn.js,jquery/jquery.lj.confirmbubble.js,jquery/jquery.lj.ljcut.js,fb-select-image.js,quickreply.js,md5.js,thread_expander.js,thread_expander.ex.js,commentmanage.js,jquery/jquery.lj.journalPromoStrip.js,lj.api.js?v=1377871128"></script>
 <!--[if lte IE 9]><script type="text/javascript" src="http://l-stat.livejournal.com/js/??lib/json3.min.js?v=1377871128"></script><![endif]-->
 <!--[if gte IE 9]><script type="text/javascript" src="http://l-stat.livejournal.com/js/??ie9pinned.js?v=1377871128"></script><![endif]-->
-<script type="text/javascript">
-Site.LJShareParams = {"ml":{"close":"Close","title":"Share"},"services":{"stumbleupon":{"bindLink":"http://www.livejournal.com/redirect/SHARING_stumbleupon?url=http%3A%2F%2Fwww.stumbleupon.com%2Fsubmit%3Furl%3D{url}","title":"StumbleUpon"},"moimir":{"bindLink":"http://www.livejournal.com/redirect/SHARING_moimir?url=http%3A%2F%2Fconnect.mail.ru%2Fshare%3Furl%3D{url}","title":"Moi mir"},"twitter":{"bindLink":"http://www.livejournal.com/redirect/SHARING_twitter?url=http%3A%2F%2Ftwitter.com%2Fshare%3Furl%3D{url}%26text%3D{title}%26hashtags%3D{hashtags}","title":"Twitter"},"digg":{"bindLink":"http://www.livejournal.com/redirect/SHARING_digg?url=http%3A%2F%2Fdigg.com%2Fsubmit%3Furl%3D{url}","title":"Digg"},"email":{"bindLink":"http://www.livejournal.com/redirect/SHARING_email?url=http%3A%2F%2Fapi.addthis.com%2Foexchange%2F0.8%2Fforward%2Femail%2Foffer%3Fusername%3Dinternal%26url%3D{url}%26title%3D{title}","title":"E-mail"},"livejournal":{"bindLink":"http://www.livejournal.com/redirect/SHARING_livejournal?url=http%3A%2F%2Fwww.livejournal.com%2Fupdate.bml%3Frepost_type%3Dc%26repost%3D{url}","openInTab":1,"title":"LiveJournal"},"vkontakte":{"bindLink":"http://www.livejournal.com/redirect/SHARING_vkontakte?url=http%3A%2F%2Fvkontakte.ru%2Fshare.php%3Furl%3D{url}","title":"VKontakte"},"facebook":{"bindLink":"http://www.livejournal.com/redirect/SHARING_facebook?url=http%3A%2F%2Fwww.facebook.com%2Fsharer.php%3Fu%3D{url}","title":"Facebook"},"odnoklassniki":{"bindLink":"http://www.livejournal.com/redirect/SHARING_odnoklassniki?url=http%3A%2F%2Fwww.odnoklassniki.ru%2Fdk%3Fst.cmd%3DaddShare%26st.s%3D1%26st._surl%3D{url}","title":"Odnoklassniki"},"tumblr":{"bindLink":"http://www.livejournal.com/redirect/SHARING_tumblr?url=http%3A%2F%2Fwww.tumblr.com%2Fshare%2Flink%3Furl%3D{url}%26name%3D{title}%26description%3D{text}","title":"Tumblr"}},"links":["livejournal","facebook","twitter","digg","tumblr","stumbleupon","email"]};</script>
+<script type="text/javascript">
+Site.LJShareParams = {"ml":{"close":"Close","title":"Share"},"services":{"stumbleupon":{"bindLink":"http://www.livejournal.com/redirect/SHARING_stumbleupon?url=http%3A%2F%2Fwww.stumbleupon.com%2Fsubmit%3Furl%3D{url}","title":"StumbleUpon"},"moimir":{"bindLink":"http://www.livejournal.com/redirect/SHARING_moimir?url=http%3A%2F%2Fconnect.mail.ru%2Fshare%3Furl%3D{url}","title":"Moi mir"},"twitter":{"bindLink":"http://www.livejournal.com/redirect/SHARING_twitter?url=http%3A%2F%2Ftwitter.com%2Fshare%3Furl%3D{url}%26text%3D{title}%26hashtags%3D{hashtags}","title":"Twitter"},"digg":{"bindLink":"http://www.livejournal.com/redirect/SHARING_digg?url=http%3A%2F%2Fdigg.com%2Fsubmit%3Furl%3D{url}","title":"Digg"},"email":{"bindLink":"http://www.livejournal.com/redirect/SHARING_email?url=http%3A%2F%2Fapi.addthis.com%2Foexchange%2F0.8%2Fforward%2Femail%2Foffer%3Fusername%3Dinternal%26url%3D{url}%26title%3D{title}","title":"E-mail"},"livejournal":{"bindLink":"http://www.livejournal.com/redirect/SHARING_livejournal?url=http%3A%2F%2Fwww.livejournal.com%2Fupdate.bml%3Frepost_type%3Dc%26repost%3D{url}","openInTab":1,"title":"LiveJournal"},"vkontakte":{"bindLink":"http://www.livejournal.com/redirect/SHARING_vkontakte?url=http%3A%2F%2Fvkontakte.ru%2Fshare.php%3Furl%3D{url}","title":"VKontakte"},"facebook":{"bindLink":"http://www.livejournal.com/redirect/SHARING_facebook?url=http%3A%2F%2Fwww.facebook.com%2Fsharer.php%3Fu%3D{url}","title":"Facebook"},"odnoklassniki":{"bindLink":"http://www.livejournal.com/redirect/SHARING_odnoklassniki?url=http%3A%2F%2Fwww.odnoklassniki.ru%2Fdk%3Fst.cmd%3DaddShare%26st.s%3D1%26st._surl%3D{url}","title":"Odnoklassniki"},"tumblr":{"bindLink":"http://www.livejournal.com/redirect/SHARING_tumblr?url=http%3A%2F%2Fwww.tumblr.com%2Fshare%2Flink%3Furl%3D{url}%26name%3D{title}%26description%3D{text}","title":"Tumblr"}},"links":["livejournal","facebook","twitter","digg","tumblr","stumbleupon","email"]};</script>
 <script type="text/javascript" src="http://l-stat.livejournal.com/tmpl/??Widgets/bubble.tmpl,Widgets/share.tmpl,CleanHtml/reposted.tmpl,CleanHtml/Repost.tmpl,CleanHtml/PaidRepost.tmpl,Widgets/popupcontent.tmpl?v=1354174850&tm=1531085;uselang=en_LJ"></script>
 <script type="text/javascript" src="http://l-stat.livejournal.com/tmpl/??Widgets/contextualhover.jqtmpl?v=1343758569&tm=1531085;uselang=en_LJ"></script>
 <script>
@ -60,10 +60,10 @@ var LJ_cmtinfo = {"form_auth":"c0%3A1377975600%3A1336%3A86400%3ARgZomE12Pp-0-%3A
                class="w-cs-login"
                action="https://www.livejournal.com/login.bml?ret=1"
                method="post">
-                
+
                <input type="hidden" name="mode" value="login" />
-                
-                
+
+

                <ul class="w-cs-signin">
                    <li class="w-cs-signin-item">
@ -262,12 +262,12 @@ var LJ_cmtinfo = {"form_auth":"c0%3A1377975600%3A1336%3A86400%3ARgZomE12Pp-0-%3A
            </div>

            <form class="w-cs-search" action="http://www.livejournal.com/search/">
-                
+
                    <input
                        type="hidden"
                        name="journal"
                        value="sweetshark">
-                
+

                <fieldset>
                    <input
@ -294,7 +294,7 @@ var LJ_cmtinfo = {"form_auth":"c0%3A1377975600%3A1336%3A86400%3ARgZomE12Pp-0-%3A
        </div>

    </div><!-- w-cs-user-controls -->
-	
+
    <script>
        jQuery( 'input.text' ).labeledPlaceholder();
    </script>
@ -358,7 +358,7 @@ var s=document.getElementsByTagName('script')[0];s.parentNode.insertBefore(e,s);
          <div class="asset-meta asset-entry-date">
            <ul class="asset-meta-list">
              <li class="item"><span><abbr class="datetime">May. 5th, 2012 at 8:18 PM</abbr></span></li>
-              
+
            </ul>
          </div>
        </div></div></div>
@ -556,18 +556,18 @@ var s=document.getElementsByTagName('script')[0];s.parentNode.insertBefore(e,s);

 </script>

-<!-- tns-counter.ru --> 
-<script language="JavaScript" type="text/javascript"> 
+<!-- tns-counter.ru -->
+<script language="JavaScript" type="text/javascript">
 var img = new Image();
 img.src = 'http://www.tns-counter.ru/V13a***R>' + document.referrer.replace(/\*/g,'%2a') + '*sup_ru/ru/UTF-8/tmsec=lj_noncyr/' + Math.round(Math.random() * 1000000000);
-</script> 
-<noscript> 
+</script>
+<noscript>
 <img src="http://www.tns-counter.ru/V13a****sup_ru/ru/UTF-8/tmsec=lj_noncyr/" width="1" height="1" alt="">
 </noscript>
 <!--/ tns-counter.ru -->
 <!-- Begin comScore Tag 1.1111.15 -->
 <script type="text/javascript">
-// <![CDATA[ 
+// <![CDATA[
 Site.page.comscore = {};
 Site.page.comscore.url = 'http'+(document.location.href.charAt(4)=='s'?'s://sb':'://b')+'.scorecardresearch.com/b';
 Site.page.comscore.query = '?c1=2&c2=7602110&sm_vd_cyrillic_status=nonCyr&sm_vd_view_own_journal=undef&sm_vd_id=undef&sm_vd_login_status=logout&sm_vd_account_level=undef&sm_vd_premium_package=undef&sm_vd_early_adopter=undef&sm_vd_log_in_service=undef&sm_vd_viewing_scheme=lanzelot&sm_vd_view_in_my_style=undef&sm_pd_visited_journal_account_type=personal&sm_pd_visited_journal_log_in_service=lj&sm_pd_ads_onpage=2&sm_pd_ad_eligible=yes&sm_pd_ad_1=demand_media_728x90&sm_pd_ad_2=demand_media_728x90&sm_pd_adult_content=none&sm_pd_comments_style=s2&sm_pd_error_pages=undef&sm_pd_visited_journal_name=sweetshark&sm_pd_page_type=journal&sm_pd_style_layout=Expressive&sm_pd_style_design=undef&sm_pd_style_system=s2&sm_pd_visited_journal_account_level=plus&sm_pd_early_adopter=no&sm_pd_visited_journal_premium_package=no&sm_pd_page_group=PostMainPage&category=undef&sm_pd_geotargeting=noncyr&sm_pd_rating_user_duplication=show&sm_pd_rating_friends=hide&sm_pd_rating_hidden_post=hide';
@ -582,7 +582,7 @@ udm_(Site.page.comscore.url + Site.page.comscore.query);
 <!-- End comScore Tag --><!-- LiveJournal COUNTER -->
 <img src="http://xc3.services.livejournal.com/ljcounter?d=srv:bil1-ws32,r:0,j:37450766,uri:%22%2F11564.html%22,vig:0,extra:Ajt0DgI7dA4AAC0s" alt="" />
 <!-- /COUNTER -->
-<!-- begin of yandex code --> 
+<!-- begin of yandex code -->
 <script language=JavaScript>
 <!--
    var seed=Math.round(Math.random()*65535);
@ -591,7 +591,7 @@ udm_(Site.page.comscore.url + Site.page.comscore.query);
 </script>
 <noscript>
    <img src=http://awaps.yandex.ru/0/9999/001001.gif?subsection=0 width=1 height=1 border=0>
-</noscript> 
+</noscript>
 <!-- end of yandex code -->
 <!-- begin of Bogun code -->
 <script>
@ -664,4 +664,4 @@ LiveJournal.injectScript('http://l-stat.livejournal.com/js/ads/xtcore.js');
 </noscript>
 <!-- End ATI Basic Tracking Code -->
 </div></body>
-</html>
+</html>
--- a/tests/test_articles/test_sweetshark/test.py
+++ b/tests/test_articles/test_sweetshark/test.py
@ -1,31 +1,33 @@
-import os
-try:
-    # Python < 2.7
-    import unittest2 as unittest
-except ImportError:
-    import unittest
+# -*- coding: utf8 -*-

+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+from os.path import join, dirname
 from breadability.readable import Article
+from ...compat import unittest


 class TestSweetsharkBlog(unittest.TestCase):
-    """Test the scoring and parsing of the Blog Post"""
+    """
+    Test the scoring and parsing of the article from URL below:
+    http://sweetshark.livejournal.com/11564.html
+    """

    def setUp(self):
        """Load up the article for us"""
-        article_path = os.path.join(os.path.dirname(__file__), 'article.html')
-        self.article = open(article_path).read()
+        article_path = join(dirname(__file__), "article.html")
+        with open(article_path, "rb") as file:
+            self.document = Article(file.read(), "http://sweetshark.livejournal.com/11564.html")

    def tearDown(self):
        """Drop the article"""
-        self.article = None
+        self.document = None

    def test_parses(self):
        """Verify we can parse the document."""
-        doc = Article(self.article)
-        self.assertTrue('id="readabilityBody"' in doc.readable)
+        self.assertIn('id="readabilityBody"', self.document.readable)

    def test_content_after_video(self):
        """The div with the comments should be removed."""
-        doc = Article(self.article)
-        self.assertTrue('Stay hungry, Stay foolish' in doc.readable)
+        self.assertIn('Stay hungry, Stay foolish', self.document.readable)