Merge pull request #21 from miso-belica/upstream-sync

Synchronize with miso-belica/readability.py fork
11 years ago · 6d8a76a2b9
parent 6906f3b2fa 66022e2503
commit 6d8a76a2b9
18 changed files with 3879 additions and 109 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,4 @@
-*.pyc
+*.py[co]
 *.prof
 .coverage

--- a/.travis.yml
+++ b/.travis.yml
@ -6,6 +6,8 @@ python:
  - "3.3"
 before_install: sudo apt-get install libxml2-dev libxslt-dev
 # command to install dependencies
-install: pip install -r requirements.txt --use-mirrors
+install:
+  - python setup.py install
+  - pip install -r requirements.txt --use-mirrors
 # command to run tests
-script: python setup.py install && nosetests tests
+script: nosetests tests
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@ -1,3 +1,4 @@
 Rick Harding (original author)
-Michal Belica (current maintainer)
 nhnifong
+Craig Maloney
+Mišo Belica
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@ -1,37 +1,28 @@
 .. :changelog:

-Changelog for readability
+Changelog for breadability
 ==========================

 0.1.17 (Jan 22nd 2014)
 ----------------------
-
 - More log quieting down to INFO vs WARN

 0.1.16 (Jan 22nd 2014)
 ----------------------
-
 - Clean up logging output at warning when it's not a true warning

 0.1.15 (Nov 29th 2013)
 -----------------------
-
-Merge changes from 0.1.14 of breadability with the fork
-https://github.com/miso-belica/readability.py and tweaking to return to the
-name breadability.
-
-
-From the fork
-~~~~~~~~~~~~~~
- Added property ``Article.main_text`` for getting text annotated with
+- Merge changes from 0.1.14 of breadability with the fork https://github.com/miso-belica/readability.py and tweaking to return to the name breadability.
+- Fork: Added property ``Article.main_text`` for getting text annotated with
  semantic HTML tags (<em>, <strong>, ...).
- Join node with 1 child of the same type. From
+- Fork: Join node with 1 child of the same type. From
  ``<div><div>...</div></div>`` we get ``<div>...</div>``.
- Don't change <div> to <p> if it contains <p> elements.
- Renamed test generation helper 'readability_newtest' -> 'readability_test'.
- Renamed package to readability. (Renamed back)
- Added support for Python >= 3.2.
- Py3k compatible package 'charade' is used instead of 'chardet'.
+- Fork: Don't change <div> to <p> if it contains <p> elements.
+- Fork: Renamed test generation helper 'readability_newtest' -> 'readability_test'.
+- Fork: Renamed package to readability. (Renamed back)
+- Fork: Added support for Python >= 3.2.
+- Fork: Py3k compatible package 'charade' is used instead of 'chardet'.

 0.1.14 (Nov 7th 2013)
 ----------------------
--- a/CREDITS.txt
+++ b/CREDITS.txt
@ -1,4 +0,0 @@
-Rick Harding
-nhnifong
-Craig Maloney
-Mišo Belica
--- a/LICENSE.rst
+++ b/LICENSE.rst
@ -1,4 +1,4 @@
-Copyright (c) 2013 Rick Harding, Michal Belica and contributors
+Copyright (c) 2013 Rick Harding and contributors

 All rights reserved.

--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,3 +1,4 @@
 include README.rst
 include CHANGELOG.rst
 include LICENSE.rst
+include AUTHORS.txt
--- a/README.rst
+++ b/README.rst
@ -19,8 +19,12 @@ This is a pretty straight port of the JS here:
 - http://code.google.com/p/arc90labs-readability/source/browse/trunk/js/readability.js#82
 - http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/

-Some other ports:

+Alternatives
+------------
+
+- https://github.com/codelucas/newspaper
+- https://github.com/grangier/python-goose
 - https://github.com/aidanf/BTE
 - http://www.unixuser.org/~euske/python/webstemmer/#extract
 - https://github.com/al3xandru/readability.py
@ -51,8 +55,7 @@ Tests
 -----
 .. code-block:: bash

-    $ nosetests --with-coverage --cover-package=breadability --cover-erase tests
-    $ nosetests-3.3 --with-coverage --cover-package=breadability --cover-erase tests
+    $ nosetests-2.6 tests && nosetests-3.2 tests && nosetests-2.7 tests && nosetests-3.3 tests


 Usage
--- a/breadability/scoring.py
+++ b/breadability/scoring.py
@ -91,15 +91,11 @@ def get_link_density(node, node_text=None):
    if text_length == 0:
        return 0.0

-    link_length = sum(
-        [len(a.text_content()) or 0 for a in node.findall(".//a")]
-    )
-
-    # For each img, give 50 bonus chars worth of length.
+    links_length = sum(map(_get_normalized_text_length, node.findall(".//a")))
+    # Give 50 bonus chars worth of length for each img.
    # Tweaking this 50 down a notch should help if we hit false positives.
-    links_length = max(
-        link_length - sum([50 for img in node.findall(".//img")]), 0
-    )
+    img_bonuses = 50 * len(node.findall(".//img"))
+    links_length = max(0, links_length - img_bonuses)

    return links_length / text_length

--- a/requirements.txt
+++ b/requirements.txt
@ -1,8 +1,9 @@
-charade
-coverage
 docopt>=0.6.1,<0.7
+charade
 lxml
-nose
+
 nose-selecttests
-pep8
 pylint
+coverage
+nose
+pep8
--- a/setup.cfg
+++ b/setup.cfg
@ -0,0 +1,7 @@
+[nosetests]
+with-coverage=1
+cover-package=breadability
+cover-erase=1
+
+[wheel]
+universal=1
--- a/setup.py
+++ b/setup.py
@ -20,8 +20,11 @@ install_requires = [
    "lxml>=2.0",
 ]
 tests_require = [
+    "nose-selecttests",
    "coverage",
+    "pylint",
    "nose",
+    "pep8",
 ]


@ -80,7 +83,7 @@ setup(
    zip_safe=False,
    install_requires=install_requires,
    tests_require=tests_require,
-    test_suite="tests.run_tests.run",
+    test_suite="nose.collector",
    entry_points={
        "console_scripts": console_script_targets,
    }
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@ -1,35 +0,0 @@
-# -*- coding: utf8 -*-
-
-from __future__ import print_function
-
-import sys
-import atexit
-import nose
-
-from os.path import dirname, abspath
-
-
-DEFAULT_PARAMS = [
-    "nosetests",
-    "--with-coverage",
-    "--cover-package=breadability",
-    "--cover-erase",
-]
-
-
-@atexit.register
-def exit_function(msg="Shutting down"):
-    print(msg, file=sys.stderr)
-
-
-def run(argv=[]):
-    sys.exitfunc = exit_function
-
-    nose.run(
-        argv=DEFAULT_PARAMS + argv,
-        defaultTest=abspath(dirname(__file__)),
-    )
-
-
-if __name__ == "__main__":
-    run(sys.argv[1:])
--- a/tests/test_articles/test_businessinsider_com/init.py
+++ b/tests/test_articles/test_businessinsider_com/init.py
--- a/tests/test_articles/test_businessinsider_com/article.html
+++ b/tests/test_articles/test_businessinsider_com/article.html
--- a/tests/test_articles/test_businessinsider_com/test.py
+++ b/tests/test_articles/test_businessinsider_com/test.py
@ -0,0 +1,39 @@
+# -*- coding: utf8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+from os.path import join, dirname
+from breadability.readable import Article
+from ...compat import unittest
+
+
+class TestArticle(unittest.TestCase):
+    """
+    Test the scoring and parsing of the article from URL below:
+    http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8
+    """
+
+    def setUp(self):
+        """Load up the article for us"""
+        article_path = join(dirname(__file__), "article.html")
+        with open(article_path, "rb") as file:
+            self.document = Article(file.read(), "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8")
+
+    def tearDown(self):
+        """Drop the article"""
+        self.document = None
+
+    def test_parses(self):
+        """Verify we can parse the document."""
+        self.assertIn('id="readabilityBody"', self.document.readable)
+
+    def test_images_preserved(self):
+        """The div with the comments should be removed."""
+        images = [
+            'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg',
+            'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg',
+        ]
+
+        for image in images:
+            self.assertIn(image, self.document.readable, image)
--- a/tests/test_articles/test_sweetshark/test.py
+++ b/tests/test_articles/test_sweetshark/test.py
@ -1,31 +1,33 @@
-import os
-try:
-    # Python < 2.7
-    import unittest2 as unittest
-except ImportError:
-    import unittest
+# -*- coding: utf8 -*-

+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+from os.path import join, dirname
 from breadability.readable import Article
+from ...compat import unittest


 class TestSweetsharkBlog(unittest.TestCase):
-    """Test the scoring and parsing of the Blog Post"""
+    """
+    Test the scoring and parsing of the article from URL below:
+    http://sweetshark.livejournal.com/11564.html
+    """

    def setUp(self):
        """Load up the article for us"""
-        article_path = os.path.join(os.path.dirname(__file__), 'article.html')
-        self.article = open(article_path).read()
+        article_path = join(dirname(__file__), "article.html")
+        with open(article_path, "rb") as file:
+            self.document = Article(file.read(), "http://sweetshark.livejournal.com/11564.html")

    def tearDown(self):
        """Drop the article"""
-        self.article = None
+        self.document = None

    def test_parses(self):
        """Verify we can parse the document."""
-        doc = Article(self.article)
-        self.assertTrue('id="readabilityBody"' in doc.readable)
+        self.assertIn('id="readabilityBody"', self.document.readable)

    def test_content_after_video(self):
        """The div with the comments should be removed."""
-        doc = Article(self.article)
-        self.assertTrue('Stay hungry, Stay foolish' in doc.readable)
+        self.assertIn('Stay hungry, Stay foolish', self.document.readable)