Merge pull request #21 from miso-belica/upstream-sync

Synchronize with miso-belica/readability.py fork
pull/23/head
Rick Harding 11 years ago
commit 6d8a76a2b9

2
.gitignore vendored

@ -1,4 +1,4 @@
*.pyc
*.py[co]
*.prof
.coverage

@ -6,6 +6,8 @@ python:
- "3.3"
before_install: sudo apt-get install libxml2-dev libxslt-dev
# command to install dependencies
install: pip install -r requirements.txt --use-mirrors
install:
- python setup.py install
- pip install -r requirements.txt --use-mirrors
# command to run tests
script: python setup.py install && nosetests tests
script: nosetests tests

@ -1,3 +1,4 @@
Rick Harding (original author)
Michal Belica (current maintainer)
nhnifong
Craig Maloney
Mišo Belica

@ -1,37 +1,28 @@
.. :changelog:
Changelog for readability
Changelog for breadability
==========================
0.1.17 (Jan 22nd 2014)
----------------------
- More log quieting down to INFO vs WARN
0.1.16 (Jan 22nd 2014)
----------------------
- Clean up logging output at warning when it's not a true warning
0.1.15 (Nov 29th 2013)
-----------------------
Merge changes from 0.1.14 of breadability with the fork
https://github.com/miso-belica/readability.py and tweaking to return to the
name breadability.
From the fork
~~~~~~~~~~~~~~
- Added property ``Article.main_text`` for getting text annotated with
- Merge changes from 0.1.14 of breadability with the fork https://github.com/miso-belica/readability.py and tweaking to return to the name breadability.
- Fork: Added property ``Article.main_text`` for getting text annotated with
semantic HTML tags (<em>, <strong>, ...).
- Join node with 1 child of the same type. From
- Fork: Join node with 1 child of the same type. From
``<div><div>...</div></div>`` we get ``<div>...</div>``.
- Don't change <div> to <p> if it contains <p> elements.
- Renamed test generation helper 'readability_newtest' -> 'readability_test'.
- Renamed package to readability. (Renamed back)
- Added support for Python >= 3.2.
- Py3k compatible package 'charade' is used instead of 'chardet'.
- Fork: Don't change <div> to <p> if it contains <p> elements.
- Fork: Renamed test generation helper 'readability_newtest' -> 'readability_test'.
- Fork: Renamed package to readability. (Renamed back)
- Fork: Added support for Python >= 3.2.
- Fork: Py3k compatible package 'charade' is used instead of 'chardet'.
0.1.14 (Nov 7th 2013)
----------------------

@ -1,4 +0,0 @@
Rick Harding
nhnifong
Craig Maloney
Mišo Belica

@ -1,4 +1,4 @@
Copyright (c) 2013 Rick Harding, Michal Belica and contributors
Copyright (c) 2013 Rick Harding and contributors
All rights reserved.

@ -1,3 +1,4 @@
include README.rst
include CHANGELOG.rst
include LICENSE.rst
include AUTHORS.txt

@ -19,8 +19,12 @@ This is a pretty straight port of the JS here:
- http://code.google.com/p/arc90labs-readability/source/browse/trunk/js/readability.js#82
- http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/
Some other ports:
Alternatives
------------
- https://github.com/codelucas/newspaper
- https://github.com/grangier/python-goose
- https://github.com/aidanf/BTE
- http://www.unixuser.org/~euske/python/webstemmer/#extract
- https://github.com/al3xandru/readability.py
@ -51,8 +55,7 @@ Tests
-----
.. code-block:: bash
$ nosetests --with-coverage --cover-package=breadability --cover-erase tests
$ nosetests-3.3 --with-coverage --cover-package=breadability --cover-erase tests
$ nosetests-2.6 tests && nosetests-3.2 tests && nosetests-2.7 tests && nosetests-3.3 tests
Usage

@ -91,15 +91,11 @@ def get_link_density(node, node_text=None):
if text_length == 0:
return 0.0
link_length = sum(
[len(a.text_content()) or 0 for a in node.findall(".//a")]
)
# For each img, give 50 bonus chars worth of length.
links_length = sum(map(_get_normalized_text_length, node.findall(".//a")))
# Give 50 bonus chars worth of length for each img.
# Tweaking this 50 down a notch should help if we hit false positives.
links_length = max(
link_length - sum([50 for img in node.findall(".//img")]), 0
)
img_bonuses = 50 * len(node.findall(".//img"))
links_length = max(0, links_length - img_bonuses)
return links_length / text_length

@ -1,8 +1,9 @@
charade
coverage
docopt>=0.6.1,<0.7
charade
lxml
nose
nose-selecttests
pep8
pylint
coverage
nose
pep8

@ -0,0 +1,7 @@
[nosetests]
with-coverage=1
cover-package=breadability
cover-erase=1
[wheel]
universal=1

@ -20,8 +20,11 @@ install_requires = [
"lxml>=2.0",
]
tests_require = [
"nose-selecttests",
"coverage",
"pylint",
"nose",
"pep8",
]
@ -80,7 +83,7 @@ setup(
zip_safe=False,
install_requires=install_requires,
tests_require=tests_require,
test_suite="tests.run_tests.run",
test_suite="nose.collector",
entry_points={
"console_scripts": console_script_targets,
}

@ -1,35 +0,0 @@
# -*- coding: utf8 -*-
from __future__ import print_function
import sys
import atexit
import nose
from os.path import dirname, abspath
DEFAULT_PARAMS = [
"nosetests",
"--with-coverage",
"--cover-package=breadability",
"--cover-erase",
]
@atexit.register
def exit_function(msg="Shutting down"):
print(msg, file=sys.stderr)
def run(argv=[]):
sys.exitfunc = exit_function
nose.run(
argv=DEFAULT_PARAMS + argv,
defaultTest=abspath(dirname(__file__)),
)
if __name__ == "__main__":
run(sys.argv[1:])

File diff suppressed because it is too large Load Diff

@ -0,0 +1,39 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from os.path import join, dirname
from breadability.readable import Article
from ...compat import unittest
class TestArticle(unittest.TestCase):
"""
Test the scoring and parsing of the article from URL below:
http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8
"""
def setUp(self):
"""Load up the article for us"""
article_path = join(dirname(__file__), "article.html")
with open(article_path, "rb") as file:
self.document = Article(file.read(), "http://www.businessinsider.com/tech-ceos-favorite-productivity-hacks-2013-8")
def tearDown(self):
"""Drop the article"""
self.document = None
def test_parses(self):
"""Verify we can parse the document."""
self.assertIn('id="readabilityBody"', self.document.readable)
def test_images_preserved(self):
"""The div with the comments should be removed."""
images = [
'bharath-kumar-a-co-founder-at-pugmarksme-suggests-working-on-a-sunday-late-night.jpg',
'bryan-guido-hassin-a-university-professor-and-startup-junkie-uses-airplane-days.jpg',
]
for image in images:
self.assertIn(image, self.document.readable, image)

@ -1,31 +1,33 @@
import os
try:
# Python < 2.7
import unittest2 as unittest
except ImportError:
import unittest
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from os.path import join, dirname
from breadability.readable import Article
from ...compat import unittest
class TestSweetsharkBlog(unittest.TestCase):
"""Test the scoring and parsing of the Blog Post"""
"""
Test the scoring and parsing of the article from URL below:
http://sweetshark.livejournal.com/11564.html
"""
def setUp(self):
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
self.article = open(article_path).read()
article_path = join(dirname(__file__), "article.html")
with open(article_path, "rb") as file:
self.document = Article(file.read(), "http://sweetshark.livejournal.com/11564.html")
def tearDown(self):
"""Drop the article"""
self.article = None
self.document = None
def test_parses(self):
"""Verify we can parse the document."""
doc = Article(self.article)
self.assertTrue('id="readabilityBody"' in doc.readable)
self.assertIn('id="readabilityBody"', self.document.readable)
def test_content_after_video(self):
"""The div with the comments should be removed."""
doc = Article(self.article)
self.assertTrue('Stay hungry, Stay foolish' in doc.readable)
self.assertIn('Stay hungry, Stay foolish', self.document.readable)

Loading…
Cancel
Save