From dc0493f99bdcbdb8f8e356d0a777ee0ea9b326de Mon Sep 17 00:00:00 2001 From: Richard Harding Date: Fri, 29 Nov 2013 12:08:26 -0500 Subject: [PATCH] Update to catch back up to craig's image helper --- readability/scoring.py | 10 +++++++++- readability/scripts/client.py | 5 ++++- tests/test_articles/test_businessinsider-com/test.py | 2 +- tests/test_articles/test_sweetshark/test.py | 2 +- 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/readability/scoring.py b/readability/scoring.py index 65344c3..3b90db8 100644 --- a/readability/scoring.py +++ b/readability/scoring.py @@ -90,7 +90,15 @@ def get_link_density(node, node_text=None): if text_length == 0: return 0.0 - links_length = sum(map(_get_normalized_text_length, node.findall(".//a"))) + + link_length = sum([len(a.text_content()) or 0 + for a in node.findall(".//a")]) + + # For each img, give 50 bonus chars worth of length. + # Tweaking this 50 down a notch should help if we hit false positives. + links_length = max(link_length - + sum([50 for img in node.findall(".//img")]), 0) + return links_length / text_length diff --git a/readability/scripts/client.py b/readability/scripts/client.py index acb1783..8025b98 100644 --- a/readability/scripts/client.py +++ b/readability/scripts/client.py @@ -37,7 +37,10 @@ from ..readable import Article HEADERS = { - "User-Agent": "Readability (Readable content parser) Version/%s" % __version__, + "User-Agent": 'breadability/{version} ({url})'.format( + url="https://github.com/bookieio/breadability", + version=__version__ + ) } diff --git a/tests/test_articles/test_businessinsider-com/test.py b/tests/test_articles/test_businessinsider-com/test.py index 54d5570..0d850be 100644 --- a/tests/test_articles/test_businessinsider-com/test.py +++ b/tests/test_articles/test_businessinsider-com/test.py @@ -5,7 +5,7 @@ try: except ImportError: import unittest -from breadability.readable import Article +from readability.readable import Article class TestBusinessInsiderArticle(unittest.TestCase): diff --git a/tests/test_articles/test_sweetshark/test.py b/tests/test_articles/test_sweetshark/test.py index e4e498c..7580e05 100644 --- a/tests/test_articles/test_sweetshark/test.py +++ b/tests/test_articles/test_sweetshark/test.py @@ -5,7 +5,7 @@ try: except ImportError: import unittest -from breadability.readable import Article +from readability.readable import Article class TestSweetsharkBlog(unittest.TestCase):