Rename it back. Respect others

pull/23/head
Richard Harding 11 years ago
parent 4cbde9cb5a
commit 1fc153d850

@ -34,8 +34,8 @@ clean_venv:
rm -rf bin include lib local man share
.PHONY: develop
develop: lib/python*/site-packages/readability.egg-link
lib/python*/site-packages/readability.egg-link:
develop: lib/python*/site-packages/breadability.egg-link
lib/python*/site-packages/breadability.egg-link:
$(PY) setup.py develop

@ -1,4 +1,4 @@
Readability.py - another readability Python port
breadability - another readability Python port
=================================================
.. image:: https://api.travis-ci.org/bookieio/breadability.png?branch=master
:target: https://travis-ci.org/bookieio/breadability.py

@ -8,4 +8,4 @@ from __future__ import (
)
import pkg_resources
__version__ = pkg_resources.get_distribution("readability").version
__version__ = pkg_resources.get_distribution("breadability").version

@ -56,7 +56,6 @@ def to_bytes(object):
return to_bytes(repr(object))
def to_unicode(object):
try:
if isinstance(object, unicode):

@ -15,7 +15,7 @@ from ._compat import unicode, to_bytes, to_unicode, unicode_compatible
from .utils import cached_property
logger = logging.getLogger("readability")
logger = logging.getLogger("breadability")
TAG_MARK_PATTERN = re.compile(to_bytes(r"</?[^>]*>\s*"))

@ -13,12 +13,17 @@ from lxml.html import fragment_fromstring, fromstring
from .document import OriginalDocument
from .annotated_text import AnnotatedTextHandler
from .scoring import (score_candidates, get_link_density, get_class_weight,
is_unlikely_node)
from .scoring import (
get_class_weight,
get_link_density,
is_unlikely_node,
score_candidates,
)
from .utils import cached_property, shrink_text
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
html_cleaner = Cleaner(
scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False,
page_structure=False, processing_instructions=True,
embedded=False, frames=False, forms=False,
@ -44,7 +49,7 @@ NULL_DOCUMENT = """
</html>
"""
logger = logging.getLogger("readability")
logger = logging.getLogger("breadability")
def ok_embedded_video(node):
@ -129,7 +134,8 @@ def check_siblings(candidate_node, candidate_list):
content_bonus += candidate_node.content_score * 0.2
if sibling in candidate_list:
adjusted_score = candidate_list[sibling].content_score + content_bonus
adjusted_score = \
candidate_list[sibling].content_score + content_bonus
if adjusted_score >= sibling_target_score:
append = True
@ -146,7 +152,8 @@ def check_siblings(candidate_node, candidate_list):
append = True
if append:
logger.debug("Sibling appended: %s %r", sibling.tag, sibling.attrib)
logger.debug(
"Sibling appended: %s %r", sibling.tag, sibling.attrib)
if sibling.tag not in ("div", "p"):
# We have a node that isn't a common block level element, like
# a form or td tag. Turn it into a div so it doesn't get
@ -191,7 +198,8 @@ def clean_document(node):
if n.tag in ("div", "p"):
text_content = shrink_text(n.text_content())
if len(text_content) < 5 and not n.getchildren():
logger.debug("Dropping %s %r without content.", n.tag, n.attrib)
logger.debug(
"Dropping %s %r without content.", n.tag, n.attrib)
to_drop.append(n)
# finally try out the conditional cleaning of the target node
@ -206,7 +214,8 @@ def clean_document(node):
def drop_nodes_with_parents(nodes):
for node in nodes:
if node.getparent() is not None:
logger.debug("Droping node with parent %s %r", node.tag, node.attrib)
logger.debug(
"Droping node with parent %s %r", node.tag, node.attrib)
node.drop_tree()
@ -231,7 +240,8 @@ def clean_conditionally(node):
commas_count = node.text_content().count(',')
if commas_count < 10:
logger.debug("There are %d commas so we're processing more.", commas_count)
logger.debug(
"There are %d commas so we're processing more.", commas_count)
# If there are not very many commas, and the number of
# non-paragraph elements is more than paragraphs or other ominous
@ -267,7 +277,8 @@ def clean_conditionally(node):
logger.debug('Conditional drop: weight big but link heavy')
remove_node = True
elif (embed == 1 and content_length < 75) or embed > 1:
logger.debug('Conditional drop: embed w/o much content or many embed')
logger.debug(
'Conditional drop: embed w/o much content or many embed')
remove_node = True
if remove_node:
@ -305,10 +316,12 @@ def find_candidates(document):
for node in document.iter():
if is_unlikely_node(node):
logger.debug("We should drop unlikely: %s %r", node.tag, node.attrib)
logger.debug(
"We should drop unlikely: %s %r", node.tag, node.attrib)
should_remove.add(node)
elif is_bad_link(node):
logger.debug("We should drop bad link: %s %r", node.tag, node.attrib)
logger.debug(
"We should drop bad link: %s %r", node.tag, node.attrib)
should_remove.add(node)
elif node.tag in SCORABLE_TAGS:
nodes_to_score.add(node)
@ -403,7 +416,8 @@ class Article(object):
return self._handle_no_candidates()
# right now we return the highest scoring candidate content
best_candidates = sorted((c for c in self.candidates.values()),
best_candidates = sorted(
(c for c in self.candidates.values()),
key=attrgetter("content_score"), reverse=True)
printer = PrettyPrinter(indent=2)
@ -415,9 +429,11 @@ class Article(object):
updated_winner = check_siblings(winner, self.candidates)
updated_winner.node = prep_article(updated_winner.node)
if updated_winner.node is not None:
dom = build_base_document(updated_winner.node, self._return_fragment)
dom = build_base_document(
updated_winner.node, self._return_fragment)
else:
logger.warning('Had candidates but failed to find a cleaned winning DOM.')
logger.warning(
'Had candidates but failed to find a cleaned winning DOM.')
dom = self._handle_no_candidates()
return self._remove_orphans(dom.get_element_by_id("readabilityBody"))
@ -437,7 +453,8 @@ class Article(object):
if self.dom is not None and len(self.dom):
dom = prep_article(self.dom)
dom = build_base_document(dom, self._return_fragment)
return self._remove_orphans(dom.get_element_by_id("readabilityBody"))
return self._remove_orphans(
dom.get_element_by_id("readabilityBody"))
else:
logger.warning("No document to use.")
return build_error_document(self._return_fragment)
@ -454,7 +471,8 @@ def leaf_div_elements_into_paragraphs(document):
for element in document.iter(tag="div"):
child_tags = tuple(n.tag for n in element.getchildren())
if "div" not in child_tags and "p" not in child_tags:
logger.debug("Changing leaf block element <%s> into <p>", element.tag)
logger.debug(
"Changing leaf block element <%s> into <p>", element.tag)
element.tag = "p"
return document

@ -17,9 +17,9 @@ from .utils import normalize_whitespace
# A series of sets of attributes we check to help in determining if a node is
# a potential candidate or not.
CLS_UNLIKELY = re.compile(
"combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|"
"sidebar|sponsor|ad-break|agegate|pagination|pager|perma|popup|tweet|"
"twitter|social|breadcrumb",
"combx|comment|community|disqus|extra|foot|header|menu|remark|rss|"
"shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|perma|popup|"
"tweet|twitter|social|breadcrumb",
re.IGNORECASE
)
CLS_MAYBE = re.compile(
@ -32,12 +32,12 @@ CLS_WEIGHT_POSITIVE = re.compile(
)
CLS_WEIGHT_NEGATIVE = re.compile(
"combx|comment|com-|contact|foot|footer|footnote|head|masthead|media|meta|"
"outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|"
"widget",
"outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|"
"tool|widget",
re.IGNORECASE
)
logger = logging.getLogger("readability")
logger = logging.getLogger("breadability")
def check_node_attributes(pattern, node, *attributes):
@ -52,6 +52,7 @@ def check_node_attributes(pattern, node, *attributes):
return False
def generate_hash_id(node):
"""
Generates a hash_id for the node in question.
@ -60,7 +61,7 @@ def generate_hash_id(node):
"""
try:
content = tostring(node)
except Exception as e:
except Exception:
logger.exception("Generating of hash failed")
content = to_bytes(repr(node))
@ -90,14 +91,15 @@ def get_link_density(node, node_text=None):
if text_length == 0:
return 0.0
link_length = sum([len(a.text_content()) or 0
for a in node.findall(".//a")])
link_length = sum(
[len(a.text_content()) or 0 for a in node.findall(".//a")]
)
# For each img, give 50 bonus chars worth of length.
# Tweaking this 50 down a notch should help if we hit false positives.
links_length = max(link_length -
sum([50 for img in node.findall(".//img")]), 0)
links_length = max(
link_length - sum([50 for img in node.findall(".//img")]), 0
)
return links_length / text_length
@ -148,8 +150,8 @@ def score_candidates(nodes):
for node in nodes:
logger.debug("* Scoring candidate %s %r", node.tag, node.attrib)
# if the node has no parent it knows of
# then it ends up creating a body & html tag to parent the html fragment
# if the node has no parent it knows of then it ends up creating a
# body & html tag to parent the html fragment
parent = node.getparent()
if parent is None:
logger.debug("Skipping candidate - parent node is 'None'.")
@ -163,7 +165,9 @@ def score_candidates(nodes):
# if paragraph is < `MIN_HIT_LENTH` characters don't even count it
inner_text = node.text_content().strip()
if len(inner_text) < MIN_HIT_LENTH:
logger.debug("Skipping candidate - inner text < %d characters.", MIN_HIT_LENTH)
logger.debug(
"Skipping candidate - inner text < %d characters.",
MIN_HIT_LENTH)
continue
# initialize readability data for the parent
@ -186,7 +190,8 @@ def score_candidates(nodes):
# subtract 0.5 points for each double quote within this paragraph
double_quotes_count = inner_text.count('"')
content_score += double_quotes_count * -0.5
logger.debug("Penalty points for %d double-quotes.", double_quotes_count)
logger.debug(
"Penalty points for %d double-quotes.", double_quotes_count)
# for every 100 characters in this paragraph, add another point
# up to 3 points
@ -195,12 +200,14 @@ def score_candidates(nodes):
logger.debug("Bonus points for length of text: %f", length_points)
# add the score to the parent
logger.debug("Bonus points for parent %s %r with score %f: %f",
logger.debug(
"Bonus points for parent %s %r with score %f: %f",
parent.tag, parent.attrib, candidates[parent].content_score,
content_score)
candidates[parent].content_score += content_score
# the grand node gets half
logger.debug("Bonus points for grand %s %r with score %f: %f",
logger.debug(
"Bonus points for grand %s %r with score %f: %f",
grand.tag, grand.attrib, candidates[grand].content_score,
content_score / 2.0)
candidates[grand].content_score += content_score / 2.0
@ -212,7 +219,8 @@ def score_candidates(nodes):
for candidate in candidates.values():
adjustment = 1.0 - get_link_density(candidate.node)
candidate.content_score *= adjustment
logger.debug("Link density adjustment for %s %r: %f",
logger.debug(
"Link density adjustment for %s %r: %f",
candidate.node.tag, candidate.node.attrib, adjustment)
return candidates

@ -4,9 +4,9 @@
A fast python port of arc90's readability tool
Usage:
readability [options] <resource>
readability --version
readability --help
breadability [options] <resource>
breadability --version
breadability --help
Arguments:
<resource> URL or file path to process in readable form.
@ -50,7 +50,7 @@ def parse_args():
def main():
args = parse_args()
logger = logging.getLogger("readability")
logger = logging.getLogger("breadability")
if args["--verbose"]:
logger.setLevel(logging.DEBUG)

@ -1,12 +1,12 @@
# -*- coding: utf8 -*-
"""
Helper to generate a new set of article test files for readability.
Helper to generate a new set of article test files for breadability.
Usage:
readability_test --name <name> <url>
readability_test --version
readability_test --help
breadability_test --name <name> <url>
breadability_test --version
breadability_test --help
Arguments:
<url> The url of content to fetch for the article.html
@ -39,7 +39,7 @@ from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from os.path import join, dirname
from readability.readable import Article
from breadability.readable import Article
from ...compat import unittest

@ -6,6 +6,9 @@ from __future__ import division, print_function, unicode_literals
import re
MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)
def is_blank(text):
"""
Returns ``True`` if string contains only whitespace characters
@ -18,7 +21,6 @@ def shrink_text(text):
return normalize_whitespace(text.strip())
MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)
def normalize_whitespace(text):
"""
Translates multiple whitespace into single space character.

@ -29,10 +29,10 @@ if sys.version_info < (2, 7):
install_requires.append("unittest2")
console_script_targets = [
"readability = readability.scripts.client:main",
"readability-{0} = readability.scripts.client:main",
"readability_test = readability.scripts.test_helper:main",
"readability_test-{0} = readability.scripts.test_helper:main",
"breadability = breadability.scripts.client:main",
"breadability-{0} = breadability.scripts.client:main",
"breadability_test = breadability.scripts.test_helper:main",
"breadability_test-{0} = breadability.scripts.test_helper:main",
]
console_script_targets = [
target.format(VERSION_SUFFIX) for target in console_script_targets
@ -40,21 +40,22 @@ console_script_targets = [
setup(
name="readability",
name="breadability",
version=VERSION,
description="Port of Readability HTML parser in Python",
long_description=long_description,
keywords=[
"bookie",
"breadability",
"content",
"HTML",
"parsing",
"readability",
"readable",
"parsing",
"HTML",
"content",
"bookie",
],
author="Rick Harding",
author_email="rharding@mitechie.com",
url="https://github.com/bookieio/b readability",
url="https://github.com/bookieio/breadability",
license="BSD",
classifiers=[
"Development Status :: 5 - Production/Stable",

@ -12,7 +12,7 @@ from os.path import dirname, abspath
DEFAULT_PARAMS = [
"nosetests",
"--with-coverage",
"--cover-package=readability",
"--cover-package=breadability",
"--cover-erase",
]

@ -1,11 +1,15 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from __future__ import (
absolute_import,
division,
print_function,
unicode_literals
)
from lxml.html import fragment_fromstring, document_fromstring
from readability.readable import Article
from readability.annotated_text import AnnotatedTextHandler
from breadability.readable import Article
from breadability.annotated_text import AnnotatedTextHandler
from .compat import unittest
from .utils import load_snippet, load_article

@ -5,7 +5,7 @@ from __future__ import division, print_function, unicode_literals
import os
from readability.readable import Article
from breadability.readable import Article
from ...compat import unittest

@ -5,7 +5,7 @@ try:
except ImportError:
import unittest
from readability.readable import Article
from breadability.readable import Article
class TestBusinessInsiderArticle(unittest.TestCase):

@ -4,8 +4,8 @@ from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from os.path import join, dirname
from readability.readable import Article
from readability._compat import unicode
from breadability.readable import Article
from breadability._compat import unicode
from ...compat import unittest

@ -1,14 +1,18 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from __future__ import (
absolute_import,
division,
print_function,
unicode_literals
)
import os
from operator import attrgetter
from readability.readable import Article
from readability.readable import check_siblings
from readability.readable import prep_article
from breadability.readable import Article
from breadability.readable import check_siblings
from breadability.readable import prep_article
from ...compat import unittest
@ -57,7 +61,8 @@ class TestArticle(unittest.TestCase):
for node in doc._should_drop:
self.assertFalse(node == found.node)
by_score = sorted([c for c in doc.candidates.values()],
by_score = sorted(
[c for c in doc.candidates.values()],
key=attrgetter('content_score'), reverse=True)
self.assertTrue(by_score[0].node == found.node)

@ -5,7 +5,7 @@ try:
except ImportError:
import unittest
from readability.readable import Article
from breadability.readable import Article
class TestSweetsharkBlog(unittest.TestCase):

@ -4,9 +4,12 @@ from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from collections import defaultdict
from readability._compat import to_unicode, to_bytes
from readability.document import (OriginalDocument, determine_encoding,
convert_breaks_to_paragraphs)
from breadability._compat import to_unicode, to_bytes
from breadability.document import (
convert_breaks_to_paragraphs,
determine_encoding,
OriginalDocument,
)
from .compat import unittest
from .utils import load_snippet
@ -18,14 +21,16 @@ class TestOriginalDocument(unittest.TestCase):
returned = convert_breaks_to_paragraphs(
"<div>HI<br><br>How are you?<br><br> \t \n <br>Fine\n I guess</div>")
self.assertEqual(returned,
self.assertEqual(
returned,
"<div>HI</p><p>How are you?</p><p>Fine\n I guess</div>")
def test_convert_hr_tags_to_paragraphs(self):
returned = convert_breaks_to_paragraphs(
"<div>HI<br><br>How are you?<hr/> \t \n <br>Fine\n I guess</div>")
self.assertEqual(returned,
self.assertEqual(
returned,
"<div>HI</p><p>How are you?</p><p>Fine\n I guess</div>")
def test_readin_min_document(self):
@ -79,7 +84,7 @@ class TestOriginalDocument(unittest.TestCase):
def test_encoding(self):
text = "ľščťžýáíéäúňôůě".encode("iso-8859-2")
encoding = determine_encoding(text)
determine_encoding(text)
def test_encoding_short(self):
text = "ľščťžýáíé".encode("iso-8859-2")

@ -6,14 +6,16 @@ from __future__ import division, print_function, unicode_literals
from lxml.etree import tounicode
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from readability._compat import to_unicode
from readability.readable import Article
from readability.readable import get_class_weight
from readability.readable import get_link_density
from readability.readable import is_bad_link
from readability.readable import score_candidates
from readability.readable import leaf_div_elements_into_paragraphs
from readability.scoring import ScoredNode
from breadability._compat import to_unicode
from breadability.readable import (
Article,
get_class_weight,
get_link_density,
is_bad_link,
leaf_div_elements_into_paragraphs,
score_candidates,
)
from breadability.scoring import ScoredNode
from .compat import unittest
from .utils import load_snippet, load_article
@ -65,7 +67,6 @@ class TestReadableDocument(unittest.TestCase):
self.assertEqual(doc.readable_dom.tag, 'div')
self.assertEqual(doc.readable_dom.get('id'), 'readabilityBody')
def test_no_content(self):
"""Without content we supply an empty unparsed doc."""
doc = Article('')
@ -81,10 +82,11 @@ class TestCleaning(unittest.TestCase):
"""Verify we wipe out things from our unlikely list."""
doc = Article(load_snippet('test_readable_unlikely.html'))
readable = doc.readable_dom
must_not_appear = ['comment', 'community', 'disqus', 'extra', 'foot',
'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',
'popup', 'tweet', 'twitter', 'imgBlogpostPermalink']
must_not_appear = [
'comment', 'community', 'disqus', 'extra', 'foot',
'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',
'popup', 'tweet', 'twitter', 'imgBlogpostPermalink']
want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']
@ -127,9 +129,8 @@ class TestCleaning(unittest.TestCase):
'</div></body></html>')
test_doc2 = document_fromstring(test_html2)
self.assertEqual(
tounicode(
leaf_div_elements_into_paragraphs(test_doc2)),
to_unicode('<html><body><p>simple<a href="">link</a></p></body></html>')
tounicode(leaf_div_elements_into_paragraphs(test_doc2)),
to_unicode('<html><body><p>simple<a href="">link</a></p></body></html>')
)
def test_dont_transform_div_with_div(self):

@ -8,14 +8,18 @@ import re
from operator import attrgetter
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from readability.readable import Article
from readability.scoring import check_node_attributes
from readability.scoring import get_class_weight
from readability.scoring import ScoredNode
from readability.scoring import score_candidates
from readability.scoring import generate_hash_id
from readability.readable import get_link_density
from readability.readable import is_unlikely_node
from breadability.readable import Article
from breadability.scoring import (
check_node_attributes,
generate_hash_id,
get_class_weight,
score_candidates,
ScoredNode,
)
from breadability.readable import (
get_link_density,
is_unlikely_node,
)
from .compat import unittest
from .utils import load_snippet
@ -60,7 +64,8 @@ class TestCheckNodeAttr(unittest.TestCase):
test_node = fragment_fromstring('<div/>')
test_node.set('class', 'test2 comment')
self.assertTrue(check_node_attributes(test_pattern, test_node, 'class'))
self.assertTrue(
check_node_attributes(test_pattern, test_node, 'class'))
def test_has_id(self):
"""Verify that a node has an id in our set."""
@ -75,7 +80,8 @@ class TestCheckNodeAttr(unittest.TestCase):
test_pattern = re.compile('test1|test2', re.I)
test_node = fragment_fromstring('<div/>')
test_node.set('class', 'test4 comment')
self.assertFalse(check_node_attributes(test_pattern, test_node, 'class'))
self.assertFalse(
check_node_attributes(test_pattern, test_node, 'class'))
def test_lacks_id(self):
"""Verify that a node does not have an id in our set."""
@ -266,7 +272,8 @@ class TestScoreCandidates(unittest.TestCase):
div_nodes = dom.findall(".//div")
candidates = score_candidates(div_nodes)
ordered = sorted((c for c in candidates.values()), reverse=True,
ordered = sorted(
(c for c in candidates.values()), reverse=True,
key=attrgetter("content_score"))
self.assertEqual(ordered[0].node.tag, "div")

Loading…
Cancel
Save