Property of ``Article`` with annotated text

11 years ago · 0df3a95c1e
parent 7337e2fb38
commit 0df3a95c1e
3 changed files with 67 additions and 0 deletions
--- a/readability/readable.py
+++ b/readability/readable.py
@ -5,6 +5,7 @@ from __future__ import absolute_import
 import re
 import logging
 from copy import deepcopy
 from operator import attrgetter
 from pprint import PrettyPrinter
 from lxml.html.clean import Cleaner
@ -26,6 +27,13 @@ html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
 SCORABLE_TAGS = ("div", "p", "td", "pre", "article")
 ANNOTATION_TAGS = (
    "a", "abbr", "acronym", "b", "big", "blink", "blockquote", "br", "cite",
    "code", "dd", "del", "dir", "dl", "dt", "em", "font", "h", "h1", "h2",
    "h3", "h4", "h5", "h6", "hr", "i", "ins", "kbd", "li", "marquee", "menu",
    "ol", "p", "pre", "q", "s", "samp", "span", "strike", "strong", "sub",
    "sup", "tt", "u", "ul", "var",
 )
 NULL_DOCUMENT = """
 <html>
    <head>
@ -393,6 +401,15 @@ class Article(object):
        candidates, self._should_drop = find_candidates(dom)
        return candidates
    @cached_property
    def readable_annotated_text(self):
        dom = deepcopy(self.readable_dom)
        for node in dom.get_element_by_id("readabilityBody").iterdescendants():
            if node.tag not in ANNOTATION_TAGS:
                node.drop_tag()
        return dom
    @cached_property
    def readable(self):
        return tounicode(self.readable_dom)
--- a/tests/data/snippets/annotated_1.html
+++ b/tests/data/snippets/annotated_1.html
@ -0,0 +1,21 @@
 <html>
 <head>
 	<meta http-equiv="charset" content="utf-8"/>
 	<title>This is title of document</title>
 </head>
 <body>
 	<div>Inline text is not so good, but it's here.</div>
 	<div class="article">
 		<div class="wrapper">
 			<p>
 				Paragraph is more <em>better</em>.
 				This text is very <strong>pretty</strong> 'cause she's girl.
 			</p>
 			<p>
 				This is not <big>crap</big> so <dfn title="Make me readable">readability</dfn> me :)
 			</p>
 		</div>
 	</div>
 	<div>And some next not so <b>good</b> text.</div>
 </body>
 </html>
--- a/tests/test_readable.py
+++ b/tests/test_readable.py
@ -285,3 +285,32 @@ class TestSiblings(unittest.TestCase):
    @unittest.skip("Not implemented yet.")
    def test_good_siblings_counted(self):
        raise NotImplementedError()
 class TestAnnotatedText(unittest.TestCase):
    def test_empty(self):
        article = Article("")
        dom = article.readable_annotated_text
        self.assertEqual(tounicode(dom),
            '<div id="readabilityBody" class="parsing-error"/>')
    def test_no_annotations(self):
        article = Article("<div><p>This is text with no annotations</p></div>")
        dom = article.readable_annotated_text
        self.assertEqual(tounicode(dom),
            '<div id="readabilityBody"><p>This is text with no annotations</p></div>')
    def test_one_annotation(self):
        article = Article("<div><p>This is text with <del>no</del> annotations</p></div>")
        dom = article.readable_annotated_text
        self.assertEqual(tounicode(dom),
            '<div id="readabilityBody"><p>This is text with <del>no</del> annotations</p></div>')
    def test_simple_document(self):
        article = Article(load_snippet("annotated_1.html"))
        dom = article.readable_annotated_text
        self.assertIn("Paragraph is more better", dom.text_content())
        self.assertIn("This is not crap so readability me :)", dom.text_content())
        self.assertNotIn("not so good", dom.text_content())