Property of ``Article`` with annotated text

11 years ago · 0df3a95c1e
parent 7337e2fb38
commit 0df3a95c1e
3 changed files with 67 additions and 0 deletions
--- a/readability/readable.py
+++ b/readability/readable.py
@ -5,6 +5,7 @@ from __future__ import absolute_import
 import re
 import logging

+from copy import deepcopy
 from operator import attrgetter
 from pprint import PrettyPrinter
 from lxml.html.clean import Cleaner
@ -26,6 +27,13 @@ html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,


 SCORABLE_TAGS = ("div", "p", "td", "pre", "article")
+ANNOTATION_TAGS = (
+    "a", "abbr", "acronym", "b", "big", "blink", "blockquote", "br", "cite",
+    "code", "dd", "del", "dir", "dl", "dt", "em", "font", "h", "h1", "h2",
+    "h3", "h4", "h5", "h6", "hr", "i", "ins", "kbd", "li", "marquee", "menu",
+    "ol", "p", "pre", "q", "s", "samp", "span", "strike", "strong", "sub",
+    "sup", "tt", "u", "ul", "var",
+)
 NULL_DOCUMENT = """
 <html>
    <head>
@ -393,6 +401,15 @@ class Article(object):
        candidates, self._should_drop = find_candidates(dom)
        return candidates

+    @cached_property
+    def readable_annotated_text(self):
+        dom = deepcopy(self.readable_dom)
+        for node in dom.get_element_by_id("readabilityBody").iterdescendants():
+            if node.tag not in ANNOTATION_TAGS:
+                node.drop_tag()
+
+        return dom
+
    @cached_property
    def readable(self):
        return tounicode(self.readable_dom)
--- a/tests/data/snippets/annotated_1.html
+++ b/tests/data/snippets/annotated_1.html
@ -0,0 +1,21 @@
+<html>
+<head>
+	<meta http-equiv="charset" content="utf-8"/>
+	<title>This is title of document</title>
+</head>
+<body>
+	<div>Inline text is not so good, but it's here.</div>
+	<div class="article">
+		<div class="wrapper">
+			<p>
+				Paragraph is more <em>better</em>.
+				This text is very <strong>pretty</strong> 'cause she's girl.
+			</p>
+			<p>
+				This is not <big>crap</big> so <dfn title="Make me readable">readability</dfn> me :)
+			</p>
+		</div>
+	</div>
+	<div>And some next not so <b>good</b> text.</div>
+</body>
+</html>
--- a/tests/test_readable.py
+++ b/tests/test_readable.py
@ -285,3 +285,32 @@ class TestSiblings(unittest.TestCase):
    @unittest.skip("Not implemented yet.")
    def test_good_siblings_counted(self):
        raise NotImplementedError()
+
+
+class TestAnnotatedText(unittest.TestCase):
+    def test_empty(self):
+        article = Article("")
+        dom = article.readable_annotated_text
+        self.assertEqual(tounicode(dom),
+            '<div id="readabilityBody" class="parsing-error"/>')
+
+    def test_no_annotations(self):
+        article = Article("<div><p>This is text with no annotations</p></div>")
+        dom = article.readable_annotated_text
+        self.assertEqual(tounicode(dom),
+            '<div id="readabilityBody"><p>This is text with no annotations</p></div>')
+
+    def test_one_annotation(self):
+        article = Article("<div><p>This is text with <del>no</del> annotations</p></div>")
+        dom = article.readable_annotated_text
+        self.assertEqual(tounicode(dom),
+            '<div id="readabilityBody"><p>This is text with <del>no</del> annotations</p></div>')
+
+    def test_simple_document(self):
+        article = Article(load_snippet("annotated_1.html"))
+        dom = article.readable_annotated_text
+
+        self.assertIn("Paragraph is more better", dom.text_content())
+        self.assertIn("This is not crap so readability me :)", dom.text_content())
+
+        self.assertNotIn("not so good", dom.text_content())