diff --git a/readability/readable.py b/readability/readable.py index 146fbd0..b5ff924 100644 --- a/readability/readable.py +++ b/readability/readable.py @@ -5,6 +5,7 @@ from __future__ import absolute_import import re import logging +from copy import deepcopy from operator import attrgetter from pprint import PrettyPrinter from lxml.html.clean import Cleaner @@ -26,6 +27,13 @@ html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, SCORABLE_TAGS = ("div", "p", "td", "pre", "article") +ANNOTATION_TAGS = ( + "a", "abbr", "acronym", "b", "big", "blink", "blockquote", "br", "cite", + "code", "dd", "del", "dir", "dl", "dt", "em", "font", "h", "h1", "h2", + "h3", "h4", "h5", "h6", "hr", "i", "ins", "kbd", "li", "marquee", "menu", + "ol", "p", "pre", "q", "s", "samp", "span", "strike", "strong", "sub", + "sup", "tt", "u", "ul", "var", +) NULL_DOCUMENT = """ @@ -393,6 +401,15 @@ class Article(object): candidates, self._should_drop = find_candidates(dom) return candidates + @cached_property + def readable_annotated_text(self): + dom = deepcopy(self.readable_dom) + for node in dom.get_element_by_id("readabilityBody").iterdescendants(): + if node.tag not in ANNOTATION_TAGS: + node.drop_tag() + + return dom + @cached_property def readable(self): return tounicode(self.readable_dom) diff --git a/tests/data/snippets/annotated_1.html b/tests/data/snippets/annotated_1.html new file mode 100644 index 0000000..1eadf0d --- /dev/null +++ b/tests/data/snippets/annotated_1.html @@ -0,0 +1,21 @@ + + + + This is title of document + + +
Inline text is not so good, but it's here.
+
+
+

+ Paragraph is more better. + This text is very pretty 'cause she's girl. +

+

+ This is not crap so readability me :) +

+
+
+
And some next not so good text.
+ + diff --git a/tests/test_readable.py b/tests/test_readable.py index 6fe6c8c..b296052 100644 --- a/tests/test_readable.py +++ b/tests/test_readable.py @@ -285,3 +285,32 @@ class TestSiblings(unittest.TestCase): @unittest.skip("Not implemented yet.") def test_good_siblings_counted(self): raise NotImplementedError() + + +class TestAnnotatedText(unittest.TestCase): + def test_empty(self): + article = Article("") + dom = article.readable_annotated_text + self.assertEqual(tounicode(dom), + '
') + + def test_no_annotations(self): + article = Article("

This is text with no annotations

") + dom = article.readable_annotated_text + self.assertEqual(tounicode(dom), + '

This is text with no annotations

') + + def test_one_annotation(self): + article = Article("

This is text with no annotations

") + dom = article.readable_annotated_text + self.assertEqual(tounicode(dom), + '

This is text with no annotations

') + + def test_simple_document(self): + article = Article(load_snippet("annotated_1.html")) + dom = article.readable_annotated_text + + self.assertIn("Paragraph is more better", dom.text_content()) + self.assertIn("This is not crap so readability me :)", dom.text_content()) + + self.assertNotIn("not so good", dom.text_content())