diff --git a/readability/readable.py b/readability/readable.py index 146fbd0..b5ff924 100644 --- a/readability/readable.py +++ b/readability/readable.py @@ -5,6 +5,7 @@ from __future__ import absolute_import import re import logging +from copy import deepcopy from operator import attrgetter from pprint import PrettyPrinter from lxml.html.clean import Cleaner @@ -26,6 +27,13 @@ html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, SCORABLE_TAGS = ("div", "p", "td", "pre", "article") +ANNOTATION_TAGS = ( + "a", "abbr", "acronym", "b", "big", "blink", "blockquote", "br", "cite", + "code", "dd", "del", "dir", "dl", "dt", "em", "font", "h", "h1", "h2", + "h3", "h4", "h5", "h6", "hr", "i", "ins", "kbd", "li", "marquee", "menu", + "ol", "p", "pre", "q", "s", "samp", "span", "strike", "strong", "sub", + "sup", "tt", "u", "ul", "var", +) NULL_DOCUMENT = """
@@ -393,6 +401,15 @@ class Article(object): candidates, self._should_drop = find_candidates(dom) return candidates + @cached_property + def readable_annotated_text(self): + dom = deepcopy(self.readable_dom) + for node in dom.get_element_by_id("readabilityBody").iterdescendants(): + if node.tag not in ANNOTATION_TAGS: + node.drop_tag() + + return dom + @cached_property def readable(self): return tounicode(self.readable_dom) diff --git a/tests/data/snippets/annotated_1.html b/tests/data/snippets/annotated_1.html new file mode 100644 index 0000000..1eadf0d --- /dev/null +++ b/tests/data/snippets/annotated_1.html @@ -0,0 +1,21 @@ + + + ++ Paragraph is more better. + This text is very pretty 'cause she's girl. +
++ This is not crap so readability me :) +
+This is text with no annotations
This is text with no annotations
This is text with no annotations
This is text with no annotations