Property of ``Article`` with annotated text

pull/21/head
Mišo Belica 11 years ago
parent 7337e2fb38
commit 0df3a95c1e

@ -5,6 +5,7 @@ from __future__ import absolute_import
import re
import logging
from copy import deepcopy
from operator import attrgetter
from pprint import PrettyPrinter
from lxml.html.clean import Cleaner
@ -26,6 +27,13 @@ html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
SCORABLE_TAGS = ("div", "p", "td", "pre", "article")
ANNOTATION_TAGS = (
"a", "abbr", "acronym", "b", "big", "blink", "blockquote", "br", "cite",
"code", "dd", "del", "dir", "dl", "dt", "em", "font", "h", "h1", "h2",
"h3", "h4", "h5", "h6", "hr", "i", "ins", "kbd", "li", "marquee", "menu",
"ol", "p", "pre", "q", "s", "samp", "span", "strike", "strong", "sub",
"sup", "tt", "u", "ul", "var",
)
NULL_DOCUMENT = """
<html>
<head>
@ -393,6 +401,15 @@ class Article(object):
candidates, self._should_drop = find_candidates(dom)
return candidates
@cached_property
def readable_annotated_text(self):
dom = deepcopy(self.readable_dom)
for node in dom.get_element_by_id("readabilityBody").iterdescendants():
if node.tag not in ANNOTATION_TAGS:
node.drop_tag()
return dom
@cached_property
def readable(self):
return tounicode(self.readable_dom)

@ -0,0 +1,21 @@
<html>
<head>
<meta http-equiv="charset" content="utf-8"/>
<title>This is title of document</title>
</head>
<body>
<div>Inline text is not so good, but it's here.</div>
<div class="article">
<div class="wrapper">
<p>
Paragraph is more <em>better</em>.
This text is very <strong>pretty</strong> 'cause she's girl.
</p>
<p>
This is not <big>crap</big> so <dfn title="Make me readable">readability</dfn> me :)
</p>
</div>
</div>
<div>And some next not so <b>good</b> text.</div>
</body>
</html>

@ -285,3 +285,32 @@ class TestSiblings(unittest.TestCase):
@unittest.skip("Not implemented yet.")
def test_good_siblings_counted(self):
raise NotImplementedError()
class TestAnnotatedText(unittest.TestCase):
def test_empty(self):
article = Article("")
dom = article.readable_annotated_text
self.assertEqual(tounicode(dom),
'<div id="readabilityBody" class="parsing-error"/>')
def test_no_annotations(self):
article = Article("<div><p>This is text with no annotations</p></div>")
dom = article.readable_annotated_text
self.assertEqual(tounicode(dom),
'<div id="readabilityBody"><p>This is text with no annotations</p></div>')
def test_one_annotation(self):
article = Article("<div><p>This is text with <del>no</del> annotations</p></div>")
dom = article.readable_annotated_text
self.assertEqual(tounicode(dom),
'<div id="readabilityBody"><p>This is text with <del>no</del> annotations</p></div>')
def test_simple_document(self):
article = Article(load_snippet("annotated_1.html"))
dom = article.readable_annotated_text
self.assertIn("Paragraph is more better", dom.text_content())
self.assertIn("This is not crap so readability me :)", dom.text_content())
self.assertNotIn("not so good", dom.text_content())

Loading…
Cancel
Save