Property of ``Article`` with annotated text

pull/21/head
Mišo Belica 11 years ago
parent 7337e2fb38
commit 0df3a95c1e

@ -5,6 +5,7 @@ from __future__ import absolute_import
import re import re
import logging import logging
from copy import deepcopy
from operator import attrgetter from operator import attrgetter
from pprint import PrettyPrinter from pprint import PrettyPrinter
from lxml.html.clean import Cleaner from lxml.html.clean import Cleaner
@ -26,6 +27,13 @@ html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
SCORABLE_TAGS = ("div", "p", "td", "pre", "article") SCORABLE_TAGS = ("div", "p", "td", "pre", "article")
ANNOTATION_TAGS = (
"a", "abbr", "acronym", "b", "big", "blink", "blockquote", "br", "cite",
"code", "dd", "del", "dir", "dl", "dt", "em", "font", "h", "h1", "h2",
"h3", "h4", "h5", "h6", "hr", "i", "ins", "kbd", "li", "marquee", "menu",
"ol", "p", "pre", "q", "s", "samp", "span", "strike", "strong", "sub",
"sup", "tt", "u", "ul", "var",
)
NULL_DOCUMENT = """ NULL_DOCUMENT = """
<html> <html>
<head> <head>
@ -393,6 +401,15 @@ class Article(object):
candidates, self._should_drop = find_candidates(dom) candidates, self._should_drop = find_candidates(dom)
return candidates return candidates
@cached_property
def readable_annotated_text(self):
dom = deepcopy(self.readable_dom)
for node in dom.get_element_by_id("readabilityBody").iterdescendants():
if node.tag not in ANNOTATION_TAGS:
node.drop_tag()
return dom
@cached_property @cached_property
def readable(self): def readable(self):
return tounicode(self.readable_dom) return tounicode(self.readable_dom)

@ -0,0 +1,21 @@
<html>
<head>
<meta http-equiv="charset" content="utf-8"/>
<title>This is title of document</title>
</head>
<body>
<div>Inline text is not so good, but it's here.</div>
<div class="article">
<div class="wrapper">
<p>
Paragraph is more <em>better</em>.
This text is very <strong>pretty</strong> 'cause she's girl.
</p>
<p>
This is not <big>crap</big> so <dfn title="Make me readable">readability</dfn> me :)
</p>
</div>
</div>
<div>And some next not so <b>good</b> text.</div>
</body>
</html>

@ -285,3 +285,32 @@ class TestSiblings(unittest.TestCase):
@unittest.skip("Not implemented yet.") @unittest.skip("Not implemented yet.")
def test_good_siblings_counted(self): def test_good_siblings_counted(self):
raise NotImplementedError() raise NotImplementedError()
class TestAnnotatedText(unittest.TestCase):
def test_empty(self):
article = Article("")
dom = article.readable_annotated_text
self.assertEqual(tounicode(dom),
'<div id="readabilityBody" class="parsing-error"/>')
def test_no_annotations(self):
article = Article("<div><p>This is text with no annotations</p></div>")
dom = article.readable_annotated_text
self.assertEqual(tounicode(dom),
'<div id="readabilityBody"><p>This is text with no annotations</p></div>')
def test_one_annotation(self):
article = Article("<div><p>This is text with <del>no</del> annotations</p></div>")
dom = article.readable_annotated_text
self.assertEqual(tounicode(dom),
'<div id="readabilityBody"><p>This is text with <del>no</del> annotations</p></div>')
def test_simple_document(self):
article = Article(load_snippet("annotated_1.html"))
dom = article.readable_annotated_text
self.assertIn("Paragraph is more better", dom.text_content())
self.assertIn("This is not crap so readability me :)", dom.text_content())
self.assertNotIn("not so good", dom.text_content())

Loading…
Cancel
Save