Start to add some processing for the readable contnet
- Add removal of style, script, etc bits in the contentpull/4/merge
parent
2e7fb0aa89
commit
e93a52a748
@ -1,10 +1,23 @@
|
||||
from breadable.document import OriginalDocument
|
||||
from breadability.document import OriginalDocument
|
||||
from breadability.utils import cached_property
|
||||
|
||||
def drop_tag(doc, *tags):
|
||||
[[n.drop_tree() for n in doc.iterfind(".//" + tag)]
|
||||
for tag in tags]
|
||||
return doc
|
||||
|
||||
|
||||
class Article(object):
|
||||
"""Parsed readable object"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def __init__(self, html, url=None):
|
||||
self.orig = OriginalDocument(html, url=url)
|
||||
|
||||
|
||||
@cached_property(ttl=600)
|
||||
def readable(self):
|
||||
"""The readable parsed article"""
|
||||
doc = self.orig.html
|
||||
doc = drop_tag(doc, 'script', 'link', 'style', 'noscript')
|
||||
return doc
|
||||
|
||||
|
@ -0,0 +1,9 @@
|
||||
from os import path
|
||||
|
||||
|
||||
TEST_DIR = path.dirname(__file__)
|
||||
|
||||
|
||||
def load_snippet(filename):
|
||||
"""Helper to fetch in the content of a test snippet"""
|
||||
return open(path.join(TEST_DIR, 'test_snippets', filename)).read()
|
@ -0,0 +1,22 @@
|
||||
from collections import defaultdict
|
||||
from unittest import TestCase
|
||||
|
||||
from breadability.readable import Article
|
||||
from breadability.tests import load_snippet
|
||||
|
||||
|
||||
class TestOriginalDocument(TestCase):
|
||||
"""Verify we can process html into a document to work off of."""
|
||||
|
||||
def test_load_doc(self):
|
||||
"""We get back an element tree from our original doc"""
|
||||
doc = Article(load_snippet('document_min.html'))
|
||||
self.assertEqual(doc.readable.tag, 'html')
|
||||
|
||||
def test_doc_no_scripts_styles(self):
|
||||
"""Step #1 remove all scripts from the document"""
|
||||
doc = Article(load_snippet('document_scripts.html'))
|
||||
readable = doc.readable
|
||||
self.assertEqual(readable.findall(".//script"), [])
|
||||
self.assertEqual(readable.findall(".//style"), [])
|
||||
self.assertEqual(readable.findall(".//link"), [])
|
@ -0,0 +1,24 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>Min Document Title</title>
|
||||
<script src="something.js"></script>
|
||||
<script src="something.js" />
|
||||
<link rel="stylesheet" href="style.css" type="text/css">
|
||||
<style type="text/css">
|
||||
body {
|
||||
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Min Document</h1>
|
||||
<a href="/about.hml">About Us</a>
|
||||
<a href="http://blog.mitechie.com/test.hml">About Us</a>
|
||||
<a href="http://amazon.com/test.hml">Amazon</a>
|
||||
<div id="footer">
|
||||
<script type="text/javascript">
|
||||
// please go away for readability
|
||||
</script>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue