Start process of adding a newtest script for generating test cases

- Adds new breadability_newtest tool for generating test cases.
- Add fixes for the scripting.com test failure.
pull/11/head
Richard Harding 12 years ago
parent 3b00d33ad3
commit 5704eb4c15

@ -45,6 +45,8 @@ setup(name='breadability',
},
entry_points={
'console_scripts':
['breadability=breadability:client.main']
['breadability=breadability:client.main',
'breadability_newtest=breadability:newtest.main',
]
}
)

@ -1,2 +1,3 @@
VERSION = '0.1.3'
import client
from scripts import newtest

@ -106,6 +106,10 @@ class LogHelper(object):
"""Turn on this logger."""
self._active = True
def deactivate(self):
"""Turn off the logger"""
self._active = False
def log(self, node, action, description):
"""Write out our log info based on the node and event specified.

@ -11,6 +11,7 @@ from breadability.document import OriginalDocument
from breadability.logconfig import LOG
from breadability.logconfig import LNODE
from breadability.scoring import score_candidates
from breadability.scoring import generate_hash_id
from breadability.scoring import get_link_density
from breadability.scoring import get_class_weight
from breadability.scoring import is_unlikely_node
@ -252,6 +253,7 @@ def clean_conditionally(node):
if node.tag not in target_tags:
# this is not the tag you're looking for
LNODE.log(node, 2, 'Node cleared.')
return
weight = get_class_weight(node)
@ -261,6 +263,7 @@ def clean_conditionally(node):
if (weight + content_score < 0):
LNODE.log(node, 2, 'Dropping conditional node')
LNODE.log(node, 2, 'Weight + score < 0')
return True
if node.text_content().count(',') < 10:
@ -284,16 +287,7 @@ def clean_conditionally(node):
remove_node = False
if img > p:
# this one has shown to do some extra image removals.
# we could get around this by checking for caption info in the
# images to try to do some scoring of good v. bad images.
# failing example:
# arstechnica.com/science/news/2012/05/1859s
# -great-auroral-stormthe-week-the-sun-touched-the-earth.ars
LNODE.log(node, 2, 'Conditional drop: img > p')
remove_node = True
elif li > p and node.tag != 'ul' and node.tag != 'ol':
if li > p and node.tag != 'ul' and node.tag != 'ol':
LNODE.log(node, 2, 'Conditional drop: li > p and not ul/ol')
remove_node = True
elif inputs > p / 3.0:
@ -315,9 +309,15 @@ def clean_conditionally(node):
LNODE.log(node, 2,
'Conditional drop: embed w/o much content or many embed')
remove_node = True
if remove_node:
LNODE.log(node, 2, 'Node will be removed')
else:
LNODE.log(node, 2, 'Node cleared')
return remove_node
# nope, don't remove anything
LNODE.log(node, 2, 'Node Cleared final.')
return False

@ -0,0 +1,105 @@
import argparse
import codecs
import urllib2
from os import mkdir
from os import path
from breadability import VERSION
TESTPATH = path.join(
path.dirname(path.dirname(__file__)),
'tests', 'test_articles')
TESTTPL = """
import os
from unittest import TestCase
from breadability.readable import Article
class TestArticle(TestCase):
\"\"\"Test the scoring and parsing of the Article\"\"\"
def setUp(self):
\"\"\"Load up the article for us\"\"\"
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
self.article = open(article_path).read()
def tearDown(self):
\"\"\"Drop the article\"\"\"
self.article = None
def test_parses(self):
\"\"\"Verify we can parse the document.\"\"\"
doc = Article(self.article)
self.assertTrue('id="readabilityBody"' in doc.readable)
def test_content_exists(self):
\"\"\"Verify that some content exists.\"\"\"
pass
def test_content_does_not_exist(self):
\"\"\"Verify we cleaned out some content that shouldn't exist.\"\"\"
pass
"""
def parse_args():
desc = "breadability helper to generate a new set of article test files."
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('--version',
action='version', version=VERSION)
parser.add_argument('-n', '--name',
action='store',
required=True,
help='Name of the test directory')
parser.add_argument('url', metavar='URL', type=str, nargs=1,
help='The url of content to fetch for the article.html')
args = parser.parse_args()
return args
def make_dir(name):
"""Generate a new directory for tests.
"""
dir_name = 'test_' + name.replace(' ', '_')
updated_name = path.join(TESTPATH, dir_name)
mkdir(updated_name)
return updated_name
def make_files(dirname):
init_file = path.join(dirname, '__init__.py')
test_file = path.join(dirname, 'test.py')
open(init_file, "a").close()
with open(test_file, 'w') as f:
f.write(TESTTPL)
def fetch_article(dirname, url):
"""Get the content of the url and make it the article.html"""
opener = urllib2.build_opener()
opener.addheaders = [('Accept-Charset', 'utf-8')]
url_response = opener.open(url)
dl_html = url_response.read().decode('utf-8')
fh = codecs.open(path.join(dirname, 'article.html'), "w", "utf-8")
fh.write(dl_html)
fh.close()
def main():
"""Run the script."""
args = parse_args()
new_dir = make_dir(args.name)
make_files(new_dir)
fetch_article(new_dir, args.url[0])
if __name__ == '__main__':
main()

@ -36,5 +36,3 @@ class TestAntipopeBlog(TestCase):
"""
doc = Article(self.article)
self.assertTrue('id="beta"' not in doc.readable)

File diff suppressed because one or more lines are too long

@ -0,0 +1,66 @@
import os
from operator import attrgetter
from unittest import TestCase
from breadability.readable import Article
from breadability.readable import check_siblings
from breadability.readable import prep_article
class TestArticle(TestCase):
"""Test the scoring and parsing of the Article"""
def setUp(self):
"""Load up the article for us"""
article_path = os.path.join(os.path.dirname(__file__), 'article.html')
self.article = open(article_path).read()
def tearDown(self):
"""Drop the article"""
self.article = None
def test_parses(self):
"""Verify we can parse the document."""
doc = Article(self.article)
self.assertTrue('id="readabilityBody"' in doc.readable)
def test_content_exists(self):
"""Verify that some content exists."""
doc = Article(self.article)
self.assertTrue('Amazon and Google' in doc.readable)
self.assertFalse('Linkblog updated' in doc.readable)
def test_candidates(self):
"""Verify we have candidates."""
doc = Article(self.article)
from lxml.etree import tounicode
found = False
wanted_hash = '04e46055'
# from breadability.logconfig import LNODE
# from breadability.logconfig import set_logging_level
# set_logging_level('DEBUG')
# LNODE.activate()
for node in doc.candidates.values():
if node.hash_id == wanted_hash:
found = node
self.assertTrue(found)
# we have the right node, it must be deleted for some reason if it's
# not still there when we need it to be.
# Make sure it's not in our to drop list.
for node in doc._should_drop:
self.assertFalse(node == found.node)
by_score = sorted([c for c in doc.candidates.values()],
key=attrgetter('content_score'), reverse=True)
self.assertTrue(by_score[0].node == found.node)
updated_winner = check_siblings(by_score[0], doc.candidates)
updated_winner.node = prep_article(updated_winner.node)
# This article hits up against the img > p conditional filtering
# because of the many .gif images in the content. We've removed that
# rule.
# set_logging_level('INFO')
# LNODE.deactivate()

@ -1,5 +1,6 @@
import time
#
# ? 2011 Christopher Arndt, MIT License
#

Loading…
Cancel
Save