Start process of adding a newtest script for generating test cases

- Adds new breadability_newtest tool for generating test cases. - Add fixes for the scripting.com test failure.
12 years ago · 5704eb4c15
parent 3b00d33ad3
commit 5704eb4c15
11 changed files with 214 additions and 13 deletions
--- a/setup.py
+++ b/setup.py
@ -45,6 +45,8 @@ setup(name='breadability',
    },
    entry_points={
        'console_scripts':
-            ['breadability=breadability:client.main']
+            ['breadability=breadability:client.main',
+             'breadability_newtest=breadability:newtest.main',
+            ]
    }
 )
--- a/src/breadability/init.py
+++ b/src/breadability/init.py
@ -1,2 +1,3 @@
 VERSION = '0.1.3'
 import client
+from scripts import newtest
--- a/src/breadability/logconfig.py
+++ b/src/breadability/logconfig.py
@ -106,6 +106,10 @@ class LogHelper(object):
        """Turn on this logger."""
        self._active = True

+    def deactivate(self):
+        """Turn off the logger"""
+        self._active = False
+
    def log(self, node, action, description):
        """Write out our log info based on the node and event specified.

--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -11,6 +11,7 @@ from breadability.document import OriginalDocument
 from breadability.logconfig import LOG
 from breadability.logconfig import LNODE
 from breadability.scoring import score_candidates
+from breadability.scoring import generate_hash_id
 from breadability.scoring import get_link_density
 from breadability.scoring import get_class_weight
 from breadability.scoring import is_unlikely_node
@ -252,6 +253,7 @@ def clean_conditionally(node):

    if node.tag not in target_tags:
        # this is not the tag you're looking for
+        LNODE.log(node, 2, 'Node cleared.')
        return

    weight = get_class_weight(node)
@ -261,6 +263,7 @@ def clean_conditionally(node):

    if (weight + content_score < 0):
        LNODE.log(node, 2, 'Dropping conditional node')
+        LNODE.log(node, 2, 'Weight + score < 0')
        return True

    if node.text_content().count(',') < 10:
@ -284,16 +287,7 @@ def clean_conditionally(node):

        remove_node = False

-        if img > p:
-            # this one has shown to do some extra image removals.
-            # we could get around this by checking for caption info in the
-            # images to try to do some scoring of good v. bad images.
-            # failing example:
-            # arstechnica.com/science/news/2012/05/1859s
-            # -great-auroral-stormthe-week-the-sun-touched-the-earth.ars
-            LNODE.log(node, 2, 'Conditional drop: img > p')
-            remove_node = True
-        elif li > p and node.tag != 'ul' and node.tag != 'ol':
+        if li > p and node.tag != 'ul' and node.tag != 'ol':
            LNODE.log(node, 2, 'Conditional drop: li > p and not ul/ol')
            remove_node = True
        elif inputs > p / 3.0:
@ -315,9 +309,15 @@ def clean_conditionally(node):
            LNODE.log(node, 2,
                'Conditional drop: embed w/o much content or many embed')
            remove_node = True
+
+        if remove_node:
+            LNODE.log(node, 2, 'Node will be removed')
+        else:
+            LNODE.log(node, 2, 'Node cleared')
        return remove_node

    # nope, don't remove anything
+    LNODE.log(node, 2, 'Node Cleared final.')
    return False


--- a/src/breadability/scripts/init.py
+++ b/src/breadability/scripts/init.py
--- a/src/breadability/scripts/newtest.py
+++ b/src/breadability/scripts/newtest.py
@ -0,0 +1,105 @@
+import argparse
+import codecs
+import urllib2
+from os import mkdir
+from os import path
+
+from breadability import VERSION
+
+
+TESTPATH = path.join(
+            path.dirname(path.dirname(__file__)),
+            'tests', 'test_articles')
+
+TESTTPL = """
+import os
+from unittest import TestCase
+
+from breadability.readable import Article
+
+
+class TestArticle(TestCase):
+    \"\"\"Test the scoring and parsing of the Article\"\"\"
+
+    def setUp(self):
+        \"\"\"Load up the article for us\"\"\"
+        article_path = os.path.join(os.path.dirname(__file__), 'article.html')
+        self.article = open(article_path).read()
+
+    def tearDown(self):
+        \"\"\"Drop the article\"\"\"
+        self.article = None
+
+    def test_parses(self):
+        \"\"\"Verify we can parse the document.\"\"\"
+        doc = Article(self.article)
+        self.assertTrue('id="readabilityBody"' in doc.readable)
+
+    def test_content_exists(self):
+        \"\"\"Verify that some content exists.\"\"\"
+        pass
+
+    def test_content_does_not_exist(self):
+        \"\"\"Verify we cleaned out some content that shouldn't exist.\"\"\"
+        pass
+"""
+
+
+def parse_args():
+    desc = "breadability helper to generate a new set of article test files."
+    parser = argparse.ArgumentParser(description=desc)
+    parser.add_argument('--version',
+        action='version', version=VERSION)
+
+    parser.add_argument('-n', '--name',
+        action='store',
+        required=True,
+        help='Name of the test directory')
+
+    parser.add_argument('url', metavar='URL', type=str, nargs=1,
+        help='The url of content to fetch for the article.html')
+
+    args = parser.parse_args()
+    return args
+
+
+def make_dir(name):
+    """Generate a new directory for tests.
+
+    """
+    dir_name = 'test_' + name.replace(' ', '_')
+    updated_name = path.join(TESTPATH, dir_name)
+    mkdir(updated_name)
+    return updated_name
+
+
+def make_files(dirname):
+    init_file = path.join(dirname, '__init__.py')
+    test_file = path.join(dirname, 'test.py')
+    open(init_file, "a").close()
+    with open(test_file, 'w') as f:
+        f.write(TESTTPL)
+
+
+def fetch_article(dirname, url):
+    """Get the content of the url and make it the article.html"""
+    opener = urllib2.build_opener()
+    opener.addheaders = [('Accept-Charset', 'utf-8')]
+    url_response = opener.open(url)
+    dl_html = url_response.read().decode('utf-8')
+
+    fh = codecs.open(path.join(dirname, 'article.html'), "w", "utf-8")
+    fh.write(dl_html)
+    fh.close()
+
+
+def main():
+    """Run the script."""
+    args = parse_args()
+    new_dir = make_dir(args.name)
+    make_files(new_dir)
+    fetch_article(new_dir, args.url[0])
+
+
+if __name__ == '__main__':
+    main()
--- a/src/breadability/tests/test_articles/test_antipope_org/test.py
+++ b/src/breadability/tests/test_articles/test_antipope_org/test.py
@ -36,5 +36,3 @@ class TestAntipopeBlog(TestCase):
        """
        doc = Article(self.article)
        self.assertTrue('id="beta"' not in doc.readable)
-
-
--- a/src/breadability/tests/test_articles/test_scripting-com/init.py
+++ b/src/breadability/tests/test_articles/test_scripting-com/init.py
--- a/src/breadability/tests/test_articles/test_scripting-com/article.html
+++ b/src/breadability/tests/test_articles/test_scripting-com/article.html
--- a/src/breadability/tests/test_articles/test_scripting-com/test.py
+++ b/src/breadability/tests/test_articles/test_scripting-com/test.py
@ -0,0 +1,66 @@
+import os
+from operator import attrgetter
+from unittest import TestCase
+
+from breadability.readable import Article
+from breadability.readable import check_siblings
+from breadability.readable import prep_article
+
+
+class TestArticle(TestCase):
+    """Test the scoring and parsing of the Article"""
+
+    def setUp(self):
+        """Load up the article for us"""
+        article_path = os.path.join(os.path.dirname(__file__), 'article.html')
+        self.article = open(article_path).read()
+
+    def tearDown(self):
+        """Drop the article"""
+        self.article = None
+
+    def test_parses(self):
+        """Verify we can parse the document."""
+        doc = Article(self.article)
+        self.assertTrue('id="readabilityBody"' in doc.readable)
+
+    def test_content_exists(self):
+        """Verify that some content exists."""
+        doc = Article(self.article)
+        self.assertTrue('Amazon and Google' in doc.readable)
+        self.assertFalse('Linkblog updated' in doc.readable)
+
+    def test_candidates(self):
+        """Verify we have candidates."""
+        doc = Article(self.article)
+        from lxml.etree import tounicode
+        found = False
+        wanted_hash = '04e46055'
+        # from breadability.logconfig import LNODE
+        # from breadability.logconfig import set_logging_level
+        # set_logging_level('DEBUG')
+        # LNODE.activate()
+        for node in doc.candidates.values():
+            if node.hash_id == wanted_hash:
+                found = node
+
+        self.assertTrue(found)
+
+        # we have the right node, it must be deleted for some reason if it's
+        # not still there when we need it to be.
+        # Make sure it's not in our to drop list.
+        for node in doc._should_drop:
+            self.assertFalse(node == found.node)
+
+        by_score = sorted([c for c in doc.candidates.values()],
+            key=attrgetter('content_score'), reverse=True)
+        self.assertTrue(by_score[0].node == found.node)
+
+        updated_winner = check_siblings(by_score[0], doc.candidates)
+        updated_winner.node = prep_article(updated_winner.node)
+
+        # This article hits up against the img > p conditional filtering
+        # because of the many .gif images in the content. We've removed that
+        # rule.
+        # set_logging_level('INFO')
+        # LNODE.deactivate()
--- a/src/breadability/utils.py
+++ b/src/breadability/utils.py
@ -1,5 +1,6 @@
 import time

+
 #
 # ? 2011 Christopher Arndt, MIT License
 #