From 5704eb4c1592832c9071ae9ebcae60dfb8741f6f Mon Sep 17 00:00:00 2001 From: Richard Harding Date: Sat, 16 Jun 2012 07:58:13 -0400 Subject: [PATCH] Start process of adding a newtest script for generating test cases - Adds new breadability_newtest tool for generating test cases. - Add fixes for the scripting.com test failure. --- setup.py | 4 +- src/breadability/__init__.py | 1 + src/breadability/logconfig.py | 4 + src/breadability/readable.py | 20 ++-- src/breadability/scripts/__init__.py | 0 src/breadability/scripts/newtest.py | 105 ++++++++++++++++++ .../test_articles/test_antipope_org/test.py | 2 - .../test_scripting-com/__init__.py | 0 .../test_scripting-com/article.html | 24 ++++ .../test_articles/test_scripting-com/test.py | 66 +++++++++++ src/breadability/utils.py | 1 + 11 files changed, 214 insertions(+), 13 deletions(-) create mode 100644 src/breadability/scripts/__init__.py create mode 100644 src/breadability/scripts/newtest.py create mode 100644 src/breadability/tests/test_articles/test_scripting-com/__init__.py create mode 100644 src/breadability/tests/test_articles/test_scripting-com/article.html create mode 100644 src/breadability/tests/test_articles/test_scripting-com/test.py diff --git a/setup.py b/setup.py index c749f01..e023eef 100644 --- a/setup.py +++ b/setup.py @@ -45,6 +45,8 @@ setup(name='breadability', }, entry_points={ 'console_scripts': - ['breadability=breadability:client.main'] + ['breadability=breadability:client.main', + 'breadability_newtest=breadability:newtest.main', + ] } ) diff --git a/src/breadability/__init__.py b/src/breadability/__init__.py index 0e4d71d..fce23f5 100644 --- a/src/breadability/__init__.py +++ b/src/breadability/__init__.py @@ -1,2 +1,3 @@ VERSION = '0.1.3' import client +from scripts import newtest diff --git a/src/breadability/logconfig.py b/src/breadability/logconfig.py index 36eabf6..704b7da 100644 --- a/src/breadability/logconfig.py +++ b/src/breadability/logconfig.py @@ -106,6 +106,10 @@ class LogHelper(object): """Turn on this logger.""" self._active = True + def deactivate(self): + """Turn off the logger""" + self._active = False + def log(self, node, action, description): """Write out our log info based on the node and event specified. diff --git a/src/breadability/readable.py b/src/breadability/readable.py index 8a54dba..1b43ecc 100644 --- a/src/breadability/readable.py +++ b/src/breadability/readable.py @@ -11,6 +11,7 @@ from breadability.document import OriginalDocument from breadability.logconfig import LOG from breadability.logconfig import LNODE from breadability.scoring import score_candidates +from breadability.scoring import generate_hash_id from breadability.scoring import get_link_density from breadability.scoring import get_class_weight from breadability.scoring import is_unlikely_node @@ -252,6 +253,7 @@ def clean_conditionally(node): if node.tag not in target_tags: # this is not the tag you're looking for + LNODE.log(node, 2, 'Node cleared.') return weight = get_class_weight(node) @@ -261,6 +263,7 @@ def clean_conditionally(node): if (weight + content_score < 0): LNODE.log(node, 2, 'Dropping conditional node') + LNODE.log(node, 2, 'Weight + score < 0') return True if node.text_content().count(',') < 10: @@ -284,16 +287,7 @@ def clean_conditionally(node): remove_node = False - if img > p: - # this one has shown to do some extra image removals. - # we could get around this by checking for caption info in the - # images to try to do some scoring of good v. bad images. - # failing example: - # arstechnica.com/science/news/2012/05/1859s - # -great-auroral-stormthe-week-the-sun-touched-the-earth.ars - LNODE.log(node, 2, 'Conditional drop: img > p') - remove_node = True - elif li > p and node.tag != 'ul' and node.tag != 'ol': + if li > p and node.tag != 'ul' and node.tag != 'ol': LNODE.log(node, 2, 'Conditional drop: li > p and not ul/ol') remove_node = True elif inputs > p / 3.0: @@ -315,9 +309,15 @@ def clean_conditionally(node): LNODE.log(node, 2, 'Conditional drop: embed w/o much content or many embed') remove_node = True + + if remove_node: + LNODE.log(node, 2, 'Node will be removed') + else: + LNODE.log(node, 2, 'Node cleared') return remove_node # nope, don't remove anything + LNODE.log(node, 2, 'Node Cleared final.') return False diff --git a/src/breadability/scripts/__init__.py b/src/breadability/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/breadability/scripts/newtest.py b/src/breadability/scripts/newtest.py new file mode 100644 index 0000000..d09c3e5 --- /dev/null +++ b/src/breadability/scripts/newtest.py @@ -0,0 +1,105 @@ +import argparse +import codecs +import urllib2 +from os import mkdir +from os import path + +from breadability import VERSION + + +TESTPATH = path.join( + path.dirname(path.dirname(__file__)), + 'tests', 'test_articles') + +TESTTPL = """ +import os +from unittest import TestCase + +from breadability.readable import Article + + +class TestArticle(TestCase): + \"\"\"Test the scoring and parsing of the Article\"\"\" + + def setUp(self): + \"\"\"Load up the article for us\"\"\" + article_path = os.path.join(os.path.dirname(__file__), 'article.html') + self.article = open(article_path).read() + + def tearDown(self): + \"\"\"Drop the article\"\"\" + self.article = None + + def test_parses(self): + \"\"\"Verify we can parse the document.\"\"\" + doc = Article(self.article) + self.assertTrue('id="readabilityBody"' in doc.readable) + + def test_content_exists(self): + \"\"\"Verify that some content exists.\"\"\" + pass + + def test_content_does_not_exist(self): + \"\"\"Verify we cleaned out some content that shouldn't exist.\"\"\" + pass +""" + + +def parse_args(): + desc = "breadability helper to generate a new set of article test files." + parser = argparse.ArgumentParser(description=desc) + parser.add_argument('--version', + action='version', version=VERSION) + + parser.add_argument('-n', '--name', + action='store', + required=True, + help='Name of the test directory') + + parser.add_argument('url', metavar='URL', type=str, nargs=1, + help='The url of content to fetch for the article.html') + + args = parser.parse_args() + return args + + +def make_dir(name): + """Generate a new directory for tests. + + """ + dir_name = 'test_' + name.replace(' ', '_') + updated_name = path.join(TESTPATH, dir_name) + mkdir(updated_name) + return updated_name + + +def make_files(dirname): + init_file = path.join(dirname, '__init__.py') + test_file = path.join(dirname, 'test.py') + open(init_file, "a").close() + with open(test_file, 'w') as f: + f.write(TESTTPL) + + +def fetch_article(dirname, url): + """Get the content of the url and make it the article.html""" + opener = urllib2.build_opener() + opener.addheaders = [('Accept-Charset', 'utf-8')] + url_response = opener.open(url) + dl_html = url_response.read().decode('utf-8') + + fh = codecs.open(path.join(dirname, 'article.html'), "w", "utf-8") + fh.write(dl_html) + fh.close() + + +def main(): + """Run the script.""" + args = parse_args() + new_dir = make_dir(args.name) + make_files(new_dir) + fetch_article(new_dir, args.url[0]) + + +if __name__ == '__main__': + main() diff --git a/src/breadability/tests/test_articles/test_antipope_org/test.py b/src/breadability/tests/test_articles/test_antipope_org/test.py index 2f2761a..4053cb0 100644 --- a/src/breadability/tests/test_articles/test_antipope_org/test.py +++ b/src/breadability/tests/test_articles/test_antipope_org/test.py @@ -36,5 +36,3 @@ class TestAntipopeBlog(TestCase): """ doc = Article(self.article) self.assertTrue('id="beta"' not in doc.readable) - - diff --git a/src/breadability/tests/test_articles/test_scripting-com/__init__.py b/src/breadability/tests/test_articles/test_scripting-com/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/breadability/tests/test_articles/test_scripting-com/article.html b/src/breadability/tests/test_articles/test_scripting-com/article.html new file mode 100644 index 0000000..0f4261d --- /dev/null +++ b/src/breadability/tests/test_articles/test_scripting-com/article.html @@ -0,0 +1,24 @@ + Scripting News: Tech press misses Google/Amazon name grab
Click here to show or hide the menubar.

Home >  +Archive >  +2012 >  +June >  +15 +

Previous / Next

Tech press misses Google/Amazon name grab
By Dave Winer on Friday, June 15, 2012 at 5:52 PM.
+
+
Amazon and Google have made an audacious grab of namespace on the Internet. As far as I can see there's been no mention of this in the tech press. permalink
+
An example. Google doesn't intend to share .blog and it will only be used to point to Blogger sites. If you have a Tumblr or WordPress blog, you can't have a .blog domain. Here is the public listing of Google's application. permalink
+
+
+
The purpose of the proposed gTLD, .blog, is to provide a dedicated Internet space where Google can continue to innovate on its Blogger offerings. The mission of the proposed gTLD is to provide a dedicated domain space in which users can publish blogs. All registered domains in the .blog gTLD will automatically be delegated to Google DNS servers, which will in turn provide authoritative DNS responses pointing to the Google Blogger service. The mission of the proposed gTLD is to simplify the Blogger user experience. Users will be able to publish content on a unique .blog domain (e.g., myname.blog) which will serve as a short and memorable URL for a particular Blogger account. This mission will enhance consumer choice by providing new availability in the second-level domain space, creating new layers of organization on the Internet, improving the Google user experience, and signaling the kind of content available in the domain. permalink
+
+
+
Amazon plans to do the same with .search. So if you have a search site and it's not Amazon's you can't be part of .search.  permalink
+
Google is going to be exclusive about .cloud.  permalink
+
There are lots more new proposed TLDs like this. permalink
+
Seems like a huge story to me. A big surprise. Did you think this is how it would work? I sure didn't. permalink
+
I tweeted this, followed by a pointer to a blog post written by Michele Neylon, all before 8AM Eastern this morning. It's now 6PM, and there have been no reports about it in the tech press. It'll be interesting to see when (or if) this becomes a story.  permalink
+
Another angle on this, the ICANN people must have known about these applications long before they were made public. How could they continue this process, knowing that is how Google and Amazon interpreted the idea of new TLDs? permalink
+
BTW, this also happened on Wednesday morning when we had a story here, at 8AM, about a fundamental change in the way Twitter works. It used to have a 140-character limit, but that limit was lifted for Twitter's media partners. A press release ran later in the day. That's when the reports started appearing in the tech press. Even though the story was in their Twitter timelines, and here on Scripting News. permalink
+
+
+
RSS feed for Scripting News
This site contributes to the scripting.com community river.


© Copyright 1997-2012 Dave Winer. Last update: Friday, June 15, 2012 at 6:27 PM Eastern. Last build: 6/16/2012; 9:03:27 AM. "It's even worse than it appears."

RSS feed for Scripting News

Previous / Next

\ No newline at end of file diff --git a/src/breadability/tests/test_articles/test_scripting-com/test.py b/src/breadability/tests/test_articles/test_scripting-com/test.py new file mode 100644 index 0000000..42fa958 --- /dev/null +++ b/src/breadability/tests/test_articles/test_scripting-com/test.py @@ -0,0 +1,66 @@ +import os +from operator import attrgetter +from unittest import TestCase + +from breadability.readable import Article +from breadability.readable import check_siblings +from breadability.readable import prep_article + + +class TestArticle(TestCase): + """Test the scoring and parsing of the Article""" + + def setUp(self): + """Load up the article for us""" + article_path = os.path.join(os.path.dirname(__file__), 'article.html') + self.article = open(article_path).read() + + def tearDown(self): + """Drop the article""" + self.article = None + + def test_parses(self): + """Verify we can parse the document.""" + doc = Article(self.article) + self.assertTrue('id="readabilityBody"' in doc.readable) + + def test_content_exists(self): + """Verify that some content exists.""" + doc = Article(self.article) + self.assertTrue('Amazon and Google' in doc.readable) + self.assertFalse('Linkblog updated' in doc.readable) + + def test_candidates(self): + """Verify we have candidates.""" + doc = Article(self.article) + from lxml.etree import tounicode + found = False + wanted_hash = '04e46055' + # from breadability.logconfig import LNODE + # from breadability.logconfig import set_logging_level + # set_logging_level('DEBUG') + # LNODE.activate() + for node in doc.candidates.values(): + if node.hash_id == wanted_hash: + found = node + + self.assertTrue(found) + + # we have the right node, it must be deleted for some reason if it's + # not still there when we need it to be. + # Make sure it's not in our to drop list. + for node in doc._should_drop: + self.assertFalse(node == found.node) + + by_score = sorted([c for c in doc.candidates.values()], + key=attrgetter('content_score'), reverse=True) + self.assertTrue(by_score[0].node == found.node) + + updated_winner = check_siblings(by_score[0], doc.candidates) + updated_winner.node = prep_article(updated_winner.node) + + # This article hits up against the img > p conditional filtering + # because of the many .gif images in the content. We've removed that + # rule. + # set_logging_level('INFO') + # LNODE.deactivate() diff --git a/src/breadability/utils.py b/src/breadability/utils.py index 8986176..6c2b100 100644 --- a/src/breadability/utils.py +++ b/src/breadability/utils.py @@ -1,5 +1,6 @@ import time + # # ? 2011 Christopher Arndt, MIT License #