From 5704eb4c1592832c9071ae9ebcae60dfb8741f6f Mon Sep 17 00:00:00 2001
From: Richard Harding <rharding@mitechie.com>
Date: Sat, 16 Jun 2012 07:58:13 -0400
Subject: [PATCH] Start process of adding a newtest script for generating test
 cases

- Adds new breadability_newtest tool for generating test cases.
- Add fixes for the scripting.com test failure.
---
 setup.py                                      |   4 +-
 src/breadability/__init__.py                  |   1 +
 src/breadability/logconfig.py                 |   4 +
 src/breadability/readable.py                  |  20 ++--
 src/breadability/scripts/__init__.py          |   0
 src/breadability/scripts/newtest.py           | 105 ++++++++++++++++++
 .../test_articles/test_antipope_org/test.py   |   2 -
 .../test_scripting-com/__init__.py            |   0
 .../test_scripting-com/article.html           |  24 ++++
 .../test_articles/test_scripting-com/test.py  |  66 +++++++++++
 src/breadability/utils.py                     |   1 +
 11 files changed, 214 insertions(+), 13 deletions(-)
 create mode 100644 src/breadability/scripts/__init__.py
 create mode 100644 src/breadability/scripts/newtest.py
 create mode 100644 src/breadability/tests/test_articles/test_scripting-com/__init__.py
 create mode 100644 src/breadability/tests/test_articles/test_scripting-com/article.html
 create mode 100644 src/breadability/tests/test_articles/test_scripting-com/test.py

diff --git a/setup.py b/setup.py
index c749f01..e023eef 100644
--- a/setup.py
+++ b/setup.py
@@ -45,6 +45,8 @@ setup(name='breadability',
     },
     entry_points={
         'console_scripts':
-            ['breadability=breadability:client.main']
+            ['breadability=breadability:client.main',
+             'breadability_newtest=breadability:newtest.main',
+            ]
     }
 )
diff --git a/src/breadability/__init__.py b/src/breadability/__init__.py
index 0e4d71d..fce23f5 100644
--- a/src/breadability/__init__.py
+++ b/src/breadability/__init__.py
@@ -1,2 +1,3 @@
 VERSION = '0.1.3'
 import client
+from scripts import newtest
diff --git a/src/breadability/logconfig.py b/src/breadability/logconfig.py
index 36eabf6..704b7da 100644
--- a/src/breadability/logconfig.py
+++ b/src/breadability/logconfig.py
@@ -106,6 +106,10 @@ class LogHelper(object):
         """Turn on this logger."""
         self._active = True
 
+    def deactivate(self):
+        """Turn off the logger"""
+        self._active = False
+
     def log(self, node, action, description):
         """Write out our log info based on the node and event specified.
 
diff --git a/src/breadability/readable.py b/src/breadability/readable.py
index 8a54dba..1b43ecc 100644
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@@ -11,6 +11,7 @@ from breadability.document import OriginalDocument
 from breadability.logconfig import LOG
 from breadability.logconfig import LNODE
 from breadability.scoring import score_candidates
+from breadability.scoring import generate_hash_id
 from breadability.scoring import get_link_density
 from breadability.scoring import get_class_weight
 from breadability.scoring import is_unlikely_node
@@ -252,6 +253,7 @@ def clean_conditionally(node):
 
     if node.tag not in target_tags:
         # this is not the tag you're looking for
+        LNODE.log(node, 2, 'Node cleared.')
         return
 
     weight = get_class_weight(node)
@@ -261,6 +263,7 @@ def clean_conditionally(node):
 
     if (weight + content_score < 0):
         LNODE.log(node, 2, 'Dropping conditional node')
+        LNODE.log(node, 2, 'Weight + score < 0')
         return True
 
     if node.text_content().count(',') < 10:
@@ -284,16 +287,7 @@ def clean_conditionally(node):
 
         remove_node = False
 
-        if img > p:
-            # this one has shown to do some extra image removals.
-            # we could get around this by checking for caption info in the
-            # images to try to do some scoring of good v. bad images.
-            # failing example:
-            # arstechnica.com/science/news/2012/05/1859s
-            # -great-auroral-stormthe-week-the-sun-touched-the-earth.ars
-            LNODE.log(node, 2, 'Conditional drop: img > p')
-            remove_node = True
-        elif li > p and node.tag != 'ul' and node.tag != 'ol':
+        if li > p and node.tag != 'ul' and node.tag != 'ol':
             LNODE.log(node, 2, 'Conditional drop: li > p and not ul/ol')
             remove_node = True
         elif inputs > p / 3.0:
@@ -315,9 +309,15 @@ def clean_conditionally(node):
             LNODE.log(node, 2,
                 'Conditional drop: embed w/o much content or many embed')
             remove_node = True
+
+        if remove_node:
+            LNODE.log(node, 2, 'Node will be removed')
+        else:
+            LNODE.log(node, 2, 'Node cleared')
         return remove_node
 
     # nope, don't remove anything
+    LNODE.log(node, 2, 'Node Cleared final.')
     return False
 
 
diff --git a/src/breadability/scripts/__init__.py b/src/breadability/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/breadability/scripts/newtest.py b/src/breadability/scripts/newtest.py
new file mode 100644
index 0000000..d09c3e5
--- /dev/null
+++ b/src/breadability/scripts/newtest.py
@@ -0,0 +1,105 @@
+import argparse
+import codecs
+import urllib2
+from os import mkdir
+from os import path
+
+from breadability import VERSION
+
+
+TESTPATH = path.join(
+            path.dirname(path.dirname(__file__)),
+            'tests', 'test_articles')
+
+TESTTPL = """
+import os
+from unittest import TestCase
+
+from breadability.readable import Article
+
+
+class TestArticle(TestCase):
+    \"\"\"Test the scoring and parsing of the Article\"\"\"
+
+    def setUp(self):
+        \"\"\"Load up the article for us\"\"\"
+        article_path = os.path.join(os.path.dirname(__file__), 'article.html')
+        self.article = open(article_path).read()
+
+    def tearDown(self):
+        \"\"\"Drop the article\"\"\"
+        self.article = None
+
+    def test_parses(self):
+        \"\"\"Verify we can parse the document.\"\"\"
+        doc = Article(self.article)
+        self.assertTrue('id="readabilityBody"' in doc.readable)
+
+    def test_content_exists(self):
+        \"\"\"Verify that some content exists.\"\"\"
+        pass
+
+    def test_content_does_not_exist(self):
+        \"\"\"Verify we cleaned out some content that shouldn't exist.\"\"\"
+        pass
+"""
+
+
+def parse_args():
+    desc = "breadability helper to generate a new set of article test files."
+    parser = argparse.ArgumentParser(description=desc)
+    parser.add_argument('--version',
+        action='version', version=VERSION)
+
+    parser.add_argument('-n', '--name',
+        action='store',
+        required=True,
+        help='Name of the test directory')
+
+    parser.add_argument('url', metavar='URL', type=str, nargs=1,
+        help='The url of content to fetch for the article.html')
+
+    args = parser.parse_args()
+    return args
+
+
+def make_dir(name):
+    """Generate a new directory for tests.
+
+    """
+    dir_name = 'test_' + name.replace(' ', '_')
+    updated_name = path.join(TESTPATH, dir_name)
+    mkdir(updated_name)
+    return updated_name
+
+
+def make_files(dirname):
+    init_file = path.join(dirname, '__init__.py')
+    test_file = path.join(dirname, 'test.py')
+    open(init_file, "a").close()
+    with open(test_file, 'w') as f:
+        f.write(TESTTPL)
+
+
+def fetch_article(dirname, url):
+    """Get the content of the url and make it the article.html"""
+    opener = urllib2.build_opener()
+    opener.addheaders = [('Accept-Charset', 'utf-8')]
+    url_response = opener.open(url)
+    dl_html = url_response.read().decode('utf-8')
+
+    fh = codecs.open(path.join(dirname, 'article.html'), "w", "utf-8")
+    fh.write(dl_html)
+    fh.close()
+
+
+def main():
+    """Run the script."""
+    args = parse_args()
+    new_dir = make_dir(args.name)
+    make_files(new_dir)
+    fetch_article(new_dir, args.url[0])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/breadability/tests/test_articles/test_antipope_org/test.py b/src/breadability/tests/test_articles/test_antipope_org/test.py
index 2f2761a..4053cb0 100644
--- a/src/breadability/tests/test_articles/test_antipope_org/test.py
+++ b/src/breadability/tests/test_articles/test_antipope_org/test.py
@@ -36,5 +36,3 @@ class TestAntipopeBlog(TestCase):
         """
         doc = Article(self.article)
         self.assertTrue('id="beta"' not in doc.readable)
-
-
diff --git a/src/breadability/tests/test_articles/test_scripting-com/__init__.py b/src/breadability/tests/test_articles/test_scripting-com/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/breadability/tests/test_articles/test_scripting-com/article.html b/src/breadability/tests/test_articles/test_scripting-com/article.html
new file mode 100644
index 0000000..0f4261d
--- /dev/null
+++ b/src/breadability/tests/test_articles/test_scripting-com/article.html
@@ -0,0 +1,24 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"><html xmlns:scripting2="http://scripting2.com/namespace.html">	<head>		<title>Scripting News: Tech press misses Google/Amazon name grab</title>		<scripting2:navigation prev="http://scripting.com/stories/2012/06/15/podcastOnThursday.html" next="http://scripting.com/stories/2012/06/16/vectorsOnParade.html" parent="" child="" />		<link rel="alternate" type="application/rss+xml" title="RSS" href="http://scripting.com/rss.xml" />		<link rel="alternate" type="application/rss+xml" title="RSS/link-blog" href="http://links.scripting.com/rss.xml" />		<link rel="alternate" type="application/opml+xml" title="blogroll" href="http://scripting.com/misc/blogroll.opml" /> 		<link rel="alternate" type="application/opml+xml" title="source" href="http://scripting.com/stories/2012/06/15/theTechPressIsOutToLunch.opml" /> 		<link rel="stylesheet" href="http://scripting.com/stylesheet.css" type="text/css" />					<link href="http://static.scripting.com/github/bootstrap2/css/bootstrap.css" rel="stylesheet">		<link href="http://static.scripting.com/github/bootstrap2/css/prettify.css" rel="stylesheet">				<script src="http://static.scripting.com/github/bootstrap2/js/jquery.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/prettify.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/bootstrap-transition.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/bootstrap-alert.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/bootstrap-modal.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/bootstrap-dropdown.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/bootstrap-scrollspy.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/bootstrap-tab.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/bootstrap-tooltip.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/bootstrap-popover.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/bootstrap-button.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/bootstrap-collapse.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/bootstrap-carousel.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/bootstrap-typeahead.js"></script>		<script src="http://static.scripting.com/github/bootstrap2/js/application.js"></script>				<script src="http://static.opml.org/scripts.js"></script>		<link href="http://static.opml.org/styles.css" rel="stylesheet">		<script src="http://static.opml.org/scripts.js"></script>		<link href="http://static.opml.org/styles.css" rel="stylesheet">		<meta name="generator" content="OPML Editor v0.75">		<script language="javaScript">	<!--	function expandCollapse(id,showClassName){		var imgId = 'img_'+ id;		if(document.getElementById(id).className == 'hide'){			document.getElementById(id).className=showClassName;			document.getElementById(imgId).src='http://scripting.com/mktree/minus.gif';					}		else{			document.getElementById(id).className='hide';			document.getElementById(imgId).src='http://scripting.com/mktree/plus.gif';			}		}	--></script> 		<style>			table th { 				border: none; 				padding: 0;				}			table td { 				border: none; 				padding: 0;				}			table td + td {				border: none; 				}			#disqus_thread {				width: 700px;				}			</style>		</head>	<body>		<script>	function toggleMenubar () {		if (document.getElementById ("mainMenuBar").style.display == ''){			setCookie ("flMenuVisible", "0", 1000)			document.getElementById ("mainMenuBar").style.display = "none";			document.getElementById ("wedge").src = 'http://scripting.com/images/2011/12/16/downWedge.gif';					}		else {			setCookie ("flMenuVisible", "1", 1000)			document.getElementById("mainMenuBar").style.display = '';			document.getElementById("wedge").src = 'http://scripting.com/images/2011/12/16/upWedge.gif';			}		}	</script><style>	.divMenubarToggle img {		position: absolute;		top: 5px;		left: 1px;		z-index:100000;		}	</style><div class="divMenubarToggle">	<a title="Click here to show or hide the menubar." onclick="toggleMenubar()"><img src="http://scripting.com/images/2011/12/16/upWedge.gif" id="wedge" width="16" height="16" alt="Click here to show or hide the menubar."></a>	</div><div class="divOpmlMenubar" id="mainMenuBar">	<div class="topbar-wrapper" style="z-index: 5;">		<div class="navbar" data-dropdown="dropdown" >			<div class="navbar-inner">				<div class="container">					<a class="brand" href="/"><white>Scripting News</white></a>					<div class="nav-collapse">						<ul class="nav">							<li class="divOpmlMenuItem"><a href="http://scripting.com/">Blog</a></li>							<li class="divOpmlMenuItem"><a href="http://links.scripting.com/">Linkblog</a></li>							<li class="divOpmlMenuItem"><a href="http://links.scripting.com/topLinks.html">Top-40</a></li>							<li class="divOpmlMenuItem"><a href="http://photos.scripting.com/">Photos</a></li>							<li class="divOpmlMenuItem"><a href="http://threads.scripting.com/">Threads</a></li>							<li class="divOpmlMenuItem"><a href="http://daveriver.com/">River</a></li>							<li class="divOpmlMenuItem"><a href="http://davewiner.com/">Dave</a></li>							</ul>						</div>					</div>				</div>			</div>		</div>	</div>		<div class="divOpmlWebpage">			<div class="divOpmlWebpageBody">				<div class="divScriptingNavigation">					<p class="crumbTrail"><a href="http://scripting.com/">Home</a>&nbsp;>&nbsp;
+<a href="http://scripting.com/toc.html">Archive</a>&nbsp;>&nbsp;
+<a href="http://scripting.com/toc.html#y2012">2012</a>&nbsp;>&nbsp;
+<a href="http://scripting.com/2012/06.html">June</a>&nbsp;>&nbsp;
+<a href="http://scripting.com/2012/06/15.html">15</a>
+</p>					<p class="nextPrev"><a href="http://scripting.com/stories/2012/06/15/podcastOnThursday.html">Previous</a> / <a href="http://scripting.com/stories/2012/06/16/vectorsOnParade.html">Next</a></p>					 </div>				<div class="divScriptingStoryBody">					<div class="storyTitle">Tech press misses Google/Amazon name grab</div>					<div class="storyByline">By <a href="http://davewiner.com/">Dave Winer</a> on Friday, June 15, 2012 at 5:52 PM.</div>					<table cellspacing="0" cellpadding="0">						<tr>							<td valign="top">								<div class="storyBody">									<div class="divOutlineBody">
+	<div class="divOutlineList" style="padding-left: 0;">
+		<div class="divOutlineItem" style="padding-bottom: 10px;" id="i14575"><img class="expandIcon" src="http://scripting.com/images/2011/11/29/blank.gif" border="0"><span class="spanOutlineText" style="font-family: Georgia; font-size: 18px; line-height: 140%;">Amazon and Google have made an audacious grab of namespace on the Internet. As far as I can see there's been no mention of this in the tech press.</span><a name="amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress">&nbsp;</a><a href="#amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a></div>
+		<div class="divOutlineItem" style="padding-bottom: 10px;" id="i14583"><a href="javascript:ec('anExampleGoogleDoesn25374','show','http://scripting.com/images/2011/11/29/blank.gif','http://scripting.com/images/2011/11/29/blank.gif');"><img class="expandIcon" id="img_anExampleGoogleDoesn25374" src="http://scripting.com/images/2011/11/29/blank.gif" border="0"></a><span class="spanOutlineText" style="font-family: Georgia; font-size: 18px; line-height: 140%;">An example. Google doesn't intend to share .blog and it will only be used to point to Blogger sites. If you have a Tumblr or WordPress blog, you can't have a .blog domain. Here is the <a href="http://gtldresult.icann.org/application-result/applicationstatus/applicationdetails/527">public listing</a> of Google's <a href="http://dropbox.scripting.com/dave/misc/googleBlogApplication.html">application</a>.</span><a name="anExampleGoogleDoesntIntendToShareBlogAndItWillOnlyBeUsedToPointToBloggerSitesIfYouHaveATumblrOrWordpressBlogYouCantHaveABlogDomainHereIsTheAHrefhttpgtldresulticannorgapplicationresultapplicationstatusapplicationdetails527publicListingaOfGooglesAHrefhttpdropboxscriptingcomdavemiscgoogleblogapplicationhtmlapplicationa">&nbsp;</a><a href="#anExampleGoogleDoesntIntendToShareBlogAndItWillOnlyBeUsedToPointToBloggerSitesIfYouHaveATumblrOrWordpressBlogYouCantHaveABlogDomainHereIsTheAHrefhttpgtldresulticannorgapplicationresultapplicationstatusapplicationdetails527publicListingaOfGooglesAHrefhttpdropboxscriptingcomdavemiscgoogleblogapplicationhtmlapplicationa"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a></div>
+		<div class="show" id="anExampleGoogleDoesn25374" name="anExampleGoogleDoesn25374">
+			<div class="divOutlineList" style="padding-left: 30px;">
+				<div class="divOutlineItem" style="padding-bottom: 10px;" id="i14593"><img class="expandIcon" src="http://scripting.com/images/2011/11/29/blank.gif" border="0"><span class="spanOutlineText" style="font-family: Georgia; font-size: 18px; line-height: 140%;"><i>The purpose of the proposed gTLD, .blog, is to provide a dedicated Internet space where Google can continue to innovate on its Blogger offerings.  The mission of the proposed gTLD is to provide a dedicated domain space in which users can publish blogs.  All registered domains in the .blog gTLD will automatically be delegated to Google DNS servers, which will in turn provide authoritative DNS responses pointing to the Google Blogger service.  The mission of the proposed gTLD is to simplify the Blogger user experience.  Users will be able to publish content on a unique .blog domain (e.g., myname.blog) which will serve as a short and memorable URL for a particular Blogger account. This mission will enhance consumer choice by providing new availability in the second-level domain space, creating new layers of organization on the Internet, improving the Google user experience, and signaling the kind of content available in the domain.</i></span><a name="ithePurposeOfTheProposedGtldBlogIsToProvideADedicatedInternetSpaceWhereGoogleCanContinueToInnovateOnItsBloggerOfferingsTheMissionOfTheProposedGtldIsToProvideADedicatedDomainSpaceInWhichUsersCanPublishBlogsAllRegisteredDomainsInTheBlogGtldWillAutomaticallyBeDelegatedToGoogleDnsServersWhichWillInTurnProvideAuthoritativeDnsResponsesPointingToTheGoogleBloggerServiceTheMissionOfTheProposedGtldIsToSimplifyTheBloggerUserExperienceUsersWillBeAbleToPublishContentOnAUniqueBlogDomainEgMynameblogWhichWillServeAsAShortAndMemorableUrlForAParticularBloggerAccountThisMissionWillEnhanceConsumerChoiceByProvidingNewAvailabilityInTheSecondlevelDomainSpaceCreatingNewLayersOfOrganizationOnTheInternetImprovingTheGoogleUserExperienceAndSignalingTheKindOfContentAvailableInTheDomaini">&nbsp;</a><a href="#ithePurposeOfTheProposedGtldBlogIsToProvideADedicatedInternetSpaceWhereGoogleCanContinueToInnovateOnItsBloggerOfferingsTheMissionOfTheProposedGtldIsToProvideADedicatedDomainSpaceInWhichUsersCanPublishBlogsAllRegisteredDomainsInTheBlogGtldWillAutomaticallyBeDelegatedToGoogleDnsServersWhichWillInTurnProvideAuthoritativeDnsResponsesPointingToTheGoogleBloggerServiceTheMissionOfTheProposedGtldIsToSimplifyTheBloggerUserExperienceUsersWillBeAbleToPublishContentOnAUniqueBlogDomainEgMynameblogWhichWillServeAsAShortAndMemorableUrlForAParticularBloggerAccountThisMissionWillEnhanceConsumerChoiceByProvidingNewAvailabilityInTheSecondlevelDomainSpaceCreatingNewLayersOfOrganizationOnTheInternetImprovingTheGoogleUserExperienceAndSignalingTheKindOfContentAvailableInTheDomaini"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a></div>
+				</div>
+			</div>
+		<div class="divOutlineItem" style="padding-bottom: 10px;" id="i14584"><img class="expandIcon" src="http://scripting.com/images/2011/11/29/blank.gif" border="0"><span class="spanOutlineText" style="font-family: Georgia; font-size: 18px; line-height: 140%;">Amazon plans to do the same with .search. So if you have a search site and it's not Amazon's you can't be part of .search. </span><a name="amazonPlansToDoTheSameWithSearchSoIfYouHaveASearchSiteAndItsNotAmazonsYouCantBePartOfSearch">&nbsp;</a><a href="#amazonPlansToDoTheSameWithSearchSoIfYouHaveASearchSiteAndItsNotAmazonsYouCantBePartOfSearch"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a></div>
+		<div class="divOutlineItem" style="padding-bottom: 10px;" id="i14594"><img class="expandIcon" src="http://scripting.com/images/2011/11/29/blank.gif" border="0"><span class="spanOutlineText" style="font-family: Georgia; font-size: 18px; line-height: 140%;">Google is going to be exclusive about .cloud. </span><a name="googleIsGoingToBeExclusiveAboutCloud">&nbsp;</a><a href="#googleIsGoingToBeExclusiveAboutCloud"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a></div>
+		<div class="divOutlineItem" style="padding-bottom: 10px;" id="i14586"><img class="expandIcon" src="http://scripting.com/images/2011/11/29/blank.gif" border="0"><span class="spanOutlineText" style="font-family: Georgia; font-size: 18px; line-height: 140%;">There are lots more new proposed TLDs like this.</span><a name="thereAreLotsMoreNewProposedTldsLikeThis">&nbsp;</a><a href="#thereAreLotsMoreNewProposedTldsLikeThis"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a></div>
+		<div class="divOutlineItem" style="padding-bottom: 10px;" id="i14587"><img class="expandIcon" src="http://scripting.com/images/2011/11/29/blank.gif" border="0"><span class="spanOutlineText" style="font-family: Georgia; font-size: 18px; line-height: 140%;">Seems like a huge story to me. A big surprise. Did you think this is how it would work? I sure didn't.</span><a name="seemsLikeAHugeStoryToMeABigSurpriseDidYouThinkThisIsHowItWouldWorkISureDidnt">&nbsp;</a><a href="#seemsLikeAHugeStoryToMeABigSurpriseDidYouThinkThisIsHowItWouldWorkISureDidnt"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a></div>
+		<div class="divOutlineItem" style="padding-bottom: 10px;" id="i14578"><img class="expandIcon" src="http://scripting.com/images/2011/11/29/blank.gif" border="0"><span class="spanOutlineText" style="font-family: Georgia; font-size: 18px; line-height: 140%;"><a href="https://twitter.com/davewiner/status/213644043740131330">I tweeted this</a>, followed by a pointer to a <a href="http://www.internetnews.me/2012/06/14/big-brands-trying-to-corner-generic-namespaces/">blog post</a> written by Michele Neylon, all before 8AM Eastern this morning. It's now 6PM, and there have been no reports about it in the tech press. It'll be interesting to see when (or if) this becomes a story. </span><a name="aHrefhttpstwittercomdavewinerstatus213644043740131330iTweetedThisaFollowedByAPointerToAAHrefhttpwwwinternetnewsme20120614bigbrandstryingtocornergenericnamespacesblogPostaWrittenByMicheleNeylonAllBefore8amEasternThisMorningItsNow6pmAndThereHaveBeenNoReportsAboutItInTheTechPressItllBeInterestingToSeeWhenOrIfThisBecomesAStory">&nbsp;</a><a href="#aHrefhttpstwittercomdavewinerstatus213644043740131330iTweetedThisaFollowedByAPointerToAAHrefhttpwwwinternetnewsme20120614bigbrandstryingtocornergenericnamespacesblogPostaWrittenByMicheleNeylonAllBefore8amEasternThisMorningItsNow6pmAndThereHaveBeenNoReportsAboutItInTheTechPressItllBeInterestingToSeeWhenOrIfThisBecomesAStory"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a></div>
+		<div class="divOutlineItem" style="padding-bottom: 10px;" id="i14579"><img class="expandIcon" src="http://scripting.com/images/2011/11/29/blank.gif" border="0"><span class="spanOutlineText" style="font-family: Georgia; font-size: 18px; line-height: 140%;">Another angle on this, the ICANN people must have known about these applications long before they were made public. How could they continue this process, knowing that is how Google and Amazon interpreted the idea of new TLDs?</span><a name="anotherAngleOnThisTheIcannPeopleMustHaveKnownAboutTheseApplicationsLongBeforeTheyWereMadePublicHowCouldTheyContinueThisProcessKnowingThatIsHowGoogleAndAmazonInterpretedTheIdeaOfNewTlds">&nbsp;</a><a href="#anotherAngleOnThisTheIcannPeopleMustHaveKnownAboutTheseApplicationsLongBeforeTheyWereMadePublicHowCouldTheyContinueThisProcessKnowingThatIsHowGoogleAndAmazonInterpretedTheIdeaOfNewTlds"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a></div>
+		<div class="divOutlineItem" style="padding-bottom: 10px;" id="i14576"><img class="expandIcon" src="http://scripting.com/images/2011/11/29/blank.gif" border="0"><span class="spanOutlineText" style="font-family: Georgia; font-size: 18px; line-height: 140%;">BTW, this also happened on Wednesday morning when we had a <a href="http://scripting.com/stories/2012/06/13/twittersLevelPlayingField.html">story here</a>, at 8AM, about a fundamental change in the way Twitter works. It used to have a 140-character limit, but that limit was lifted for Twitter's media partners. A press release ran later in the day. That's when the reports started appearing in the tech press. Even though the story was in their Twitter timelines, and here on Scripting News.</span><a name="btwThisAlsoHappenedOnWednesdayMorningWhenWeHadAAHrefhttpscriptingcomstories20120613twitterslevelplayingfieldhtmlstoryHereaAt8amAboutAFundamentalChangeInTheWayTwitterWorksItUsedToHaveA140characterLimitButThatLimitWasLiftedForTwittersMediaPartnersAPressReleaseRanLaterInTheDayThatsWhenTheReportsStartedAppearingInTheTechPressEvenThoughTheStoryWasInTheirTwitterTimelinesAndHereOnScriptingNews">&nbsp;</a><a href="#btwThisAlsoHappenedOnWednesdayMorningWhenWeHadAAHrefhttpscriptingcomstories20120613twitterslevelplayingfieldhtmlstoryHereaAt8amAboutAFundamentalChangeInTheWayTwitterWorksItUsedToHaveA140characterLimitButThatLimitWasLiftedForTwittersMediaPartnersAPressReleaseRanLaterInTheDayThatsWhenTheReportsStartedAppearingInTheTechPressEvenThoughTheStoryWasInTheirTwitterTimelinesAndHereOnScriptingNews"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a></div>
+		</div>
+	</div>
+																		</div>								</td>							<td valign="top">								<div class="scriptingRightSidebar">									<div class="divTwitterFollowButton">										<a href="http://scripting.com/rss.xml"><img src="http://www.scripting.com/images/xml.gif" width="36" height="14" border="0" style="float: right;" alt="RSS feed for Scripting News"></a>										<a href="https://twitter.com/davewiner" class="twitter-follow-button" data-show-count="true" data-show-screen-name="false">Follow @davewiner</a>										<script>!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0];if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src="//platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>										</div>									<script type="text/javascript" src="http://static.reallysimple.org/users/dave/linkblogxml/miniLinkBlog.js"></script>									<div class="divScriptingCommunityBadge">										<center><a href="http://river.scripting.com/"><img src="http://scripting.com/images/2011/11/28/badge.gif" width="200" height="25" border="0" alt="This site contributes to the scripting.com community river."></a></center>										</div>									<p><script type="text/javascript" src="http://127.0.0.1:5337/scripting2/editor/controls?username=davewiner&url=http://scripting.com/stories/2012/06/15/theTechPressIsOutToLunch.html"></script></p>									</div>								</td>							</tr>						</table>					<div class="scriptingFooter">						<hr size=1 noshade>&copy; Copyright 1997-2012 Dave Winer. Last update: Friday, June 15, 2012 at 6:27 PM Eastern. Last build: 6/16/2012; 9:03:27 AM. "It's even worse than it appears." 						<p><a href="http://scripting.com/rss.xml"><img src="http://www.scripting.com/images/xml.gif" width="36" height="14" border="0" alt="RSS feed for Scripting News"></a></p>						<p><script type="text/javascript" src="http://127.0.0.1:5337/scripting2/editor/controls?username=davewiner&url=http://scripting.com/stories/2012/06/15/theTechPressIsOutToLunch.html"></script></p>						<p><a href="http://scripting.com/stories/2012/06/15/podcastOnThursday.html">Previous</a> / <a href="http://scripting.com/stories/2012/06/16/vectorsOnParade.html">Next</a></p>						<script language="JavaScript" type="text/javascript"><!--							var imageUrl = "http://counters.scripting.com/counters/count.gif";							var imageTag = "<img src=\"" + imageUrl + "?group=scripting2&referer=" + escape (document.referrer) + "\" height=\"1\" width=\"1\">";							document.write (imageTag);							--></script>					</div>				</div>			</div>		</body>	</html>
\ No newline at end of file
diff --git a/src/breadability/tests/test_articles/test_scripting-com/test.py b/src/breadability/tests/test_articles/test_scripting-com/test.py
new file mode 100644
index 0000000..42fa958
--- /dev/null
+++ b/src/breadability/tests/test_articles/test_scripting-com/test.py
@@ -0,0 +1,66 @@
+import os
+from operator import attrgetter
+from unittest import TestCase
+
+from breadability.readable import Article
+from breadability.readable import check_siblings
+from breadability.readable import prep_article
+
+
+class TestArticle(TestCase):
+    """Test the scoring and parsing of the Article"""
+
+    def setUp(self):
+        """Load up the article for us"""
+        article_path = os.path.join(os.path.dirname(__file__), 'article.html')
+        self.article = open(article_path).read()
+
+    def tearDown(self):
+        """Drop the article"""
+        self.article = None
+
+    def test_parses(self):
+        """Verify we can parse the document."""
+        doc = Article(self.article)
+        self.assertTrue('id="readabilityBody"' in doc.readable)
+
+    def test_content_exists(self):
+        """Verify that some content exists."""
+        doc = Article(self.article)
+        self.assertTrue('Amazon and Google' in doc.readable)
+        self.assertFalse('Linkblog updated' in doc.readable)
+
+    def test_candidates(self):
+        """Verify we have candidates."""
+        doc = Article(self.article)
+        from lxml.etree import tounicode
+        found = False
+        wanted_hash = '04e46055'
+        # from breadability.logconfig import LNODE
+        # from breadability.logconfig import set_logging_level
+        # set_logging_level('DEBUG')
+        # LNODE.activate()
+        for node in doc.candidates.values():
+            if node.hash_id == wanted_hash:
+                found = node
+
+        self.assertTrue(found)
+
+        # we have the right node, it must be deleted for some reason if it's
+        # not still there when we need it to be.
+        # Make sure it's not in our to drop list.
+        for node in doc._should_drop:
+            self.assertFalse(node == found.node)
+
+        by_score = sorted([c for c in doc.candidates.values()],
+            key=attrgetter('content_score'), reverse=True)
+        self.assertTrue(by_score[0].node == found.node)
+
+        updated_winner = check_siblings(by_score[0], doc.candidates)
+        updated_winner.node = prep_article(updated_winner.node)
+
+        # This article hits up against the img > p conditional filtering
+        # because of the many .gif images in the content. We've removed that
+        # rule.
+        # set_logging_level('INFO')
+        # LNODE.deactivate()
diff --git a/src/breadability/utils.py b/src/breadability/utils.py
index 8986176..6c2b100 100644
--- a/src/breadability/utils.py
+++ b/src/breadability/utils.py
@@ -1,5 +1,6 @@
 import time
 
+
 #
 # ? 2011 Christopher Arndt, MIT License
 #