Add checks to links to remove really bad links from the scripting site

pull/11/head
Richard Harding 12 years ago
parent 5157b4570d
commit ff37f3169f

@ -251,6 +251,16 @@ def clean_conditionally(node):
LNODE.log(node, 2, 'Cleaning conditionally node.')
# Clean out links with really large href anchor values.
if node.tag == 'a':
name = node.get('name')
href = node.get('href')
if name and not href:
return True
if href and href.startswith('#') and len(href) > 50:
return True
if node.tag not in target_tags:
# this is not the tag you're looking for
LNODE.log(node, 2, 'Node cleared.')

@ -29,11 +29,13 @@ class TestArticle(TestCase):
doc = Article(self.article)
self.assertTrue('Amazon and Google' in doc.readable)
self.assertFalse('Linkblog updated' in doc.readable)
self.assertFalse(
'#anExampleGoogleDoesntIntendToShareBlogAndItWill' in doc.readable)
def test_candidates(self):
"""Verify we have candidates."""
doc = Article(self.article)
from lxml.etree import tounicode
# from lxml.etree import tounicode
found = False
wanted_hash = '04e46055'
# from breadability.logconfig import LNODE

Loading…
Cancel
Save