Do some link filtring to drop stupid permalinks from the content.

pull/11/head
Richard Harding 12 years ago
parent 9cf19d9970
commit bf35e3410e

@ -52,6 +52,27 @@ def drop_tag(doc, *tags):
return doc
def is_bad_link(a_node):
"""Helper to determine if the link is something to clean out
We've hit articles with many multiple links that should be cleaned out
because they're just there to pollute the space. See tests for examples.
"""
if a_node.tag == 'a':
name = a_node.get('name')
href = a_node.get('href')
if name and not href:
return True
if href:
url_bits = href.split('#')
if len(url_bits) == 2:
if len(url_bits[1]) > 25:
return True
return False
def ok_embedded_video(node):
"""Check if this embed/video is an ok one to count."""
keep_keywords = ['youtube', 'blip.tv', 'vimeo']
@ -251,16 +272,6 @@ def clean_conditionally(node):
LNODE.log(node, 2, 'Cleaning conditionally node.')
# Clean out links with really large href anchor values.
if node.tag == 'a':
name = node.get('name')
href = node.get('href')
if name and not href:
return True
if href and href.startswith('#') and len(href) > 50:
return True
if node.tag not in target_tags:
# this is not the tag you're looking for
LNODE.log(node, 2, 'Node cleared.')
@ -361,6 +372,10 @@ def find_candidates(doc):
LOG.debug('We should drop unlikely: ' + str(node))
should_remove.append(node)
continue
if node.tag == 'a' and is_bad_link(node):
LOG.debug('We should drop bad link: ' + str(node))
should_remove.append(node)
continue
if node.tag in scorable_node_tags and node not in nodes_to_score:
nodes_to_score.append(node)
return score_candidates(nodes_to_score), should_remove

@ -10,7 +10,7 @@ from breadability.logconfig import LOG
# a potential candidate or not.
CLS_UNLIKELY = re.compile(('combx|comment|community|disqus|extra|foot|header|'
'menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|'
'pager|popup|tweet|twitter'), re.I)
'pager|perma|popup|tweet|twitter'), re.I)
CLS_MAYBE = re.compile('and|article|body|column|main|shadow', re.I)
CLS_WEIGHT_POSITIVE = re.compile(('article|body|content|entry|hentry|main|'
'page|pagination|post|text|blog|story'), re.I)

@ -6,6 +6,7 @@ from unittest import TestCase
from breadability.readable import Article
from breadability.readable import get_class_weight
from breadability.readable import get_link_density
from breadability.readable import is_bad_link
from breadability.readable import score_candidates
from breadability.readable import transform_misused_divs_into_paragraphs
from breadability.scoring import ScoredNode
@ -71,7 +72,7 @@ class TestCleaning(TestCase):
must_not_appear = ['comment', 'community', 'disqus', 'extra', 'foot',
'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',
'popup', 'tweet', 'twitter']
'popup', 'tweet', 'twitter', 'imgBlogpostPermalink']
want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']
@ -119,6 +120,18 @@ class TestCleaning(TestCase):
u'<html><body><p>simple<a href="">link</a></p></body></html>'
)
def test_bad_links(self):
"""Some links should just not belong."""
bad_links = [
'<a name="amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress">&nbsp;</a>',
'<a href="#amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>',
'<a href="http://scripting.com/stories/2012/06/15/theTechPressIsOutToLunch.html#anExampleGoogleDoesntIntendToShareBlogAndItWillOnlyBeUsedToPointToBloggerSitesIfYouHaveATumblrOrWordpressBlogYouCantHaveABlogDomainHereIsTheAHrefhttpgtldresulticannorgapplicationresultapplicationstatusapplicationdetails527publicListingaOfGooglesAHrefhttpdropboxscriptingcomdavemiscgoogleblogapplicationhtmlapplicationa"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>'
]
for l in bad_links:
link = fragment_fromstring(l)
self.assertTrue(is_bad_link(link))
class TestCandidateNodes(TestCase):
"""Candidate nodes are scoring containers we use."""

@ -20,6 +20,7 @@
<div id="harticleeader">Gone</div>
<div class="article header">Gone</div>
<div class="column header">Gone</div>
<a class="imgBlogpostPermalink">Gone</a>
<!-- And this will stick around for final -->
<div>Final content.</div>

Loading…
Cancel
Save