Do some link filtring to drop stupid permalinks from the content.

12 years ago · bf35e3410e
parent 9cf19d9970
commit bf35e3410e
4 changed files with 41 additions and 12 deletions
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -52,6 +52,27 @@ def drop_tag(doc, *tags):
    return doc


+def is_bad_link(a_node):
+    """Helper to determine if the link is something to clean out
+
+    We've hit articles with many multiple links that should be cleaned out
+    because they're just there to pollute the space. See tests for examples.
+
+    """
+    if a_node.tag == 'a':
+        name = a_node.get('name')
+        href = a_node.get('href')
+        if name and not href:
+            return True
+
+        if href:
+            url_bits = href.split('#')
+            if len(url_bits) == 2:
+                if len(url_bits[1]) > 25:
+                    return True
+    return False
+
+
 def ok_embedded_video(node):
    """Check if this embed/video is an ok one to count."""
    keep_keywords = ['youtube', 'blip.tv', 'vimeo']
@ -251,16 +272,6 @@ def clean_conditionally(node):

    LNODE.log(node, 2, 'Cleaning conditionally node.')

-    # Clean out links with really large href anchor values.
-    if node.tag == 'a':
-        name = node.get('name')
-        href = node.get('href')
-        if name and not href:
-            return True
-
-        if href and href.startswith('#') and len(href) > 50:
-            return True
-
    if node.tag not in target_tags:
        # this is not the tag you're looking for
        LNODE.log(node, 2, 'Node cleared.')
@ -361,6 +372,10 @@ def find_candidates(doc):
            LOG.debug('We should drop unlikely: ' + str(node))
            should_remove.append(node)
            continue
+        if node.tag == 'a' and is_bad_link(node):
+            LOG.debug('We should drop bad link: ' + str(node))
+            should_remove.append(node)
+            continue
        if node.tag in scorable_node_tags and node not in nodes_to_score:
            nodes_to_score.append(node)
    return score_candidates(nodes_to_score), should_remove
--- a/src/breadability/scoring.py
+++ b/src/breadability/scoring.py
@ -10,7 +10,7 @@ from breadability.logconfig import LOG
 # a potential candidate or not.
 CLS_UNLIKELY = re.compile(('combx|comment|community|disqus|extra|foot|header|'
    'menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|'
-    'pager|popup|tweet|twitter'), re.I)
+    'pager|perma|popup|tweet|twitter'), re.I)
 CLS_MAYBE = re.compile('and|article|body|column|main|shadow', re.I)
 CLS_WEIGHT_POSITIVE = re.compile(('article|body|content|entry|hentry|main|'
    'page|pagination|post|text|blog|story'), re.I)
--- a/src/breadability/tests/test_readable.py
+++ b/src/breadability/tests/test_readable.py
@ -6,6 +6,7 @@ from unittest import TestCase
 from breadability.readable import Article
 from breadability.readable import get_class_weight
 from breadability.readable import get_link_density
+from breadability.readable import is_bad_link
 from breadability.readable import score_candidates
 from breadability.readable import transform_misused_divs_into_paragraphs
 from breadability.scoring import ScoredNode
@ -71,7 +72,7 @@ class TestCleaning(TestCase):
        must_not_appear = ['comment', 'community', 'disqus', 'extra', 'foot',
                'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
                'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',
-                'popup', 'tweet', 'twitter']
+                'popup', 'tweet', 'twitter', 'imgBlogpostPermalink']

        want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']

@ -119,6 +120,18 @@ class TestCleaning(TestCase):
                u'<html><body><p>simple<a href="">link</a></p></body></html>'
        )

+    def test_bad_links(self):
+        """Some links should just not belong."""
+        bad_links = [
+            '<a name="amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress">&nbsp;</a>',
+            '<a href="#amazonAndGoogleHaveMadeAnAudaciousGrabOfNamespaceOnTheInternetAsFarAsICanSeeTheresBeenNoMentionOfThisInTheTechPress"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>',
+            '<a href="http://scripting.com/stories/2012/06/15/theTechPressIsOutToLunch.html#anExampleGoogleDoesntIntendToShareBlogAndItWillOnlyBeUsedToPointToBloggerSitesIfYouHaveATumblrOrWordpressBlogYouCantHaveABlogDomainHereIsTheAHrefhttpgtldresulticannorgapplicationresultapplicationstatusapplicationdetails527publicListingaOfGooglesAHrefhttpdropboxscriptingcomdavemiscgoogleblogapplicationhtmlapplicationa"><img src="http://scripting.com/images/2001/09/20/sharpPermaLink3.gif" class="imgBlogpostPermalink" width="6" height="9" border="0" alt="permalink"></a>'
+        ]
+
+        for l in bad_links:
+            link = fragment_fromstring(l)
+            self.assertTrue(is_bad_link(link))
+

 class TestCandidateNodes(TestCase):
    """Candidate nodes are scoring containers we use."""
--- a/src/breadability/tests/test_snippets/test_readable_unlikely.html
+++ b/src/breadability/tests/test_snippets/test_readable_unlikely.html
@ -20,6 +20,7 @@
        <div id="harticleeader">Gone</div>
        <div class="article header">Gone</div>
        <div class="column header">Gone</div>
+        <a class="imgBlogpostPermalink">Gone</a>

        <!-- And this will stick around for final -->
        <div>Final content.</div>