Work on tweaking out parser algorithm to help find the right candidate: fixes #2

12 years ago · 9e6835bd92
parent b78ea49c5a
commit 9e6835bd92
2 changed files with 42 additions and 37 deletions
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -223,7 +223,11 @@ def prep_article(doc):
                    continue

            # finally try out the conditional cleaning of the target node
-            clean_conditionally(n)
+            if clean_conditionally(n):
+                # For some reason the parent is none so we can't drop, we're
+                # not in a tree that can take dropping this node.
+                if n.getparent() is not None:
+                    n.drop_tree()

        return node

@ -242,7 +246,7 @@ def prep_article(doc):

        if (weight + content_score < 0):
            LOG.debug('Dropping conditional node: ' + str(node))
-            node.drop_tree()
+            return True

        if node.text_content().count(',') < 10:
            LOG.debug("There aren't 10 ,s so we're processing more")
@ -269,7 +273,7 @@ def prep_article(doc):
                # this one has shown to do some extra image removals.
                # we could get around this by checking for caption info in the
                # images to try to do some scoring of good v. bad images.
-                # failing example: 
+                # failing example:
                # arstechnica.com/science/news/2012/05/1859s-great-auroral-stormthe-week-the-sun-touched-the-earth.ars
                LOG.debug('Conditional drop: img > p')
                remove_node = True
@ -291,12 +295,10 @@ def prep_article(doc):
            elif (embed == 1 and content_length < 75) or embed > 1:
                LOG.debug('Conditional drop: embed without much content or many embed')
                remove_node = True
+            return remove_node

-            if remove_node:
-                # For some reason the parent is none so we can't drop, we're
-                # not in a tree that can take dropping this node.
-                if node.getparent() is not None:
-                    node.drop_tree()
+        # nope, don't remove anything
+        return False

    doc = clean_document(doc)
    return doc
@ -309,16 +311,18 @@ def find_candidates(doc):
    clean up and return the final best match.

    """
-    scorable_node_tags = ['p', 'td', 'pre']
+    scorable_node_tags = ['div', 'p', 'td', 'pre']
    nodes_to_score = []
+    should_remove = []

    for node in doc.iter():
        if is_unlikely_node(node):
-            LOG.debug('Dropping unlikely: ' + str(node))
-            node.drop_tree()
-        elif node.tag in scorable_node_tags:
+            LOG.debug('We should drop unlikely: ' + str(node))
+            should_remove.append(node)
+            continue
+        if node.tag in scorable_node_tags:
            nodes_to_score.append(node)
-    return score_candidates(nodes_to_score)
+    return score_candidates(nodes_to_score), should_remove


 class Article(object):
@ -342,7 +346,7 @@ class Article(object):
        html_cleaner(doc)
        doc = drop_tag(doc, 'noscript')
        doc = transform_misused_divs_into_paragraphs(doc)
-        candidates = find_candidates(doc)
+        candidates, should_drop = find_candidates(doc)

        if candidates:
            LOG.debug('Candidates found:')
@ -364,6 +368,9 @@ class Article(object):
        else:
            LOG.warning('No candidates found: using document.')
            LOG.debug('Begin final prep of article')
+            # since we've not found a good candidate we're should help this
+            # cleanup by removing the should_drop we spotted.
+            [n.drop_tree() for n in should_drop]
            doc = prep_article(doc)
            doc = build_base_document(doc)

--- a/src/breadability/scoring.py
+++ b/src/breadability/scoring.py
@ -1,28 +1,25 @@
 """Handle dealing with scoring nodes and content for our parsing."""
 import re

+from breadability.logconfig import LOG
+
 # A series of sets of attributes we check to help in determining if a node is
 # a potential candidate or not.
-CLS_UNLIKELY = set([
-    'combx', 'comment', 'community', 'disqus', 'extra', 'foot', 'header',
-    'menu', '' 'remark', 'rss', 'shoutbox', 'sidebar', 'sponsor', 'ad-break',
-    'agegate', 'pagination' '', 'pager', 'popup', 'tweet', 'twitter',
-])
-CLS_MAYBE = set([
-    'and', 'article', 'body', 'column', 'main', 'shadow',
-])
-CLS_WEIGHT_POSITIVE = set(['article', 'body', 'content', 'entry', 'hentry',
-    'main', 'page', 'pagination', 'post', 'text', 'blog', 'story'])
-CLS_WEIGHT_NEGATIVE = set(['combx', 'comment', 'com-', 'contact', 'foot',
-    'footer', 'footnote', 'masthead', 'media', 'meta', 'outbrain', 'promo',
-    'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor', 'shopping', 'tags',
-    'tool', 'widget'])
+CLS_UNLIKELY = re.compile(('combx|comment|community|disqus|extra|foot|header|'
+    'menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|'
+    'pager|popup|tweet|twitter'), re.I)
+CLS_MAYBE = re.compile('and|article|body|column|main|shadow', re.I)
+CLS_WEIGHT_POSITIVE = re.compile(('article|body|content|entry|hentry|main|'
+    'page|pagination|post|text|blog|story'), re.I)
+CLS_WEIGHT_NEGATIVE = re.compile(('combx|comment|com-|contact|foot|footer|'
+    'footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|'
+    'sidebar|sponsor|shopping|tags|tool|widget'), re.I)


 def check_node_attr(node, attr, checkset):
-    attr = node.get(attr) or ""
-    check = set(attr.lower().split(' '))
-    if check.intersection(checkset):
+    value = node.get(attr) or ""
+    check = checkset.search(value)
+    if check:
        return True
    else:
        return False
@ -88,13 +85,15 @@ def score_candidates(nodes):
        content_score = 0
        parent = node.getparent()
        grand = parent.getparent() if parent is not None else None
-        innertext = node.text
+        innertext = node.text_content()

        if parent is None or grand is None:
+            LOG.debug("Skipping candidate because parent/grand are none")
            continue

        # If this paragraph is less than 25 characters, don't even count it.
        if innertext and len(innertext) < MIN_HIT_LENTH:
+            LOG.debug("Skipping candidate because not enough content.")
            continue

        # Initialize readability data for the parent.
@ -116,11 +115,10 @@ def score_candidates(nodes):
        length_points = len(innertext) % 100 if innertext else 0
        content_score = length_points if length_points > 3 else 3

-        # Add the score to the parent. The grandparent gets half. */
-        if parent is not None:
-            candidates[parent].content_score += content_score
-        if grand is not None:
-            candidates[grand].content_score += content_score
+        # Add the score to the parent.
+        candidates[parent].content_score += content_score
+        # The grandparent gets half.
+        candidates[grand].content_score += content_score / 2.0

        for candidate in candidates.values():
            candidate.content_score = candidate.content_score * (1 -