Update scoring and tests for the antipope article

12 years ago · d3c83b7255
parent 3f70a49a22
commit d3c83b7255
2 changed files with 28 additions and 9 deletions
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -1,4 +1,5 @@
 import re
+from lxml.etree import Element
 from lxml.etree import tounicode
 from lxml.etree import tostring
 from lxml.html.clean import Cleaner
@ -11,6 +12,7 @@ from breadability.document import OriginalDocument
 from breadability.logconfig import LOG
 from breadability.logconfig import LNODE
 from breadability.scoring import score_candidates
+from breadability.scoring import generate_hash_id
 from breadability.scoring import get_link_density
 from breadability.scoring import get_class_weight
 from breadability.scoring import is_unlikely_node
@ -206,7 +208,8 @@ def prep_article(doc):
            LOG.debug('Adding H2 to list of nodes to clean.')
            clean_list.append('h2')

-        for n in node.iter():
+        for n in node.iter(tag=Element):
+            LNODE.log(n, 2, "Cleaning iter node")
            # clean out any incline style properties
            if 'style' in n.attrib:
                n.set('style', '')
@ -265,6 +268,10 @@ def prep_article(doc):
        """Remove the clean_el if it looks like bad content based on rules."""
        target_tags = ['form', 'table', 'ul', 'div', 'p']

+        LNODE.log(node, 2, 'Cleaning conditionally node.')
+        if generate_hash_id(node) == '6d63f9d5':
+            import ipdb;from pprint import pprint; ipdb.set_trace()
+
        if node.tag not in target_tags:
            # this is not the tag you're looking for
            return
@ -411,6 +418,9 @@ class Article(object):
            LOG.debug('Candidates found:')
            pp = PrettyPrinter(indent=2)

+            # cleanup by removing the should_drop we spotted.
+            [n.drop_tree() for n in self._should_drop]
+
            # right now we return the highest scoring candidate content
            by_score = sorted([c for c in self.candidates.values()],
                key=attrgetter('content_score'), reverse=True)
--- a/src/breadability/scoring.py
+++ b/src/breadability/scoring.py
@ -28,6 +28,22 @@ def check_node_attr(node, attr, checkset):
        return False


+def generate_hash_id(node):
+    """Generate a hash_id for the node in question.
+
+    :param node: lxml etree node
+
+    """
+    content = tounicode(node)
+    hashed = md5()
+    try:
+        hashed.update(content.encode('utf-8', errors="replace"))
+    except Exception, e:
+        LOG.error("BOOM! " + str(e))
+
+    return hashed.hexdigest()[0:8]
+
+
 def get_link_density(node, node_text=None):
    """Generate a value for the number of links in the node.

@ -206,11 +222,4 @@ class ScoredNode(object):

    @property
    def hash_id(self):
-        content = tounicode(self.node)
-        hashed = md5()
-        try:
-            hashed.update(content.encode('utf-8', errors="replace"))
-        except Exception, e:
-            LOG.error("BOOM! " + str(e))
-
-        return hashed.hexdigest()[0:8]
+        return generate_hash_id(self.node)