Create LNODE and update bugs in parsing

- Add concept of a LNODE logger that outputs information about scoring, node, and generates a hash_id for the node content so we can track it. - Add `-d` flag to the cmd line client to output the LNODE logging - Update reading in of http content in the client to be unicode - Wrap stdout with a unicode happy stream so we can pipe unicode to less/grep, etc - Add html article to the scorable tags we work with - Make sure we drop iframe along with noscript - Fix scoring bugs around length points - Add the hash_id as a scored node @property
12 years ago · 32350fc3a1
parent f1623fc3e3
commit 32350fc3a1
5 changed files with 3112 additions and 34 deletions
--- a/src/breadability/client.py
+++ b/src/breadability/client.py
@ -1,6 +1,6 @@
 import argparse
 import codecs
-import os
+import locale
 import sys
 import urllib
 import webbrowser
@ -9,6 +9,7 @@ from tempfile import mkstemp

 from breadability import VERSION
 from breadability.logconfig import LOG
+from breadability.logconfig import LNODE
 from breadability.logconfig import set_logging_level
 from breadability.readable import Article

@ -41,6 +42,11 @@ def parse_args():
        default=False,
        help='open the parsed content in your web browser')

+    parser.add_argument('-d', '--debug',
+        action='store_true',
+        default=False,
+        help='Output the detailed scoring information for debugging parsing')
+
    parser.add_argument('path', metavar='P', type=str, nargs=1,
        help="The url or file path to process in readable form.")

@ -54,6 +60,9 @@ def main():
    if args.verbose:
        set_logging_level('DEBUG')

+    if args.debug:
+        LNODE.activate()
+
    target = args.path[0]
    LOG.debug("Target: " + target)

@ -66,12 +75,11 @@ def main():

    if is_url:
        req = urllib.urlopen(target)
-        ucontent = req.read().encode('utf-8')
+        content = req.read()
+        ucontent = unicode(content, 'utf-8')
    else:
        ucontent = codecs.open(target, "r", "utf-8").read()

-    enc = sys.__stdout__.encoding or 'utf-8'
-
    doc = Article(ucontent, url=url, fragment=args.fragment)
    if args.browser:
        fg, pathname = mkstemp(suffix='.html')
@ -80,7 +88,9 @@ def main():
        out.close()
        webbrowser.open(pathname)
    else:
-        sys.stdout(doc.readable.encode(enc, 'replace'))
+        # Wrap sys.stdout into a StreamWriter to allow writing unicode.
+        sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout)
+        sys.stdout.write(doc.readable)


 if __name__ == '__main__':
--- a/src/breadability/logconfig.py
+++ b/src/breadability/logconfig.py
@ -9,6 +9,8 @@ import logging
 import sys
 import time
 from collections import namedtuple
+from hashlib import md5
+from lxml.etree import tounicode


 # For pretty log messages, if available
@ -19,6 +21,9 @@ except ImportError:

 LOGLEVEL = "WARNING"

+
+
+
 # Logging bits stolen and adapted from:
 # http://www.tornadoweb.org/documentation/_modules/tornado/options.html
 LogOptions = namedtuple('LogOptions', [
@ -38,6 +43,7 @@ options = LogOptions(
 )


+
 def set_logging_level(level):
    """Adjust the current logging level.

@ -77,6 +83,54 @@ def enable_pretty_logging():
        root_logger.addHandler(channel)


+class LogHelper(object):
+    """Helper to allow us to log as we want for debugging"""
+    scoring = 1
+    removing = 2
+    _active = False
+
+    _actions = None
+
+    def __init__(self, log, actions=None, content=False):
+        if actions is None:
+            self._actions = tuple()
+        else:
+            self._actions = actions
+
+        self._log = log
+        self.content = content
+
+    @property
+    def actions(self):
+        """Return a tuple of the actions we want to log"""
+        return self._actions
+
+    def activate(self):
+        """Turn on this logger."""
+        self._active = True
+
+    def log(self, node, action, description):
+        """Write out our log info based on the node and event specified.
+
+        We only log this information if we're are DEBUG loglevel
+
+        """
+        if self._active:
+            content = tounicode(node)
+            hashed = md5()
+            try:
+                hashed.update(content.encode('utf-8', errors="replace"))
+            except Exception, e:
+                LOG.error("Cannot hash the current node.")
+            hash_id = hashed.hexdigest()[0:8]
+            # if hash_id in ['9c880b27', '8393b7d7', '69bfebdd']:
+            print(u"{0} :: {1}\n{2}".format(
+                hash_id,
+                description,
+                content.replace(u"\n", u"")[0:202],
+            ))
+
+
 class _LogFormatter(logging.Formatter):
    def __init__(self, color, *args, **kwargs):
        logging.Formatter.__init__(self, *args, **kwargs)
@ -129,3 +183,7 @@ class _LogFormatter(logging.Formatter):
 logging.getLogger('breadable').setLevel(getattr(logging, LOGLEVEL))
 enable_pretty_logging()
 LOG = logging.getLogger('breadable')
+LNODE = LogHelper(LOG,
+    actions=(LogHelper.scoring, LogHelper.removing),
+    content=True
+)
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -3,13 +3,13 @@ from operator import attrgetter
 from lxml.etree import tounicode
 from lxml.etree import tostring
 from lxml.html.clean import Cleaner
-from lxml.html import document_fromstring
 from lxml.html import fragment_fromstring
 from lxml.html import fromstring
 from pprint import PrettyPrinter

 from breadability.document import OriginalDocument
 from breadability.logconfig import LOG
+from breadability.logconfig import LNODE
 from breadability.scoring import score_candidates
 from breadability.scoring import get_link_density
 from breadability.scoring import get_class_weight
@ -34,6 +34,7 @@ BASE_DOC = """
    </body>
 </html>
 """
+SCORABLE_TAGS = ['div', 'p', 'td', 'pre', 'article']


 def drop_tag(doc, *tags):
@ -45,7 +46,7 @@ def drop_tag(doc, *tags):
    for tag in tags:
        found = doc.iterfind(".//" + tag)
        for n in found:
-            LOG.debug("Dropping tag: " + str(n))
+            LNODE.log(n, 1, "Dropping tag")
            n.drop_tree()
    return doc

@ -98,6 +99,7 @@ def build_base_document(html, fragment=True):
        else:
            output = found_body

+    output.doctype = "<!DOCTYPE html>"
    return output


@ -120,7 +122,7 @@ def transform_misused_divs_into_paragraphs(doc):
            # We need to create a <p> and put all it's contents in there
            # We'll just stringify it, then regex replace the first/last
            # div bits to turn them into <p> vs <div>.
-            LOG.debug('Turning leaf <div> into <p>')
+            LNODE.log(elem, 1, 'Turning leaf <div> into <p>')
            orig = tounicode(elem).strip()
            started = re.sub(r'^<\s*div', '<p', orig)
            ended = re.sub(r'div>$', 'p>', started)
@ -145,6 +147,7 @@ def check_siblings(candidate_node, candidate_list):
        content_bonus = 0

        if sibling is candidate_node.node:
+            LNODE.log(sibling, 1, 'Sibling is the node so append')
            append = True

        # Give a bonus if sibling nodes and top candidates have the example
@ -171,7 +174,7 @@ def check_siblings(candidate_node, candidate_list):
                    append = True

        if append:
-            LOG.debug('Sibling being appended' + str(sibling))
+            LNODE.log(sibling, 1, 'Sibling being appended')
            if sibling.tag not in ['div', 'p']:
                # We have a node that isn't a common block level element, like
                # a form or td tag. Turn it into a div so it doesn't get
@ -223,7 +226,7 @@ def prep_article(doc):
                        allow = True

                if not allow:
-                    LOG.debug('Dropping node: ' + str(n))
+                    LNODE.log(n, 2, "Dropping Node")
                    n.drop_tree()
                    # go on with next loop, this guy is gone
                    continue
@ -235,8 +238,7 @@ def prep_article(doc):
                if get_class_weight(n) < 0 or get_link_density(n) > .33:
                    # for some reason we get nodes here without a parent
                    if n.getparent() is not None:
-                        LOG.debug(
-                            "Dropping <hX>, it's insignificant: " + str(n))
+                        LNODE.log(n, 2, "Dropping <hX>, it's insignificant")
                        n.drop_tree()
                        # go on with next loop, this guy is gone
                        continue
@ -246,7 +248,7 @@ def prep_article(doc):
                # if the p has no children and has no content...well then down
                # with it.
                if not n.getchildren() and len(n.text_content()) < 5:
-                    LOG.debug('Dropping extra <p>: ' + str(n))
+                    LNODE.log(n, 2, 'Dropping extra <p>')
                    n.drop_tree()
                    # go on with next loop, this guy is gone
                    continue
@ -274,7 +276,7 @@ def prep_article(doc):
        content_score = 0

        if (weight + content_score < 0):
-            LOG.debug('Dropping conditional node: ' + str(node))
+            LNODE(node, 2, 'Dropping conditional node')
            return True

        if node.text_content().count(',') < 10:
@ -304,25 +306,25 @@ def prep_article(doc):
                # images to try to do some scoring of good v. bad images.
                # failing example:
                # arstechnica.com/science/news/2012/05/1859s-great-auroral-stormthe-week-the-sun-touched-the-earth.ars
-                LOG.debug('Conditional drop: img > p')
+                LNODE.log(node, 2, 'Conditional drop: img > p')
                remove_node = True
            elif li > p and node.tag != 'ul' and node.tag != 'ol':
-                LOG.debug('Conditional drop: li > p and not ul/ol')
+                LNODE.log(node, 2, 'Conditional drop: li > p and not ul/ol')
                remove_node = True
            elif inputs > p / 3.0:
-                LOG.debug('Conditional drop: inputs > p/3.0')
+                LNODE.log(node, 2, 'Conditional drop: inputs > p/3.0')
                remove_node = True
            elif content_length < 25 and (img == 0 or img > 2):
-                LOG.debug('Conditional drop: len < 25 and 0/>2 images')
+                LNODE.log(node, 2, 'Conditional drop: len < 25 and 0/>2 images')
                remove_node = True
            elif weight < 25 and link_density > 0.2:
-                LOG.debug('Conditional drop: weight small and link is dense')
+                LNODE.log(node, 2, 'Conditional drop: weight small and link is dense')
                remove_node = True
            elif weight >= 25 and link_density > 0.5:
-                LOG.debug('Conditional drop: weight big but link heavy')
+                LNODE.log(node, 2, 'Conditional drop: weight big but link heavy')
                remove_node = True
            elif (embed == 1 and content_length < 75) or embed > 1:
-                LOG.debug('Conditional drop: embed without much content or many embed')
+                LNODE.log(node, 2, 'Conditional drop: embed without much content or many embed')
                remove_node = True
            return remove_node

@ -340,7 +342,7 @@ def find_candidates(doc):
    clean up and return the final best match.

    """
-    scorable_node_tags = ['div', 'p', 'td', 'pre']
+    scorable_node_tags = SCORABLE_TAGS
    nodes_to_score = []
    should_remove = []

@ -349,7 +351,7 @@ def find_candidates(doc):
            LOG.debug('We should drop unlikely: ' + str(node))
            should_remove.append(node)
            continue
-        if node.tag in scorable_node_tags:
+        if node.tag in scorable_node_tags and node not in nodes_to_score:
            nodes_to_score.append(node)
    return score_candidates(nodes_to_score), should_remove

@ -386,7 +388,7 @@ class Article(object):
        doc = self.orig.html
        # cleaning doesn't return, just wipes in place
        html_cleaner(doc)
-        doc = drop_tag(doc, 'noscript')
+        doc = drop_tag(doc, 'noscript', 'iframe')
        doc = transform_misused_divs_into_paragraphs(doc)
        candidates, should_drop = find_candidates(doc)

--- a/src/breadability/scoring.py
+++ b/src/breadability/scoring.py
@ -1,6 +1,9 @@
 """Handle dealing with scoring nodes and content for our parsing."""
 import re
+from hashlib import md5
+from lxml.etree import tounicode

+from breadability.logconfig import LNODE
 from breadability.logconfig import LOG

 # A series of sets of attributes we check to help in determining if a node is
@ -25,15 +28,20 @@ def check_node_attr(node, attr, checkset):
        return False


-def get_link_density(node):
+def get_link_density(node, node_text=None):
    """Generate a value for the number of links in the node.

    :param node: pared elementree node
+    :param node_text: if we already have the text_content() make this easier
+    on us.
    :returns float:

    """
    link_length = sum([len(a.text_content()) or 0 for a in node.findall(".//a")])
-    text_length = len(node.text_content())
+    if node_text:
+        text_length = len(node_text)
+    else:
+        text_length = len(node.text_content())
    return float(link_length) / max(text_length, 1)


@ -82,18 +90,20 @@ def score_candidates(nodes):
    candidates = {}

    for node in nodes:
+        LNODE.log(node, 1, "Scoring Node")
+
        content_score = 0
        parent = node.getparent()
        grand = parent.getparent() if parent is not None else None
        innertext = node.text_content()

        if parent is None or grand is None:
-            LOG.debug("Skipping candidate because parent/grand are none")
+            LNODE.log(node, 1, "Skipping candidate because parent/grand are none")
            continue

        # If this paragraph is less than 25 characters, don't even count it.
        if innertext and len(innertext) < MIN_HIT_LENTH:
-            LOG.debug("Skipping candidate because not enough content.")
+            LNODE.log(node, 1, "Skipping candidate because not enough content.")
            continue

        # Initialize readability data for the parent.
@ -109,20 +119,30 @@ def score_candidates(nodes):

        # Add points for any commas within this paragraph
        content_score += innertext.count(',') if innertext else 0
+        LNODE.log(node, 1, "Bonus points for ,: " + str(innertext.count(',')))

        # For every 100 characters in this paragraph, add another point. Up to
        # 3 points.
        length_points = len(innertext) % 100 if innertext else 0
-        content_score = length_points if length_points > 3 else 3
+        if length_points > 3:
+            content_score += 3
+        else:
+            content_score += length_points
+        LNODE.log(node, 1, "Length/content points: {0} : {1}".format(length_points, content_score))

        # Add the score to the parent.
+        LNODE.log(node, 1, "From this current node.")
        candidates[parent].content_score += content_score
+        LNODE.log(candidates[parent].node, 1, "Giving parent bonus points: " + str(candidates[parent].content_score))
        # The grandparent gets half.
-        candidates[grand].content_score += content_score / 2.0
+        LNODE.log(candidates[grand].node, 1, "Giving grand bonus points")
+        candidates[grand].content_score += (content_score / 2.0)
+        LNODE.log(candidates[parent].node, 1, "Giving grand bonus points: " + str(candidates[grand].content_score))

-        for candidate in candidates.values():
-            candidate.content_score = candidate.content_score * (1 -
-                    get_link_density(candidate.node))
+    for candidate in candidates.values():
+        LNODE.log(candidate.node, 1, "Getting link density adjustment: {0} * {1} ".format(
+            candidate.content_score, (1 - get_link_density(candidate.node))))
+        candidate.content_score = candidate.content_score * (1 - get_link_density(candidate.node))

    return candidates

@ -138,7 +158,10 @@ class ScoredNode(object):

    def __repr__(self):
        """Helpful representation of our Scored Node"""
-        return "{0:0.1F}\t{1}".format(self.content_score, self.node)
+        return "{0}: {1:0.1F}\t{2}".format(
+            self.hash_id,
+            self.content_score,
+            self.node)

    def __init__(self, node):
        """Given node, set an initial score and weigh based on css and id"""
@ -157,3 +180,14 @@ class ScoredNode(object):
            content_score = -5
        content_score += get_class_weight(node)
        self.content_score = content_score
+
+    @property
+    def hash_id(self):
+        content = tounicode(self.node)
+        hashed = md5()
+        try:
+            hashed.update(content.encode('utf-8', errors="replace"))
+        except Exception, e:
+            LOG.error("BOOM! " + str(e))
+
+        return hashed.hexdigest()[0:8]
--- a/src/breadability/tests/test_articles/blogs/automation_blog.html
+++ b/src/breadability/tests/test_articles/blogs/automation_blog.html