Simplify logging

11 years ago · 101950478e
parent 81be8ccbfb
commit 101950478e
6 changed files with 67 additions and 289 deletions
--- a/breadability/client.py
+++ b/breadability/client.py
@ -3,6 +3,7 @@
 from __future__ import absolute_import

 import argparse
+import logging
 import codecs
 import locale
 import sys
@ -12,15 +13,9 @@ import webbrowser
 from tempfile import mkstemp

 from ._version import VERSION
-from .logconfig import LOG
-from .logconfig import LNODE
-from .logconfig import set_logging_level
 from .readable import Article


-LOGLEVEL = 'WARNING'
-
-
 def parse_args():
    desc = "A fast python port of arc90's readability tool"
    parser = argparse.ArgumentParser(description=desc)
@ -61,15 +56,14 @@ def parse_args():

 def main():
    args = parse_args()
+    logger = logging.getLogger("breadability")

    if args.verbose:
-        set_logging_level('DEBUG')
+        logger.seLevel(logging.DEBUG)

-    if args.debug:
-        LNODE.activate()

    target = args.path[0]
-    LOG.debug("Target: " + target)
+    logger.debug("Target: %r", target)

    if target.startswith('http') or target.startswith('www'):
        is_url = True
--- a/breadability/document.py
+++ b/breadability/document.py
@ -5,6 +5,7 @@
 from __future__ import absolute_import

 import re
+import logging
 import charade

 from lxml.etree import tostring
@ -14,11 +15,11 @@ from lxml.html import document_fromstring
 from lxml.html import HTMLParser

 from ._py3k import unicode, to_string, to_bytes
-from .logconfig import LOG
 from .utils import cached_property


 utf8_parser = HTMLParser(encoding='utf-8')
+logger = logging.getLogger("breadability")


 def get_encoding(page):
@ -46,7 +47,7 @@ def get_encoding(page):

 def replace_multi_br_to_paragraphs(html):
    """Convert multiple <br>s into paragraphs"""
-    LOG.debug('Replacing multiple <br/> to <p>')
+    logger.debug('Replacing multiple <br/> to <p>')
    rep = re.compile("(<br[^>]*>[ \n\r\t]*){2,}", re.I)
    return rep.sub('</p><p>', html)

@ -54,7 +55,7 @@ def replace_multi_br_to_paragraphs(html):
 def build_doc(page):
    """Requires that the `page` not be None"""
    if page is None:
-        LOG.error("Page content is None, can't build_doc")
+        logger.error("Page content is None, can't build_doc")
        return ''
    if isinstance(page, unicode):
        page_unicode = page
@ -67,7 +68,7 @@ def build_doc(page):
            parser=utf8_parser)
        return doc
    except XMLSyntaxError as exc:
-        LOG.error('Failed to parse: ' + str(exc))
+        logger.error('Failed to parse: ' + str(exc))
        raise ValueError('Failed to parse document contents.')


@ -95,7 +96,7 @@ class OriginalDocument(object):
        # doc = html_cleaner.clean_html(doc)
        base_href = self.url
        if base_href:
-            LOG.debug('Making links absolute')
+            logger.debug('Making links absolute')
            doc.make_links_absolute(base_href, resolve_base_href=True)
        else:
            doc.resolve_base_href()
--- a/breadability/logconfig.py
+++ b/breadability/logconfig.py
@ -1,192 +0,0 @@
-"""Setup a logging helper for our module.
-
-
-Helpers:
-    LOG - out active logger instance
-    set_logging_level(level) - adjust the current logging level
-"""
-import logging
-import sys
-import time
-
-from collections import namedtuple
-from hashlib import md5
-from lxml.etree import tounicode
-from breadability._py3k import to_unicode
-
-
-# For pretty log messages, if available
-try:
-    import curses
-except ImportError:
-    curses = None
-
-LOGLEVEL = "WARNING"
-
-
-# Logging bits stolen and adapted from:
-# http://www.tornadoweb.org/documentation/_modules/tornado/options.html
-LogOptions = namedtuple('LogOptions', [
-    'loglevel',
-    'log_file_prefix',
-    'log_file_max_size',
-    'log_file_num_backups',
-    'log_to_stderr',
-])
-
-options = LogOptions(
-    loglevel=LOGLEVEL,
-    log_file_prefix="",
-    log_file_max_size=100 * 1000 * 1000,
-    log_file_num_backups=5,
-    log_to_stderr=True,
-)
-
-
-def set_logging_level(level):
-    """Adjust the current logging level.
-
-    Expect a string of DEBUG, WARNING, INFO, etc.
-
-    """
-    logging.getLogger('breadable').setLevel(getattr(logging, level))
-
-
-def enable_pretty_logging():
-    """Turns on formatted logging output as configured.
-
-    This is called automatically by `parse_command_line`.
-    """
-    root_logger = logging.getLogger()
-    if options.log_file_prefix:
-        channel = logging.handlers.RotatingFileHandler(
-            filename=options.log_file_prefix,
-            maxBytes=options.log_file_max_size,
-            backupCount=options.log_file_num_backups)
-        channel.setFormatter(_LogFormatter(color=False))
-        root_logger.addHandler(channel)
-
-    if (options.log_to_stderr or
-        (options.log_to_stderr is None and not root_logger.handlers)):
-        # Set up color if we are in a tty and curses is installed
-        color = False
-        if curses and sys.stderr.isatty():
-            try:
-                curses.setupterm()
-                if curses.tigetnum("colors") > 0:
-                    color = True
-            except Exception:
-                pass
-        channel = logging.StreamHandler()
-        channel.setFormatter(_LogFormatter(color=color))
-        root_logger.addHandler(channel)
-
-
-class LogHelper(object):
-    """Helper to allow us to log as we want for debugging"""
-    scoring = 1
-    removing = 2
-    _active = False
-
-    _actions = None
-
-    def __init__(self, log, actions=None, content=False):
-        if actions is None:
-            self._actions = tuple()
-        else:
-            self._actions = actions
-
-        self._log = log
-        self.content = content
-
-    @property
-    def actions(self):
-        """Return a tuple of the actions we want to log"""
-        return self._actions
-
-    def activate(self):
-        """Turn on this logger."""
-        self._active = True
-
-    def deactivate(self):
-        """Turn off the logger"""
-        self._active = False
-
-    def log(self, node, action, description):
-        """Write out our log info based on the node and event specified.
-
-        We only log this information if we're are DEBUG loglevel
-
-        """
-        if self._active:
-            content = tounicode(node)
-            hashed = md5()
-            try:
-                hashed.update(content.encode('utf-8', errors="replace"))
-            except Exception as exc:
-                LOG.error("Cannot hash the current node." + str(exc))
-            hash_id = hashed.hexdigest()[0:8]
-            # if hash_id in ['9c880b27', '8393b7d7', '69bfebdd']:
-            print(to_unicode("{0} :: {1}\n{2}").format(
-                hash_id,
-                description,
-                content.replace(to_unicode("\n"), to_unicode(""))[0:202],
-            ))
-
-
-class _LogFormatter(logging.Formatter):
-    def __init__(self, color, *args, **kwargs):
-        logging.Formatter.__init__(self, *args, **kwargs)
-        self._color = color
-        if color:
-            # The curses module has some str/bytes confusion in python3.
-            # Most methods return bytes, but only accept strings.
-            # The explict calls to unicode() below are harmless in python2,
-            # but will do the right conversion in python3.
-            fg_color = unicode(curses.tigetstr("setaf") or
-                               curses.tigetstr("setf") or "", "ascii")
-            self._colors = {
-                logging.DEBUG: unicode(
-                    curses.tparm(fg_color, curses.COLOR_CYAN),
-                   "ascii"),
-                logging.INFO: unicode(
-                    curses.tparm(fg_color, curses.COLOR_GREEN),
-                    "ascii"),
-                logging.WARNING: unicode(
-                    curses.tparm(fg_color, curses.COLOR_YELLOW),  # Yellow
-                    "ascii"),
-                logging.ERROR: unicode(
-                    curses.tparm(fg_color, curses.COLOR_RED),  # Red
-                    "ascii"),
-            }
-            self._normal = unicode(curses.tigetstr("sgr0"), "ascii")
-
-    def format(self, record):
-        try:
-            record.message = record.getMessage()
-        except Exception as e:
-            record.message = "Bad message (%r): %r" % (e, record.__dict__)
-        record.asctime = time.strftime(
-            "%y%m%d %H:%M:%S", self.converter(record.created))
-        prefix = '[%(levelname)1.1s %(asctime)s %(module)s:%(lineno)d]' % \
-            record.__dict__
-        if self._color:
-            prefix = (self._colors.get(record.levelno, self._normal) +
-                      prefix + self._normal)
-        formatted = prefix + " " + record.message
-        if record.exc_info:
-            if not record.exc_text:
-                record.exc_text = self.formatException(record.exc_info)
-        if record.exc_text:
-            formatted = formatted.rstrip() + "\n" + record.exc_text
-        return formatted.replace("\n", "\n    ")
-
-
-# Set up log level and pretty console logging by default
-logging.getLogger('breadable').setLevel(getattr(logging, LOGLEVEL))
-enable_pretty_logging()
-LOG = logging.getLogger('breadable')
-LNODE = LogHelper(LOG,
-    actions=(LogHelper.scoring, LogHelper.removing),
-    content=True
-)
--- a/breadability/readable.py
+++ b/breadability/readable.py
@ -3,6 +3,8 @@
 from __future__ import absolute_import

 import re
+import logging
+
 from lxml.etree import tounicode
 from lxml.etree import tostring
 from lxml.html.clean import Cleaner
@ -12,8 +14,6 @@ from operator import attrgetter
 from pprint import PrettyPrinter

 from .document import OriginalDocument
-from .logconfig import LOG
-from .logconfig import LNODE
 from .scoring import score_candidates
 from .scoring import get_link_density
 from .scoring import get_class_weight
@ -40,6 +40,8 @@ BASE_DOC = """
 """
 SCORABLE_TAGS = ['div', 'p', 'td', 'pre', 'article']

+logger = logging.getLogger("breadability")
+

 def drop_tag(doc, *tags):
    """Helper to just remove any nodes that match this html tag passed in
@ -50,7 +52,7 @@ def drop_tag(doc, *tags):
    for tag in tags:
        found = doc.iterfind(".//" + tag)
        for n in found:
-            LNODE.log(n, 1, "Dropping tag")
+            logger.debug("Dropping tag %s", tag)
            n.drop_tree()
    return doc

@ -168,7 +170,7 @@ def transform_misused_divs_into_paragraphs(doc):
            # We need to create a <p> and put all it's contents in there
            # We'll just stringify it, then regex replace the first/last
            # div bits to turn them into <p> vs <div>.
-            LNODE.log(elem, 1, 'Turning leaf <div> into <p>')
+            logger.debug('Turning leaf <div> into <p>')
            orig = tounicode(elem).strip()
            started = re.sub(r'^<\s*div', '<p', orig)
            ended = re.sub(r'div>$', 'p>', started)
@ -193,7 +195,7 @@ def check_siblings(candidate_node, candidate_list):
        content_bonus = 0

        if sibling is candidate_node.node:
-            LNODE.log(sibling, 1, 'Sibling is the node so append')
+            logger.debug('Sibling is the node so append')
            append = True

        # Give a bonus if sibling nodes and top candidates have the example
@ -220,7 +222,7 @@ def check_siblings(candidate_node, candidate_list):
                    append = True

        if append:
-            LNODE.log(sibling, 1, 'Sibling being appended')
+            logger.debug('Sibling being appended')
            if sibling.tag not in ['div', 'p']:
                # We have a node that isn't a common block level element, like
                # a form or td tag. Turn it into a div so it doesn't get
@ -237,18 +239,18 @@ def clean_document(node):
    if node is None or len(node) == 0:
        return

-    LNODE.log(node, 2, "Processing doc")
+    logger.debug("Processing doc")
    clean_list = ['object', 'h1']
    to_drop = []

    # If there is only one h2, they are probably using it as a header and
    # not a subheader, so remove it since we already have a header.
    if len(node.findall('.//h2')) == 1:
-        LOG.debug('Adding H2 to list of nodes to clean.')
+        logger.debug('Adding H2 to list of nodes to clean.')
        clean_list.append('h2')

    for n in node.iter():
-        LNODE.log(n, 2, "Cleaning iter node")
+        logger.debug("Cleaning iter node")
        # clean out any in-line style properties
        if 'style' in n.attrib:
            n.set('style', '')
@ -267,7 +269,7 @@ def clean_document(node):
                    allow = True

            if not allow:
-                LNODE.log(n, 2, "Dropping Node")
+                logger.debug("Dropping Node")
                to_drop.append(n)

        if n.tag in ['h1', 'h2', 'h3', 'h4']:
@ -275,7 +277,7 @@ def clean_document(node):
            # if the heading has no css weight or a high link density,
            # remove it
            if get_class_weight(n) < 0 or get_link_density(n) > .33:
-                LNODE.log(n, 2, "Dropping <hX>, it's insignificant")
+                logger.debug("Dropping <hX>, it's insignificant")
                to_drop.append(n)

        # clean out extra <p>
@ -283,7 +285,7 @@ def clean_document(node):
            # if the p has no children and has no content...well then down
            # with it.
            if not n.getchildren() and len(n.text_content()) < 5:
-                LNODE.log(n, 2, 'Dropping extra <p>')
+                logger.debug('Dropping extra <p>')
                to_drop.append(n)

        # finally try out the conditional cleaning of the target node
@ -298,11 +300,11 @@ def clean_conditionally(node):
    """Remove the clean_el if it looks like bad content based on rules."""
    target_tags = ['form', 'table', 'ul', 'div', 'p']

-    LNODE.log(node, 2, 'Cleaning conditionally node.')
+    logger.debug('Cleaning conditionally node.')

    if node.tag not in target_tags:
        # this is not the tag you're looking for
-        LNODE.log(node, 2, 'Node cleared.')
+        logger.debug('Node cleared.')
        return

    weight = get_class_weight(node)
@ -311,12 +313,12 @@ def clean_conditionally(node):
    content_score = 0

    if (weight + content_score < 0):
-        LNODE.log(node, 2, 'Dropping conditional node')
-        LNODE.log(node, 2, 'Weight + score < 0')
+        logger.debug('Dropping conditional node')
+        logger.debug('Weight + score < 0')
        return True

    if node.text_content().count(',') < 10:
-        LOG.debug("There aren't 10 ,s so we're processing more")
+        logger.debug("There aren't 10 ,s so we're processing more")

        # If there are not very many commas, and the number of
        # non-paragraph elements is more than paragraphs or other ominous
@ -337,36 +339,32 @@ def clean_conditionally(node):
        remove_node = False

        if li > p and node.tag != 'ul' and node.tag != 'ol':
-            LNODE.log(node, 2, 'Conditional drop: li > p and not ul/ol')
+            logger.debug('Conditional drop: li > p and not ul/ol')
            remove_node = True
        elif inputs > p / 3.0:
-            LNODE.log(node, 2, 'Conditional drop: inputs > p/3.0')
+            logger.debug('Conditional drop: inputs > p/3.0')
            remove_node = True
        elif content_length < 25 and (img == 0 or img > 2):
-            LNODE.log(node, 2,
-                'Conditional drop: len < 25 and 0/>2 images')
+            logger.debug('Conditional drop: len < 25 and 0/>2 images')
            remove_node = True
        elif weight < 25 and link_density > 0.2:
-            LNODE.log(node, 2,
-                'Conditional drop: weight small and link is dense')
+            logger.debug('Conditional drop: weight small and link is dense')
            remove_node = True
        elif weight >= 25 and link_density > 0.5:
-            LNODE.log(node, 2,
-                'Conditional drop: weight big but link heavy')
+            logger.debug('Conditional drop: weight big but link heavy')
            remove_node = True
        elif (embed == 1 and content_length < 75) or embed > 1:
-            LNODE.log(node, 2,
-                'Conditional drop: embed w/o much content or many embed')
+            logger.debug('Conditional drop: embed w/o much content or many embed')
            remove_node = True

        if remove_node:
-            LNODE.log(node, 2, 'Node will be removed')
+            logger.debug('Node will be removed')
        else:
-            LNODE.log(node, 2, 'Node cleared')
+            logger.debug('Node cleared')
        return remove_node

    # nope, don't remove anything
-    LNODE.log(node, 2, 'Node Cleared final.')
+    logger.debug('Node Cleared final.')
    return False


@ -397,11 +395,11 @@ def find_candidates(doc):

    for node in doc.iter():
        if is_unlikely_node(node):
-            LOG.debug('We should drop unlikely: ' + str(node))
+            logger.debug('We should drop unlikely: ' + str(node))
            should_remove.append(node)
            continue
        if node.tag == 'a' and is_bad_link(node):
-            LOG.debug('We should drop bad link: ' + str(node))
+            logger.debug('We should drop bad link: ' + str(node))
            should_remove.append(node)
            continue
        if node.tag in scorable_node_tags and node not in nodes_to_score:
@ -422,7 +420,7 @@ class Article(object):
        doc.

        """
-        LOG.debug('Url: ' + str(url))
+        logger.debug('Url: ' + str(url))
        self.orig = OriginalDocument(html, url=url)
        self.fragment = fragment

@ -464,7 +462,7 @@ class Article(object):
    def _readable(self):
        """The readable parsed article"""
        if self.candidates:
-            LOG.debug('Candidates found:')
+            logger.debug('Candidates found:')
            pp = PrettyPrinter(indent=2)

            # cleanup by removing the should_drop we spotted.
@ -474,23 +472,23 @@ class Article(object):
            # right now we return the highest scoring candidate content
            by_score = sorted([c for c in self.candidates.values()],
                key=attrgetter('content_score'), reverse=True)
-            LOG.debug(pp.pformat(by_score))
+            logger.debug(pp.pformat(by_score))

            # since we have several candidates, check the winner's siblings
            # for extra content
            winner = by_score[0]
-            LOG.debug('Selected winning node: ' + str(winner))
+            logger.debug('Selected winning node: ' + str(winner))
            updated_winner = check_siblings(winner, self.candidates)
-            LOG.debug('Begin final prep of article')
+            logger.debug('Begin final prep of article')
            updated_winner.node = prep_article(updated_winner.node)
            if updated_winner.node is not None:
                doc = build_base_document(updated_winner.node, self.fragment)
            else:
-                LOG.warning('Had candidates but failed to find a cleaned winning doc.')
+                logger.warning('Had candidates but failed to find a cleaned winning doc.')
                doc = self._handle_no_candidates()
        else:
-            LOG.warning('No candidates found: using document.')
-            LOG.debug('Begin final prep of article')
+            logger.warning('No candidates found: using document.')
+            logger.debug('Begin final prep of article')
            doc = self._handle_no_candidates()

        return doc
@ -505,7 +503,7 @@ class Article(object):
            doc = prep_article(self.doc)
            doc = build_base_document(doc, self.fragment)
        else:
-            LOG.warning('No document to use.')
+            logger.warning('No document to use.')
            doc = build_error_document(self.fragment)

        return doc
--- a/breadability/scoring.py
+++ b/breadability/scoring.py
@ -5,11 +5,10 @@
 from __future__ import absolute_import

 import re
+import logging

 from hashlib import md5
 from lxml.etree import tounicode
-from .logconfig import LNODE
-from .logconfig import LOG

 # A series of sets of attributes we check to help in determining if a node is
 # a potential candidate or not.
@ -23,6 +22,8 @@ CLS_WEIGHT_NEGATIVE = re.compile(('combx|comment|com-|contact|foot|footer|'
    'footnote|head|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|'
    'sidebar|sponsor|shopping|tags|tool|widget'), re.I)

+logger = logging.getLogger("breadability")
+

 def check_node_attr(node, attr, checkset):
    value = node.get(attr) or ""
@ -44,7 +45,7 @@ def generate_hash_id(node):
    try:
        hashed.update(content.encode('utf-8', "replace"))
    except Exception as e:
-        LOG.error("BOOM! " + str(e))
+        logger.exception("BOOM! %r", e)

    return hashed.hexdigest()[0:8]

@ -112,7 +113,7 @@ def score_candidates(nodes):
    candidates = {}

    for node in nodes:
-        LNODE.log(node, 1, "Scoring Node")
+        logger.debug("Scoring Node")

        content_score = 0
        # if the node has no parent it knows of, then it ends up creating a
@ -122,16 +123,12 @@ def score_candidates(nodes):
        innertext = node.text_content()

        if parent is None or grand is None:
-            LNODE.log(
-                node, 1,
-                "Skipping candidate because parent/grand are none")
+            logger.debug("Skipping candidate because parent/grand are none")
            continue

        # If this paragraph is less than 25 characters, don't even count it.
        if innertext and len(innertext) < MIN_HIT_LENTH:
-            LNODE.log(
-                node, 1,
-                "Skipping candidate because not enough content.")
+            logger.debug("Skipping candidate because not enough content.")
            continue

        # Initialize readability data for the parent.
@ -148,13 +145,11 @@ def score_candidates(nodes):
        if innertext:
            # Add 0.25 points for any commas within this paragraph
            content_score += innertext.count(',') * 0.25
-            LNODE.log(node, 1,
-                "Bonus points for ,: " + str(innertext.count(',')))
+            logger.debug("Bonus points for ,: " + str(innertext.count(',')))

            # Subtract 0.5 points for each double quote within this paragraph
            content_score += innertext.count('"') * (-0.5)
-            LNODE.log(node, 1,
-                'Penalty points for ": ' + str(innertext.count('"')))
+            logger.debug('Penalty points for ": ' + str(innertext.count('"')))

            # For every 100 characters in this paragraph, add another point.
            # Up to 3 points.
@ -164,35 +159,22 @@ def score_candidates(nodes):
                content_score += 3
            else:
                content_score += length_points
-            LNODE.log(
-                node, 1,
-                "Length/content points: {0} : {1}".format(length_points,
-                                                          content_score))
+            logger.debug("Length/content points: %r : %r", length_points,
+                content_score)

        # Add the score to the parent.
-        LNODE.log(node, 1, "From this current node.")
+        logger.debug("From this current node.")
        candidates[parent].content_score += content_score
-        LNODE.log(
-            candidates[parent].node,
-            1,
-            "Giving parent bonus points: " + str(
-                candidates[parent].content_score))
+        logger.debug("Giving parent bonus points: %r", candidates[parent].content_score)
        # The grandparent gets half.
-        LNODE.log(candidates[grand].node, 1, "Giving grand bonus points")
+        logger.debug("Giving grand bonus points")
        candidates[grand].content_score += (content_score / 2.0)
-        LNODE.log(
-            candidates[parent].node,
-            1,
-            "Giving grand bonus points: " + str(
-                candidates[grand].content_score))
+        logger.debug("Giving grand bonus points: %r", candidates[grand].content_score)

    for candidate in candidates.values():
        adjustment = 1 - get_link_density(candidate.node)
-        LNODE.log(
-            candidate.node,
-            1,
-            "Getting link density adjustment: {0} * {1} ".format(
-                candidate.content_score, adjustment))
+        logger.debug("Getting link density adjustment: %r * %r",
+            candidate.content_score, adjustment)
        candidate.content_score = candidate.content_score * (adjustment)

    return candidates
--- a/tests/test_articles/test_scripting-com/test.py
+++ b/tests/test_articles/test_scripting-com/test.py
@ -44,10 +44,7 @@ class TestArticle(unittest.TestCase):
        # from lxml.etree import tounicode
        found = False
        wanted_hash = '04e46055'
-        # from breadability.logconfig import LNODE
-        # from breadability.logconfig import set_logging_level
-        # set_logging_level('DEBUG')
-        # LNODE.activate()
+
        for node in doc.candidates.values():
            if node.hash_id == wanted_hash:
                found = node
@ -70,5 +67,3 @@ class TestArticle(unittest.TestCase):
        # This article hits up against the img > p conditional filtering
        # because of the many .gif images in the content. We've removed that
        # rule.
-        # set_logging_level('INFO')
-        # LNODE.deactivate()