Simplify logging

pull/21/head
Mišo Belica 11 years ago
parent 81be8ccbfb
commit 101950478e

@ -3,6 +3,7 @@
from __future__ import absolute_import
import argparse
import logging
import codecs
import locale
import sys
@ -12,15 +13,9 @@ import webbrowser
from tempfile import mkstemp
from ._version import VERSION
from .logconfig import LOG
from .logconfig import LNODE
from .logconfig import set_logging_level
from .readable import Article
LOGLEVEL = 'WARNING'
def parse_args():
desc = "A fast python port of arc90's readability tool"
parser = argparse.ArgumentParser(description=desc)
@ -61,15 +56,14 @@ def parse_args():
def main():
args = parse_args()
logger = logging.getLogger("breadability")
if args.verbose:
set_logging_level('DEBUG')
logger.seLevel(logging.DEBUG)
if args.debug:
LNODE.activate()
target = args.path[0]
LOG.debug("Target: " + target)
logger.debug("Target: %r", target)
if target.startswith('http') or target.startswith('www'):
is_url = True

@ -5,6 +5,7 @@
from __future__ import absolute_import
import re
import logging
import charade
from lxml.etree import tostring
@ -14,11 +15,11 @@ from lxml.html import document_fromstring
from lxml.html import HTMLParser
from ._py3k import unicode, to_string, to_bytes
from .logconfig import LOG
from .utils import cached_property
utf8_parser = HTMLParser(encoding='utf-8')
logger = logging.getLogger("breadability")
def get_encoding(page):
@ -46,7 +47,7 @@ def get_encoding(page):
def replace_multi_br_to_paragraphs(html):
"""Convert multiple <br>s into paragraphs"""
LOG.debug('Replacing multiple <br/> to <p>')
logger.debug('Replacing multiple <br/> to <p>')
rep = re.compile("(<br[^>]*>[ \n\r\t]*){2,}", re.I)
return rep.sub('</p><p>', html)
@ -54,7 +55,7 @@ def replace_multi_br_to_paragraphs(html):
def build_doc(page):
"""Requires that the `page` not be None"""
if page is None:
LOG.error("Page content is None, can't build_doc")
logger.error("Page content is None, can't build_doc")
return ''
if isinstance(page, unicode):
page_unicode = page
@ -67,7 +68,7 @@ def build_doc(page):
parser=utf8_parser)
return doc
except XMLSyntaxError as exc:
LOG.error('Failed to parse: ' + str(exc))
logger.error('Failed to parse: ' + str(exc))
raise ValueError('Failed to parse document contents.')
@ -95,7 +96,7 @@ class OriginalDocument(object):
# doc = html_cleaner.clean_html(doc)
base_href = self.url
if base_href:
LOG.debug('Making links absolute')
logger.debug('Making links absolute')
doc.make_links_absolute(base_href, resolve_base_href=True)
else:
doc.resolve_base_href()

@ -1,192 +0,0 @@
"""Setup a logging helper for our module.
Helpers:
LOG - out active logger instance
set_logging_level(level) - adjust the current logging level
"""
import logging
import sys
import time
from collections import namedtuple
from hashlib import md5
from lxml.etree import tounicode
from breadability._py3k import to_unicode
# For pretty log messages, if available
try:
import curses
except ImportError:
curses = None
LOGLEVEL = "WARNING"
# Logging bits stolen and adapted from:
# http://www.tornadoweb.org/documentation/_modules/tornado/options.html
LogOptions = namedtuple('LogOptions', [
'loglevel',
'log_file_prefix',
'log_file_max_size',
'log_file_num_backups',
'log_to_stderr',
])
options = LogOptions(
loglevel=LOGLEVEL,
log_file_prefix="",
log_file_max_size=100 * 1000 * 1000,
log_file_num_backups=5,
log_to_stderr=True,
)
def set_logging_level(level):
"""Adjust the current logging level.
Expect a string of DEBUG, WARNING, INFO, etc.
"""
logging.getLogger('breadable').setLevel(getattr(logging, level))
def enable_pretty_logging():
"""Turns on formatted logging output as configured.
This is called automatically by `parse_command_line`.
"""
root_logger = logging.getLogger()
if options.log_file_prefix:
channel = logging.handlers.RotatingFileHandler(
filename=options.log_file_prefix,
maxBytes=options.log_file_max_size,
backupCount=options.log_file_num_backups)
channel.setFormatter(_LogFormatter(color=False))
root_logger.addHandler(channel)
if (options.log_to_stderr or
(options.log_to_stderr is None and not root_logger.handlers)):
# Set up color if we are in a tty and curses is installed
color = False
if curses and sys.stderr.isatty():
try:
curses.setupterm()
if curses.tigetnum("colors") > 0:
color = True
except Exception:
pass
channel = logging.StreamHandler()
channel.setFormatter(_LogFormatter(color=color))
root_logger.addHandler(channel)
class LogHelper(object):
"""Helper to allow us to log as we want for debugging"""
scoring = 1
removing = 2
_active = False
_actions = None
def __init__(self, log, actions=None, content=False):
if actions is None:
self._actions = tuple()
else:
self._actions = actions
self._log = log
self.content = content
@property
def actions(self):
"""Return a tuple of the actions we want to log"""
return self._actions
def activate(self):
"""Turn on this logger."""
self._active = True
def deactivate(self):
"""Turn off the logger"""
self._active = False
def log(self, node, action, description):
"""Write out our log info based on the node and event specified.
We only log this information if we're are DEBUG loglevel
"""
if self._active:
content = tounicode(node)
hashed = md5()
try:
hashed.update(content.encode('utf-8', errors="replace"))
except Exception as exc:
LOG.error("Cannot hash the current node." + str(exc))
hash_id = hashed.hexdigest()[0:8]
# if hash_id in ['9c880b27', '8393b7d7', '69bfebdd']:
print(to_unicode("{0} :: {1}\n{2}").format(
hash_id,
description,
content.replace(to_unicode("\n"), to_unicode(""))[0:202],
))
class _LogFormatter(logging.Formatter):
def __init__(self, color, *args, **kwargs):
logging.Formatter.__init__(self, *args, **kwargs)
self._color = color
if color:
# The curses module has some str/bytes confusion in python3.
# Most methods return bytes, but only accept strings.
# The explict calls to unicode() below are harmless in python2,
# but will do the right conversion in python3.
fg_color = unicode(curses.tigetstr("setaf") or
curses.tigetstr("setf") or "", "ascii")
self._colors = {
logging.DEBUG: unicode(
curses.tparm(fg_color, curses.COLOR_CYAN),
"ascii"),
logging.INFO: unicode(
curses.tparm(fg_color, curses.COLOR_GREEN),
"ascii"),
logging.WARNING: unicode(
curses.tparm(fg_color, curses.COLOR_YELLOW), # Yellow
"ascii"),
logging.ERROR: unicode(
curses.tparm(fg_color, curses.COLOR_RED), # Red
"ascii"),
}
self._normal = unicode(curses.tigetstr("sgr0"), "ascii")
def format(self, record):
try:
record.message = record.getMessage()
except Exception as e:
record.message = "Bad message (%r): %r" % (e, record.__dict__)
record.asctime = time.strftime(
"%y%m%d %H:%M:%S", self.converter(record.created))
prefix = '[%(levelname)1.1s %(asctime)s %(module)s:%(lineno)d]' % \
record.__dict__
if self._color:
prefix = (self._colors.get(record.levelno, self._normal) +
prefix + self._normal)
formatted = prefix + " " + record.message
if record.exc_info:
if not record.exc_text:
record.exc_text = self.formatException(record.exc_info)
if record.exc_text:
formatted = formatted.rstrip() + "\n" + record.exc_text
return formatted.replace("\n", "\n ")
# Set up log level and pretty console logging by default
logging.getLogger('breadable').setLevel(getattr(logging, LOGLEVEL))
enable_pretty_logging()
LOG = logging.getLogger('breadable')
LNODE = LogHelper(LOG,
actions=(LogHelper.scoring, LogHelper.removing),
content=True
)

@ -3,6 +3,8 @@
from __future__ import absolute_import
import re
import logging
from lxml.etree import tounicode
from lxml.etree import tostring
from lxml.html.clean import Cleaner
@ -12,8 +14,6 @@ from operator import attrgetter
from pprint import PrettyPrinter
from .document import OriginalDocument
from .logconfig import LOG
from .logconfig import LNODE
from .scoring import score_candidates
from .scoring import get_link_density
from .scoring import get_class_weight
@ -40,6 +40,8 @@ BASE_DOC = """
"""
SCORABLE_TAGS = ['div', 'p', 'td', 'pre', 'article']
logger = logging.getLogger("breadability")
def drop_tag(doc, *tags):
"""Helper to just remove any nodes that match this html tag passed in
@ -50,7 +52,7 @@ def drop_tag(doc, *tags):
for tag in tags:
found = doc.iterfind(".//" + tag)
for n in found:
LNODE.log(n, 1, "Dropping tag")
logger.debug("Dropping tag %s", tag)
n.drop_tree()
return doc
@ -168,7 +170,7 @@ def transform_misused_divs_into_paragraphs(doc):
# We need to create a <p> and put all it's contents in there
# We'll just stringify it, then regex replace the first/last
# div bits to turn them into <p> vs <div>.
LNODE.log(elem, 1, 'Turning leaf <div> into <p>')
logger.debug('Turning leaf <div> into <p>')
orig = tounicode(elem).strip()
started = re.sub(r'^<\s*div', '<p', orig)
ended = re.sub(r'div>$', 'p>', started)
@ -193,7 +195,7 @@ def check_siblings(candidate_node, candidate_list):
content_bonus = 0
if sibling is candidate_node.node:
LNODE.log(sibling, 1, 'Sibling is the node so append')
logger.debug('Sibling is the node so append')
append = True
# Give a bonus if sibling nodes and top candidates have the example
@ -220,7 +222,7 @@ def check_siblings(candidate_node, candidate_list):
append = True
if append:
LNODE.log(sibling, 1, 'Sibling being appended')
logger.debug('Sibling being appended')
if sibling.tag not in ['div', 'p']:
# We have a node that isn't a common block level element, like
# a form or td tag. Turn it into a div so it doesn't get
@ -237,18 +239,18 @@ def clean_document(node):
if node is None or len(node) == 0:
return
LNODE.log(node, 2, "Processing doc")
logger.debug("Processing doc")
clean_list = ['object', 'h1']
to_drop = []
# If there is only one h2, they are probably using it as a header and
# not a subheader, so remove it since we already have a header.
if len(node.findall('.//h2')) == 1:
LOG.debug('Adding H2 to list of nodes to clean.')
logger.debug('Adding H2 to list of nodes to clean.')
clean_list.append('h2')
for n in node.iter():
LNODE.log(n, 2, "Cleaning iter node")
logger.debug("Cleaning iter node")
# clean out any in-line style properties
if 'style' in n.attrib:
n.set('style', '')
@ -267,7 +269,7 @@ def clean_document(node):
allow = True
if not allow:
LNODE.log(n, 2, "Dropping Node")
logger.debug("Dropping Node")
to_drop.append(n)
if n.tag in ['h1', 'h2', 'h3', 'h4']:
@ -275,7 +277,7 @@ def clean_document(node):
# if the heading has no css weight or a high link density,
# remove it
if get_class_weight(n) < 0 or get_link_density(n) > .33:
LNODE.log(n, 2, "Dropping <hX>, it's insignificant")
logger.debug("Dropping <hX>, it's insignificant")
to_drop.append(n)
# clean out extra <p>
@ -283,7 +285,7 @@ def clean_document(node):
# if the p has no children and has no content...well then down
# with it.
if not n.getchildren() and len(n.text_content()) < 5:
LNODE.log(n, 2, 'Dropping extra <p>')
logger.debug('Dropping extra <p>')
to_drop.append(n)
# finally try out the conditional cleaning of the target node
@ -298,11 +300,11 @@ def clean_conditionally(node):
"""Remove the clean_el if it looks like bad content based on rules."""
target_tags = ['form', 'table', 'ul', 'div', 'p']
LNODE.log(node, 2, 'Cleaning conditionally node.')
logger.debug('Cleaning conditionally node.')
if node.tag not in target_tags:
# this is not the tag you're looking for
LNODE.log(node, 2, 'Node cleared.')
logger.debug('Node cleared.')
return
weight = get_class_weight(node)
@ -311,12 +313,12 @@ def clean_conditionally(node):
content_score = 0
if (weight + content_score < 0):
LNODE.log(node, 2, 'Dropping conditional node')
LNODE.log(node, 2, 'Weight + score < 0')
logger.debug('Dropping conditional node')
logger.debug('Weight + score < 0')
return True
if node.text_content().count(',') < 10:
LOG.debug("There aren't 10 ,s so we're processing more")
logger.debug("There aren't 10 ,s so we're processing more")
# If there are not very many commas, and the number of
# non-paragraph elements is more than paragraphs or other ominous
@ -337,36 +339,32 @@ def clean_conditionally(node):
remove_node = False
if li > p and node.tag != 'ul' and node.tag != 'ol':
LNODE.log(node, 2, 'Conditional drop: li > p and not ul/ol')
logger.debug('Conditional drop: li > p and not ul/ol')
remove_node = True
elif inputs > p / 3.0:
LNODE.log(node, 2, 'Conditional drop: inputs > p/3.0')
logger.debug('Conditional drop: inputs > p/3.0')
remove_node = True
elif content_length < 25 and (img == 0 or img > 2):
LNODE.log(node, 2,
'Conditional drop: len < 25 and 0/>2 images')
logger.debug('Conditional drop: len < 25 and 0/>2 images')
remove_node = True
elif weight < 25 and link_density > 0.2:
LNODE.log(node, 2,
'Conditional drop: weight small and link is dense')
logger.debug('Conditional drop: weight small and link is dense')
remove_node = True
elif weight >= 25 and link_density > 0.5:
LNODE.log(node, 2,
'Conditional drop: weight big but link heavy')
logger.debug('Conditional drop: weight big but link heavy')
remove_node = True
elif (embed == 1 and content_length < 75) or embed > 1:
LNODE.log(node, 2,
'Conditional drop: embed w/o much content or many embed')
logger.debug('Conditional drop: embed w/o much content or many embed')
remove_node = True
if remove_node:
LNODE.log(node, 2, 'Node will be removed')
logger.debug('Node will be removed')
else:
LNODE.log(node, 2, 'Node cleared')
logger.debug('Node cleared')
return remove_node
# nope, don't remove anything
LNODE.log(node, 2, 'Node Cleared final.')
logger.debug('Node Cleared final.')
return False
@ -397,11 +395,11 @@ def find_candidates(doc):
for node in doc.iter():
if is_unlikely_node(node):
LOG.debug('We should drop unlikely: ' + str(node))
logger.debug('We should drop unlikely: ' + str(node))
should_remove.append(node)
continue
if node.tag == 'a' and is_bad_link(node):
LOG.debug('We should drop bad link: ' + str(node))
logger.debug('We should drop bad link: ' + str(node))
should_remove.append(node)
continue
if node.tag in scorable_node_tags and node not in nodes_to_score:
@ -422,7 +420,7 @@ class Article(object):
doc.
"""
LOG.debug('Url: ' + str(url))
logger.debug('Url: ' + str(url))
self.orig = OriginalDocument(html, url=url)
self.fragment = fragment
@ -464,7 +462,7 @@ class Article(object):
def _readable(self):
"""The readable parsed article"""
if self.candidates:
LOG.debug('Candidates found:')
logger.debug('Candidates found:')
pp = PrettyPrinter(indent=2)
# cleanup by removing the should_drop we spotted.
@ -474,23 +472,23 @@ class Article(object):
# right now we return the highest scoring candidate content
by_score = sorted([c for c in self.candidates.values()],
key=attrgetter('content_score'), reverse=True)
LOG.debug(pp.pformat(by_score))
logger.debug(pp.pformat(by_score))
# since we have several candidates, check the winner's siblings
# for extra content
winner = by_score[0]
LOG.debug('Selected winning node: ' + str(winner))
logger.debug('Selected winning node: ' + str(winner))
updated_winner = check_siblings(winner, self.candidates)
LOG.debug('Begin final prep of article')
logger.debug('Begin final prep of article')
updated_winner.node = prep_article(updated_winner.node)
if updated_winner.node is not None:
doc = build_base_document(updated_winner.node, self.fragment)
else:
LOG.warning('Had candidates but failed to find a cleaned winning doc.')
logger.warning('Had candidates but failed to find a cleaned winning doc.')
doc = self._handle_no_candidates()
else:
LOG.warning('No candidates found: using document.')
LOG.debug('Begin final prep of article')
logger.warning('No candidates found: using document.')
logger.debug('Begin final prep of article')
doc = self._handle_no_candidates()
return doc
@ -505,7 +503,7 @@ class Article(object):
doc = prep_article(self.doc)
doc = build_base_document(doc, self.fragment)
else:
LOG.warning('No document to use.')
logger.warning('No document to use.')
doc = build_error_document(self.fragment)
return doc

@ -5,11 +5,10 @@
from __future__ import absolute_import
import re
import logging
from hashlib import md5
from lxml.etree import tounicode
from .logconfig import LNODE
from .logconfig import LOG
# A series of sets of attributes we check to help in determining if a node is
# a potential candidate or not.
@ -23,6 +22,8 @@ CLS_WEIGHT_NEGATIVE = re.compile(('combx|comment|com-|contact|foot|footer|'
'footnote|head|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|'
'sidebar|sponsor|shopping|tags|tool|widget'), re.I)
logger = logging.getLogger("breadability")
def check_node_attr(node, attr, checkset):
value = node.get(attr) or ""
@ -44,7 +45,7 @@ def generate_hash_id(node):
try:
hashed.update(content.encode('utf-8', "replace"))
except Exception as e:
LOG.error("BOOM! " + str(e))
logger.exception("BOOM! %r", e)
return hashed.hexdigest()[0:8]
@ -112,7 +113,7 @@ def score_candidates(nodes):
candidates = {}
for node in nodes:
LNODE.log(node, 1, "Scoring Node")
logger.debug("Scoring Node")
content_score = 0
# if the node has no parent it knows of, then it ends up creating a
@ -122,16 +123,12 @@ def score_candidates(nodes):
innertext = node.text_content()
if parent is None or grand is None:
LNODE.log(
node, 1,
"Skipping candidate because parent/grand are none")
logger.debug("Skipping candidate because parent/grand are none")
continue
# If this paragraph is less than 25 characters, don't even count it.
if innertext and len(innertext) < MIN_HIT_LENTH:
LNODE.log(
node, 1,
"Skipping candidate because not enough content.")
logger.debug("Skipping candidate because not enough content.")
continue
# Initialize readability data for the parent.
@ -148,13 +145,11 @@ def score_candidates(nodes):
if innertext:
# Add 0.25 points for any commas within this paragraph
content_score += innertext.count(',') * 0.25
LNODE.log(node, 1,
"Bonus points for ,: " + str(innertext.count(',')))
logger.debug("Bonus points for ,: " + str(innertext.count(',')))
# Subtract 0.5 points for each double quote within this paragraph
content_score += innertext.count('"') * (-0.5)
LNODE.log(node, 1,
'Penalty points for ": ' + str(innertext.count('"')))
logger.debug('Penalty points for ": ' + str(innertext.count('"')))
# For every 100 characters in this paragraph, add another point.
# Up to 3 points.
@ -164,35 +159,22 @@ def score_candidates(nodes):
content_score += 3
else:
content_score += length_points
LNODE.log(
node, 1,
"Length/content points: {0} : {1}".format(length_points,
content_score))
logger.debug("Length/content points: %r : %r", length_points,
content_score)
# Add the score to the parent.
LNODE.log(node, 1, "From this current node.")
logger.debug("From this current node.")
candidates[parent].content_score += content_score
LNODE.log(
candidates[parent].node,
1,
"Giving parent bonus points: " + str(
candidates[parent].content_score))
logger.debug("Giving parent bonus points: %r", candidates[parent].content_score)
# The grandparent gets half.
LNODE.log(candidates[grand].node, 1, "Giving grand bonus points")
logger.debug("Giving grand bonus points")
candidates[grand].content_score += (content_score / 2.0)
LNODE.log(
candidates[parent].node,
1,
"Giving grand bonus points: " + str(
candidates[grand].content_score))
logger.debug("Giving grand bonus points: %r", candidates[grand].content_score)
for candidate in candidates.values():
adjustment = 1 - get_link_density(candidate.node)
LNODE.log(
candidate.node,
1,
"Getting link density adjustment: {0} * {1} ".format(
candidate.content_score, adjustment))
logger.debug("Getting link density adjustment: %r * %r",
candidate.content_score, adjustment)
candidate.content_score = candidate.content_score * (adjustment)
return candidates

@ -44,10 +44,7 @@ class TestArticle(unittest.TestCase):
# from lxml.etree import tounicode
found = False
wanted_hash = '04e46055'
# from breadability.logconfig import LNODE
# from breadability.logconfig import set_logging_level
# set_logging_level('DEBUG')
# LNODE.activate()
for node in doc.candidates.values():
if node.hash_id == wanted_hash:
found = node
@ -70,5 +67,3 @@ class TestArticle(unittest.TestCase):
# This article hits up against the img > p conditional filtering
# because of the many .gif images in the content. We've removed that
# rule.
# set_logging_level('INFO')
# LNODE.deactivate()

Loading…
Cancel
Save