Create LNODE and update bugs in parsing

- Add concept of a LNODE logger that outputs information about scoring, node,
    and generates a hash_id for the node content so we can track it.
- Add `-d` flag to the cmd line client to output the LNODE logging
- Update reading in of http content in the client to be unicode
- Wrap stdout with a unicode happy stream so we can pipe unicode to less/grep,
    etc
- Add html article to the scorable tags we work with
- Make sure we drop iframe along with noscript
- Fix scoring bugs around length points
- Add the hash_id as a scored node @property
pull/4/merge
Richard Harding 12 years ago
parent f1623fc3e3
commit 32350fc3a1

@ -1,6 +1,6 @@
import argparse
import codecs
import os
import locale
import sys
import urllib
import webbrowser
@ -9,6 +9,7 @@ from tempfile import mkstemp
from breadability import VERSION
from breadability.logconfig import LOG
from breadability.logconfig import LNODE
from breadability.logconfig import set_logging_level
from breadability.readable import Article
@ -41,6 +42,11 @@ def parse_args():
default=False,
help='open the parsed content in your web browser')
parser.add_argument('-d', '--debug',
action='store_true',
default=False,
help='Output the detailed scoring information for debugging parsing')
parser.add_argument('path', metavar='P', type=str, nargs=1,
help="The url or file path to process in readable form.")
@ -54,6 +60,9 @@ def main():
if args.verbose:
set_logging_level('DEBUG')
if args.debug:
LNODE.activate()
target = args.path[0]
LOG.debug("Target: " + target)
@ -66,12 +75,11 @@ def main():
if is_url:
req = urllib.urlopen(target)
ucontent = req.read().encode('utf-8')
content = req.read()
ucontent = unicode(content, 'utf-8')
else:
ucontent = codecs.open(target, "r", "utf-8").read()
enc = sys.__stdout__.encoding or 'utf-8'
doc = Article(ucontent, url=url, fragment=args.fragment)
if args.browser:
fg, pathname = mkstemp(suffix='.html')
@ -80,7 +88,9 @@ def main():
out.close()
webbrowser.open(pathname)
else:
sys.stdout(doc.readable.encode(enc, 'replace'))
# Wrap sys.stdout into a StreamWriter to allow writing unicode.
sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout)
sys.stdout.write(doc.readable)
if __name__ == '__main__':

@ -9,6 +9,8 @@ import logging
import sys
import time
from collections import namedtuple
from hashlib import md5
from lxml.etree import tounicode
# For pretty log messages, if available
@ -19,6 +21,9 @@ except ImportError:
LOGLEVEL = "WARNING"
# Logging bits stolen and adapted from:
# http://www.tornadoweb.org/documentation/_modules/tornado/options.html
LogOptions = namedtuple('LogOptions', [
@ -38,6 +43,7 @@ options = LogOptions(
)
def set_logging_level(level):
"""Adjust the current logging level.
@ -77,6 +83,54 @@ def enable_pretty_logging():
root_logger.addHandler(channel)
class LogHelper(object):
"""Helper to allow us to log as we want for debugging"""
scoring = 1
removing = 2
_active = False
_actions = None
def __init__(self, log, actions=None, content=False):
if actions is None:
self._actions = tuple()
else:
self._actions = actions
self._log = log
self.content = content
@property
def actions(self):
"""Return a tuple of the actions we want to log"""
return self._actions
def activate(self):
"""Turn on this logger."""
self._active = True
def log(self, node, action, description):
"""Write out our log info based on the node and event specified.
We only log this information if we're are DEBUG loglevel
"""
if self._active:
content = tounicode(node)
hashed = md5()
try:
hashed.update(content.encode('utf-8', errors="replace"))
except Exception, e:
LOG.error("Cannot hash the current node.")
hash_id = hashed.hexdigest()[0:8]
# if hash_id in ['9c880b27', '8393b7d7', '69bfebdd']:
print(u"{0} :: {1}\n{2}".format(
hash_id,
description,
content.replace(u"\n", u"")[0:202],
))
class _LogFormatter(logging.Formatter):
def __init__(self, color, *args, **kwargs):
logging.Formatter.__init__(self, *args, **kwargs)
@ -129,3 +183,7 @@ class _LogFormatter(logging.Formatter):
logging.getLogger('breadable').setLevel(getattr(logging, LOGLEVEL))
enable_pretty_logging()
LOG = logging.getLogger('breadable')
LNODE = LogHelper(LOG,
actions=(LogHelper.scoring, LogHelper.removing),
content=True
)

@ -3,13 +3,13 @@ from operator import attrgetter
from lxml.etree import tounicode
from lxml.etree import tostring
from lxml.html.clean import Cleaner
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from lxml.html import fromstring
from pprint import PrettyPrinter
from breadability.document import OriginalDocument
from breadability.logconfig import LOG
from breadability.logconfig import LNODE
from breadability.scoring import score_candidates
from breadability.scoring import get_link_density
from breadability.scoring import get_class_weight
@ -34,6 +34,7 @@ BASE_DOC = """
</body>
</html>
"""
SCORABLE_TAGS = ['div', 'p', 'td', 'pre', 'article']
def drop_tag(doc, *tags):
@ -45,7 +46,7 @@ def drop_tag(doc, *tags):
for tag in tags:
found = doc.iterfind(".//" + tag)
for n in found:
LOG.debug("Dropping tag: " + str(n))
LNODE.log(n, 1, "Dropping tag")
n.drop_tree()
return doc
@ -98,6 +99,7 @@ def build_base_document(html, fragment=True):
else:
output = found_body
output.doctype = "<!DOCTYPE html>"
return output
@ -120,7 +122,7 @@ def transform_misused_divs_into_paragraphs(doc):
# We need to create a <p> and put all it's contents in there
# We'll just stringify it, then regex replace the first/last
# div bits to turn them into <p> vs <div>.
LOG.debug('Turning leaf <div> into <p>')
LNODE.log(elem, 1, 'Turning leaf <div> into <p>')
orig = tounicode(elem).strip()
started = re.sub(r'^<\s*div', '<p', orig)
ended = re.sub(r'div>$', 'p>', started)
@ -145,6 +147,7 @@ def check_siblings(candidate_node, candidate_list):
content_bonus = 0
if sibling is candidate_node.node:
LNODE.log(sibling, 1, 'Sibling is the node so append')
append = True
# Give a bonus if sibling nodes and top candidates have the example
@ -171,7 +174,7 @@ def check_siblings(candidate_node, candidate_list):
append = True
if append:
LOG.debug('Sibling being appended' + str(sibling))
LNODE.log(sibling, 1, 'Sibling being appended')
if sibling.tag not in ['div', 'p']:
# We have a node that isn't a common block level element, like
# a form or td tag. Turn it into a div so it doesn't get
@ -223,7 +226,7 @@ def prep_article(doc):
allow = True
if not allow:
LOG.debug('Dropping node: ' + str(n))
LNODE.log(n, 2, "Dropping Node")
n.drop_tree()
# go on with next loop, this guy is gone
continue
@ -235,8 +238,7 @@ def prep_article(doc):
if get_class_weight(n) < 0 or get_link_density(n) > .33:
# for some reason we get nodes here without a parent
if n.getparent() is not None:
LOG.debug(
"Dropping <hX>, it's insignificant: " + str(n))
LNODE.log(n, 2, "Dropping <hX>, it's insignificant")
n.drop_tree()
# go on with next loop, this guy is gone
continue
@ -246,7 +248,7 @@ def prep_article(doc):
# if the p has no children and has no content...well then down
# with it.
if not n.getchildren() and len(n.text_content()) < 5:
LOG.debug('Dropping extra <p>: ' + str(n))
LNODE.log(n, 2, 'Dropping extra <p>')
n.drop_tree()
# go on with next loop, this guy is gone
continue
@ -274,7 +276,7 @@ def prep_article(doc):
content_score = 0
if (weight + content_score < 0):
LOG.debug('Dropping conditional node: ' + str(node))
LNODE(node, 2, 'Dropping conditional node')
return True
if node.text_content().count(',') < 10:
@ -304,25 +306,25 @@ def prep_article(doc):
# images to try to do some scoring of good v. bad images.
# failing example:
# arstechnica.com/science/news/2012/05/1859s-great-auroral-stormthe-week-the-sun-touched-the-earth.ars
LOG.debug('Conditional drop: img > p')
LNODE.log(node, 2, 'Conditional drop: img > p')
remove_node = True
elif li > p and node.tag != 'ul' and node.tag != 'ol':
LOG.debug('Conditional drop: li > p and not ul/ol')
LNODE.log(node, 2, 'Conditional drop: li > p and not ul/ol')
remove_node = True
elif inputs > p / 3.0:
LOG.debug('Conditional drop: inputs > p/3.0')
LNODE.log(node, 2, 'Conditional drop: inputs > p/3.0')
remove_node = True
elif content_length < 25 and (img == 0 or img > 2):
LOG.debug('Conditional drop: len < 25 and 0/>2 images')
LNODE.log(node, 2, 'Conditional drop: len < 25 and 0/>2 images')
remove_node = True
elif weight < 25 and link_density > 0.2:
LOG.debug('Conditional drop: weight small and link is dense')
LNODE.log(node, 2, 'Conditional drop: weight small and link is dense')
remove_node = True
elif weight >= 25 and link_density > 0.5:
LOG.debug('Conditional drop: weight big but link heavy')
LNODE.log(node, 2, 'Conditional drop: weight big but link heavy')
remove_node = True
elif (embed == 1 and content_length < 75) or embed > 1:
LOG.debug('Conditional drop: embed without much content or many embed')
LNODE.log(node, 2, 'Conditional drop: embed without much content or many embed')
remove_node = True
return remove_node
@ -340,7 +342,7 @@ def find_candidates(doc):
clean up and return the final best match.
"""
scorable_node_tags = ['div', 'p', 'td', 'pre']
scorable_node_tags = SCORABLE_TAGS
nodes_to_score = []
should_remove = []
@ -349,7 +351,7 @@ def find_candidates(doc):
LOG.debug('We should drop unlikely: ' + str(node))
should_remove.append(node)
continue
if node.tag in scorable_node_tags:
if node.tag in scorable_node_tags and node not in nodes_to_score:
nodes_to_score.append(node)
return score_candidates(nodes_to_score), should_remove
@ -386,7 +388,7 @@ class Article(object):
doc = self.orig.html
# cleaning doesn't return, just wipes in place
html_cleaner(doc)
doc = drop_tag(doc, 'noscript')
doc = drop_tag(doc, 'noscript', 'iframe')
doc = transform_misused_divs_into_paragraphs(doc)
candidates, should_drop = find_candidates(doc)

@ -1,6 +1,9 @@
"""Handle dealing with scoring nodes and content for our parsing."""
import re
from hashlib import md5
from lxml.etree import tounicode
from breadability.logconfig import LNODE
from breadability.logconfig import LOG
# A series of sets of attributes we check to help in determining if a node is
@ -25,15 +28,20 @@ def check_node_attr(node, attr, checkset):
return False
def get_link_density(node):
def get_link_density(node, node_text=None):
"""Generate a value for the number of links in the node.
:param node: pared elementree node
:param node_text: if we already have the text_content() make this easier
on us.
:returns float:
"""
link_length = sum([len(a.text_content()) or 0 for a in node.findall(".//a")])
text_length = len(node.text_content())
if node_text:
text_length = len(node_text)
else:
text_length = len(node.text_content())
return float(link_length) / max(text_length, 1)
@ -82,18 +90,20 @@ def score_candidates(nodes):
candidates = {}
for node in nodes:
LNODE.log(node, 1, "Scoring Node")
content_score = 0
parent = node.getparent()
grand = parent.getparent() if parent is not None else None
innertext = node.text_content()
if parent is None or grand is None:
LOG.debug("Skipping candidate because parent/grand are none")
LNODE.log(node, 1, "Skipping candidate because parent/grand are none")
continue
# If this paragraph is less than 25 characters, don't even count it.
if innertext and len(innertext) < MIN_HIT_LENTH:
LOG.debug("Skipping candidate because not enough content.")
LNODE.log(node, 1, "Skipping candidate because not enough content.")
continue
# Initialize readability data for the parent.
@ -109,20 +119,30 @@ def score_candidates(nodes):
# Add points for any commas within this paragraph
content_score += innertext.count(',') if innertext else 0
LNODE.log(node, 1, "Bonus points for ,: " + str(innertext.count(',')))
# For every 100 characters in this paragraph, add another point. Up to
# 3 points.
length_points = len(innertext) % 100 if innertext else 0
content_score = length_points if length_points > 3 else 3
if length_points > 3:
content_score += 3
else:
content_score += length_points
LNODE.log(node, 1, "Length/content points: {0} : {1}".format(length_points, content_score))
# Add the score to the parent.
LNODE.log(node, 1, "From this current node.")
candidates[parent].content_score += content_score
LNODE.log(candidates[parent].node, 1, "Giving parent bonus points: " + str(candidates[parent].content_score))
# The grandparent gets half.
candidates[grand].content_score += content_score / 2.0
LNODE.log(candidates[grand].node, 1, "Giving grand bonus points")
candidates[grand].content_score += (content_score / 2.0)
LNODE.log(candidates[parent].node, 1, "Giving grand bonus points: " + str(candidates[grand].content_score))
for candidate in candidates.values():
candidate.content_score = candidate.content_score * (1 -
get_link_density(candidate.node))
for candidate in candidates.values():
LNODE.log(candidate.node, 1, "Getting link density adjustment: {0} * {1} ".format(
candidate.content_score, (1 - get_link_density(candidate.node))))
candidate.content_score = candidate.content_score * (1 - get_link_density(candidate.node))
return candidates
@ -138,7 +158,10 @@ class ScoredNode(object):
def __repr__(self):
"""Helpful representation of our Scored Node"""
return "{0:0.1F}\t{1}".format(self.content_score, self.node)
return "{0}: {1:0.1F}\t{2}".format(
self.hash_id,
self.content_score,
self.node)
def __init__(self, node):
"""Given node, set an initial score and weigh based on css and id"""
@ -157,3 +180,14 @@ class ScoredNode(object):
content_score = -5
content_score += get_class_weight(node)
self.content_score = content_score
@property
def hash_id(self):
content = tounicode(self.node)
hashed = md5()
try:
hashed.update(content.encode('utf-8', errors="replace"))
except Exception, e:
LOG.error("BOOM! " + str(e))
return hashed.hexdigest()[0:8]

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save