Add some more debugging to support tracing wtf we did and why

pull/4/merge
Richard Harding 12 years ago
parent 00ba7e5164
commit 14bbe701eb

@ -8,10 +8,10 @@ from lxml.etree import tounicode
from lxml.html import document_fromstring
from lxml.html import HTMLParser
from breadability.logconfig import LOG
from breadability.utils import cached_property
LOG = logging.getLogger(__name__)
utf8_parser = HTMLParser(encoding='utf-8')
@ -38,6 +38,7 @@ def get_encoding(page):
def replace_multi_br_to_paragraphs(html):
"""Convert multiple <br>s into paragraphs"""
LOG.debug('Replacing multiple <br/> to <p>')
rep = re.compile("(<br[^>]*>[ \n\r\t]*){2,}", re.I)
return rep.sub('</p><p>', html)
@ -81,6 +82,7 @@ class OriginalDocument(object):
# doc = html_cleaner.clean_html(doc)
base_href = self.url
if base_href:
LOG.debug('Making links absolute')
doc.make_links_absolute(base_href, resolve_base_href=True)
else:
doc.resolve_base_href()

@ -5,6 +5,8 @@ from lxml.etree import tostring
from lxml.html.clean import Cleaner
from lxml.html import fragment_fromstring
from lxml.html import fromstring
from pprint import PrettyPrinter
from breadability.document import OriginalDocument
from breadability.logconfig import LOG
from breadability.scoring import score_candidates
@ -30,8 +32,9 @@ def drop_tag(doc, *tags):
"""
for tag in tags:
found = doc.iterfind(".//" + tag)
if found:
[n.drop_tree for n in found]
for n in found:
LOG.debug("Dropping tag: " + str(n))
n.drop_tree()
return doc
@ -78,11 +81,11 @@ def transform_misused_divs_into_paragraphs(doc):
# We need to create a <p> and put all it's contents in there
# We'll just stringify it, then regex replace the first/last
# div bits to turn them into <p> vs <div>.
LOG.debug('Turning leaf <div> into <p>')
orig = tounicode(elem).strip()
started = re.sub(r'^<\s*div', '<p', orig)
ended = re.sub(r'div>$', 'p>', started)
elem.getparent().replace(elem, fromstring(ended))
return doc
@ -129,6 +132,7 @@ def check_siblings(candidate_node, candidate_list):
append = True
if append:
LOG.debug('Sibling being appended' + str(sibling))
if sibling.tag not in ['div', 'p']:
# We have a node that isn't a common block level element, like
# a form or td tag. Turn it into a div so it doesn't get
@ -152,12 +156,14 @@ def prep_article(doc):
"""
def clean_document(node):
"""Clean up the final document we return as the readable article"""
LOG.debug('Cleaning document')
clean_list = ['object', 'h1']
keep_keywords = ['youtube', 'blip.tv', 'vimeo']
# If there is only one h2, they are probably using it as a header and
# not a subheader, so remove it since we already have a header.
if len(node.findall('.//h2')) == 1:
LOG.debug('Adding H2 to list of nodes to clean.')
clean_list.append('h2')
for n in node.iter():
@ -183,7 +189,10 @@ def prep_article(doc):
if not allow and key in node_str:
allow = True
if not allow:
LOG.debug('Dropping node: ' + str(n))
n.drop_tree()
# go on with next loop, this guy is gone
continue
if n.tag in ['h1', 'h2', 'h3', 'h4']:
# clean headings
@ -192,15 +201,22 @@ def prep_article(doc):
if get_class_weight(n) < 0 or get_link_density(n) > .33:
# for some reason we get nodes here without a parent
if n.getparent() is not None:
LOG.debug(
"Dropping <hX>, it's insignificant: " + str(n))
n.drop_tree()
# go on with next loop, this guy is gone
continue
# clean out extra <p>
if n.tag == 'p':
# if the p has no children and has no content...well then down
# with it.
if not n.getchildren() and len(n.text_content()) < 5:
LOG.debug('Dropping extra <p>: ' + str(n))
n.drop_tree()
# go on with next loop, this guy is gone
continue
return node
def clean_conditionally(doc, clean_el):
@ -222,6 +238,7 @@ def find_candidates(doc):
for node in doc.iter():
if is_unlikely_node(node):
LOG.debug('Dropping unlikely: ' + str(node))
node.drop_tree()
elif node.tag in scorable_node_tags:
nodes_to_score.append(node)
@ -252,6 +269,10 @@ class Article(object):
candidates = find_candidates(doc)
if candidates:
LOG.debug('Candidates found:')
pp = PrettyPrinter(indent=2)
LOG.debug(pp.pformat(candidates))
# right now we return the highest scoring candidate content
by_score = sorted([c for c in candidates.values()],
key=attrgetter('content_score'), reverse=True)
@ -259,10 +280,14 @@ class Article(object):
# since we have several candidates, check the winner's siblings
# for extra content
winner = by_score[0]
LOG.debug('Selected winning node: ' + str(winner))
updated_winner = check_siblings(winner, candidates)
LOG.debug('Begin final prep of article')
updated_winner.node = prep_article(updated_winner.node)
doc = build_base_document(updated_winner.node)
else:
LOG.warning('No candidates found: using document.')
LOG.debug('Begin final prep of article')
doc = prep_article(doc)
doc = build_base_document(doc)

@ -138,6 +138,10 @@ class ScoredNode(object):
"""
__slots__ = ['node', 'content_score']
def __repr__(self):
"""Helpful representation of our Scored Node"""
return "{0:0.1F}\t{1}".format(self.content_score, self.node)
def __init__(self, node):
"""Given node, set an initial score and weigh based on css and id"""
self.node = node

Loading…
Cancel
Save