Update scoring and tests for the antipope article

pull/11/head
Richard Harding 12 years ago
parent 3f70a49a22
commit d3c83b7255

@ -1,4 +1,5 @@
import re
from lxml.etree import Element
from lxml.etree import tounicode
from lxml.etree import tostring
from lxml.html.clean import Cleaner
@ -11,6 +12,7 @@ from breadability.document import OriginalDocument
from breadability.logconfig import LOG
from breadability.logconfig import LNODE
from breadability.scoring import score_candidates
from breadability.scoring import generate_hash_id
from breadability.scoring import get_link_density
from breadability.scoring import get_class_weight
from breadability.scoring import is_unlikely_node
@ -206,7 +208,8 @@ def prep_article(doc):
LOG.debug('Adding H2 to list of nodes to clean.')
clean_list.append('h2')
for n in node.iter():
for n in node.iter(tag=Element):
LNODE.log(n, 2, "Cleaning iter node")
# clean out any incline style properties
if 'style' in n.attrib:
n.set('style', '')
@ -265,6 +268,10 @@ def prep_article(doc):
"""Remove the clean_el if it looks like bad content based on rules."""
target_tags = ['form', 'table', 'ul', 'div', 'p']
LNODE.log(node, 2, 'Cleaning conditionally node.')
if generate_hash_id(node) == '6d63f9d5':
import ipdb;from pprint import pprint; ipdb.set_trace()
if node.tag not in target_tags:
# this is not the tag you're looking for
return
@ -411,6 +418,9 @@ class Article(object):
LOG.debug('Candidates found:')
pp = PrettyPrinter(indent=2)
# cleanup by removing the should_drop we spotted.
[n.drop_tree() for n in self._should_drop]
# right now we return the highest scoring candidate content
by_score = sorted([c for c in self.candidates.values()],
key=attrgetter('content_score'), reverse=True)

@ -28,6 +28,22 @@ def check_node_attr(node, attr, checkset):
return False
def generate_hash_id(node):
"""Generate a hash_id for the node in question.
:param node: lxml etree node
"""
content = tounicode(node)
hashed = md5()
try:
hashed.update(content.encode('utf-8', errors="replace"))
except Exception, e:
LOG.error("BOOM! " + str(e))
return hashed.hexdigest()[0:8]
def get_link_density(node, node_text=None):
"""Generate a value for the number of links in the node.
@ -206,11 +222,4 @@ class ScoredNode(object):
@property
def hash_id(self):
content = tounicode(self.node)
hashed = md5()
try:
hashed.update(content.encode('utf-8', errors="replace"))
except Exception, e:
LOG.error("BOOM! " + str(e))
return hashed.hexdigest()[0:8]
return generate_hash_id(self.node)

Loading…
Cancel
Save