Garden and lint

pull/11/head
Richard Harding 12 years ago
parent 6d380712c5
commit e83a753b82

@ -22,8 +22,6 @@ except ImportError:
LOGLEVEL = "WARNING"
# Logging bits stolen and adapted from:
# http://www.tornadoweb.org/documentation/_modules/tornado/options.html
LogOptions = namedtuple('LogOptions', [
@ -43,7 +41,6 @@ options = LogOptions(
)
def set_logging_level(level):
"""Adjust the current logging level.
@ -120,8 +117,8 @@ class LogHelper(object):
hashed = md5()
try:
hashed.update(content.encode('utf-8', errors="replace"))
except Exception, e:
LOG.error("Cannot hash the current node.")
except Exception, exc:
LOG.error("Cannot hash the current node." + str(exc))
hash_id = hashed.hexdigest()[0:8]
# if hash_id in ['9c880b27', '8393b7d7', '69bfebdd']:
print(u"{0} :: {1}\n{2}".format(

@ -51,11 +51,10 @@ def drop_tag(doc, *tags):
return doc
def ok_embedded_video(node):
"""Check if this embed/video is an ok one to count."""
keep_keywords = ['youtube', 'blip.tv', 'vimeo']
node_str = tounicode(n)
node_str = tounicode(node)
for key in keep_keywords:
if key in node_str:
return True
@ -305,7 +304,8 @@ def prep_article(doc):
# we could get around this by checking for caption info in the
# images to try to do some scoring of good v. bad images.
# failing example:
# arstechnica.com/science/news/2012/05/1859s-great-auroral-stormthe-week-the-sun-touched-the-earth.ars
# arstechnica.com/science/news/2012/05/1859s
# -great-auroral-stormthe-week-the-sun-touched-the-earth.ars
LNODE.log(node, 2, 'Conditional drop: img > p')
remove_node = True
elif li > p and node.tag != 'ul' and node.tag != 'ol':
@ -315,16 +315,20 @@ def prep_article(doc):
LNODE.log(node, 2, 'Conditional drop: inputs > p/3.0')
remove_node = True
elif content_length < 25 and (img == 0 or img > 2):
LNODE.log(node, 2, 'Conditional drop: len < 25 and 0/>2 images')
LNODE.log(node, 2,
'Conditional drop: len < 25 and 0/>2 images')
remove_node = True
elif weight < 25 and link_density > 0.2:
LNODE.log(node, 2, 'Conditional drop: weight small and link is dense')
LNODE.log(node, 2,
'Conditional drop: weight small and link is dense')
remove_node = True
elif weight >= 25 and link_density > 0.5:
LNODE.log(node, 2, 'Conditional drop: weight big but link heavy')
LNODE.log(node, 2,
'Conditional drop: weight big but link heavy')
remove_node = True
elif (embed == 1 and content_length < 75) or embed > 1:
LNODE.log(node, 2, 'Conditional drop: embed without much content or many embed')
LNODE.log(node, 2,
'Conditional drop: embed w/o much content or many embed')
remove_node = True
return remove_node

@ -37,7 +37,8 @@ def get_link_density(node, node_text=None):
:returns float:
"""
link_length = sum([len(a.text_content()) or 0 for a in node.findall(".//a")])
link_length = sum([len(a.text_content()) or 0
for a in node.findall(".//a")])
if node_text:
text_length = len(node_text)
else:
@ -98,12 +99,16 @@ def score_candidates(nodes):
innertext = node.text_content()
if parent is None or grand is None:
LNODE.log(node, 1, "Skipping candidate because parent/grand are none")
LNODE.log(
node, 1,
"Skipping candidate because parent/grand are none")
continue
# If this paragraph is less than 25 characters, don't even count it.
if innertext and len(innertext) < MIN_HIT_LENTH:
LNODE.log(node, 1, "Skipping candidate because not enough content.")
LNODE.log(
node, 1,
"Skipping candidate because not enough content.")
continue
# Initialize readability data for the parent.
@ -128,21 +133,36 @@ def score_candidates(nodes):
content_score += 3
else:
content_score += length_points
LNODE.log(node, 1, "Length/content points: {0} : {1}".format(length_points, content_score))
LNODE.log(
node, 1,
"Length/content points: {0} : {1}".format(length_points,
content_score))
# Add the score to the parent.
LNODE.log(node, 1, "From this current node.")
candidates[parent].content_score += content_score
LNODE.log(candidates[parent].node, 1, "Giving parent bonus points: " + str(candidates[parent].content_score))
LNODE.log(
candidates[parent].node,
1,
"Giving parent bonus points: " + str(
candidates[parent].content_score))
# The grandparent gets half.
LNODE.log(candidates[grand].node, 1, "Giving grand bonus points")
candidates[grand].content_score += (content_score / 2.0)
LNODE.log(candidates[parent].node, 1, "Giving grand bonus points: " + str(candidates[grand].content_score))
LNODE.log(
candidates[parent].node,
1,
"Giving grand bonus points: " + str(
candidates[grand].content_score))
for candidate in candidates.values():
LNODE.log(candidate.node, 1, "Getting link density adjustment: {0} * {1} ".format(
candidate.content_score, (1 - get_link_density(candidate.node))))
candidate.content_score = candidate.content_score * (1 - get_link_density(candidate.node))
adjustment = 1 - get_link_density(candidate.node)
LNODE.log(
candidate.node,
1,
"Getting link density adjustment: {0} * {1} ".format(
candidate.content_score, adjustment))
candidate.content_score = candidate.content_score * (adjustment)
return candidates

@ -1,12 +1,11 @@
import time
#
# ? 2011 Christopher Arndt, MIT License
#
class cached_property(object):
'''Decorator for read-only properties evaluated only once within TTL period.
'''Decorator for read-only properties evaluated only once within TTL
period.
It can be used to created a cached property like this::
@ -15,8 +14,7 @@ class cached_property(object):
# the class containing the property must be a new-style class
class MyClass(object):
# create property whose value is cached for ten minutes
@cached_property(ttl=600)
def randint(self):
@cached_property(ttl=600) def randint(self):
# will only be evaluated every 10 min. at maximum.
return random.randint(0, 100)
@ -32,7 +30,7 @@ class cached_property(object):
zero for the cached value to never expire.
To expire a cached property value manually just do::
del instance._cache[<property name>]
'''

Loading…
Cancel
Save