s
- for el in self.reverse_tags(node, "table", "ul", "div"):
- if el in allowed:
- continue
- weight = self.class_weight(el)
- if el in candidates:
- content_score = candidates[el]['content_score']
- #print '!',el, '-> %6.3f' % content_score
- else:
- content_score = 0
- tag = el.tag
-
- if weight + content_score < 0:
- self.debug("Cleaned %s with score %6.3f and weight %-3s" %
- (describe(el), content_score, weight, ))
- el.drop_tree()
- elif el.text_content().count(",") < 10:
- counts = {}
- for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
- counts[kind] = len(el.findall('.//%s' %kind))
- counts["li"] -= 100
-
- content_length = text_length(el) # Count the text length excluding any surrounding whitespace
- link_density = self.get_link_density(el)
- parent_node = el.getparent()
- if parent_node is not None:
- if parent_node in candidates:
- content_score = candidates[parent_node]['content_score']
- else:
- content_score = 0
- #if parent_node is not None:
- #pweight = self.class_weight(parent_node) + content_score
- #pname = describe(parent_node)
- #else:
- #pweight = 0
- #pname = "no parent"
- to_remove = False
- reason = ""
-
- #if el.tag == 'div' and counts["img"] >= 1:
- # continue
- if counts["p"] and counts["img"] > counts["p"]:
- reason = "too many images (%s)" % counts["img"]
- to_remove = True
- elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
- reason = "more
s than s"
- to_remove = True
- elif counts["input"] > (counts["p"] / 3):
- reason = "less than 3x
s than s"
- to_remove = True
- elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2):
- reason = "too short content length %s without a single image" % content_length
- to_remove = True
- elif weight < 25 and link_density > 0.2:
- reason = "too many links %.3f for its weight %s" % (link_density, weight)
- to_remove = True
- elif weight >= 25 and link_density > 0.5:
- reason = "too many links %.3f for its weight %s" % (link_density, weight)
- to_remove = True
- elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
- reason = "