pull/4/merge
Richard Harding 12 years ago
parent 8e96cb7844
commit e843940549

@ -7,8 +7,7 @@ from breadability.document import OriginalDocument
from breadability.utils import cached_property
RegexList = namedtuple('RegexList',
['unlikely', 'maybe', 'positive', 'negative'])
RegexList = namedtuple('RegexList', ['unlikely', 'maybe'])
READABLERE = RegexList(
@ -17,10 +16,9 @@ READABLERE = RegexList(
'remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination'
'|pager|popup|tweet|twitter', re.I)),
maybe=(re.compile('and|article|body|column|main|shadow', re.I)),
positive=(),
negative=()
)
CLS_WEIGHT_POSITIVE = set(['article', 'body', 'content', 'entry', 'hentry',
'main', 'page', 'pagination', 'post', 'text', 'blog', 'story'])
CLS_WEIGHT_NEGATIVE = set(['combx', 'comment', 'com-', 'contact', 'foot',
@ -30,6 +28,11 @@ CLS_WEIGHT_NEGATIVE = set(['combx', 'comment', 'com-', 'contact', 'foot',
def drop_tag(doc, *tags):
"""Helper to just remove any nodes that match this html tag passed in
:param *tags: one or more html tag strings to remove e.g. style, script
"""
[[n.drop_tree() for n in doc.iterfind(".//" + tag)]
for tag in tags]
return doc
@ -38,7 +41,7 @@ def drop_tag(doc, *tags):
def build_base_document(html):
"""Return a base document with the body as root.
html should be a parsed Element object.
:param html: Parsed Element object
"""
found_body = html.find('.//body')
@ -52,6 +55,7 @@ def build_base_document(html):
found_body.set('id', 'readabilityBody')
return html
def transform_misused_divs_into_paragraphs(doc):
"""Turn all divs that don't have children block level elements into p's
@ -81,6 +85,7 @@ def transform_misused_divs_into_paragraphs(doc):
###### SCORING
def get_class_weight(node):
"""Get an elements class/id weight.
@ -113,7 +118,7 @@ def score_candidates(nodes):
for node in nodes:
content_score = 0
parent = node.getparent()
grand = parent.getparent() if parent is not None else None
grand = parent.getparent() if parent is not None else None
innertext = node.text
if parent is None or grand is None:
@ -132,7 +137,7 @@ def score_candidates(nodes):
candidates[grand] = CandidateNode(grand)
# Add a point for the paragraph itself as a base.
content_score += 1;
content_score += 1
# Add points for any commas within this paragraph
content_score += innertext.count(',') if innertext else 0

Loading…
Cancel
Save