python-readability/readability/debug.py
2015-07-27 11:59:17 +06:00

54 lines
1.3 KiB
Python

import re
#FIXME: use with caution, can leak memory
uids = {}
uids_document = None
def describe_node(node):
global uids
if node is None:
return ''
if not hasattr(node, 'tag'):
return "[%s]" % type(node)
name = node.tag
if node.get('id', ''):
name += '#' + node.get('id')
if node.get('class', '').strip():
name += '.' + '.'.join(node.get('class').split())
if name[:4] in ['div#', 'div.']:
name = name[3:]
if name in ['tr', 'td', 'div', 'p']:
uid = uids.get(node)
if uid is None:
uid = uids[node] = len(uids) + 1
name += "{%02d}" % uid
return name
def describe(node, depth=1):
global uids, uids_document
doc = node.getroottree().getroot()
if doc != uids_document:
uids = {}
uids_document = doc
#return repr(NodeRepr(node))
parent = ''
if depth and node.getparent() is not None:
parent = describe(node.getparent(), depth=depth - 1) + '>'
return parent + describe_node(node)
RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U)
def text_content(elem, length=40):
content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', ''))
if len(content) < length:
return content
return content[:length] + '...'