54 lines
1.3 KiB
Python
54 lines
1.3 KiB
Python
import re
|
|
|
|
|
|
#FIXME: use with caution, can leak memory
|
|
uids = {}
|
|
uids_document = None
|
|
|
|
|
|
def describe_node(node):
|
|
global uids
|
|
if node is None:
|
|
return ''
|
|
if not hasattr(node, 'tag'):
|
|
return "[%s]" % type(node)
|
|
name = node.tag
|
|
if node.get('id', ''):
|
|
name += '#' + node.get('id')
|
|
if node.get('class', '').strip():
|
|
name += '.' + '.'.join(node.get('class').split())
|
|
if name[:4] in ['div#', 'div.']:
|
|
name = name[3:]
|
|
if name in ['tr', 'td', 'div', 'p']:
|
|
uid = uids.get(node)
|
|
if uid is None:
|
|
uid = uids[node] = len(uids) + 1
|
|
name += "{%02d}" % uid
|
|
return name
|
|
|
|
|
|
def describe(node, depth=1):
|
|
global uids, uids_document
|
|
doc = node.getroottree().getroot()
|
|
if doc != uids_document:
|
|
uids = {}
|
|
uids_document = doc
|
|
|
|
#return repr(NodeRepr(node))
|
|
parent = ''
|
|
if depth and node.getparent() is not None:
|
|
parent = describe(node.getparent(), depth=depth - 1) + '>'
|
|
return parent + describe_node(node)
|
|
|
|
|
|
RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U)
|
|
|
|
|
|
def text_content(elem, length=40):
|
|
content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', ''))
|
|
if len(content) < length:
|
|
return content
|
|
return content[:length] + '...'
|
|
|
|
|