just adding documentation and comments

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@743 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
pull/117/head
emijrp 12 years ago
parent d616bcc2e6
commit c8ca525ff7

@ -47,11 +47,9 @@ def delay(config={}):
time.sleep(config['delay']) time.sleep(config['delay'])
def cleanHTML(raw=''): def cleanHTML(raw=''):
""" """ """ Extract only the real wiki content and remove rubbish. This function is only used to retrieve page titles and file names when no API is available """
#<!-- bodytext --> <!-- /bodytext --> """ DO NOT use this function to extract page content """
#<!-- start content --> <!-- end content --> #different "tags" used by different MediaWiki versions to mark where starts and ends content
#<!-- Begin Content Area --> <!-- End Content Area -->
#<!-- content --> <!-- mw_content -->
if re.search('<!-- bodytext -->', raw): if re.search('<!-- bodytext -->', raw):
raw = raw.split('<!-- bodytext -->')[1].split('<!-- /bodytext -->')[0] raw = raw.split('<!-- bodytext -->')[1].split('<!-- /bodytext -->')[0]
elif re.search('<!-- start content -->', raw): elif re.search('<!-- start content -->', raw):
@ -95,7 +93,6 @@ def getNamespaces(config={}):
else: else:
namespaces = [0] namespaces = [0]
#retrieve all titles from Special:Allpages, if the wiki is big, perhaps there are sub-Allpages to explore
namespaces = [i for i in set(namespaces)] #uniques namespaces = [i for i in set(namespaces)] #uniques
print '%d namespaces found' % (len(namespaces)) print '%d namespaces found' % (len(namespaces))
return namespaces, namespacenames return namespaces, namespacenames

Loading…
Cancel
Save