From c8ca525ff70a8416fb401adb4da25c30c133b555 Mon Sep 17 00:00:00 2001 From: emijrp Date: Mon, 6 Aug 2012 14:54:31 +0000 Subject: [PATCH] just adding documentation and comments git-svn-id: https://wikiteam.googlecode.com/svn/trunk@743 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 --- dumpgenerator.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index d0c510f..93f79ca 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -47,11 +47,9 @@ def delay(config={}): time.sleep(config['delay']) def cleanHTML(raw=''): - """ """ - # - # - # - # + """ Extract only the real wiki content and remove rubbish. This function is only used to retrieve page titles and file names when no API is available """ + """ DO NOT use this function to extract page content """ + #different "tags" used by different MediaWiki versions to mark where starts and ends content if re.search('', raw): raw = raw.split('')[1].split('')[0] elif re.search('', raw): @@ -95,7 +93,6 @@ def getNamespaces(config={}): else: namespaces = [0] - #retrieve all titles from Special:Allpages, if the wiki is big, perhaps there are sub-Allpages to explore namespaces = [i for i in set(namespaces)] #uniques print '%d namespaces found' % (len(namespaces)) return namespaces, namespacenames