diff --git a/dumpgenerator.py b/dumpgenerator.py
index 83854b0..d7e9c6d 100644
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -62,14 +62,12 @@ def cleanHTML(raw=''):
raw = raw.split('')[1].split('')[0]
else:
print raw[:250]
- print 'This wiki doesn\'t use marks to split contain'
+ print 'This wiki doesn\'t use marks to split content'
sys.exit()
return raw
def getNamespaces(config={}):
- """ """
- #fix get namespaces from a random Special:Export page, it is better
- #too from API http://wikiindex.org/api.php?action=query&meta=siteinfo&siprop=general|namespaces
+ """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages. Function called if no API is available. """
namespaces = config['namespaces']
namespacenames = {0:''} # main is 0, no prefix
if namespaces:
@@ -99,14 +97,45 @@ def getNamespaces(config={}):
namespaces = [i for i in set(namespaces)] #uniques
print '%d namespaces found' % (len(namespaces))
return namespaces, namespacenames
+
+def getNamespacesAPI(config={}):
+ """ Uses the API to get the list of namespaces names and ids """
+ namespaces = config['namespaces']
+ namespacenames = {0:''} # main is 0, no prefix
+ if namespaces:
+ req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'xml'}), headers={'User-Agent': getUserAgent()})
+ f = urllib2.urlopen(req)
+ raw = f.read()
+ f.close()
+
+ m = re.compile(r']*?/?>(?P[^<]+)?()?').finditer(raw) # [^>]*? to include case="first-letter" canonical= etc.
+ if 'all' in namespaces:
+ namespaces = []
+ for i in m:
+ namespaces.append(int(i.group("namespaceid")))
+ namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
+ else:
+ #check if those namespaces really exist in this wiki
+ namespaces2 = []
+ for i in m:
+ if int(i.group("namespaceid")) in namespaces:
+ namespaces2.append(int(i.group("namespaceid")))
+ namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
+ namespaces = namespaces2
+ else:
+ namespaces = [0]
+
+ namespaces = [i for i in set(namespaces)] #uniques
+ print '%d namespaces found' % (len(namespaces))
+ return namespaces, namespacenames
def getPageTitlesAPI(config={}):
- """ """
+ """ Uses the API to get the list of page titles """
titles = []
- namespaces, namespacenames = getNamespaces(config=config)
+ namespaces, namespacenames = getNamespacesAPI(config=config)
for namespace in namespaces:
if namespace in config['exnamespaces']:
- print ' Skiping namespace =', namespace
+ print ' Skipping namespace =', namespace
continue
c = 0
@@ -456,7 +485,7 @@ def getImageFilenamesURL(config={}):
f = urllib2.urlopen(req)
raw = f.read()
f.close()
- if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicated wiki
+ if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki
if limit > 10:
print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)
limit = limit/10
@@ -520,6 +549,58 @@ def getImageFilenamesURL(config={}):
images.sort()
return images
+def getImageFilenamesURLAPI(config={}):
+ """ Retrieve file list: filename, url, uploader """
+ print 'Retrieving image filenames'
+ headers = {'User-Agent': getUserAgent()}
+ aifrom = '!'
+ images = []
+ while aifrom:
+ sys.stderr.write('.') #progress
+ params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'xml', 'ailimit': 500}
+ data = urllib.urlencode(params)
+ req = urllib2.Request(url=config['api'], data=data, headers=headers)
+ try:
+ f = urllib2.urlopen(req)
+ except:
+ try:
+ print 'Server is slow... Waiting some seconds and retrying...'
+ time.sleep(10)
+ f = urllib2.urlopen(req)
+ except:
+ print 'An error has occurred while retrieving page titles with API'
+ print 'Please, resume the dump, --resume'
+ sys.exit()
+ xml = f.read()
+ f.close()
+ m = re.findall(r'', xml)
+ if m:
+ aifrom = undoHTMLEntities(text=m[0]) #" = ", etc
+ else:
+ aifrom = ''
+ m = re.compile(r'(?im)]*user="(?P[^"]+)"[^>]* url="(?P[^"]+)"[^>]*/>').finditer(xml) # Retrieves a filename, uploader, url triple from the name, user, url field of the xml line; space before url needed to avoid getting the descriptionurl field instead.
+ for i in m:
+ url = i.group('url')
+ if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL?
+ if url[0] == '/': #slash is added later
+ url = url[1:]
+ domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
+ url = '%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
+ url = undoHTMLEntities(text=url)
+ #url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
+ url = re.sub(' ', '_', url)
+ filename = re.sub('_', ' ', i.group('filename'))
+ filename = undoHTMLEntities(text=filename)
+ filename = urllib.unquote(filename)
+ uploader = re.sub('_', ' ', i.group('uploader'))
+ uploader = undoHTMLEntities(text=uploader)
+ uploader = urllib.unquote(uploader)
+ images.append([filename, url, uploader])
+
+ print ' Found %d images' % (len(images))
+ images.sort()
+ return images
+
def undoHTMLEntities(text=''):
""" """
text = re.sub('<', '<', text) # i guess only < > & " need conversion http://www.w3schools.com/html/html_entities.asp
@@ -951,7 +1032,10 @@ def main(params=[]):
else:
print 'Image list is incomplete. Reloading...'
#do not resume, reload, to avoid inconsistences, deleted images or so
- images = getImageFilenamesURL(config=config)
+ if config['api']:
+ images=getImageFilenamesURLAPI(config=config)
+ else:
+ images = getImageFilenamesURL(config=config)
saveImageFilenamesURL(config=config, images=images)
#checking images directory
listdir = []
@@ -991,7 +1075,10 @@ def main(params=[]):
saveTitles(config=config, titles=titles)
generateXMLDump(config=config, titles=titles)
if config['images']:
- images += getImageFilenamesURL(config=config)
+ if config['api']:
+ images += getImageFilenamesURLAPI(config=config)
+ else:
+ images += getImageFilenamesURL(config=config)
saveImageFilenamesURL(config=config, images=images)
generateImageDump(config=config, other=other, images=images)
if config['logs']:
@@ -1028,4 +1115,4 @@ def main(params=[]):
bye()
if __name__ == "__main__":
- main()
+ main()
\ No newline at end of file