From a8479f9936c6c9fa6ca007814f64ba869a777835 Mon Sep 17 00:00:00 2001
From: nemobis <federicoleva@tiscali.it>
Date: Sun, 20 May 2012 08:08:28 +0000
Subject: [PATCH] Get namespaces and images info from API. Code badly copied
 around, we should probably use modules such as
 http://packages.python.org/simplemediawiki/ to use API. Tested, seems to
 work. Fixed some typos.

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@668 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
---
 dumpgenerator.py | 109 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 98 insertions(+), 11 deletions(-)
diff --git a/dumpgenerator.py b/dumpgenerator.py
index 83854b0..d7e9c6d 100644
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -62,14 +62,12 @@ def cleanHTML(raw=''):
         raw = raw.split('<article id="WikiaMainContent" class="WikiaMainContent">')[1].split('</article>')[0]
     else:
         print raw[:250]
-        print 'This wiki doesn\'t use marks to split contain'
+        print 'This wiki doesn\'t use marks to split content'
         sys.exit()
     return raw
 
 def getNamespaces(config={}):
-    """  """
-    #fix get namespaces from a random Special:Export page, it is better
-    #too from API http://wikiindex.org/api.php?action=query&meta=siteinfo&siprop=general|namespaces
+    """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages. Function called if no API is available. """
     namespaces = config['namespaces']
     namespacenames = {0:''} # main is 0, no prefix
     if namespaces:
@@ -99,14 +97,45 @@ def getNamespaces(config={}):
     namespaces = [i for i in set(namespaces)] #uniques
     print '%d namespaces found' % (len(namespaces))
     return namespaces, namespacenames
+    
+def getNamespacesAPI(config={}):
+    """ Uses the API to get the list of namespaces names and ids """
+    namespaces = config['namespaces']
+    namespacenames = {0:''} # main is 0, no prefix
+    if namespaces:
+        req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'xml'}), headers={'User-Agent': getUserAgent()})
+        f = urllib2.urlopen(req)
+        raw = f.read()
+        f.close()
+            
+        m = re.compile(r'<ns id="(?P<namespaceid>\d+)"[^>]*?/?>(?P<namespacename>[^<]+)?(</ns>)?').finditer(raw) # [^>]*? to include case="first-letter" canonical= etc.
+        if 'all' in namespaces:
+            namespaces = []
+            for i in m:
+                namespaces.append(int(i.group("namespaceid")))
+                namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
+        else:
+            #check if those namespaces really exist in this wiki
+            namespaces2 = []
+            for i in m:
+                if int(i.group("namespaceid")) in namespaces:
+                    namespaces2.append(int(i.group("namespaceid")))
+                    namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
+            namespaces = namespaces2
+    else:
+        namespaces = [0]
+    
+    namespaces = [i for i in set(namespaces)] #uniques
+    print '%d namespaces found' % (len(namespaces))
+    return namespaces, namespacenames
 
 def getPageTitlesAPI(config={}):
-    """  """
+    """ Uses the API to get the list of page titles """
     titles = []
-    namespaces, namespacenames = getNamespaces(config=config)
+    namespaces, namespacenames = getNamespacesAPI(config=config)
     for namespace in namespaces:
         if namespace in config['exnamespaces']:
-            print '    Skiping namespace =', namespace
+            print '    Skipping namespace =', namespace
             continue
         
         c = 0
@@ -456,7 +485,7 @@ def getImageFilenamesURL(config={}):
         f = urllib2.urlopen(req)
         raw = f.read()
         f.close()
-        if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicated wiki
+        if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki
             if limit > 10:
                 print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)
                 limit = limit/10
@@ -520,6 +549,58 @@ def getImageFilenamesURL(config={}):
     images.sort()
     return images
 
+def getImageFilenamesURLAPI(config={}):
+    """ Retrieve file list: filename, url, uploader """
+    print 'Retrieving image filenames'
+    headers = {'User-Agent': getUserAgent()}
+    aifrom = '!'
+    images = []
+    while aifrom:
+        sys.stderr.write('.') #progress
+        params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'xml', 'ailimit': 500}
+        data = urllib.urlencode(params)
+        req = urllib2.Request(url=config['api'], data=data, headers=headers)
+        try:
+            f = urllib2.urlopen(req)
+        except:
+            try:
+                print 'Server is slow... Waiting some seconds and retrying...'
+                time.sleep(10)
+                f = urllib2.urlopen(req)
+            except:
+                print 'An error has occurred while retrieving page titles with API'
+                print 'Please, resume the dump, --resume'
+                sys.exit()
+        xml = f.read()
+        f.close()
+        m = re.findall(r'<allimages aifrom="([^>]+)" />', xml)
+        if m:
+            aifrom = undoHTMLEntities(text=m[0]) #&quot; = ", etc
+        else:
+            aifrom = ''
+        m = re.compile(r'(?im)<img name="(?P<filename>[^"]+)"[^>]*user="(?P<uploader>[^"]+)"[^>]* url="(?P<url>[^"]+)"[^>]*/>').finditer(xml) # Retrieves a filename, uploader, url triple from the name, user, url field of the xml line; space before url needed to avoid getting the descriptionurl field instead.
+        for i in m:
+            url = i.group('url')
+            if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL?
+                if url[0] == '/': #slash is added later
+                    url = url[1:]
+                domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
+                url = '%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
+            url = undoHTMLEntities(text=url)
+            #url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
+            url = re.sub(' ', '_', url)
+            filename = re.sub('_', ' ', i.group('filename'))
+            filename = undoHTMLEntities(text=filename)
+            filename = urllib.unquote(filename)
+            uploader = re.sub('_', ' ', i.group('uploader'))
+            uploader = undoHTMLEntities(text=uploader)
+            uploader = urllib.unquote(uploader)
+            images.append([filename, url, uploader])           
+                    
+    print '    Found %d images' % (len(images))
+    images.sort()
+    return images
+
 def undoHTMLEntities(text=''):
     """  """
     text = re.sub('&lt;', '<', text) # i guess only < > & " need conversion http://www.w3schools.com/html/html_entities.asp
@@ -951,7 +1032,10 @@ def main(params=[]):
             else:
                 print 'Image list is incomplete. Reloading...'
                 #do not resume, reload, to avoid inconsistences, deleted images or so
-                images = getImageFilenamesURL(config=config)
+                if config['api']:
+                    images=getImageFilenamesURLAPI(config=config)
+                else:
+                    images = getImageFilenamesURL(config=config)
                 saveImageFilenamesURL(config=config, images=images)
             #checking images directory
             listdir = []
@@ -991,7 +1075,10 @@ def main(params=[]):
             saveTitles(config=config, titles=titles)
             generateXMLDump(config=config, titles=titles)
         if config['images']:
-            images += getImageFilenamesURL(config=config)
+            if config['api']:
+                images += getImageFilenamesURLAPI(config=config)
+            else:
+                images += getImageFilenamesURL(config=config)
             saveImageFilenamesURL(config=config, images=images)
             generateImageDump(config=config, other=other, images=images)
         if config['logs']:
@@ -1028,4 +1115,4 @@ def main(params=[]):
     bye()
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file