From 1c1f0dbb86c2aa6eeb21f6c18f8d7e4ffe79b93e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Emilio=20J=2E=20Rodr=C3=ADguez-Posada?= <emijrp@gmail.com>
Date: Sun, 29 Jun 2014 10:01:09 +0200
Subject: [PATCH] replacing XML with JSON in image downloading

---
 dumpgenerator.py | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)
diff --git a/dumpgenerator.py b/dumpgenerator.py
index 1690825..db7ba04 100644
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -654,7 +654,7 @@ def getImageFilenamesURLAPI(config={}):
     images = []
     while aifrom:
         sys.stderr.write('.') #progress
-        params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'xml', 'ailimit': 500}
+        params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500}
         data = urllib.urlencode(params)
         req = urllib2.Request(url=config['api'], data=data, headers=headers)
         try:
@@ -669,35 +669,31 @@ def getImageFilenamesURLAPI(config={}):
                 print 'Please, resume the dump, --resume'
                 sys.exit()
         if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-            xml = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+            jsonimages = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read())
         else:
-            xml = f.read()
+            jsonimages = json.loads(f.read())
         f.close()
+        print jsonimages
         delay(config=config)
-        # Match the query-continue, old and new format
-        m = re.findall(r'<allimages (?:aicontinue|aifrom)="([^>]+)" />', xml)
-        if m:
-            aifrom = undoHTMLEntities(text=m[0]) #&quot; = ", etc
-        else:
-            aifrom = ''
-        m = re.compile(r'(?im)<img name="(?P<filename>[^"]+)"[^>]*user="(?P<uploader>[^"]+)"[^>]* url="(?P<url>[^"]+)"[^>]*/>').finditer(xml) # Retrieves a filename, uploader, url triple from the name, user, url field of the xml line; space before url needed to avoid getting the descriptionurl field instead.
-        for i in m:
-            url = i.group('url')
+        aifrom = ''
+        if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'):
+            if jsontitles['query-continue']['allimages'].has_key('aicontinue'):
+                aifrom = jsonimages['query-continue']['allimages']['aicontinue'] 
+            elif jsontitles['query-continue']['allimages'].has_key('aifrom'):
+                aifrom = jsonimages['query-continue']['allimages']['aifrom']
+        #print aifrom
+        
+        for image in jsonimages['query']['allimages']:
+            url = image['url']
             if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL?
                 if url[0] == '/': #slash is added later
                     url = url[1:]
                 domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
                 url = '%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
-            url = undoHTMLEntities(text=url)
-            #url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
             url = re.sub(' ', '_', url)
-            filename = re.sub('_', ' ', i.group('filename'))
-            filename = undoHTMLEntities(text=filename)
-            filename = urllib.unquote(filename)
-            uploader = re.sub('_', ' ', i.group('uploader'))
-            uploader = undoHTMLEntities(text=uploader)
-            uploader = urllib.unquote(uploader)
-            images.append([filename, url, uploader])           
+            filename = re.sub('_', ' ', url.split('/')[-1])
+            uploader = re.sub('_', ' ', image['user'])
+            images.append([filename, url, uploader])
 
     if (len(images) == 1):
         print '    Found 1 image'