From d4eed1f738b836b5553109844f3872d27215b691 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Emilio=20J=2E=20Rodr=C3=ADguez-Posada?= <emijrp@gmail.com>
Date: Mon, 30 Jun 2014 20:03:32 +0200
Subject: [PATCH] fixing #127 and #134 , now works with APIs that returns
 'name' field for images and those that don't do it (in this case we unquote
 over ascii); also fixing bug that re-download image list when it was
 completed previously

---
 dumpgenerator.py | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/dumpgenerator.py b/dumpgenerator.py
index 38bc50b..60f441e 100644
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -552,8 +552,8 @@ def saveImageFilenamesURL(config={}, images=[]):
 
     imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date'])
     imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w')
-    imagesfile.write('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]))
-    imagesfile.write('\n--END--')
+    output = u"%s\n--END--" % (u'\n'.join([u'%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]))
+    imagesfile.write(output.encode('utf-8'))
     imagesfile.close()
     
     print 'Image filenames and URLs saved at...', imagesfilename
@@ -572,9 +572,9 @@ def getImageFilenamesURL(config={}):
         req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
         f = urllib2.urlopen(req)
         if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-            raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
+            raw = unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8')
         else:
-            raw = f.read()
+            raw = unicode(f.read(), 'utf-8')
         f.close()
         delay(config=config)
         if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki
@@ -618,7 +618,7 @@ def getImageFilenamesURL(config={}):
                 if url[0] == '/': #slash is added later
                     url = url[1:]
                 domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
-                url = '%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
+                url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
             url = undoHTMLEntities(text=url)
             #url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
             url = re.sub(' ', '_', url)
@@ -669,17 +669,17 @@ def getImageFilenamesURLAPI(config={}):
                 print 'Please, resume the dump, --resume'
                 sys.exit()
         if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
-            jsonimages = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read())
+            jsonimages = json.loads(unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8'))
         else:
-            jsonimages = json.loads(f.read())
+            jsonimages = json.loads(unicode(f.read(), 'utf-8'))
         f.close()
-        print jsonimages
+        #print jsonimages
         delay(config=config)
         aifrom = ''
         if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'):
-            if jsontitles['query-continue']['allimages'].has_key('aicontinue'):
+            if jsonimages['query-continue']['allimages'].has_key('aicontinue'):
                 aifrom = jsonimages['query-continue']['allimages']['aicontinue'] 
-            elif jsontitles['query-continue']['allimages'].has_key('aifrom'):
+            elif jsonimages['query-continue']['allimages'].has_key('aifrom'):
                 aifrom = jsonimages['query-continue']['allimages']['aifrom']
         #print aifrom
         
@@ -689,9 +689,13 @@ def getImageFilenamesURLAPI(config={}):
                 if url[0] == '/': #slash is added later
                     url = url[1:]
                 domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
-                url = '%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
+                url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
             url = re.sub(' ', '_', url)
-            filename = re.sub('_', ' ', url.split('/')[-1])
+            if image.has_key('name'):
+                filename = re.sub('_', ' ', image['name'])
+            else:
+                #some tips http://stackoverflow.com/questions/5139249/python-url-unquote-unicode
+                filename = re.sub('_', ' ', unicode(urllib2.unquote(url.encode('ascii')).split('/')[-1], 'utf-8'))
             uploader = re.sub('_', ' ', image['user'])
             images.append([filename, url, uploader])
 
@@ -750,11 +754,12 @@ def generateImageDump(config={}, other={}, images=[], start=''):
         class URLopenerUserAgent(urllib.FancyURLopener):
             version = "%s" % getUserAgent()
         urllib._urlopener = URLopenerUserAgent()
-        urllib.urlretrieve(url=url, filename='%s/%s' % (imagepath, filename2) )
+        filename3 = u'%s/%s' % (imagepath, filename2)
+        urllib.urlretrieve(url=url, filename=filename3.encode('utf-8'))
         # TODO: data=urllib.urlencode({}) removed image; request fails on wikipedia and POST neither works?
         
         #saving description if any
-        xmlfiledesc = getXMLFileDesc(config=config, title='Image:%s' % (filename)) # use Image: for backwards compatibility
+        xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename)) # use Image: for backwards compatibility
         f = open('%s/%s.desc' % (imagepath, filename2), 'w')
         if not re.search(r'</mediawiki>', xmlfiledesc): #<text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
             #failure when retrieving desc? then save it as empty .desc
@@ -1185,7 +1190,7 @@ def resumePreviousDump(config={}, other={}):
         lastimage = ''
         try:
             f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
-            raw = f.read()
+            raw = unicode(f.read(), 'utf-8').strip()
             lines = raw.split('\n')
             for l in lines:
                 if re.search(r'\t', l):
@@ -1194,7 +1199,7 @@ def resumePreviousDump(config={}, other={}):
             f.close()
         except:
             pass #probably file doesnot exists
-        if lastimage == '--END--':
+        if lastimage == u'--END--':
             print 'Image list was completed in the previous session'
         else:
             print 'Image list is incomplete. Reloading...'