From d07a14cbce4e60a172c8d9c307ec865c108d91c6 Mon Sep 17 00:00:00 2001
From: Federico Leva <federicoleva@tiscali.it>
Date: Sun, 20 May 2018 00:00:27 +0300
Subject: [PATCH] New version of uploader.py with possibility of separate
 directory

Also much faster than using os.walk, which lists all the images
in all wikidump directories.
---
 uploader.py | 150 +++++++++++++++++++++++-----------------------------
 1 file changed, 66 insertions(+), 84 deletions(-)

diff --git a/uploader.py b/uploader.py
index 4ae3e07..99d7d67 100644
--- a/uploader.py
+++ b/uploader.py
@@ -16,6 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import getopt
+import argparse
 import os
 import re
 import subprocess
@@ -30,89 +31,41 @@ from internetarchive import get_item
 
 import dumpgenerator
 
-# Configuration goes here
 # You need a file named keys.txt with access and secret keys, in two different lines
 accesskey = open('keys.txt', 'r').readlines()[0].strip()
 secretkey = open('keys.txt', 'r').readlines()[1].strip()
-# Use --admin if you are a wikiteam collection admin, or specify another collection:
-collection = 'opensource'
 
 # Nothing to change below
 convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'}
-listfile = sys.argv[1]
-uploadeddumps = []
-try:
-    uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
-except:
-    pass
-print '%d dumps uploaded previously' % (len(uploadeddumps))
-
-def getParameters(params=[]):
-    if not params:
-        params = sys.argv[2:]
-    config = {
-        'prune-directories': False,
-        'prune-wikidump': False,
-        'collection': collection,
-        'update': False,
-    }
-    #console params
-    try:
-        opts, args = getopt.getopt(params, "", ["h", "help", "prune-directories", "prune-wikidump", "admin", "update"])
-    except getopt.GetoptError, err:
-        # print help information and exit:
-        print str(err) # will print something like "option -a not recognized"
-        usage()
-        sys.exit(2)
-    for o, a in opts:
-        if o in ("-h","--help"):
-            usage()
-            sys.exit()
-        elif o in ("--prune-directories"):
-            config['prune-directories'] = True
-        elif o in ("--prune-wikidump"):
-            config['prune-wikidump'] = True
-        elif o in ("--admin"):
-            config['collection'] = "wikiteam"
-        elif o in ("--update"):
-            config['update'] = True
-    return config
-
-def usage():
-    """  """
-    print """uploader.py
 
-This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
-The list must be a text file with the wiki's api.php URLs, one per line.
-Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
-as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
-You need a file named keys.txt with access and secret keys, in two different lines
-You also need dumpgenerator.py in the same directory as this script.
-
-Use --help to print this help."""
-
-def log(wiki, dump, msg):
-    f = open('uploader-%s.log' % (listfile), 'a')
+def log(wiki, dump, msg, config={}):
+    f = open('uploader-%s.log' % (config.listfile), 'a')
     f.write('\n%s;%s;%s' % (wiki, dump, msg))
     f.close()
 
-def upload(wikis, config={}):
+def upload(wikis, config={}, uploadeddumps=[]):
     headers = {'User-Agent': dumpgenerator.getUserAgent()}
+    dumpdir = config.wikidump_dir
 
+    filelist = os.listdir(dumpdir)
     for wiki in wikis:
         print "#"*73
         print "# Uploading", wiki
         print "#"*73
         wiki = wiki.lower()
-        prefix = dumpgenerator.domain2prefix(config={'api': wiki})
+        configtemp = config
+        try:
+            prefix = dumpgenerator.domain2prefix(config={'api': wiki})
+        except KeyError:
+            print "ERROR: could not produce the prefix for %s" % wiki
+        config = configtemp
 
         wikiname = prefix.split('-')[0]
         dumps = []
-        for dirname, dirnames, filenames in os.walk('.'):
-            if dirname == '.':
-                for f in filenames:
-                    if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
-                        dumps.append(f)
+        for f in filelist:
+            if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
+                print "%s found" % f
+                dumps.append(f)
                 break
 
         c = 0
@@ -120,30 +73,33 @@ def upload(wikis, config={}):
             wikidate = dump.split('-')[1]
             item = get_item('wiki-' + wikiname)
             if dump in uploadeddumps:
-                if config['prune-directories']:
+                if config.prune_directories:
                     rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
                     # With -f the deletion might have happened before and we won't know
                     if not os.system(rmline):
                         print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
-                if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
+                if config.prune_wikidump and dump.endswith('wikidump.7z'):
                         # Simplistic quick&dirty check for the presence of this file in the item
-                        stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+                        print "Checking content in previously uploaded files"
+                        stdout, stderr = subprocess.Popen(["md5sum", dumpdir + '/' + dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
                         dumphash = re.sub(' +.+\n?', '', stdout)
 
                         if dumphash in map(lambda x: x['md5'], item.files):
-                            log(wiki, dump, 'verified')
-                            rmline='rm -rf %s' % dump
+                            log(wiki, dump, 'verified', config)
+                            rmline='rm -rf %s' % dumpdir + '/' + dump
                             if not os.system(rmline):
-                                print 'DELETED ' + dump
+                                print 'DELETED ' + dumpdir + '/' + dump
                             print '%s was uploaded before, skipping...' % (dump)
                             continue
                         else:
                             print 'ERROR: The online item misses ' + dump
-                            log(wiki, dump, 'missing')
+                            log(wiki, dump, 'missing', config)
                             # We'll exit this if and go upload the dump
                 else:
                     print '%s was uploaded before, skipping...' % (dump)
                     continue
+            else:
+                print '%s was not uploaded before' % dump
 
             time.sleep(0.1)
             wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
@@ -155,7 +111,7 @@ def upload(wikis, config={}):
             # Logo path
             logourl = ''
 
-            if ismissingitem or config['update']:
+            if ismissingitem or config.update:
                 #get metadata from api.php
                 #first sitename and base url
                 params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
@@ -163,7 +119,7 @@ def upload(wikis, config={}):
                 req = urllib2.Request(url=wiki, data=data, headers=headers)
                 xml = ''
                 try:
-                    f = urllib2.urlopen(req)
+                    f = urllib2.urlopen(req, timeout=10)
                     xml = f.read()
                     f.close()
                 except:
@@ -198,7 +154,7 @@ def upload(wikis, config={}):
                 req = urllib2.Request(url=wiki, data=data, headers=headers)
                 xml = ''
                 try:
-                    f = urllib2.urlopen(req)
+                    f = urllib2.urlopen(req, timeout=10)
                     xml = f.read()
                     f.close()
                 except:
@@ -214,7 +170,7 @@ def upload(wikis, config={}):
 
                 raw = ''
                 try:
-                    f = urllib.urlopen(baseurl)
+                    f = urllib.urlopen(baseurl, timeout=10)
                     raw = f.read()
                     f.close()
                 except:
@@ -238,7 +194,6 @@ def upload(wikis, config={}):
                     logourl = re.findall(ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0]
                 except:
                     pass
-                print logourl
 
                 #retrieve some info from the wiki
                 wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
@@ -264,7 +219,7 @@ def upload(wikis, config={}):
                 # Item metadata
                 md = {
                     'mediatype': 'web',
-                    'collection': config['collection'],
+                    'collection': config.collection,
                     'title': wikititle,
                     'description': wikidesc,
                     'language': lang,
@@ -277,25 +232,52 @@ def upload(wikis, config={}):
 
             #Upload files and update metadata
             try:
-                item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True)
+                item.upload(dumpdir + '/' + dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False)
                 item.modify_metadata(md) # update
                 print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname)
+                uploadeddumps.append(dump)
+                log(wiki, dump, 'ok', config)
                 if logourl:
-                    logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl)).read())
+                    logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl), timeout=10).read())
                     logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown'
                     logo.name = 'wiki-' + wikiname + '_logo.' + logoextension
                     item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True)
-                uploadeddumps.append(dump)
-                log(wiki, dump, 'ok')
-            except:
-                print wiki, dump, 'error when uploading?'
+            except Exception as e:
+                print wiki, dump, 'Error when uploading?'
+                print e.message
 
             c += 1
 
 def main(params=[]):
-    config = getParameters(params=params)
+    parser = argparse.ArgumentParser("""uploader.py
+
+This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
+The list must be a text file with the wiki's api.php URLs, one per line.
+Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
+as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
+You need a file named keys.txt with access and secret keys, in two different lines
+You also need dumpgenerator.py in the same directory as this script.
+
+Use --help to print this help.""")
+
+    parser.add_argument('-pd', '--prune_directories', action='store_true')
+    parser.add_argument('-pw', '--prune_wikidump', action='store_true')
+    parser.add_argument('-a', '--admin', action='store_true')
+    parser.add_argument('-c', '--collection', default='opensource')
+    parser.add_argument('-wd', '--wikidump_dir', default='.')
+    parser.add_argument('-u', '--update', action='store_true')
+    parser.add_argument('listfile')
+    config = parser.parse_args()
+    uploadeddumps = []
+    listfile = config.listfile
+    try:
+        uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
+    except:
+        pass
+    print '%d dumps uploaded previously' % (len(uploadeddumps))
     wikis = open(listfile, 'r').read().strip().splitlines()
-    upload(wikis, config)
+
+    upload(wikis, config, uploadeddumps)
 
 if __name__ == "__main__":
     main()