From d07a14cbce4e60a172c8d9c307ec865c108d91c6 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Sun, 20 May 2018 00:00:27 +0300 Subject: [PATCH] New version of uploader.py with possibility of separate directory Also much faster than using os.walk, which lists all the images in all wikidump directories. --- uploader.py | 150 +++++++++++++++++++++++----------------------------- 1 file changed, 66 insertions(+), 84 deletions(-) diff --git a/uploader.py b/uploader.py index 4ae3e07..99d7d67 100644 --- a/uploader.py +++ b/uploader.py @@ -16,6 +16,7 @@ # along with this program. If not, see . import getopt +import argparse import os import re import subprocess @@ -30,89 +31,41 @@ from internetarchive import get_item import dumpgenerator -# Configuration goes here # You need a file named keys.txt with access and secret keys, in two different lines accesskey = open('keys.txt', 'r').readlines()[0].strip() secretkey = open('keys.txt', 'r').readlines()[1].strip() -# Use --admin if you are a wikiteam collection admin, or specify another collection: -collection = 'opensource' # Nothing to change below convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'} -listfile = sys.argv[1] -uploadeddumps = [] -try: - uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1] -except: - pass -print '%d dumps uploaded previously' % (len(uploadeddumps)) - -def getParameters(params=[]): - if not params: - params = sys.argv[2:] - config = { - 'prune-directories': False, - 'prune-wikidump': False, - 'collection': collection, - 'update': False, - } - #console params - try: - opts, args = getopt.getopt(params, "", ["h", "help", "prune-directories", "prune-wikidump", "admin", "update"]) - except getopt.GetoptError, err: - # print help information and exit: - print str(err) # will print something like "option -a not recognized" - usage() - sys.exit(2) - for o, a in opts: - if o in ("-h","--help"): - usage() - sys.exit() - elif o in ("--prune-directories"): - config['prune-directories'] = True - elif o in ("--prune-wikidump"): - config['prune-wikidump'] = True - elif o in ("--admin"): - config['collection'] = "wikiteam" - elif o in ("--update"): - config['update'] = True - return config - -def usage(): - """ """ - print """uploader.py -This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org. -The list must be a text file with the wiki's api.php URLs, one per line. -Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format -as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ). -You need a file named keys.txt with access and secret keys, in two different lines -You also need dumpgenerator.py in the same directory as this script. - -Use --help to print this help.""" - -def log(wiki, dump, msg): - f = open('uploader-%s.log' % (listfile), 'a') +def log(wiki, dump, msg, config={}): + f = open('uploader-%s.log' % (config.listfile), 'a') f.write('\n%s;%s;%s' % (wiki, dump, msg)) f.close() -def upload(wikis, config={}): +def upload(wikis, config={}, uploadeddumps=[]): headers = {'User-Agent': dumpgenerator.getUserAgent()} + dumpdir = config.wikidump_dir + filelist = os.listdir(dumpdir) for wiki in wikis: print "#"*73 print "# Uploading", wiki print "#"*73 wiki = wiki.lower() - prefix = dumpgenerator.domain2prefix(config={'api': wiki}) + configtemp = config + try: + prefix = dumpgenerator.domain2prefix(config={'api': wiki}) + except KeyError: + print "ERROR: could not produce the prefix for %s" % wiki + config = configtemp wikiname = prefix.split('-')[0] dumps = [] - for dirname, dirnames, filenames in os.walk('.'): - if dirname == '.': - for f in filenames: - if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')): - dumps.append(f) + for f in filelist: + if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')): + print "%s found" % f + dumps.append(f) break c = 0 @@ -120,30 +73,33 @@ def upload(wikis, config={}): wikidate = dump.split('-')[1] item = get_item('wiki-' + wikiname) if dump in uploadeddumps: - if config['prune-directories']: + if config.prune_directories: rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate) # With -f the deletion might have happened before and we won't know if not os.system(rmline): print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate) - if config['prune-wikidump'] and dump.endswith('wikidump.7z'): + if config.prune_wikidump and dump.endswith('wikidump.7z'): # Simplistic quick&dirty check for the presence of this file in the item - stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() + print "Checking content in previously uploaded files" + stdout, stderr = subprocess.Popen(["md5sum", dumpdir + '/' + dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() dumphash = re.sub(' +.+\n?', '', stdout) if dumphash in map(lambda x: x['md5'], item.files): - log(wiki, dump, 'verified') - rmline='rm -rf %s' % dump + log(wiki, dump, 'verified', config) + rmline='rm -rf %s' % dumpdir + '/' + dump if not os.system(rmline): - print 'DELETED ' + dump + print 'DELETED ' + dumpdir + '/' + dump print '%s was uploaded before, skipping...' % (dump) continue else: print 'ERROR: The online item misses ' + dump - log(wiki, dump, 'missing') + log(wiki, dump, 'missing', config) # We'll exit this if and go upload the dump else: print '%s was uploaded before, skipping...' % (dump) continue + else: + print '%s was not uploaded before' % dump time.sleep(0.1) wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8] @@ -155,7 +111,7 @@ def upload(wikis, config={}): # Logo path logourl = '' - if ismissingitem or config['update']: + if ismissingitem or config.update: #get metadata from api.php #first sitename and base url params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'} @@ -163,7 +119,7 @@ def upload(wikis, config={}): req = urllib2.Request(url=wiki, data=data, headers=headers) xml = '' try: - f = urllib2.urlopen(req) + f = urllib2.urlopen(req, timeout=10) xml = f.read() f.close() except: @@ -198,7 +154,7 @@ def upload(wikis, config={}): req = urllib2.Request(url=wiki, data=data, headers=headers) xml = '' try: - f = urllib2.urlopen(req) + f = urllib2.urlopen(req, timeout=10) xml = f.read() f.close() except: @@ -214,7 +170,7 @@ def upload(wikis, config={}): raw = '' try: - f = urllib.urlopen(baseurl) + f = urllib.urlopen(baseurl, timeout=10) raw = f.read() f.close() except: @@ -238,7 +194,6 @@ def upload(wikis, config={}): logourl = re.findall(ur'p-logo["\'][^>]*>\s*]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0] except: pass - print logourl #retrieve some info from the wiki wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia @@ -264,7 +219,7 @@ def upload(wikis, config={}): # Item metadata md = { 'mediatype': 'web', - 'collection': config['collection'], + 'collection': config.collection, 'title': wikititle, 'description': wikidesc, 'language': lang, @@ -277,25 +232,52 @@ def upload(wikis, config={}): #Upload files and update metadata try: - item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True) + item.upload(dumpdir + '/' + dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False) item.modify_metadata(md) # update print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname) + uploadeddumps.append(dump) + log(wiki, dump, 'ok', config) if logourl: - logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl)).read()) + logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl), timeout=10).read()) logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown' logo.name = 'wiki-' + wikiname + '_logo.' + logoextension item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True) - uploadeddumps.append(dump) - log(wiki, dump, 'ok') - except: - print wiki, dump, 'error when uploading?' + except Exception as e: + print wiki, dump, 'Error when uploading?' + print e.message c += 1 def main(params=[]): - config = getParameters(params=params) + parser = argparse.ArgumentParser("""uploader.py + +This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org. +The list must be a text file with the wiki's api.php URLs, one per line. +Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format +as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ). +You need a file named keys.txt with access and secret keys, in two different lines +You also need dumpgenerator.py in the same directory as this script. + +Use --help to print this help.""") + + parser.add_argument('-pd', '--prune_directories', action='store_true') + parser.add_argument('-pw', '--prune_wikidump', action='store_true') + parser.add_argument('-a', '--admin', action='store_true') + parser.add_argument('-c', '--collection', default='opensource') + parser.add_argument('-wd', '--wikidump_dir', default='.') + parser.add_argument('-u', '--update', action='store_true') + parser.add_argument('listfile') + config = parser.parse_args() + uploadeddumps = [] + listfile = config.listfile + try: + uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1] + except: + pass + print '%d dumps uploaded previously' % (len(uploadeddumps)) wikis = open(listfile, 'r').read().strip().splitlines() - upload(wikis, config) + + upload(wikis, config, uploadeddumps) if __name__ == "__main__": main()